package talon

  1. Overview
  2. Docs

Source file talon_csv.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
(*---------------------------------------------------------------------------
  Copyright (c) 2026 The Raven authors. All rights reserved.
  SPDX-License-Identifier: ISC
  ---------------------------------------------------------------------------*)

type dtype_spec =
  (string * [ `Float32 | `Float64 | `Int32 | `Int64 | `Bool | `String ]) list

let default_na_values = [ ""; "NA"; "N/A"; "null"; "NULL"; "nan"; "NaN" ]
let is_null_value na_values s = List.mem s na_values

let detect_dtype na_values values =
  let non_null_values =
    List.filter (fun v -> not (is_null_value na_values v)) values
  in
  if List.length non_null_values = 0 then `String
  else
    let all_bool =
      List.for_all
        (fun v ->
          match String.lowercase_ascii v with
          | "true" | "t" | "yes" | "y" | "1" | "false" | "f" | "no" | "n" | "0"
            ->
              true
          | _ -> false)
        non_null_values
    in
    if all_bool then `Bool
    else
      let all_int, needs_int64 =
        List.fold_left
          (fun (all_ok, overflow) v ->
            if not all_ok then (false, overflow)
            else
              try
                let i64 = Int64.of_string v in
                let too_big =
                  i64 > Int64.of_int32 Int32.max_int
                  || i64 < Int64.of_int32 Int32.min_int
                in
                (true, overflow || too_big)
              with _ -> (false, overflow))
          (true, false) non_null_values
      in
      if all_int then if needs_int64 then `Int64 else `Int32
      else
        let all_float =
          List.for_all
            (fun v ->
              try
                ignore (float_of_string v);
                true
              with _ -> false)
            non_null_values
        in
        if all_float then `Float32 else `String

let columns_of_rows na_values dtype_spec column_names data_rows =
  let num_cols = List.length column_names in
  let columns_data = Array.init num_cols (fun _ -> []) in
  List.iter
    (fun row ->
      List.iteri
        (fun i value ->
          if i < num_cols then columns_data.(i) <- value :: columns_data.(i))
        row)
    data_rows;
  Array.iteri (fun i lst -> columns_data.(i) <- List.rev lst) columns_data;
  List.mapi
    (fun i name ->
      let values = columns_data.(i) in
      let dtype =
        match dtype_spec with
        | Some specs -> (
            try List.assoc name specs
            with Not_found -> detect_dtype na_values values)
        | None -> detect_dtype na_values values
      in
      let parse_col values ~parse ~make =
        let arr =
          List.map
            (fun v ->
              if is_null_value na_values v then None
              else try Some (parse v) with _ -> None)
            values
          |> Array.of_list
        in
        make arr
      in
      let column =
        match dtype with
        | `Float32 ->
            parse_col values ~parse:float_of_string ~make:Talon.Col.float32_opt
        | `Float64 ->
            parse_col values ~parse:float_of_string ~make:Talon.Col.float64_opt
        | `Int32 ->
            parse_col values ~parse:Int32.of_string ~make:Talon.Col.int32_opt
        | `Int64 ->
            parse_col values ~parse:Int64.of_string ~make:Talon.Col.int64_opt
        | `Bool ->
            parse_col values ~make:Talon.Col.bool_opt ~parse:(fun v ->
                match String.lowercase_ascii v with
                | "true" | "t" | "yes" | "y" | "1" -> true
                | "false" | "f" | "no" | "n" | "0" -> false
                | _ -> raise Exit)
        | `String -> parse_col values ~parse:Fun.id ~make:Talon.Col.string_opt
      in
      (name, column))
    column_names

let col_string_fns na_repr df =
  List.map
    (fun name ->
      Talon.Col.to_string_fn ~null:na_repr (Talon.get_column_exn df name))
    (Talon.column_names df)

let df_of_rows ?names ?(na_values = default_na_values) ?dtype_spec rows =
  match names with
  | Some column_names -> (
      match rows with
      | [] ->
          let columns =
            List.map (fun name -> (name, Talon.Col.string [||])) column_names
          in
          Talon.create columns
      | _ ->
          columns_of_rows na_values dtype_spec column_names rows |> Talon.create
      )
  | None -> (
      match rows with
      | [] -> Talon.empty
      | [ header ] ->
          let columns =
            List.map (fun name -> (name, Talon.Col.string [||])) header
          in
          Talon.create columns
      | header :: data ->
          columns_of_rows na_values dtype_spec header data |> Talon.create)

let of_string ?(sep = ',') ?names ?na_values ?dtype_spec s =
  df_of_rows ?names ?na_values ?dtype_spec (Csv_io.parse ~separator:sep s)

let to_string ?(sep = ',') ?(na_repr = "") df =
  let buf = Buffer.create 1024 in
  let fns = col_string_fns na_repr df in
  let n_rows = Talon.num_rows df in
  Csv_io.write_row buf sep (Talon.column_names df);
  for i = 0 to n_rows - 1 do
    Csv_io.write_row buf sep (List.map (fun f -> f i) fns)
  done;
  Buffer.contents buf

let read ?(sep = ',') ?names ?na_values ?dtype_spec path =
  In_channel.with_open_text path @@ fun ic ->
  let rows = ref [] in
  (try
     while true do
       let line = Csv_io.strip_cr (input_line ic) in
       if line <> "" then rows := Csv_io.parse_row sep line :: !rows
     done
   with End_of_file -> ());
  df_of_rows ?names ?na_values ?dtype_spec (List.rev !rows)

let write ?(sep = ',') ?(na_repr = "") path df =
  Out_channel.with_open_text path @@ fun oc ->
  let buf = Buffer.create 256 in
  let fns = col_string_fns na_repr df in
  let n_rows = Talon.num_rows df in
  Csv_io.write_row buf sep (Talon.column_names df);
  output_string oc (Buffer.contents buf);
  for i = 0 to n_rows - 1 do
    Buffer.clear buf;
    Csv_io.write_row buf sep (List.map (fun f -> f i) fns);
    output_string oc (Buffer.contents buf)
  done