1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
open Ast
let register env =
(*
--# Create a DataFrame
--#
--# Constructs a DataFrame from either a list of rows (Dicts) or a Dictionary of columns (Vectors/Lists).
--#
--# @name dataframe
--# @param data :: List[Dict]|Dict The data rows or columns.
--# @return :: DataFrame The created DataFrame.
--# @example
--# # Row-wise construction:
--# df = dataframe([
--# {"a": 1, "b": 2},
--# {"a": 3, "b": 4}
--# ])
--#
--# # Column-wise construction (supported for VDict):
--# df2 = dataframe([a: [1, 3], b: [2, 4]])
--#
--# # Scalar values are recycled to match other column lengths:
--# df3 = dataframe([x: [1, 2, 3], constant: 0])
--# @family dataframe
--# @seealso read_csv
--# @export
*)
let env = Env.add "dataframe"
(make_builtin ~name:"dataframe" 1 (fun args _env ->
match args with
| [VDict d] ->
(* Column-wise construction from a Dictionary: dataframe([x: [1,2], y: [3,4]]) *)
(try
let raw_columns = List.map (fun (name, v) ->
match v with
| VVector vec -> (name, vec)
| VList l -> (name, Array.of_list (List.map snd l))
| scalar -> (name, [| scalar |])
) d in
if raw_columns = [] then
VDataFrame { arrow_table = Arrow_table.empty; group_keys = [] }
else
let nrows = List.fold_left (fun acc (_, vec) -> max acc (Array.length vec)) 0 raw_columns in
let columns = List.map (fun (name, vec) ->
if Array.length vec = 1 && nrows > 1 then
(name, Array.make nrows vec.(0))
else if Array.length vec <> nrows && (nrows > 0 || Array.length vec > 0) then
raise (Failure (Printf.sprintf "column `%s` length %d does not match max length %d" name (Array.length vec) nrows))
else
(name, vec)
) raw_columns in
let arrow_table = Arrow_bridge.table_from_value_columns columns nrows in
VDataFrame { arrow_table; group_keys = [] }
with Failure msg -> Error.value_error ("dataframe: " ^ msg))
| _ ->
let rows = match args with
| [VList l] -> l
| [VVector a] -> List.init (Array.length a) (fun i -> (None, a.(i)))
| _ -> []
in
if rows = [] && args <> [VList []] && args <> [VVector [||]] then
Error.type_error "Function `dataframe` expects a single argument (List or Vector of rows, or a Dict of columns)."
else
(match rows with
| [] -> VDataFrame { arrow_table = Arrow_table.empty; group_keys = [] }
| (_first_row_name, first_row_val) :: _ ->
(* Inspect first row to determine columns *)
(match first_row_val with
| VDict pairs ->
let headers = List.map fst pairs in
let _ncols = List.length headers in
let nrows = List.length rows in
(* Extract data for each column from VDict rows *)
let columns = List.map (fun col_name ->
let col_values = Array.init nrows (fun i ->
let (_, row_val) = List.nth rows i in
match row_val with
| VDict row_pairs ->
(match List.assoc_opt col_name row_pairs with
| Some v -> v
| None -> (VNA NAGeneric)) (* Missing key = NA *)
| _ -> (VNA NAGeneric) (* Invalid row structure handling *)
) in
(col_name, col_values)
) headers in
(* Create Arrow table using bridge *)
let arrow_table = Arrow_bridge.table_from_value_columns columns nrows in
VDataFrame { arrow_table; group_keys = [] }
| VList pairs ->
let headers = List.filter_map (fun (k, _) -> k) pairs in
let _ncols = List.length headers in
let nrows = List.length rows in
(* Extract data for each column from VList rows *)
let columns = List.map (fun col_name ->
let col_values = Array.init nrows (fun i ->
let (_, row_val) = List.nth rows i in
match row_val with
| VList row_pairs ->
(* For VList, find the item with the matching name *)
(match List.find_opt (fun (n, _) -> n = Some col_name) row_pairs with
| Some (_, v) -> v
| None -> (VNA NAGeneric))
| _ -> (VNA NAGeneric) (* Invalid row structure handling *)
) in
(col_name, col_values)
) headers in
(* Create Arrow table using bridge *)
let arrow_table = Arrow_bridge.table_from_value_columns columns nrows in
VDataFrame { arrow_table; group_keys = [] }
| _ -> Error.type_error (Printf.sprintf "Function `dataframe` expects a list of Dicts (rows). First row is: %s" (Ast.Utils.value_to_string first_row_val)))
)
)) env in
(*
--# Extract column as vector
--#
--# Extracts a single column from a DataFrame as a Vector.
--#
--# @name pull
--# @param df :: DataFrame The input DataFrame.
--# @param col :: String The column name.
--# @return :: Vector The column data.
--# @example
--# pull(mtcars, "mpg")
--# @family dataframe
--# @seealso select
--# @export
*)
let env = Env.add "pull"
(make_builtin ~name:"pull" 2 (fun args _env ->
match args with
| [VDataFrame df; v_col] ->
(match Utils.extract_column_name v_col with
| Some col_name ->
(match Arrow_table.get_column df.arrow_table col_name with
| None -> Error.make_error KeyError (Printf.sprintf "Column `%s` not found in DataFrame." col_name)
| Some col ->
match col with
| Arrow_table.FloatColumn data ->
VVector (Array.map (function Some f -> VFloat f | None -> (VNA NAGeneric)) data)
| Arrow_table.IntColumn data ->
VVector (Array.map (function Some i -> VInt i | None -> (VNA NAGeneric)) data)
| Arrow_table.StringColumn data ->
VVector (Array.map (function Some s -> VString s | None -> (VNA NAGeneric)) data)
| Arrow_table.BoolColumn data ->
VVector (Array.map (function Some b -> VBool b | None -> (VNA NAGeneric)) data)
| Arrow_table.DateColumn data ->
VVector (Array.map (function Some d -> VDate d | None -> VNA NADate) data)
| Arrow_table.DatetimeColumn (data, tz) ->
VVector (Array.map (function Some ts -> VDatetime (ts, tz) | None -> VNA NADate) data)
| Arrow_table.NAColumn n ->
VVector (Array.make n ((VNA NAGeneric)))
| Arrow_table.DictionaryColumn (data, levels, ordered) ->
VVector (Array.map (function Some i -> VFactor (i, levels, ordered) | None -> (VNA NAGeneric)) data)
| Arrow_table.ListColumn data ->
VVector (Array.map (function Some t -> VDataFrame { arrow_table = t; group_keys = [] } | None -> (VNA NAGeneric)) data))
| None -> Error.type_error "pull: second argument must be a column name ($col or \"col\").")
| _ -> Error.type_error "pull expects (DataFrame, column_name)."
)) env in
(*
--# Convert to NDArray
--#
--# Converts numeric columns of a DataFrame to a matrix (NDArray).
--#
--# @name to_array
--# @param df :: DataFrame The input DataFrame.
--# @param cols :: List[Symbol|String] (Optional) Columns to include. Defaults to all numeric.
--# @return :: NDArray A 2D array of the data.
--# @example
--# mat = to_array(mtcars)
--# mat = to_array(mtcars, [$mpg, $wt])
--# @family dataframe
--# @seealso dataframe
--# @export
*)
let env = Env.add "to_array"
(make_builtin ~name:"to_array" ~variadic:true 1 (fun args _env ->
(* ... existing to_array implementation ... *)
match args with
| [VDataFrame df] ->
(* Auto-select all numeric columns *)
let col_names = List.filter (fun name ->
match Arrow_table.get_column df.arrow_table name with
| Some (Arrow_table.FloatColumn _)
| Some (Arrow_table.IntColumn _) -> true
| _ -> false
) (Arrow_table.column_names df.arrow_table) in
if col_names = [] then
Error.value_error "to_matrix: DataFrame has no numeric columns."
else
let nrows = Arrow_table.num_rows df.arrow_table in
let ncols = List.length col_names in
let data = Array.make (nrows * ncols) 0.0 in
let rec process_columns idx = function
| [] -> Ok ()
| name :: rest ->
match Arrow_owl_bridge.numeric_column_to_owl df.arrow_table name with
| None -> Error (Error.type_error (Printf.sprintf "Column `%s` is not numeric or contains NAs." name))
| Some view ->
for i = 0 to nrows - 1 do
data.(i * ncols + idx) <- view.arr.(i)
done;
process_columns (idx + 1) rest
in
(match try process_columns 0 col_names with Invalid_argument _ -> Error (Error.type_error "Invalid column list") with
| Ok () -> VNDArray { shape = [|nrows; ncols|]; data }
| Error e -> e)
| [VDataFrame df; v_cols] ->
let items = match v_cols with
| VList l -> List.map snd l
| VVector a -> Array.to_list a
| VString _ | VSymbol _ -> [v_cols]
| _ -> []
in
if items = [] && v_cols <> VList [] && v_cols <> VVector [||] then
Error.type_error "to_array expects (DataFrame, [column_names] | column_name)."
else
let maybe_col_names = List.map Utils.extract_column_name items in
if List.exists Option.is_none maybe_col_names then
Error.type_error "Column names must be strings or symbols."
else
let col_names = List.filter_map Fun.id maybe_col_names in
let nrows = Arrow_table.num_rows df.arrow_table in
let ncols = List.length col_names in
let data = Array.make (nrows * ncols) 0.0 in
let rec process_columns idx = function
| [] -> Ok ()
| name :: rest ->
match Arrow_owl_bridge.numeric_column_to_owl df.arrow_table name with
| None -> Error (Error.type_error (Printf.sprintf "Column `%s` is not numeric or contains NAs." name))
| Some view ->
for i = 0 to nrows - 1 do
data.(i * ncols + idx) <- view.arr.(i)
done;
process_columns (idx + 1) rest
in
(match try process_columns 0 col_names with Invalid_argument _ -> Error (Error.type_error "Invalid column list") with
| Ok () -> VNDArray { shape = [|nrows; ncols|]; data }
| Error e -> e)
| _ -> Error.type_error "to_array expects (DataFrame, [column_names])."
)) env in
env