1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
open Ast
open Arrow_table
(*
--# Separate a character column into multiple columns
--#
--# Given either a regular expression or a fixed position, separate() splits
--# a single character column into multiple new columns.
--#
--# @name separate
--# @param df :: DataFrame The DataFrame.
--# @param col :: Symbol The column to separate (use $col syntax).
--# @param into :: List[String] Names of the new columns to create.
--# @param sep :: String (Optional) Regular expression or position to separate at.
--# Defaults to "[^[:alnum:]]+".
--# @param remove :: Bool (Optional) If true, remove the input column from the result.
--# Defaults to true.
--# @return :: DataFrame The separated DataFrame.
--# @example
--# separate(df, $date, into = ["year", "month", "day"], sep = "-")
--# @family colcraft
--# @export
*)
let register env =
Env.add "separate"
(make_builtin_named ~name:"separate" ~variadic:true 1 (fun named_args _env ->
let df_arg = match named_args with
| (_, VDataFrame df) :: _ -> Some df
| _ -> None
in
let get_named k = List.find_map (fun (nk, v) -> if nk = Some k then Some v else None) named_args in
let positional = List.filter_map (fun (k, v) -> if k = None then Some v else None) named_args in
let col_val = match get_named "col" with Some v -> Some v | None -> (match positional with _::v::_ -> Some v | _ -> None) in
let col_name = match col_val with Some v -> (match Utils.extract_column_name v with Some s -> s | None -> "") | _ -> "" in
let into_val = match get_named "into" with Some v -> Some v | None -> (match positional with _::_::v::_ -> Some v | _ -> None) in
let into_cols = match into_val with
| Some (VList items) -> List.filter_map (fun (_, v) -> match v with VString s -> Some s | _ -> None) items
| _ -> []
in
let sep = match get_named "sep" with
| Some (VString s) -> s
| _ -> "[^[:alnum:]]+" (* Default to splitting at non-alphanumeric chars *)
in
let remove = match get_named "remove" with
| Some (VBool b) -> b
| _ -> true
in
match df_arg with
| None -> Error.type_error "Function `separate` expects a DataFrame as first argument."
| Some df ->
if col_name = "" || into_cols = [] then
Error.make_error ValueError "Function `separate` requires `col` and `into` arguments."
else if not (Arrow_table.has_column df.arrow_table col_name) then
Error.make_error KeyError (Printf.sprintf "Function `separate`: column `%s` not found." col_name)
else
let orig_nrows = Arrow_table.num_rows df.arrow_table in
let col_data = Arrow_table.get_column df.arrow_table col_name in
let val_to_str = function
| VString s -> Some s
| VInt n -> Some (string_of_int n)
| VFloat f -> Some (string_of_float f)
| VBool b -> Some (string_of_bool b)
| VDate d ->
let micros_per_day = 86_400_000_000L in
Some (Chrono.format_datetime_value (Int64.mul (Int64.of_int d) micros_per_day) None "%Y-%m-%d")
| VDatetime (dt, tz) -> Some (Chrono.format_datetime_value dt tz "%Y-%m-%dT%H:%M:%S")
| VNA _ -> None
| other -> Some (Utils.value_to_raw_string other)
in
match col_data with
| None -> Error.make_error KeyError (Printf.sprintf "Function `separate`: column `%s` not found." col_name)
| Some data ->
let values = Arrow_bridge.column_to_values data in
let sep_re = Str.regexp sep in
let n_into = List.length into_cols in
let split_vals = Array.init orig_nrows (fun i ->
match val_to_str values.(i) with
| Some s ->
let parts = Str.split sep_re s in
let n_parts = List.length parts in
if n_parts >= n_into then
List.filteri (fun i _ -> i < n_into) parts
else
parts @ (List.init (n_into - n_parts) (fun _ -> ""))
| None -> List.init n_into (fun _ -> "NA")
) in
(* Create new columns *)
let new_cols_data = List.mapi (fun i name ->
let col_vals = Array.map (fun parts ->
match List.nth_opt parts i with
| Some "NA" | None -> None
| Some s -> Some s
) split_vals in
(name, StringColumn col_vals)
) into_cols in
let all_names = Arrow_table.column_names df.arrow_table in
let final_columns = ref [] in
List.iter (fun name ->
if name = col_name then
begin
if not remove then
final_columns := (name, data) :: !final_columns;
List.iter (fun (n, d) -> final_columns := (n, d) :: !final_columns) new_cols_data
end
else
final_columns := (name, match Arrow_table.get_column df.arrow_table name with Some d -> d | None -> NAColumn orig_nrows) :: !final_columns
) all_names;
let final_columns = List.rev !final_columns in
let new_schema = List.map (fun (n, c) -> (n, Arrow_table.column_type_of c)) final_columns in
VDataFrame { arrow_table = { schema = new_schema; columns = final_columns; nrows = orig_nrows; native_handle = None } |> Arrow_table.materialize; group_keys = df.group_keys }
))
env