1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
open Ast
open Arrow_table
(*
--# Combine multiple columns into one character column
--#
--# unite() is a convenience function that pastes together multiple columns
--# into a single character column.
--#
--# @name unite
--# @param df :: DataFrame The DataFrame.
--# @param col :: String The name of the new column to create.
--# @param ... :: Symbol The columns to combine (use $col syntax).
--# @param sep :: String (Optional) Separator to use between values.
--# Defaults to "_".
--# @param remove :: Bool (Optional) If true, remove the input columns from the result.
--# Defaults to true.
--# @param na_rm :: Bool (Optional) If true, missing values will be removed prior to uniting.
--# Defaults to false.
--# @return :: DataFrame The united DataFrame.
--# @example
--# unite(df, "full_name", $first_name, $last_name, sep = " ")
--# @family colcraft
--# @export
*)
let register env =
Env.add "unite"
(make_builtin_named ~name:"unite" ~variadic:true 1 (fun named_args _env ->
let df_arg = match named_args with
| (_, VDataFrame df) :: _ -> Some df
| _ -> None
in
let get_named k = List.find_map (fun (nk, v) -> if nk = Some k then Some v else None) named_args in
let positional = List.filter_map (fun (k, v) -> if k = None then Some v else None) named_args in
let (col_from_named, new_col_name) = match get_named "col" with
| Some (VString s) -> (true, s)
| _ ->
(match positional with
| _ :: VString s :: _ -> (false, s)
| _ -> (false, ""))
in
let cols_variants =
match positional with
| _ :: _ when col_from_named ->
(* col provided as named arg: positional = [df, first_src_col, ...] *)
(match positional with _ :: tail -> tail | _ -> [])
| _ :: _ :: tail ->
(* col provided positionally: positional = [df, new_col_name, first_src_col, ...] *)
tail
| _ -> []
in
let cols_to_unite = List.filter_map Utils.extract_column_name cols_variants in
let sep = match get_named "sep" with
| Some (VString s) -> s
| _ -> "_"
in
let remove = match get_named "remove" with
| Some (VBool b) -> b
| _ -> true
in
let na_rm = match get_named "na_rm" with
| Some (VBool b) -> b
| _ -> false
in
match df_arg with
| None -> Error.type_error "Function `unite` expects a DataFrame as first argument."
| Some df ->
if new_col_name = "" || cols_to_unite = [] then
Error.make_error ValueError "Function `unite` requires `col` and at least one source column."
else
let orig_nrows = Arrow_table.num_rows df.arrow_table in
let all_names = Arrow_table.column_names df.arrow_table in
(* Check existence of columns *)
let missing = List.filter (fun c -> not (List.mem c all_names)) cols_to_unite in
if missing <> [] then
Error.make_error KeyError (Printf.sprintf "Function `unite`: column(s) not found: %s" (String.concat ", " missing))
else
let get_val_str col_name i =
match Arrow_table.get_column df.arrow_table col_name with
| Some (IntColumn a) -> (match a.(i) with Some v -> Some (string_of_int v) | None -> None)
| Some (FloatColumn a) -> (match a.(i) with Some v -> Some (string_of_float v) | None -> None)
| Some (StringColumn a) -> (match a.(i) with Some v -> Some v | None -> None)
| Some (BoolColumn a) -> (match a.(i) with Some v -> Some (string_of_bool v) | None -> None)
| Some (DateColumn a) ->
(match a.(i) with
| Some d ->
let tm = Unix.gmtime (float_of_int d *. 86400.) in
Some (Printf.sprintf "%04d-%02d-%02d" (tm.tm_year + 1900) (tm.tm_mon + 1) tm.tm_mday)
| None -> None)
| Some (DatetimeColumn (a, _)) ->
(match a.(i) with
| Some ts ->
let tm = Unix.gmtime (Int64.to_float ts) in
Some (Printf.sprintf "%04d-%02d-%02d %02d:%02d:%02d"
(tm.tm_year + 1900) (tm.tm_mon + 1) tm.tm_mday
tm.tm_hour tm.tm_min tm.tm_sec)
| None -> None)
| _ -> None
in
let new_col_vals = Array.init orig_nrows (fun i ->
let parts = List.filter_map (fun col ->
match get_val_str col i with
| Some s -> Some s
| None -> if na_rm then None else Some "NA"
) cols_to_unite in
Some (String.concat sep parts)
) in
let new_col_data = (new_col_name, StringColumn new_col_vals) in
(* Find insertion point of FIRST column to unite *)
let final_columns = ref [] in
let inserted = ref false in
List.iter (fun name ->
if List.mem name cols_to_unite then
begin
if not !inserted then
begin
final_columns := new_col_data :: !final_columns;
inserted := true
end;
if not remove then
final_columns := (name, match Arrow_table.get_column df.arrow_table name with Some d -> d | None -> NAColumn orig_nrows) :: !final_columns
end
else
final_columns := (name, match Arrow_table.get_column df.arrow_table name with Some d -> d | None -> NAColumn orig_nrows) :: !final_columns
) all_names;
let final_columns = List.rev !final_columns in
let new_schema = List.map (fun (n, c) -> (n, Arrow_table.column_type_of c)) final_columns in
VDataFrame { arrow_table = { schema = new_schema; columns = final_columns; nrows = orig_nrows; native_handle = None } |> Arrow_table.materialize; group_keys = df.group_keys }
))
env