1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
open Ast
open Arrow_table

let separate_rows_impl (named_args : (string option * value) list) _env =
  match named_args with
  | (_, VDataFrame df) :: rest ->
      let col_to_sep = match List.filter (fun (k, _) -> k = None) rest with
        | [(_, v)] -> Utils.extract_column_name v
        | _ -> None
      in
      let sep = match List.assoc_opt (Some "sep") rest with Some (VString s) -> s | _ -> "[^A-Za-z0-9]+" in

      (match col_to_sep with
       | None -> Error.make_error ArityError "separate_rows expects a column to separate ($col)."
       | Some col_name ->
           match Arrow_table.get_column df.arrow_table col_name with
           | Some (StringColumn data) ->
               let re = Str.regexp sep in
               let tokens = Array.map (function 
                 | Some s -> Str.split re s
                 | None -> [""]
               ) data in
               
               let final_nrows = Array.fold_left (fun acc t -> acc + List.length t) 0 tokens in
               
               let expansion_indices = Array.make final_nrows 0 in
               let sep_values = Array.make final_nrows (VNA NAGeneric) in
               let curr = ref 0 in
               Array.iteri (fun i t_list ->
                 List.iter (fun t ->
                   expansion_indices.(!curr) <- i;
                   sep_values.(!curr) <- VString t;
                   incr curr
                 ) t_list
               ) tokens;
               
               let new_columns = List.map (fun (name, _) ->
                 if name = col_name then
                   (name, Arrow_bridge.values_to_column sep_values)
                 else
                   match Arrow_table.get_column df.arrow_table name with
                   | Some col -> (name, Arrow_table.take_col col expansion_indices final_nrows)
                   | None -> (name, Arrow_table.NAColumn final_nrows)
               ) df.arrow_table.schema in
               
               VDataFrame { df with arrow_table = { df.arrow_table with columns = new_columns; nrows = final_nrows; native_handle = None } }
           | _ -> Error.type_error (Printf.sprintf "Column `%s` is not a String column." col_name))
  | _ :: _ -> Error.type_error "Function `separate_rows` expects a DataFrame as first argument."
  | [] -> Error.make_error ArityError "Function `separate_rows` requires a DataFrame."

(*
--# Split delimited values into rows
--#
--# Expands delimited string values into multiple rows while repeating the remaining columns.
--#
--# @name separate_rows
--# @family colcraft
--# @export
*)
let register env =
  Env.add "separate_rows" (make_builtin_named ~name:"separate_rows" ~variadic:true 1 separate_rows_impl) env