1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
open Ast
open Arrow_table

let nest_impl (named_args : (string option * value) list) _env =
  match named_args with
  | (_, VDataFrame df) :: rest ->
    let all_names = Arrow_table.column_names df.arrow_table in
    
    let rec resolve_col v =
      match v with
      | VSymbol _ -> (match Utils.extract_column_name v with Some s -> Result.Ok [s] | None -> Result.Ok [])
      | VString s -> Result.Ok [s]
      | VVector arr -> 
          let results = Array.to_list arr |> List.map resolve_col in
          (match List.find_opt Result.is_error results with
           | Some (Result.Error e) -> Result.Error e
           | _ -> Result.Ok (List.concat_map (function Result.Ok s -> s | _ -> []) results))
      | VList items -> 
          let results = List.map snd items |> List.map resolve_col in
          (match List.find_opt Result.is_error results with
           | Some (Result.Error e) -> Result.Error e
           | _ -> Result.Ok (List.concat_map (function Result.Ok s -> s | _ -> []) results))
      | VBuiltin b ->
          (match b.b_func [(None, VDataFrame df)] (ref Env.empty) with
           | VList items -> 
               let names = List.map snd items |> List.filter_map (function 
                 | VString s -> Some s 
                 | VSymbol _ as item_v -> Utils.extract_column_name item_v 
                 | _ -> None)
               in Result.Ok names
           | VError e -> Result.Error e
           | other -> Result.Error (Error.make_error_info TypeError ("Matcher returned " ^ Utils.value_to_string other)))
      | VError e -> Result.Error e
      | _ -> Result.Error (Error.make_error_info TypeError ("Invalid column selection: " ^ Utils.value_to_string v))
    in

    (* Support data = [...] keyword or positional columns *)
    let to_nest_res = match List.assoc_opt (Some "data") rest with
      | Some v -> resolve_col v
      | None ->
          let positional = List.filter (fun (k, _) -> k = None) rest |> List.map snd in
          if positional = [] && df.group_keys <> [] then
            (* Nest everything EXCEPT group keys *)
            Result.Ok (List.filter (fun n -> not (List.mem n df.group_keys)) all_names)
          else
            let results = List.map resolve_col positional in
            match List.find_opt Result.is_error results with
            | Some (Result.Error e) -> Result.Error e
            | _ -> Result.Ok (List.concat_map (function Result.Ok s -> s | _ -> []) results)
    in

    begin match to_nest_res with
    | Result.Error e -> VError e
    | Result.Ok to_nest ->
      let missing = List.filter (fun n -> not (List.mem n all_names)) to_nest in
      if missing <> [] then
        Error.make_error KeyError (Printf.sprintf "Column(s) not found in DataFrame: %s." (String.concat ", " missing))
      else if to_nest = [] then VDataFrame df
      else
        let group_cols = List.filter (fun n -> not (List.mem n to_nest)) all_names in
        let new_col_name = match List.assoc_opt (Some "name") rest with Some (VString s) -> s | _ -> "data" in

        if group_cols = [] then
          (* Nest everything *)
          let sub_table = Arrow_compute.project df.arrow_table to_nest in
          let arrow_col = Arrow_table.ListColumn [|Some sub_table|] in
          let new_table = { 
            Arrow_table.schema = [(new_col_name, ArrowList (ArrowStruct sub_table.schema))];
            columns = [(new_col_name, arrow_col)];
            nrows = 1;
            native_handle = None;
          } in
          VDataFrame { arrow_table = new_table; group_keys = [] }
        else
          let grouped = Arrow_compute.group_by_optimized df.arrow_table group_cols in
          let groups = Arrow_compute.get_ocaml_groups grouped in
          let n_groups = List.length groups in

          if n_groups = 0 then begin
            (* Fast-path for empty input: no groups, produce correct empty schema. *)
            let sub_table = Arrow_compute.project df.arrow_table to_nest in
            let key_schema = List.map (fun k ->
              (k, match Arrow_table.column_type df.arrow_table k with Some t -> t | None -> ArrowNA)
            ) group_cols in
            let key_zero_cols = List.map (fun (k, t) ->
              let col = match t with
                | ArrowInt64 -> IntColumn [||]
                | ArrowFloat64 -> FloatColumn [||]
                | ArrowBoolean -> BoolColumn [||]
                | ArrowString -> StringColumn [||]
                | _ -> NAColumn 0
              in
              (k, col)
            ) key_schema in
            let nested_col = (new_col_name, Arrow_table.ListColumn [||]) in
            let final_schema = key_schema @ [(new_col_name, ArrowList (ArrowStruct sub_table.schema))] in
            VDataFrame {
              arrow_table = {
                Arrow_table.schema = final_schema;
                columns = key_zero_cols @ [nested_col];
                nrows = 0;
                native_handle = None;
              };
              group_keys = [];
            }
          end else begin
          let nested_results = Arrow_compute.nest grouped to_nest in
          let nested_results_arr = Array.of_list nested_results in
          let groups_arr = Array.of_list groups in

          let key_cols = List.map (fun k ->
            match Arrow_table.get_column df.arrow_table k with
            | Some col ->
                (k, Array.init n_groups (fun i ->
                  let (_, indices) = groups_arr.(i) in
                  match indices with
                  | first :: _ -> Arrow_bridge.value_at col first
                  | [] -> (VNA NAGeneric)))
            | None -> (k, Array.make n_groups ((VNA NAGeneric)))
          ) group_cols in

          let nested_data = Array.init n_groups (fun i ->
            let (_, sub_table) = nested_results_arr.(i) in
            Some sub_table
          ) in

          let all_cols = List.map (fun (k, v) -> (k, Arrow_bridge.values_to_column v)) key_cols in
          let first_sub = match nested_data.(0) with Some t -> t | None -> Arrow_table.empty in
          let nested_col = (new_col_name, Arrow_table.ListColumn nested_data) in

          let final_cols = all_cols @ [nested_col] in
          let final_schema = List.map (fun (k, _) ->
            (k, match Arrow_table.column_type df.arrow_table k with Some t -> t | None -> ArrowNA)
          ) key_cols @ [(new_col_name, ArrowList (ArrowStruct first_sub.schema))] in

          VDataFrame {
            arrow_table = {
              schema = final_schema;
              columns = final_cols;
              nrows = n_groups;
              native_handle = None
            };
            group_keys = []
          }
          end
    end
  | _ :: _ -> Error.type_error "Function `nest` expects a DataFrame as first argument."
  | [] -> Error.make_error ArityError "Function `nest` requires a DataFrame."

(*
--# Nest columns into sub-dataframes
--#
--# Packs selected columns into nested DataFrame values grouped by the remaining columns.
--# Supports flexible column selection using symbols, strings, or selection helpers
--# (like starts_with, ends_with).
--#
--# If the DataFrame is already grouped (via group_by()) and no columns are
--# specified, nest() will automatically nest all columns except the grouping keys.
--#
--# @name nest
--# @param df :: DataFrame The DataFrame to nest.
--# @param data :: Selection (Optional) Columns or matchers to nest.
--# @param name :: String (Optional) Name for the new nested column, defaults to "data".
--# @param ... :: Selection (Optional) Positional columns to nest if 'data' is not provided.
--# @return :: DataFrame A new DataFrame with grouped keys and a nested list-column.
--# @family colcraft
--# @export
*)
let register env =
  Env.add "nest" (make_builtin_named ~name:"nest" ~variadic:true 1 nest_impl) env