1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
(* src/arrow/arrow_column.ml *)
(* Column access and views for Arrow-backed DataFrames.                  *)
(* Provides zero-copy column views that keep the backing table alive.    *)
(* Supports both native Arrow tables (via FFI) and pure OCaml storage.   *)

open Bigarray

(** A column view — references the backing table to prevent GC collection *)
type column_view = {
  backing : Arrow_table.t;
  column_name : string;
  data : Arrow_table.column_data;
}

(** A zero-copy numeric view backed by a Bigarray over Arrow memory *)
type numeric_view =
  | FloatView of (float, float64_elt, c_layout) Array1.t
  | IntView of (int64, int64_elt, c_layout) Array1.t

(** Get a column view from an Arrow table.
    For native-backed tables, this extracts column data via FFI. *)
let get_column (table : Arrow_table.t) (name : string) : column_view option =
  match Arrow_table.get_column table name with
  | Some data -> Some { backing = table; column_name = name; data }
  | None -> None

(** Get the Arrow type of a column view *)
let column_type (view : column_view) : Arrow_table.arrow_type =
  Arrow_table.column_type_of view.data

(** Get the length of a column view *)
let column_length (view : column_view) : int =
  Arrow_table.column_length view.data

(** Get the raw column data from a view *)
let column_data (view : column_view) : Arrow_table.column_data =
  view.data

(** Create a zero-copy Bigarray view over an Arrow column's buffer.
    Only works for numeric columns (Float64, Int64) backed by a native
    Arrow table. Returns None for non-numeric types or pure OCaml tables.
    The returned Bigarray shares memory with the Arrow buffer — no copy. *)
let zero_copy_view (col : column_view) : numeric_view option =
  match col.backing.native_handle with
  | Some handle when not handle.Arrow_table.freed ->
    (match Arrow_table.column_type_of col.data with
     | Arrow_table.ArrowFloat64 ->
       (match Arrow_ffi.arrow_table_get_column_data handle.ptr col.column_name with
        | Some array_ptr ->
          (match Arrow_ffi.arrow_float64_array_to_bigarray array_ptr with
           | Some ba ->
             (* Keep the backing table alive as long as this view is reachable *)
             ignore (Sys.opaque_identity col.backing);
             Some (FloatView ba)
           | None -> None)
        | None -> None)
     | Arrow_table.ArrowInt64 ->
       (match Arrow_ffi.arrow_table_get_column_data handle.ptr col.column_name with
        | Some array_ptr ->
          (match Arrow_ffi.arrow_int64_array_to_bigarray array_ptr with
           | Some ba ->
             ignore (Sys.opaque_identity col.backing);
             Some (IntView ba)
           | None -> None)
        | None -> None)
     | _ -> None)
  | _ -> None

(** Access a single element from a column view without copying.
    Returns the T value at the given index, or (VNA NAGeneric) if out of bounds.
    For numeric columns backed by a zero-copy view, reads directly from
    the Arrow buffer. Otherwise, falls back to column_data indexing. *)
let get_value_at (view : column_view) (idx : int) : Ast.value =
  let len = column_length view in
  if idx < 0 || idx >= len then Ast.(VNA NAGeneric)
  else
    match view.data with
    | Arrow_table.IntColumn a ->
      (match a.(idx) with Some i -> Ast.VInt i | None -> Ast.VNA Ast.NAInt)
    | Arrow_table.FloatColumn a ->
      (match a.(idx) with Some f -> Ast.VFloat f | None -> Ast.VNA Ast.NAFloat)
    | Arrow_table.BoolColumn a ->
      (match a.(idx) with Some b -> Ast.VBool b | None -> Ast.VNA Ast.NABool)
    | Arrow_table.StringColumn a ->
      (match a.(idx) with Some s -> Ast.VString s | None -> Ast.VNA Ast.NAString)
    | Arrow_table.DateColumn a ->
      (match a.(idx) with Some d -> Ast.VDate d | None -> Ast.VNA Ast.NADate)
    | Arrow_table.DatetimeColumn (a, tz) ->
      (match a.(idx) with Some ts -> Ast.VDatetime (ts, tz) | None -> Ast.VNA Ast.NADate)
    | Arrow_table.NAColumn _ -> Ast.VNA Ast.NAGeneric
    | Arrow_table.DictionaryColumn (a, levels, ordered) ->
      (match a.(idx) with Some i -> Ast.VFactor (i, levels, ordered) | None -> Ast.VNA Ast.NAGeneric)
    | Arrow_table.ListColumn a ->
      (match a.(idx) with Some t -> Ast.VDataFrame { arrow_table = t; group_keys = [] } | None -> Ast.VNA Ast.NAGeneric)

(** Get a slice (sub-view) of a column view.
    Returns a new column_view covering [start, start+length) of the original.
    The backing table reference is shared to prevent GC collection. *)
let get_slice (view : column_view) (start : int) (len : int) : column_view =
  let total = column_length view in
  let actual_start = max 0 (min start total) in
  let actual_len = max 0 (min len (total - actual_start)) in
  let slice_data = match view.data with
    | Arrow_table.IntColumn a ->
      Arrow_table.IntColumn (Array.sub a actual_start actual_len)
    | Arrow_table.FloatColumn a ->
      Arrow_table.FloatColumn (Array.sub a actual_start actual_len)
    | Arrow_table.BoolColumn a ->
      Arrow_table.BoolColumn (Array.sub a actual_start actual_len)
    | Arrow_table.StringColumn a ->
      Arrow_table.StringColumn (Array.sub a actual_start actual_len)
    | Arrow_table.DateColumn a ->
      Arrow_table.DateColumn (Array.sub a actual_start actual_len)
    | Arrow_table.DatetimeColumn (a, tz) ->
      Arrow_table.DatetimeColumn (Array.sub a actual_start actual_len, tz)
    | Arrow_table.NAColumn _ ->
      Arrow_table.NAColumn actual_len
    | Arrow_table.DictionaryColumn (a, levels, ordered) ->
      Arrow_table.DictionaryColumn (Array.sub a actual_start actual_len, levels, ordered)
    | Arrow_table.ListColumn a ->
      Arrow_table.ListColumn (Array.sub a actual_start actual_len)
  in
  { backing = view.backing; column_name = view.column_name; data = slice_data }

(** Convert a column view to a T value list (fallback for non-vectorizable ops).
    Always works regardless of backing storage. *)
let column_view_to_list (view : column_view) : Ast.value list =
  Array.to_list (Arrow_bridge.column_to_values view.data)