1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
open Ast
open Arrow_table
(*
--# Remove rows with missing values
--#
--# drop_na() removes rows from a DataFrame where specified columns have
--# missing values.
--#
--# @name drop_na
--# @param df :: DataFrame The DataFrame.
--# @param ... :: Symbol (Optional) Columns to check for missing values (use $col syntax).
--# If none specified, checks all columns.
--# @return :: DataFrame The DataFrame with NA rows removed.
--# @example
--# drop_na(df)
--# drop_na(df, $age, $score)
--# @family colcraft
--# @export
*)
let register env =
Env.add "drop_na"
(make_builtin ~name:"drop_na" ~variadic:true 1 (fun args _env ->
match args with
| VDataFrame df :: cols_variants ->
let all_names = Arrow_table.column_names df.arrow_table in
let cols_to_check, parse_err =
if cols_variants = [] then (all_names, None)
else
let parsed = List.filter_map Utils.extract_column_name cols_variants in
if List.length parsed <> List.length cols_variants then
([], Some "Function `drop_na` expects all column arguments to use $col syntax.")
else (parsed, None)
in
(match parse_err with
| Some msg -> Error.type_error msg
| None ->
(* Validate all requested columns exist *)
let missing = List.filter (fun c -> not (List.mem c all_names)) cols_to_check in
if missing <> [] then
Error.make_error KeyError (Printf.sprintf "Function `drop_na`: column(s) not found: %s" (String.concat ", " missing))
else
let orig_nrows = Arrow_table.num_rows df.arrow_table in
let keeps = ref [] in
for i = 0 to orig_nrows - 1 do
let has_na = List.exists (fun col ->
match Arrow_table.get_column df.arrow_table col with
| Some (IntColumn a) -> Option.is_none a.(i)
| Some (FloatColumn a) -> Option.is_none a.(i)
| Some (StringColumn a) -> Option.is_none a.(i)
| Some (BoolColumn a) -> Option.is_none a.(i)
| _ -> true
) cols_to_check in
if not has_na then keeps := i :: !keeps
done;
let indices = Array.of_list (List.rev !keeps) in
let new_table = Arrow_compute.sort_by_indices df.arrow_table indices in
VDataFrame { arrow_table = new_table; group_keys = df.group_keys })
| _ :: _ -> Error.type_error "Function `drop_na` expects a DataFrame as first argument."
| _ -> Error.make_error ArityError "Function `drop_na` requires a DataFrame."
))
env