1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
open Ast

(*
--# Count distinct values
--#
--# Returns the number of distinct values in a vector or list.
--# Inside `summarize()`, this acts as an aggregation expression.
--#
--# @name n_distinct
--# @param x :: Vector | List The input values.
--# @return :: Int The number of distinct values.
--# @example
--#   summarize(df, $unique_species = n_distinct($species))
--# @family colcraft
--# @seealso summarize, distinct
--# @export
*)
let count_distinct_in_array values =
  let seen = Value_hash.ValueHash.create (max 1 (min 64 (Array.length values))) in
  Array.iter (fun value -> Value_hash.ValueHash.replace seen value ()) values;
  Value_hash.ValueHash.length seen

let register env =
  Env.add "n_distinct"
    (make_builtin ~name:"n_distinct" 1 (fun args _env ->
      match args with
      | [VVector values] -> VInt (count_distinct_in_array values)
      | [VList values] ->
          let arr = Array.of_list (List.map snd values) in
          VInt (count_distinct_in_array arr)
      | [VNA _] -> Error.type_error "Function `n_distinct` expects a vector or list, got NA."
      | [_] -> Error.type_error "Function `n_distinct` expects a vector or list."
      | _ -> Error.arity_error_named "n_distinct" 1 (List.length args)
    ))
    env