1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
(* src/packages/stats/residuals.ml *)
open Ast
(** residuals(data, model, type="response") — computes residuals for the given data. *)
(*
--# Model Residuals
--#
--# Computes residuals for a model given a dataset.
--#
--# @name residuals
--# @param data :: DataFrame Input data.
--# @param model :: Model The model object.
--# @param type :: String (Optional) Type of residuals = "response" (default) or "pearson".
--# @return :: DataFrame Columns = `actual`, `fitted`, `resid`.
--#
--# @details
--# Residuals represent the difference between observed data and model predictions.
--#
--# * **Response Residuals**: $r_i = y_i - \hat{y}_i$. These are raw residuals on the scale
--# of the dependent variable. In GLMs, this corresponds to the difference on the
--# original scale (e.g., probability for binomial).
--#
--# * **Pearson Residuals**: $r_i^P = \frac{y_i - \hat{y}_i}{\sqrt{V(\hat{y}_i)}}$. These are
--# standardized by the variance function of the model. They are useful for detecting
--# outliers and checking constant variance (homoscedasticity) in GLMs where variance
--# typically changes with the mean.
--#
--# @example
--# res = residuals(mtcars, model)
--# res_p = residuals(mtcars, model, type = "pearson")
--# @family stats
--# @export
*)
let register env =
Env.add "residuals"
(make_builtin_named ~name:"residuals" ~variadic:true 2 (fun args _env ->
let parsed = match args with
| (None, VDataFrame df) :: (None, VDict model) :: rest ->
let t = match List.assoc_opt "type" (List.filter_map (fun (n, v) -> match n with Some s -> Some (s, v) | None -> None) rest) with
| Some (VString s) -> String.lowercase_ascii s
| _ -> "response"
in
Ok (df, model, t)
| _ -> Error (Error.type_error "Function `residuals` expects (DataFrame, Model).")
in
match parsed with
| Error e -> e
| Ok (df, model, r_type) ->
(* 1. Get fitted values using predict *)
let predict_fn = match Env.find_opt "predict" _env with
| Some (VBuiltin b) -> b.b_func
| _ -> fun _ _ -> Error.type_error "Internal error: `predict` not found."
in
let fitted_v = predict_fn [(None, VDataFrame df); (None, VDict model)] (ref _env) in
match fitted_v with
| VVector fitted ->
(* 2. Get actual values from response variable *)
let response_name = match List.assoc_opt "formula" model with
| Some (VFormula f) -> (match f.response with [name] -> Some name | _ -> None)
| _ -> None
in
let actuals = match response_name with
| Some name -> Arrow_table.get_float_column df.arrow_table name
| None -> Array.make (Array.length fitted) None
in
let n = Array.length fitted in
let resids = Array.init n (fun i ->
match actuals.(i), fitted.(i) with
| Some y, VFloat y_hat ->
let raw = y -. y_hat in
if r_type = "pearson" then
let family = match List.assoc_opt "_model_data" model with
| Some (VDict d) -> (match List.assoc_opt "family" d with Some (VString f) -> String.lowercase_ascii f | _ -> "gaussian")
| _ -> "gaussian"
in
let var_mu = match family with
| "binomial" -> y_hat *. (1.0 -. y_hat)
| "poisson" -> y_hat
| "gamma" -> y_hat *. y_hat
| _ -> 1.0
in
if var_mu > 0.0 then Some (raw /. sqrt var_mu) else Some raw
else Some raw
| _ -> None
) in
let columns = [
("actual", Arrow_table.FloatColumn actuals);
("fitted", Arrow_table.FloatColumn (Array.map (function VFloat f -> Some f | _ -> None) fitted));
("resid", Arrow_table.FloatColumn resids);
] in
let table = Arrow_table.create columns n in
VDataFrame { arrow_table = table; group_keys = [] }
| VError e -> VError e
| _ -> Error.type_error "Function `predict` did not return a Vector."
)) env