Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Page
Library
Module
Module type
Parameter
Class
Class type
Source
RFR.ml1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251(* Copyright (C) 2021, Francois Berenger Tsuda laboratory, Tokyo university, 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. *) (* Random Forests Regressor *) module A = BatArray module IntSet = BatSet.Int module L = BatList module Log = Dolog.Log module RNG = Random.State module Ht = BatHashtbl type features = int Feature_vector.t type dep_var = float type sample = features (* X *) * dep_var (* y *) type tree = Leaf of dep_var | Node of tree (* lhs *) * int * int (* (feature, threshold) *) * tree (* rhs *) type metric = MSE (* Mean Squared Error *) | MAE (* Mean Absolute Error *) | MAD (* Median Absolute Deviation *) let square x = x *. x let average_dep_vars samples = let n = float (A.length samples) in let sum = A.fold (fun acc (_features, value) -> acc +. value ) 0.0 samples in sum /. n let mean_squared_error samples = let n = float (A.length samples) in let avg = average_dep_vars samples in let sum_squared_errors = A.fold (fun acc (_sample, y) -> acc +. square (y -. avg) ) 0.0 samples in sum_squared_errors /. n let mean_absolute_error samples = let n = float (A.length samples) in let avg = average_dep_vars samples in let sum_abs_errors = A.fold (fun acc (_sample, y) -> acc +. abs_float (y -. avg) ) 0.0 samples in sum_abs_errors /. n let mean_absolute_deviation samples = let n = float (A.length samples) in let values = A.map snd samples in let med = Utls.array_medianf values in let abs_devs = A.map (fun x -> abs_float (x -. med)) values in (Utls.array_medianf abs_devs) /. n let metric_of = function | MSE -> mean_squared_error | MAE -> mean_absolute_error | MAD -> mean_absolute_deviation (* maybe this is called the "Classification And Regression Tree" (CART) algorithm in the litterature *) let tree_grow (rng: Random.State.t) (* seeded RNG *) (metric: sample array -> float) (* hyper params *) (max_features: int) (max_samples: int) (min_node_size: int) (training_set: sample array) (* dataset *) : tree * int array = let bootstrap, oob = (* First randomization introduced by random forests: bootstrap sampling *) Utls.array_bootstrap_sample_OOB rng max_samples training_set in let rec loop samples = (* min_node_size is a regularization parameter; it also allows to * abort tree building (might be interesting for very large datasets) *) if A.length samples <= min_node_size then Leaf (average_dep_vars samples) else (* collect all non constant features *) let split_candidates = let all_candidates = RFC.collect_non_constant_features samples in (* randomly keep only N of them: Second randomization introduced by random forests (random feature sampling). *) L.take max_features (L.shuffle ~state:rng all_candidates) in match split_candidates with | [] -> (* cannot discriminate samples further *) Leaf (average_dep_vars samples) | _ -> (* select the (feature, threshold) pair minimizing cost *) let candidate_splits = L.fold (fun acc1 (feature, values) -> IntSet.fold (fun value acc2 -> (feature, value, RFC.partition_samples feature value samples) :: acc2 ) values acc1 ) [] split_candidates in let split_costs = L.rev_map (fun (feature, value, (left, right)) -> let cost = RFC.cost_function metric left right in (cost, feature, value, (left, right)) ) candidate_splits in (* choose one split minimizing cost *) let cost, feature, threshold, (left, right) = RFC.choose_min_cost rng split_costs in if A.length left = 0 then Leaf (average_dep_vars right) else if A.length right = 0 then Leaf (average_dep_vars left) else if cost = 0.0 then (* if the cost is minimal: pure nodes -> stop digging *) Node (Leaf (average_dep_vars left), feature, threshold, Leaf (average_dep_vars right)) else Node (loop left, feature, threshold, loop right) in (loop (* 0 *) bootstrap, oob) (* array of all samples whose index is listed *) let extract indexes (samples: sample array): sample array = A.map (A.unsafe_get samples) indexes let rand_max_bound = 1073741823 (* 2^30 - 1 *) let forest_grow ncores rng metric ntrees max_features max_samples min_node_size train = (* treat the RNG as a seed stream, for reproducibility despite potentially out of order parallel run *) let seeds = A.init ntrees (fun _ -> RNG.int rng rand_max_bound) in RFC.array_parmap ncores (fun seed -> let rng' = RNG.make [|seed|] in tree_grow rng' metric max_features max_samples min_node_size train ) seeds (Leaf 0.0, [||]) type forest = (tree * int array) array (* before saving a model, we might want to just get rid of the OOB * sample indexes *) let drop_OOB (f: forest): forest = A.map (fun (t, _oob) -> (t, [||])) f let train (ncores: int) (rng: Random.State.t) (metric: metric) (ntrees: int) (max_features: RFC.int_or_float) (card_features: int) (max_samples: RFC.int_or_float) (min_node_size: int) (train: sample array): forest = Utls.enforce (1 <= ntrees) "RFC.train: ntrees < 1"; let metric_f = metric_of metric in let max_feats = RFC.ratio_to_int 1 card_features "max_features" max_features in let n = A.length train in let max_samps = RFC.ratio_to_int 1 n "max_samples" max_samples in let min_node = let () = Utls.enforce (1 <= min_node_size && min_node_size < n) "RFC.train: min_node_size not in [1,n[" in min_node_size in forest_grow ncores rng metric_f ntrees max_feats max_samps min_node train (* predict for one sample using one tree *) let tree_predict tree (features, _dep_var) = let rec loop = function | Leaf dep_var -> dep_var | Node (lhs, feature, threshold, rhs) -> let value = Feature_vector.get feature features in if value <= threshold then loop lhs else loop rhs in loop tree (* predict an average value and its standard deviation * over all trees in the forest *) let predict_one ncores forest x = let pred_vals = RFC.array_parmap ncores (fun (tree, _oob) -> tree_predict tree x) forest 0.0 in let avg = A.favg pred_vals in let std = Utls.std_dev avg pred_vals in (avg, std) (* will scale better than predict_one *) let predict_many ncores forest xs = RFC.array_parmap ncores (predict_one 1 forest) xs (0.0, 0.0) let predict_OOB forest train = let n = A.length train in let oob_idx2preds = Ht.create n in A.iter (fun (tree, oob) -> let train_OOB = extract oob train in let truths = A.map snd train_OOB in let preds = A.map (tree_predict tree) train_OOB in Utls.array_iter3 oob truths preds (fun oob_idx truth pred -> try let prev_truth, prev_preds = Ht.find oob_idx2preds oob_idx in assert(prev_truth = truth); Ht.replace oob_idx2preds oob_idx (prev_truth, pred :: prev_preds) with Not_found -> Ht.add oob_idx2preds oob_idx (truth, [pred]) ) ) forest; let truth_preds = A.make (Ht.length oob_idx2preds) (0.0, 0.0) in Utls.ht_iteri (fun i _oob_idx (truth, preds) -> A.unsafe_set truth_preds i (truth, L.favg preds) ) oob_idx2preds; truth_preds type filename = string let save fn forest = Utls.save fn (drop_OOB forest) let restore fn = Utls.restore fn (* coefficient of determination R2 *) let r2 truth_preds = let n = float (A.length truth_preds) in let sum_squared_diffs = A.fold_left (fun acc (x, y) -> acc +. Utls.square (x -. y) ) 0.0 truth_preds in let sum_squared_truth_diffs = let avg_truth = let sum = A.fold_left (fun acc (truth, _pred) -> acc +. truth ) 0.0 truth_preds in sum /. n in A.fold_left (fun acc (truth, _pred) -> acc +. square (truth -. avg_truth) ) 0.0 truth_preds in 1.0 -. (sum_squared_diffs /. sum_squared_truth_diffs)