Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file linear_algebra_wrapper.ml
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187openCoreletdebug=falseletrandom_indices_in_place~maxarr=letlen=Array.lengtharrinfori=0tolen-1doarr.(i)<-Random.intmaxdone(* [quantile_of_array] sorts the array and returns the values at the quantile indices. If
we ever expose this function, we should check that low_quantile and high_quantile are
in the interval [0,1]. *)letquantile_of_arrayarr?(failures=0)~len~low_quantile~high_quantile=Array.sortarr~len~compare:Float.compare;letindexq=Float.iround_towards_zero_exn(Float.of_intlen*.q+.0.5*.Float.of_intfailures)in(* [extended_get i] retrieves entry [i] from [arr], pretending that
[arr.(i) = infinity] when [i > len - 1], and that [arr.(i) = neg_infinity]
when [i < failures].
It assumes [i >= 0] and that entries with [i < failures] are already [neg_infinity].
*)letextended_geti=ifi>=lenthenFloat.infinityelsearr.(i)in(* For the low_quantile calculation, if the index is too large (too many failures),
return the last entry of sorted array to preserve monotonicity *)letleft_endpoint=extended_get(Int.min(indexlow_quantile)(len-1))inletright_endpoint=extended_get(Int.max(indexhigh_quantile)failures)inAnalysis_result.Ci95.create~left_endpoint~right_endpointletdebug_printpred_matrixresp_vector=fori=0to(Array.lengthpred_matrix-1)doprintf"(%4d) "i;forj=0to(Array.lengthpred_matrix.(0)-1)doprintf"%3g "pred_matrix.(i).(j);done;printf"| %3g\n%!"resp_vector.(i);done(* val ols : Measurement.t -> responder -> predictors -> float array *)letmake_lr_inputs?indicesmeas~resp~preds=letmoduleMs=Measurement_sampleinletpreds_acc=Array.mappreds~f:Ms.accessorinletresp_acc=Ms.accessorrespinletmake_pred_valuesms=Array.mappreds_acc~f:(funpred_acc->pred_accms)inletmss=Measurement.samplesmeasinletpred_matrix,resp_vector=matchindiceswith|Someindices->Array.mapindices~f:(funi->make_pred_valuesmss.(i)),Array.mapindices~f:(funi->resp_accmss.(i))|None->Array.init(Measurement.sample_countmeas)~f:(funi->make_pred_valuesmss.(i)),Array.init(Measurement.sample_countmeas)~f:(funi->resp_accmss.(i))inifdebugthendebug_printpred_matrixresp_vector;pred_matrix,resp_vectorletolsmeas~resp~preds=ifdebugthenbeginArray.iteripreds~f:(funipred->printf"(%d) %s "i(Variable.to_stringpred));printf"\n%!";end;letmatrix,vector=make_lr_inputsmeas~resp~predsinmatchLinear_algebra.ols~in_place:truematrixvectorwith|Ok_asx->x|Error_->Or_error.error_string"\
Regression failed. (In Bench, this is usually because the predictors were
linearly dependent, commonly as a result of having a predictor that is always
zero, or having two predictors that are multiples of each other.)"(* val r_square : Measurement.t -> responder -> predictors -> coefficients:float array -> float *)letr_squaremeas~resp~preds~coeffs=letpredictors_matrix,responder_vector=make_lr_inputsmeas~resp~predsinletsum_responder=Array.foldresponder_vector~init:0.~f:(+.)inletmean=sum_responder/.Float.of_int(Array.lengthresponder_vector)inlettot_ss=ref0.inletres_ss=ref0.inletpredictedi=letx=ref0.inforj=0toArray.lengthcoeffs-1dox:=!x+.predictors_matrix.(i).(j)*.coeffs.(j)done;!xinfori=0toArray.lengthresponder_vector-1dotot_ss:=!tot_ss+.(responder_vector.(i)-.mean)**2.;res_ss:=!res_ss+.(responder_vector.(i)-.predictedi)**2.;done;1.-.!res_ss/.!tot_ss(* A note about the constant [threshold = 10] (for the number of nonzero values each
predictor must have) below:
We are interested in producing 95% confidence intervals. As a result, we want enough
nonzero entries so that at least 95% of bootstrap replications succeed. The
probability that a particular row is omitted in a particular bootstrap replication is
about 1/e = 0.36788. If there are n nonzero entries in a column, the probability that
they're all omitted is 0.36788^n; we want that to be less than 0.05. n = 3 is
sufficiently large for that.
Of course, there are multiple columns to worry about. Supposing conservatively that the
user wants to use up to 20 predictors, and noting that the probability that we get
failure in some column is bounded above by the sum of the probabilities for the
individual columns, we want 0.36788^n < 0.05/20. n = 6 is sufficiently large for that.
(In practice, the predictors will tend to be correlated, so the upper bound obtained by
summing is conservative.)
Something else to worry about is that the fundamental assumption made when using
bootstrapping--that the empirical distribution is a good approximation of the
underlying distribution--starts to break down if we have very few nonzero values. So,
for a little "breathing room", n = 10 should be sufficient.
This should yield far fewer than 5% of bootstrap trials failing, so the following is a
relatively minor point. In the presence of failures, our 95% confidence interval still
encompasses 95% of _all_ trials, including failures. We position the confidence
interval so that it is centered within the interval of all trials that succeeded.
E.g., if there were 1000 trials, and no failures, we would ordinarily take the values
at indices 25 and 975 as endpoints for the 95% confidence interval; if there are 20
failures, we will (with all the failures sorted to the front of the array) instead take
the values at indices 35 and 985.
*)letbootstrap_threshold=10letcan_bootstrapmeas~resp~preds=letmatrix,_vector=make_lr_inputsmeas~resp~predsinletnon_zero=Array.create~len:(Array.lengthpreds)0inletnon_zero_cols=ref0inArray.itermatrix~f:(funrow->fori=0toArray.lengthnon_zero-1doifrow.(i)<>0.0thenbeginnon_zero.(i)<-non_zero.(i)+1;ifnon_zero.(i)=bootstrap_thresholdthenincrnon_zero_colsenddone);if!non_zero_cols=Array.lengthnon_zerothenNoneelseSome(Error.of_string(sprintf"Columns %s have less that %d non-zero values."(Array.foldinon_zero~init:""~f:(funistrcol_count->ifcol_count<bootstrap_thresholdthensprintf"%s %s(non-zero %d)"str(Variable.to_stringpreds.(i))col_countelsestr))bootstrap_threshold))letbootstrap~trialsmeas~resp~preds=letnum_preds=Array.lengthpredsinmatchcan_bootstrapmeas~resp~predswith|Someerr->Errorerr|None->letbootstrap_fails=ref0inletindices=Array.create~len:(Measurement.sample_countmeas)0inletbootstrap_coeffs=Array.initnum_preds~f:(fun_->Array.create~len:trials0.0)in(* Each bootstrap replication samples with replacement from the rows we have. *)fori=0totrials-1dorandom_indices_in_placeindices~max:(Measurement.sample_countmeas);letmatrix,vector=make_lr_inputs~indicesmeas~resp~predsinmatchLinear_algebra.ols~in_place:truematrixvectorwith(* If the run succeeded, save the coefficients. *)|Okbt_ols_result->forp=0tonum_preds-1dobootstrap_coeffs.(p).(i)<-bt_ols_result.(p);done(* If the run failed, assume neg_infinity *)|_->incrbootstrap_fails;forp=0tonum_preds-1dobootstrap_coeffs.(p).(i)<-Float.neg_infinity;donedone;Ok(Array.initnum_preds~f:(funi->iftrials=0thenAnalysis_result.Ci95.bad_ci95elsequantile_of_array(bootstrap_coeffs.(i))~failures:!bootstrap_fails~len:trials~low_quantile:0.025~high_quantile:0.975))