package oplsr

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file PLS.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
(* Copyright (C) 2020, Francois Berenger

   Yamanishi laboratory,
   Department of Bioscience and Bioinformatics,
   Faculty of Computer Science and Systems Engineering,
   Kyushu Institute of Technology,
   680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)

open Printf

module Log = Dolog.Log

(* CSV file must have modeled variable as first column, all other columns are
   feature values. CSV file must be in space separated dense format.
   The first line is the CSV header (column numbers are fine). *)
let optimize debug train_data_csv_fn nb_folds =
  let validation_str =
    if nb_folds > 1 then
      sprintf "validation = 'CV', segments = %d" nb_folds
    else
      "validation = 'none'" in
  (* create R script and store it in a temp file *)
  let r_script_fn = Filename.temp_file "oplsr_optim_" ".r" in
  Utls.with_out_file r_script_fn (fun out ->
      fprintf out
        "library('pls', quietly = TRUE, verbose = FALSE)\n\
         data <- as.matrix(read.table('%s', colClasses = 'numeric', \
                           header = TRUE))\n\
         ncols <- dim(data)[2]\n\
         xs <- data[, 2:ncols]\n\
         ys <- data[, 1:1]\n\
         train_data <- data.frame(y = ys, x = I(xs))\n\
         model <- plsr(y ~ x, method = 'simpls', data = train_data,\n\
                       %s)\n\
         r2 <- R2(model)\n\
         r2s <- unlist(r2[1])\n\
         ncomp_best <- which.max(r2s)\n\
         r2_max = r2s[ncomp_best]\n\
         printf <- function(...) cat(sprintf(...))\n\
         printf('ncomp: %%d R2: %%f\n', ncomp_best, r2_max)\n\
         quit()\n"
        train_data_csv_fn
        validation_str
    );
  let r_log_fn = Filename.temp_file "oplsr_optim_" ".log" in
  (* execute it *)
  let cmd =
    sprintf "(R --vanilla --slave < %s 2>&1) > %s" r_script_fn r_log_fn in
  if debug then Log.debug "%s" cmd;
  if Sys.command cmd <> 0 then
    failwith ("PLS.optimize: R failure: " ^ cmd)
  else
    let last_log_line =
      Utls.get_command_output debug (sprintf "tail -1 %s" r_log_fn) in
    let ncomp, r2 =
      try Scanf.sscanf last_log_line "ncomp: %d R2: %f" (fun x y -> (x, y))
      with exn -> (Log.error "cannot parse: %s" last_log_line;
                   raise exn) in
    if not debug then
      List.iter Sys.remove [r_script_fn; r_log_fn];
    (ncomp, r2)

let train debug train_data_csv_fn ncomp_best =
  (* create R script and store it in a temp file *)
  let r_script_fn = Filename.temp_file "oplsr_train_" ".r" in
  let r_model_fn = Filename.temp_file "oplsr_train_model_" ".bin" in
  Utls.with_out_file r_script_fn (fun out ->
      fprintf out
        "library('pls', quietly = TRUE, verbose = FALSE)\n\
         data <- as.matrix(read.table('%s', colClasses = 'numeric',\n\
                           header = TRUE))\n\
         ncols <- dim(data)[2]\n\
         xs <- data[, 2:ncols]\n\
         ys <- data[, 1:1]\n\
         train_data <- data.frame(y = ys, x = I(xs))\n\
         model <- plsr(y ~ x, ncomp = %d, method = 'simpls', \
                       data = train_data, validation = 'none')\n\
         save(model, file='%s')\n\
         quit()\n"
        train_data_csv_fn
        ncomp_best
        r_model_fn
    );
  let r_log_fn = Filename.temp_file "oplsr_train_" ".log" in
  (* execute it *)
  let cmd =
    sprintf "(R --vanilla --slave < %s 2>&1) > %s" r_script_fn r_log_fn in
  if debug then Log.debug "%s" cmd;
  if Sys.command cmd <> 0 then
    failwith ("PLS.train: R failure: " ^ cmd);
  if not debug then
    List.iter Sys.remove [r_script_fn; r_log_fn];
  r_model_fn

let predict_to_file debug ncomp_best trained_model_fn test_data_csv_fn
    out_preds_fn =
  (* create R script and store it in a temp file *)
  let r_script_fn = Filename.temp_file "oplsr_predict_" ".r" in
  Utls.with_out_file r_script_fn (fun out ->
      fprintf out
        "library('pls', quietly = TRUE, verbose = FALSE)\n\
         load('%s')\n\
         data <- as.matrix(read.table('%s',\n\
                           colClasses = 'numeric', header = TRUE))\n\
         ncols <- dim(data)[2]\n\
         xs <- data[, 2:ncols]\n\
         ys <- data[, 1:1]\n\
         test_data <- data.frame(y = ys, x = I(xs))\n\
         values <- predict(model, ncomp = %d, newdata = test_data)\n\
         write.table(values, file = '%s', sep = '\n',\n\
                     row.names = F, col.names = F)\n\
         quit()\n"
        trained_model_fn
        test_data_csv_fn
        ncomp_best
        out_preds_fn
    );
  let r_log_fn = Filename.temp_file "oplsr_train_" ".log" in
  (* execute it *)
  let cmd =
    sprintf "(R --vanilla --slave < %s 2>&1) > %s" r_script_fn r_log_fn in
  if debug then Log.debug "%s" cmd;
  if Sys.command cmd <> 0 then
    failwith ("PLS.predict: R failure: " ^ cmd);
  if not debug then
    List.iter Sys.remove [r_script_fn; r_log_fn]

let predict debug ncomp_best trained_model_fn test_data_csv_fn =
  let out_preds_fn = Filename.temp_file "oplsr_preds_" ".txt" in
  predict_to_file debug ncomp_best trained_model_fn test_data_csv_fn
    out_preds_fn;
  let preds = Utls.float_list_of_file out_preds_fn in
  (if not debug then Sys.remove out_preds_fn);
  preds