Source file consent.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
open Printf
module Logger = Dolog.Log
module Log = Dolog.Log
module Cons = Consensus
module Fp = Fingerprint
module L = MyList
module Mol = Molecule
module Pol = Policy
module StringSet = BatSet.Make(BatString)
module TopN = Top_keeper
module Utls = MyUtils
module SL = struct
type t = string * float * int * bool
let get_score (_, s, _, _) = s
let get_label (_, _, _, l) = l
end
module BEDROC = Cpm.MakeROC.Make (SL)
module Vpt_point =
struct
type t = Fp.t
let dist = Score.fp_tanimoto_dist
end
module Vpt = Vpt.Vp_tree.Make(Vpt_point)
let output_all_scores = -1
let use_all_actives = -1
let no_index = -1
let no_label = false
let no_tversky = -1.0
type filename = string
let output_scores
(out_fn: filename)
(score_labels: Score_label.t list)
(top_N: int): unit =
if out_fn = "" then
Log.warn "output_scores: OFF"
else
(Log.info "writing scores in %s ..." out_fn;
Utls.with_out_file out_fn (fun out ->
if top_N = output_all_scores then
let high_scores_first = ROC.rank_order_by_score score_labels in
L.iter (fun (mol_name, score, _index, _label) ->
fprintf out "%s %.4f\n" mol_name score
) high_scores_first
else
let res =
L.fold_left (fun acc (mol_name, score, _index, _label) ->
TopN.add mol_name score acc
) (TopN.create top_N) score_labels in
let high_scores_first = TopN.high_scores_first res in
L.iter (fun (score, name) ->
fprintf out "%s %.4f\n" name score
) high_scores_first
))
let compute_cumulated_activity
(score_labels: Score_label.t list)
(all_actives: Mol.t list): float list =
let _, name_to_activity = Mol.potency_scale all_actives in
let high_scores_first = ROC.rank_order_by_score score_labels in
let acc = ref 0.0 in
L.map (fun (name, _score, _index, _label) ->
let activity =
try Hashtbl.find name_to_activity name
with Not_found -> 0.0 in
acc := !acc +. activity;
!acc
) high_scores_first
let compute_chemical_diversity
(score_labels: Score_label.t list)
(name2cluster: (string, int) Hashtbl.t): float list =
if Hashtbl.length name2cluster = 0 then []
else
let high_scores_first = ROC.rank_order_by_score score_labels in
let clusters_seen = Hashtbl.create 11 in
L.map (fun (name, _score, _index, is_active) ->
if is_active then
begin
let m_name = Utls.remove_string_prefix "active" name in
try
let cluster = Hashtbl.find name2cluster m_name in
Hashtbl.replace clusters_seen cluster ()
with Not_found -> ()
end;
float (Hashtbl.length clusters_seen)
) high_scores_first
let force_scaffolds_diversity
(name2cluster: (string, int) Hashtbl.t)
(score_labels: Score_label.t list): Score_label.t list =
let keep_count = 1 in
if Hashtbl.length name2cluster = 0 then
failwith "force_scaffolds_diversity: name2cluster is empty";
let sorted = ROC.rank_order_by_score score_labels in
let clusters_seen = Hashtbl.create 11 in
List.filter (fun (name', _score, _index, _label) ->
let name = Utls.remove_string_prefix "active" name' in
match BatHashtbl.Exceptionless.find name2cluster name with
| None -> (Log.warn "unknown cluster for %s" name'; false)
| Some c ->
try
let count = Hashtbl.find clusters_seen c in
if count >= keep_count then false
else (Hashtbl.replace clusters_seen c (count + 1); true)
with Not_found ->
(Hashtbl.add clusters_seen c 1; true)
) sorted
let output_float_curve (verbose: bool) (out_fn: filename) (curve: float list)
: unit =
if not verbose || out_fn = "" then
Log.warn "output_float_curve: OFF"
else
Utls.with_out_file out_fn (fun out ->
L.iteri (fun i x ->
fprintf out "%d %.3f\n" i x
) curve
)
let output_int_curve (verbose: bool) (out_fn: filename) (curve: int list)
: unit =
if not verbose || out_fn = "" then
Log.warn "output_int_curve: OFF"
else
Utls.with_out_file out_fn (fun out ->
L.iteri (fun i x ->
fprintf out "%d %d\n" i x
) curve
)
type cons_id = string
type name2cluster = (string, int) Hashtbl.t
type curve =
| AUC of cons_id * float
| PM of cons_id * float
| BEDROC of cons_id * float
| Accum_actives of cons_id * float list
| Accum_activity of cons_id * Score_label.t list * Mol.t list
| Chemical_diversity of cons_id * Score_label.t list * name2cluster
type merged_curve =
| AUC of cons_id * float list
| PM of cons_id * float list
| BEDROC of cons_id * float list
| Accum_actives of cons_id * float list
| Accum_activity of cons_id * float list
| Chemical_diversity of cons_id * float list
let get_cid: curve -> cons_id = function
| AUC (cid, _)
| PM (cid, _)
| BEDROC (cid, _)
| Accum_actives (cid, _)
| Accum_activity (cid, _, _)
| Chemical_diversity (cid, _, _) -> cid
let get_floats: curve -> float list = function
| AUC (_, auc) -> [auc]
| PM (_, pm) -> [pm]
| BEDROC (_, br) -> [br]
| Accum_actives (_, floats) -> floats
| Accum_activity (_, score_labels, mols) ->
compute_cumulated_activity score_labels mols
| Chemical_diversity (_, score_labels, map) ->
compute_chemical_diversity score_labels map
let is_accum_activity_curve: curve -> bool = function
| Accum_activity _ -> true
| _ -> false
let get_score_labels (curves: curve list) =
match List.filter is_accum_activity_curve curves with
| [Accum_activity (_cid, score_labels, _molecules)] -> Some score_labels
| _ -> None
let similar_curves (lhs: curve) (rhs: curve): bool =
match lhs, rhs with
| AUC _, AUC _
| PM _, PM _
| BEDROC _, BEDROC _
| Accum_actives _, Accum_actives _
| Accum_activity _, Accum_activity _
| Chemical_diversity _, Chemical_diversity _ -> (get_cid lhs) = (get_cid rhs)
| AUC _, _
| PM _, _
| BEDROC _, _
| Accum_actives _, _
| Accum_activity _, _
| Chemical_diversity _, _ -> false
let group_curves (curves: curve list): curve list list =
let rec loop acc = function
| [] -> acc
| x :: xs ->
let ok, rest = List.partition (similar_curves x) xs in
loop ((x :: ok) :: acc) rest
in
loop [] curves
let merge_curves (grouped_curves: curve list): merged_curve =
let float_lists = L.map get_floats grouped_curves in
match grouped_curves with
| [] -> assert(false)
| AUC (cid, _) :: _ ->
AUC (cid, L.concat float_lists)
| PM (cid, _) :: _ ->
PM (cid, L.concat float_lists)
| BEDROC (cid, _) :: _ ->
BEDROC (cid, L.concat float_lists)
| Accum_actives (cid, _) :: _ ->
Accum_actives (cid, L.nfmedian float_lists)
| Accum_activity (cid, _, _) :: _ ->
Accum_activity (cid, L.nfmedian float_lists)
| Chemical_diversity (cid, _, _) :: _ ->
Chemical_diversity (cid, L.nfmedian float_lists)
let output_curves
(dst_dir: string) (verbose: bool) (l: merged_curve list): unit =
L.iter (fun curve ->
let prfx, cid, floats = match curve with
| AUC (cid, auc) ->
let () = match auc with
| [x] -> Log.info "AUC: %.3f" x
| _ -> ()
in
("auc", cid, auc)
| PM (cid, pm) ->
let () = match pm with
| [x] -> Log.info "PM: %.3f" x
| _ -> ()
in
("pm", cid, pm)
| BEDROC (cid, br) ->
let () = match br with
| [x] -> Log.info "BEDROC: %.3f" x
| _ -> ()
in
("bedroc", cid, br)
| Accum_actives (cid, floats) -> ("cum", cid, floats)
| Accum_activity (cid, floats) -> ("cum_act", cid, floats)
| Chemical_diversity (cid, floats) -> ("chem_div", cid, floats)
in
let out_fn = sprintf "%s/%s.%s.txt" dst_dir prfx cid in
Log.debug "creating: %s" out_fn;
output_float_curve verbose out_fn floats
) l
let remove_queries_from_database cons_q candidates' =
let query_names = Cons.get_query_names cons_q in
L.filter (fun mol ->
let name = Mol.get_name mol in
if StringSet.mem name query_names then
(Log.warn "rm %s from DB" name; false)
else
true
) candidates'
let single_query
(candidates': Mol.t list)
(all_actives: Mol.t list)
(name2cluster: (string, int) Hashtbl.t)
(cons_q: Cons.t): curve list =
let candidates = remove_queries_from_database cons_q candidates' in
match cons_q with
| Cons.Single _ ->
let cstr = Cons.to_string cons_q in
let csize = Cons.size cons_q in
let cons_id = sprintf "%s.%03d" cstr csize in
let tani_or_tver = Score.get_fp_score !Flags.curr_score in
let score_fun =
(fun (q: Fp.t) (cand: Mol.t) ->
let cand_fp = Mol.get_fp cand in
let score = tani_or_tver q cand_fp in
let label = Mol.is_active cand in
Mol.(cand.name, score, no_index, label))
in
let queries = Cons.get_fprints cons_q in
let curves: curve list list =
List.map (fun q ->
let score_labels = List.map (score_fun q) candidates in
let auc, cum_curve = ROC.auc score_labels in
let bedroc = BEDROC.bedroc_auc score_labels in
let pm = ROC.power_metric !Flags.pm_percent score_labels in
let avg_cum_curve = L.map float cum_curve in
(AUC (cons_id, auc): curve) ::
PM (cons_id, pm) ::
BEDROC (cons_id, bedroc) ::
Accum_actives (cons_id, avg_cum_curve) ::
Accum_activity (cons_id, score_labels, all_actives) ::
[Chemical_diversity (cons_id, score_labels, name2cluster)]
) queries in
L.concat curves
| _ -> assert(false)
let oppo_scorer
(tani_or_tver: Fp.t -> Fp.t -> float)
(tap_fun: Mol.t -> unit)
(fprints: Fp.t list)
(cand: Mol.t): Score_label.t =
tap_fun cand;
let cand_name = Mol.get_name cand in
let cand_fp = Mol.get_fp cand in
let scores =
L.map (fun q ->
tani_or_tver q cand_fp
) fprints
in
let score = L.max scores in
let label = Mol.is_active cand in
(cand_name, score, no_index, label)
let index_queries (fprints: Fp.t list): Vpt.t =
assert(!Flags.curr_score = Flags.Tanimoto);
Vpt.(create Optimal fprints)
let fast_oppo_scorer
(tap_fun: Mol.t -> unit)
(vpt: Vpt.t)
(cand: Mol.t): Score_label.t =
tap_fun cand;
let cand_name = Mol.get_name cand in
let cand_fp = Mol.get_fp cand in
let nearest_tdist, _nearest_fp =
Vpt.nearest_neighbor cand_fp vpt in
let score = 1.0 -. nearest_tdist in
let label = Mol.is_active cand in
(cand_name, score, no_index, label)
let cons_scorer (score_fun: Fp.t -> float) (tap_fun: Mol.t -> unit) (cand: Mol.t)
: Score_label.t =
tap_fun cand;
let cand_name = Mol.get_name cand in
let cand_fp = Mol.get_fp cand in
let score = score_fun cand_fp in
let label = Mol.is_active cand in
(cand_name, score, no_index, label)
let compute_score_labels
(candidates': Mol.t list)
(name2cluster: (string, int) Hashtbl.t)
(cons_q: Cons.t): Score_label.t list =
let candidates = remove_queries_from_database cons_q candidates' in
let score_fun: (Mol.t -> Score_label.t) =
match cons_q with
| Cons.Single _ -> assert(false)
| Cons.Opportunist (_queries, fprints) ->
if !Flags.fast_oppo_score then
(Log.info "fast oppo score";
fast_oppo_scorer ignore (index_queries fprints))
else
let tani_or_tver = Score.get_fp_score !Flags.curr_score in
oppo_scorer tani_or_tver ignore fprints
| Cons.Knowledgeable_ecfp4 _
| Cons.Knowledgeable_maccs _
| Cons.Knowledgeable_mop2d _
| Cons.Optimist_ecfp4 _
| Cons.Optimist_maccs _
| Cons.Optimist_mop2d _
| Cons.Realist_ecfp4 _
| Cons.Realist_maccs _
| Cons.Realist_mop2d _ -> cons_scorer (Cons.cons_score cons_q) ignore
in
let score_labels' = List.map score_fun candidates in
if !Flags.force_scaffolds_diversity then
force_scaffolds_diversity name2cluster score_labels'
else
score_labels'
let rank_active_molecules score_labels =
L.fold_lefti (fun acc i sl ->
if Score_label.is_active sl then (i, sl) :: acc
else acc
) [] (ROC.rank_order_by_score score_labels)
let query
(candidates': Mol.t list)
(all_actives: Mol.t list)
(name2cluster: (string, int) Hashtbl.t)
(cons_q: Cons.t): curve list =
if Cons.is_single cons_q then
single_query candidates' all_actives name2cluster cons_q
else
let cstr = Cons.to_string cons_q in
let csize = Cons.size cons_q in
let cons_id = sprintf "%s.%03d" cstr csize in
let score_labels = compute_score_labels candidates' name2cluster cons_q in
let auc, cum_curve = ROC.auc score_labels in
let pm = ROC.power_metric 0.1 score_labels in
let avg_cum_curve = L.map float cum_curve in
AUC (cons_id, auc) ::
PM (cons_id, pm) ::
Accum_actives (cons_id, avg_cum_curve) ::
Accum_activity (cons_id, score_labels, all_actives) ::
[Chemical_diversity (cons_id, score_labels, name2cluster)]
let pcons_query
(cons_q: Cons.t)
(database_fn: filename)
(scores_out_fn: filename)
(nprocs: int)
(top_N: int): unit =
Log.info "reading candidates from %s ..." database_fn;
let output, tap_fun =
if BatString.ends_with database_fn ".bin" || nprocs > 1 then
(None, ignore)
else
let bin_out_fn = database_fn ^ ".bin" in
let output = Gzip.open_out ~level:1 bin_out_fn in
(Some output, Mol.to_chan output) in
let score_molecule =
match cons_q with
| Cons.Single _ -> failwith "pcons_query: single strat. not supported"
| Cons.Knowledgeable_ecfp4 _
| Cons.Knowledgeable_maccs _
| Cons.Knowledgeable_mop2d _
| Cons.Optimist_ecfp4 _
| Cons.Optimist_maccs _
| Cons.Optimist_mop2d _
| Cons.Realist_ecfp4 _
| Cons.Realist_maccs _
| Cons.Realist_mop2d _ -> cons_scorer (Cons.cons_score cons_q) tap_fun
| Cons.Opportunist (_queries, fprints) ->
if !Flags.fast_oppo_score then
(Log.info "fast oppo score";
fast_oppo_scorer tap_fun (index_queries fprints))
else
let tani_or_tver = Score.get_fp_score !Flags.curr_score in
oppo_scorer tani_or_tver tap_fun fprints
in
begin
if nprocs > 1 || not (BatString.ends_with database_fn ".bin") then
let mol_reader =
Mol.mol_reader_of_filename_fp !Flags.curr_fingerprint in
Log.warn "sorted scores (will not run in constant memory)";
let score_labels =
Utls.parmap_on_file nprocs database_fn score_molecule mol_reader in
output_scores scores_out_fn score_labels top_N
else
let input = Gzip.open_in database_fn in
Utls.with_out_file scores_out_fn (fun out ->
let top_scores = ref (TopN.create top_N) in
let count = ref 0 in
let total = ref 0 in
if top_N = output_all_scores then
Log.warn "scores are unsorted (to run in constant memory)";
try
while true do
let mol = Mol.from_chan input in
incr count;
if !count = 50_000 then
(total := !total + !count;
count := 0;
eprintf "scored %d molecules\r%!" !total;
);
let mol_name, score, _index, _label = score_molecule mol in
if top_N = output_all_scores then
fprintf out "%s %.4f\n" mol_name score
else
top_scores := TopN.add mol_name score !top_scores
done;
with End_of_file ->
begin
eprintf "\n";
let high_scores_first = TopN.high_scores_first !top_scores in
Gzip.close_in input;
L.iter (fun (score, name) ->
fprintf out "%s %.4f\n" name score
) high_scores_first
end
)
end;
Utls.may_apply Gzip.close_out output
let shuffle_then_cut size mols =
let shuffled = L.real_shuffle mols in
fst (L.split_at size shuffled)
let split_then_shuffle size mols' =
let mols = Mol.most_potent_first mols' in
let n = List.length mols in
let slice_size = int_of_float (floor (float n /. float size)) in
let all_slices = L.ntake slice_size mols in
let enough_slices = fst (L.split_at size all_slices) in
let res = L.map (shuffle_then_cut 1) enough_slices in
L.flatten res
let random_choices smart_IC50_sampling size max mols =
if size = use_all_actives then [mols]
else
let sampler =
if smart_IC50_sampling then
split_then_shuffle
else
shuffle_then_cut
in
MyUtils.n_times max (fun () ->
let res = sampler size mols in
assert(L.length res = size);
res)
let scan_policies size max mols =
let chosen = random_choices false size max mols in
let n = L.length chosen in
if n < max then
Log.warn "scan_policies: not enough queries: %d < %d" n max;
List.flatten (L.map (Cons.create Pol.Single) chosen ::
L.map (Cons.create Pol.Opportunist) chosen ::
L.map (Cons.create Pol.Optimist) chosen ::
L.map (Cons.create Pol.Realist) chosen :: [])
let create_cons_queries verbose size (strat: Pol.t) max mols =
let chosen = random_choices (strat = Pol.Knowledgeable) size max mols in
let res = L.map (Cons.create ~verbose:verbose strat) chosen in
let n = L.length res in
if n < max then
Log.warn "create_cons_queries: not enough queries: %d < %d" n max
;
res
let single_cons_query verbose size strat mols =
match create_cons_queries verbose size strat 1 mols with
| [one] -> one
| _ -> assert(false)
let main () =
let start = Unix.gettimeofday () in
Logger.set_log_level Logger.INFO;
Logger.set_output stderr;
Logger.color_on ();
let strat = ref "" in
let query_fn = ref "" in
let database_fn = ref "" in
let clusters_fn = ref "" in
let scores_out_fn = ref "/dev/stdout" in
let output_top_N = ref output_all_scores in
let cons_size = ref use_all_actives in
let downsize = ref 1.0 in
let nb_queries = ref 1 in
let tversky_alpha = ref no_tversky in
let verbose = ref false in
let scan_all_policies = ref false in
let randomize_DB = ref false in
let nprocs = ref 1 in
let strategies_str = "{sing|oppo|opti|real|know}" in
let usage_message =
sprintf
"usage:\n%s -s %s \
-q queries.{sdf|mol2|csv|ecfp4|maccs|pubc} \
-db candidates.{sdf|mol2|csv|ecfp4|maccs|pubc}\n"
Sys.argv.(0) strategies_str in
let argc = Array.length Sys.argv in
if argc = 1 then
let () = eprintf "%s" usage_message in
let _ = exit 1 in
()
else
Arg.parse
["-s", Arg.Set_string strat,
sprintf "<strat> consensus strategy %s (mandatory)" strategies_str;
"-tani", Arg.Unit (fun () -> Flags.curr_score := Flags.Tanimoto),
"use Tanimoto score (default)";
"-tver", Arg.Set_float tversky_alpha,
"<float> use Tversky score with given alpha weight instead \
of Tanimoto (optional)";
"-pm", Arg.Set_float Flags.pm_percent,
sprintf "<float> Power Metric threshold (optional; default=%.2f)"
!Flags.pm_percent;
"-q", Arg.Set_string query_fn,
"<filename> queries file (known actives; mandatory)";
"-c", Arg.Set_string clusters_fn,
"<filename> mapping of molecule name to Bemis-Murcko framework and \
corresponding cluster (optional) to create chemical diversity curves";
"-dsize", Arg.Set_float downsize,
"<float> downsize dataset by factor (x in ]0.0;1.0]; optional)";
"-fdiv", Arg.Set Flags.force_scaffolds_diversity,
"force scaffolds diversity (experimental, optional)";
"-db", Arg.Set_string database_fn,
"<filename> database to rank order (mandatory)";
"-linpot", Arg.Unit (fun () -> Flags.potency_scaling := Flags.Linear),
"use linear potency scaling (optional; experimental)";
"-o", Arg.Set_string scores_out_fn,
"<filename> where to write scores (can be combined with -top)";
"-n", Arg.Set_int cons_size,
"<int> consensus size; #known actives used to create query (optional; \
default=all molecules in query file)";
"-top", Arg.Set_int output_top_N,
"<int> how many top scoring molecules to write out \
(optional; default=all; must be combined with -o)";
"-np", Arg.Set_int nprocs,
"<int> number of cores to use (default=1; optional)";
"-max", Arg.Set_int nb_queries,
"<int> number of distinct queries to run for statistics \
(default=1; optional; developer)";
"-randDB", Arg.Set randomize_DB,
"randomize DB prior to screening (optional)";
"-scan", Arg.Set scan_all_policies,
"scan all policies (optional; developer)";
"-fast", Arg.Set Flags.fast_oppo_score,
"index queries before oppo strat tanimoto scoring (optional)";
"-v", Arg.Set verbose, "verbose/debug mode (optional)"]
(fun arg -> raise (Arg.Bad ("Bad argument: " ^ arg)))
usage_message;
if !verbose then Logger.set_log_level Logger.DEBUG;
if !tversky_alpha <> no_tversky then
Flags.curr_score := Flags.Tversky !tversky_alpha;
if !Flags.force_scaffolds_diversity && !clusters_fn = "" then
failwith "-fdiv requires -c";
let name2cluster =
if !clusters_fn <> "" then Mol.load_clusters !clusters_fn
else Hashtbl.create 0 in
let query_mols, candidate_mols =
if !downsize = 1.0 then
Pair.map Mol.from_file (!query_fn, !database_fn)
else
Mol.from_downsized_file !downsize !database_fn in
let candidate_mols =
(if !randomize_DB then L.real_shuffle else Utls.id) candidate_mols in
if L.length query_mols < !cons_size then
Log.warn "not enough molecules in %s" !query_fn
;
let nb_molecules = List.length candidate_mols in
let cons_queries =
if !scan_all_policies then
scan_policies !cons_size !nb_queries query_mols
else
let strategy = Pol.of_string !strat in
create_cons_queries !verbose !cons_size strategy !nb_queries query_mols
in
let nb_screens = L.sum (L.map Cons.length cons_queries) in
Log.info "nb_screens: %d" nb_screens;
let curves' =
Utls.list_parmap !nprocs
(query candidate_mols query_mols name2cluster) cons_queries
in
let curves = L.concat curves' in
Utls.may_apply
(fun scores -> output_scores !scores_out_fn scores !output_top_N)
(get_score_labels curves);
let grouped_curves = group_curves curves in
let merged_curves = L.map merge_curves grouped_curves in
let dst_dir = Filename.dirname !database_fn in
output_curves dst_dir !verbose merged_curves;
let stop = Unix.gettimeofday () in
let delta_t = stop -. start in
Log.info "processed %.3f molecule/s" ((float nb_molecules) /. delta_t)