package bistro-bio

  1. Overview
  2. Docs
Bistro workflows for computational biology

Install

dune-project
 Dependency

Authors

Maintainers

Sources

bistro-0.6.0.tbz
sha256=146177faaaa9117a8e2bf0fd60cb658662c0aa992f35beb246e6fd0766050e66
sha512=553fe0c20f236316449b077a47e6e12626d193ba1916e9da233e5526dd39090e8677277e1c79baace3bdc940cb009f25431730a8efc00ae4ed9cc42a0add9609

doc/src/bistro-bio/fastq_sample.ml.html

Source file fastq_sample.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
open Core_kernel
open Bistro
open Biotk
open Formats

type t =
  | Fq of fastq file SE_or_PE.t
  | Fq_gz of fastq gz file SE_or_PE.t

let is_single_end = function
  | Fq (Single_end _)
  | Fq_gz (Single_end _ ) -> true
  | Fq (Paired_end _)
  | Fq_gz (Paired_end _) -> false

let plain_se x = Fq (SE_or_PE.Single_end x)
let plain_pe x y = Fq (SE_or_PE.Paired_end (x, y))
let compressed_se x = Fq_gz (SE_or_PE.Single_end x)
let compressed_pe x y = Fq_gz (SE_or_PE.Paired_end (x, y))

let dep = function
  | Fq se_or_pe -> SE_or_PE.map se_or_pe ~f:Shell_dsl.dep
  | Fq_gz se_or_pe -> SE_or_PE.map se_or_pe ~f:Bistro_unix.Cmd.gzdep

let explode fq_samples =
  let fqs, fqs1, fqs2, fqs_gz, fqs1_gz, fqs2_gz =
    List.fold fq_samples ~init:([],[],[],[],[],[]) ~f:(fun (fqs, fqs1, fqs2, fqs_gz, fqs1_gz, fqs2_gz) x ->
        match x with
        | Fq (Single_end fq) -> (fq :: fqs, fqs1, fqs2, fqs_gz, fqs1_gz, fqs2_gz)
        | Fq (Paired_end (fq1, fq2)) -> (fqs, fq1 :: fqs1, fq2 :: fqs2, fqs_gz, fqs1_gz, fqs2_gz)
        | Fq_gz (Single_end fq) -> (fqs, fqs1, fqs2, fq :: fqs_gz, fqs1_gz, fqs2_gz)
        | Fq_gz(Paired_end (fq1, fq2)) -> (fqs, fqs1, fqs2, fqs_gz, fq1 :: fqs1_gz, fq2 :: fqs2_gz)
      )
  in
  (List.rev fqs, List.rev fqs1, List.rev fqs2),
  (List.rev fqs_gz, List.rev fqs1_gz, List.rev fqs2_gz)

type source =
  | Fastq_url of string SE_or_PE.t
  | Fastq_gz_url of string SE_or_PE.t
  | SRA_dataset of { srr_id : string ;
                     library_type : [`single_end | `paired_end] }

let unsafe_file_of_url url : 'a file =
  if String.is_prefix ~prefix:"http://" url || String.is_prefix ~prefix:"ftp://" url
  then Bistro_unix.wget url
  else Workflow.input url

let rec fastq_gz_of_source = function
  | Fastq_url _ as s ->
    SE_or_PE.map (fastq_of_source s) ~f:Bistro_unix.gzip
  | Fastq_gz_url uris ->
    SE_or_PE.map uris ~f:unsafe_file_of_url
  | SRA_dataset { srr_id ; library_type } ->
    match library_type with
    | `paired_end ->
      let r1, r2 = Sra_toolkit.(fastq_dump_pe fastq_gz) (`id srr_id) in
      Paired_end (r1, r2)
    | `single_end ->
      Single_end (Sra_toolkit.(fastq_dump fastq_gz) (`id srr_id))

and fastq_of_source = function
  | Fastq_url uris ->
    SE_or_PE.map uris ~f:unsafe_file_of_url
  | Fastq_gz_url _
  | SRA_dataset _ as s ->
    SE_or_PE.map ~f:Bistro_unix.gunzip (fastq_gz_of_source s)

let of_source s =
  match s with
  | Fastq_url _ -> Fq (fastq_of_source s)
  | SRA_dataset _
  | Fastq_gz_url _ -> Fq_gz (fastq_gz_of_source s)

module type Data = sig
  type t
  val source : t -> source List1.t
end

module Make(Data : Data) = struct
  let fastq_gz x = List1.map (Data.source x) ~f:fastq_gz_of_source
  let fastq x = List1.map (Data.source x) ~f:fastq_of_source

  let fastq_samples s = List1.map (Data.source s) ~f:of_source

  let fastqc x =
    List1.map (fastq_gz x) ~f:(SE_or_PE.map ~f:FastQC.fastqc_gz)
end