package biotk

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file gtf.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
open Core
open CFStream
open Biocaml_base

(* https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md *)

type record = Biocaml_base.Gff.record = {
  seqname    : string ;
  source     : string option ;
  feature    : string option ;
  start_pos  : int ;
  stop_pos   : int ;
  score      : float option ;
  strand     : [`Plus | `Minus | `Not_stranded | `Unknown ] ;
  phase      : int option ;
  attributes : (string * string list) list ;
}
[@@ deriving sexp]

(* http://gmod.org/wiki/GFF2 *)
module Parser = struct
  open Angstrom

  let field name =
    take_while1 (function
        | '\t' | '\n' -> false
        | _ -> true
      )
    <?> ("field:" ^ name)

  let to_int fieldnum s =
    match Int.of_string s with
    | n -> return n
    | exception _ ->
      fail (sprintf "failed to convert %s to integer at field %d" s fieldnum)

  let to_float fieldnum s =
    match Float.of_string s with
    | x -> return x
    | exception _ ->
      fail (sprintf "failed to convert %s to float at field %d" s fieldnum)

  let option = function
    | "." -> None
    | s -> Some s

  let maybe_convert f = function
    | "." -> return None
    | s -> f s >>| Option.some

  let strand fieldnum = function
    | "." -> return `Not_stranded
    | "?" -> return `Unknown
    | "+" -> return `Plus
    | "-" -> return `Minus
    | _ -> fail (sprintf "Incorrect strand character at field %d" fieldnum)

  let attribute_id = take_while1 (function
      | 'a'..'z' | 'A'..'Z'
      | '_' -> true
      | _ -> false
    )

  let quoted_attribute_value =
    char '"' *> take_while (function
        | '"' | '\n' -> false
        | _ -> true
      ) <*
    char '"'

  let unquoted_attribute_value =
    take_while (function
        | ' ' | ';' | '\n' -> false
        | _ -> true
      )

  let one_gtf_attributes =
    attribute_id >>= fun id ->
    skip_many1 (char ' ') >>= fun () ->
    (quoted_attribute_value <|> unquoted_attribute_value) >>= fun s ->
    return (id, [ s ])

  let attribute_separator =
    skip_many (char ' ') >>= fun () ->
    char ';' >>= fun _ ->
    skip_many (char ' ')

  let gtf_attributes =
    sep_by1 attribute_separator one_gtf_attributes
    <* Angstrom.option ';' (char ';')
    <?> "gtf_attributes"

  let tab = char '\t'

  let record =
    field "seqname" >>= fun seqname ->
    tab *>
    field "source" >>| option >>= fun source ->
    tab *>
    field "feature" >>| option >>= fun feature ->
    tab *>
    field "start_pos" >>= to_int 4 >>= fun start_pos ->
    tab *>
    field "stop_pos" >>= to_int 5 >>= fun stop_pos ->
    tab *>
    field "score" >>= maybe_convert (to_float 6) >>= fun score ->
    tab *>
    field "strand" >>= strand 7 >>= fun strand ->
    tab *>
    field "phase" >>= maybe_convert (to_int 8) >>= fun phase ->
    tab *>
    gtf_attributes >>| fun attributes ->
    `Record Gff.{
      seqname ;
      source ;
      feature ;
      start_pos ;
      stop_pos ;
      score ;
      strand ;
      phase ;
      attributes ;
    }
  let record = record <?> "record"

  let comment =
    char '#' >>= fun _ ->
    take_while (Char.( <> ) '\n') >>| fun s ->
    `Comment s

  let space =
    skip_while (function
        | ' ' | '\t' -> true
        | _ -> false
      )

  let file =
    let rec line_start acc lno =
      (end_of_input *> return acc)
      <|> (comment >>= fun c -> after_comment (c :: acc) lno )
      <|> (record >>= fun r -> after_record (r :: acc) lno)
    and after_comment acc lno =
      (end_of_line >>= fun () -> line_start acc (lno + 1))
      <|> (end_of_input *> return acc)
    and after_record acc lno =
      (end_of_input *> return acc)
      <|> (space *> end_of_line >>= fun () -> line_start acc (lno + 1))
      <|> (space *> comment >>= fun c -> after_comment (c :: acc) lno)
    in
    line_start [] 1
    >>| List.rev

  let test p s v =
    match parse_string p s ~consume:All with
    | Ok x -> Poly.(x = v)
    | Error msg ->
      print_endline msg ;
      false

  let%test "Gtf.gtf_attributes1" =
    test
      gtf_attributes
      {|gene_id "FBgn0031081"|}
      [ ("gene_id", ["FBgn0031081"]) ]

  let%test "Gtf.gtf_attributes2" =
    test
      gtf_attributes
      {|gene_id "FBgn0031081"; gene_symbol "Nep3"; transcript_id "FBtr0070000"; transcript_symbol "Nep3-RA";|}
      [ ("gene_id", ["FBgn0031081"]) ; ("gene_symbol", ["Nep3"]) ;
        ("transcript_id", ["FBtr0070000"]) ; ("transcript_symbol", ["Nep3-RA"]) ]

  let%test "Gtf.gtf_attributes3" =
    test
      gtf_attributes
      {|Transcript B0273.1; Note "Zn-Finger"|}
      [ "Transcript", ["B0273.1"] ; "Note", ["Zn-Finger"]]

  let%test "Gtf.comment" =
    test comment "#comment" (`Comment "comment")

  type item = [ `Comment of string | `Record of record ]
  [@@ deriving sexp]

  let expect_record s =
    parse_string ~consume:All record s
    |> [%sexp_of: ([`Record of record], string) Result.t]
    |> Sexp.output_hum Stdio.stdout

  let expect s =
    parse_string ~consume:All file s
    |> [%sexp_of: (item list, string) Result.t]
    |> Sexp.output_hum Stdio.stdout

  let ex1 = {|IV	curated	mRNA	5506800	5508917	.	+	.	Transcript B0273.1; Note "Zn-Finger";|}
  let%expect_test "parse GTF record" =
    expect_record ex1;
    [%expect {|
      (Ok
       (Record
        ((seqname IV) (source (curated)) (feature (mRNA)) (start_pos 5506800)
         (stop_pos 5508917) (score ()) (strand Plus) (phase ())
         (attributes ((Transcript (B0273.1)) (Note (Zn-Finger))))))) |}]

  let ex2 = {|X	FlyBase	gene	19961297	19969323	.	+	.	gene_id "FBgn0031081"; gene_symbol "Nep3";|}
  let%expect_test "parse GTF record 2" =
    expect_record ex2;
    [%expect {|
      (Ok
       (Record
        ((seqname X) (source (FlyBase)) (feature (gene)) (start_pos 19961297)
         (stop_pos 19969323) (score ()) (strand Plus) (phase ())
         (attributes ((gene_id (FBgn0031081)) (gene_symbol (Nep3))))))) |}]

  let ex2 = {|IV	curated	mRNA	5506800	5508917	.	+	.	Transcript B0273.1; Note "Zn-Finger" # some comment
IV	curated	mRNA	5506800	5508917	.	+	.	Transcript B0273.1; Note "Zn-Finger"
X	FlyBase	gene	19961297	19969323	.	+	.	gene_id "FBgn0031081"; gene_symbol "Nep3";|}

  let%expect_test "parse GTF file" =
    expect ex2;
    [%expect {|
      (Ok
       ((Record
         ((seqname IV) (source (curated)) (feature (mRNA)) (start_pos 5506800)
          (stop_pos 5508917) (score ()) (strand Plus) (phase ())
          (attributes ((Transcript (B0273.1)) (Note (Zn-Finger))))))
        (Comment " some comment")
        (Record
         ((seqname IV) (source (curated)) (feature (mRNA)) (start_pos 5506800)
          (stop_pos 5508917) (score ()) (strand Plus) (phase ())
          (attributes ((Transcript (B0273.1)) (Note (Zn-Finger))))))
        (Record
         ((seqname X) (source (FlyBase)) (feature (gene)) (start_pos 19961297)
          (stop_pos 19969323) (score ()) (strand Plus) (phase ())
          (attributes ((gene_id (FBgn0031081)) (gene_symbol (Nep3)))))))) |}]
end

module Item = struct
  type t = Gff.item

  let parse line =
    match Gff.gtf_item_of_line line with
    | Ok item -> item
    | Error (`Msg msg) -> failwith msg

  let unparse item =
    (Gff.line_of_item `two item :> string)

  let to_record = function
    | `Record r -> Some r
    | `Comment _ -> None
end

include Line_oriented.Make(Item)

(* Introduced a new load function because it so happens that comment
   items may follow records in gtf files, which is incompatible with
   the assumptions in Line_oriented. *)
let load fn =
  let _unconsumed, result =
    In_channel.with_file fn ~f:(fun ic ->
        Angstrom_unix.parse Parser.file ic
      )
  in
  match result with
  | Ok xs -> xs
  | Error msg -> failwith msg

module Record = struct
  type t = Gff.record
  let loc r = GLoc.{ chr = r.Gff.seqname ; lo = r.start_pos ; hi = r.stop_pos }
  let compare p q =
    match GLoc.compare (loc p) (loc q) with
    | 0 -> Option.compare String.compare p.feature q.feature
    | c -> c
end

module Annotation = struct
  type t = {
    gene_id_label : string ;
    transcript_id_label : string ;
    items : Gff.record list String.Table.t ;
  }

  let of_items ?(gene_id_label = "gene_id") ?(transcript_id_label = "transcript_id") items =
    let items =
      CFStream.Stream.of_list items
      |> CFStream.Stream.filter_map ~f:(function
          | `Comment _ -> None
          | `Record r ->
            match List.Assoc.find r.Biocaml_base.Gff.attributes gene_id_label ~equal:String.equal with
            | Some (id :: _) -> Some (id, r)
            | Some []
            | None -> None
        )
      |> Biocaml_unix.Accu.relation
      |> CFStream.Stream.map ~f:(fun (x, y) -> x, List.rev y)
      |> CFStream.Stream.to_list
      |> String.Table.of_alist_exn
    in
    { gene_id_label ; transcript_id_label ; items }

  let%test_unit "Annotation.of_items tail rec" =
    List.init 1_000_000 ~f:(fun _ -> `Comment "")
    |> of_items
    |> ignore

  let strand_of_entry items =
    let strands =
      List.map items ~f:(fun i -> i.Gff.strand)
      |> List.dedup_and_sort ~compare:Poly.compare
    in
    match strands with
    | [] -> raise (Invalid_argument "strand_of_entry")
    | [ s ] -> Ok s
    | _ -> Or_error.error_string "more than one strand in entry"

  let exons_of_entry items =
    List.filter_map items ~f:(fun r ->
        match r.Gff.feature with
        | Some "exon" -> Some r
        | _ -> None
      )

  let sort_by_attribute ~attr_label items =
    Stream.of_list items
    |> Stream.filter_map ~f:(fun r ->
        match List.Assoc.find r.Biocaml_base.Gff.attributes attr_label ~equal:String.equal with
          | Some (id :: _) -> Some (id, r)
          | Some []
          | None -> None
      )
    |> Biocaml_unix.Accu.relation
    |> Stream.to_list

  let gene_of_entry ~id ~transcript_id_label items =
    match exons_of_entry items with
    | [] -> Ok None
    | exons ->
      let open Or_error.Monad_infix in
      strand_of_entry exons |> Or_error.tag ~tag:id >>= function
      | `Unknown | `Not_stranded -> Or_error.error_string "no defined strand"
      | `Plus | `Minus as strand ->
        let transcripts =
          sort_by_attribute ~attr_label:transcript_id_label exons
          |> List.map ~f:(fun (s, items) -> s, List.map ~f:Record.loc items)
        in
        Gene.make ~strand ~id transcripts
        |> Or_error.map ~f:Option.some

  let genes annot =
    let r = String.Table.create () in
    let errors =
      String.Table.fold annot.items ~init:[] ~f:(fun ~key:id ~data:items errors ->
          match gene_of_entry ~id ~transcript_id_label:annot.transcript_id_label items with
          | Ok (Some g) ->
            String.Table.set r ~key:id ~data:g ;
            errors
          | Ok None -> errors
          | Error e -> (id, e) :: errors
        )
    in
    r, errors

  let utr5' annot =
    String.Table.filter_map annot.items ~f:(fun items ->
        List.find_map items ~f:(fun r ->
            match r.Gff.feature with
            | Some "5UTR" -> Some r
            | _ -> None
          )
      )

  let utr3' annot =
    String.Table.filter_map annot.items ~f:(fun items ->
        List.find_map items ~f:(fun r ->
            match r.Gff.feature with
            | Some "3UTR" -> Some r
            | _ -> None
          )
      )
end