package SZXX

  1. Overview
  2. Docs

Source file parsing.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
open! Base
open Angstrom

let ( .*{} ) bs pos = Bigstringaf.unsafe_get bs pos |> Char.to_int

let return_true = return true

let return_false = return false

let return_none = return None

let return_unit = return ()

let return_nil = return []

let fail_pattern = fail "pattern"

let fail_backtrack = fail "backtrack"

(** More efficient than [Angstrom.skip_many]. See https://github.com/inhabitedtype/angstrom/pull/219 *)
let skip_many p =
  fix (fun m ->
    p >>= (fun _ -> return_true) <|> return_false >>= function
    | true -> m
    | false -> return_unit )

(* Runs [p] until it returns a [Some], then return that *)
let skip_find p =
  fix (fun m ->
    let found = ref None in
    p
    >>= (function
          | None -> return_true
          | Some _ as x ->
            found := x;
            return_false)
    <|> return_false
    >>= function
    | true -> m
    | false -> return !found )

(* Same as [skip_find] but backtracks to the position right before it received [Some] *)
let skip_find_backtrack p =
  fix (fun m ->
    let found = ref None in
    p
    >>= (function
          | None -> return_true
          | Some _ as x ->
            found := x;
            fail_backtrack)
    <|> return_false
    >>= function
    | true -> m
    | false -> return !found )

let skip_n_times n p =
  if n <= 0
  then return_unit
  else
    fix (fun m ->
      let count = ref 0 in
      let* _ = p in
      Int.incr count;
      if !count < n then m else return_unit )

(* Faster version of Angstrom.count *)
let count n p =
  if n <= 0
  then return_nil
  else (
    let rec loop = function
      | 0 -> return_nil
      | n -> lift2 List.cons p (loop (n - 1))
    in
    loop n )

let peek_with len f =
  Unsafe.peek len f <|> (advance len *> fail_backtrack <|> return_unit) *> Unsafe.peek len f

let equal_sub buf1 ~pos1 buf2 ~pos2 ~len =
  let i = ref 0 in
  while !i < len && buf1.*{pos1 + !i} = buf2.*{pos2 + !i} do
    Int.incr i
  done;
  !i = len

let string s =
  let len = String.length s in
  let bs = Bigstringaf.of_string s ~off:0 ~len in
  Unsafe.take len (fun buf ~off ~len ->
    if equal_sub buf ~pos1:off bs ~pos2:0 ~len then return_unit else fail_pattern )
  >>= Fn.id

let bigstring bs =
  Unsafe.take (Bigstringaf.length bs) (fun buf ~off ~len ->
    if equal_sub buf ~pos1:off bs ~pos2:0 ~len then return_unit else fail_pattern )
  >>= Fn.id

let drop p = p *> return_unit

(** Fastest benchmarked way to do this. Inspired by [Cstruct.ffind_sub] *)
let sub_index ~pattern ~patlen bs ~off ~len =
  if patlen > len
  then None
  else (
    let max_zidx_sub = patlen - 1 in
    let max_zidx_s = off + len - patlen in
    let rec loop i k =
      if i > max_zidx_s
      then None
      else if k > max_zidx_sub
      then Some (i - off)
      else if k > 0
      then if pattern.*{k} = bs.*{i + k} then loop i (k + 1) else loop (i + 1) 0
      else if pattern.*{0} = bs.*{i}
      then loop i 1
      else loop (i + 1) 0
    in
    loop off 0 )

module Storage = struct
  type t = {
    add: Bigstringaf.t -> off:int -> len:int -> unit;
    finalize: unit -> unit;
    commit: unit Angstrom.t;
  }

  let noop_backtrack =
    { add = (fun _ ~off:_ ~len:_ -> ()); finalize = (fun () -> ()); commit = return_unit }

  let noop = { add = (fun _ ~off:_ ~len:_ -> ()); finalize = (fun () -> ()); commit }
end

(* Boyer–Moore–Horspool algorithm *)
module BMH = struct
  type t =
    | Found
    | Shift of int
    | Restart

  let make_table ~pattern ~patlen =
    let table = Array.create ~len:256 Restart in
    for i = 0 to patlen - 2 do
      table.(pattern.*{i}) <- Shift (patlen - (i + 1))
    done;
    table

  let run ~pattern ~patlen table bs =
    let rec loop = function
      | -1 -> Found
      | i when bs.*{i} = pattern.*{i} -> (loop [@tailcall]) (i - 1)
      | _ -> table.(bs.*{patlen - 1})
    in
    loop (patlen - 1)
end

let bounded_file_reader ~slice_size ~pattern Storage.{ add; finalize; commit } =
  let patlen = String.length pattern in
  let pattern = Bigstringaf.of_string pattern ~off:0 ~len:patlen in
  let rec slow_path table window =
    match BMH.run ~pattern ~patlen table window with
    | Found ->
      finalize ();
      commit
    | Shift by ->
      add ~off:0 ~len:by window;
      let diff = patlen - by in
      Bigstringaf.unsafe_blit window ~src_off:by window ~dst_off:0 ~len:diff;
      let* () =
        Unsafe.take by (fun buf ~off ~len ->
          Bigstringaf.unsafe_blit buf ~src_off:off window ~dst_off:diff ~len )
      in
      (slow_path [@tailcall]) table window
    | Restart ->
      add ~off:0 ~len:patlen window;
      let* () =
        Unsafe.take patlen (fun buf ~off ~len ->
          Bigstringaf.unsafe_blit buf ~src_off:off window ~dst_off:0 ~len )
      in
      (slow_path [@tailcall]) table window
  in

  let fast_path =
    peek_with (slice_size + patlen) (sub_index ~pattern ~patlen) >>= function
    | Some _ as x -> return x
    | None -> Unsafe.take slice_size add *> commit *> return_none
  in

  skip_find_backtrack fast_path >>= function
  | None ->
    (* This branch is hit when the fast path fails due to the remaining
       length being less than the slice_size. That's why the slow_path
       cannot be using a large Unsafe.peek/take: we don't know how much
       is left. *)
    take_bigstring patlen >>= slow_path (BMH.make_table ~pattern ~patlen)
  | Some at ->
    ((if at > 0 then Unsafe.take at add else return_unit) >>| finalize) *> commit *> bigstring pattern

let take_until_pattern ~slice_size ~pattern =
  let storage buf partial staged =
    Storage.
      {
        add =
          (fun bs ~off ~len ->
            if !staged + len > slice_size
            then (
              Buffer.add_subbytes buf partial ~pos:0 ~len:!staged;
              staged := 0 );
            Bigstringaf.unsafe_blit_to_bytes bs ~src_off:off partial ~dst_off:!staged ~len;
            staged := !staged + len);
        finalize = (fun () -> if !staged > 0 then Buffer.add_subbytes buf partial ~pos:0 ~len:!staged);
        commit = return_unit;
      }
  in
  let* () = return_unit in
  let buf = Buffer.create 32 in
  let partial = Bytes.create slice_size in
  let staged = ref 0 in
  let+ () = bounded_file_reader ~slice_size ~pattern (storage buf partial staged) in
  Buffer.contents buf