package matrix

  1. Overview
  2. Docs

Source file tokenizer.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
let esc = Char.chr 0x1b
let br_paste_start = "\x1b[200~"
let br_paste_end = "\x1b[201~"
let br_paste_start_len = String.length br_paste_start
let br_paste_end_len = String.length br_paste_end

(* Hard caps to prevent unbounded buffer growth from malformed input. *)
let max_paste_len = 1_048_576 (* 1 MB *)
let max_sequence_len = 4_096 (* 4 KB *)

let br_paste_end_failure =
  let len = br_paste_end_len in
  let fail = Array.make len 0 in
  let j = ref 0 in
  for i = 1 to len - 1 do
    while !j > 0 && br_paste_end.[!j] <> br_paste_end.[i] do
      j := fail.(!j - 1)
    done;
    if br_paste_end.[!j] = br_paste_end.[i] then incr j;
    fail.(i) <- !j
  done;
  fail

type token = Sequence of string | Text of string | Paste of string

type parser = {
  buffer : Buffer.t;
  mutable paste_buffer : bytes;
  mutable paste_len : int;
  mutable paste_match : int;
  mutable flush_deadline : float option;
  mutable mode : [ `Normal | `Paste ];
}

(* Timeout for ambiguous lone ESC (could be ESC key or start of Alt+key) *)
let ambiguity_timeout = 0.050

(* Timeout for clearly incomplete escape sequences (CSI, OSC, DCS, etc.) *)
let incomplete_seq_timeout = 0.100

let schedule_flush t now =
  if t.mode = `Paste || Buffer.length t.buffer = 0 then t.flush_deadline <- None
  else
    let len = Buffer.length t.buffer in
    let delay =
      if len = 1 && Buffer.nth t.buffer 0 = esc then
        (* Lone ESC: ambiguous between ESC key and start of escape sequence *)
        ambiguity_timeout
      else if len >= 2 && Buffer.nth t.buffer 0 = esc then
        (* Incomplete escape sequence - use longer timeout *)
        incomplete_seq_timeout
      else
        (* Plain text with no ESC - shouldn't normally happen but use short *)
        ambiguity_timeout
    in
    t.flush_deadline <- Some (now +. delay)

let create () =
  {
    buffer = Buffer.create 128;
    paste_buffer = Bytes.create 128;
    paste_len = 0;
    paste_match = 0;
    flush_deadline = None;
    mode = `Normal;
  }

let pending t = Bytes.of_string (Buffer.contents t.buffer)

let reset t =
  Buffer.clear t.buffer;
  t.paste_len <- 0;
  t.paste_match <- 0;
  t.mode <- `Normal;
  t.flush_deadline <- None

(* helpers *)

let push_tokens acc tokens =
  (* tokens are in left-to-right order; we build acc in reverse *)
  List.rev_append tokens acc

let add_paste_tokens acc payload =
  let acc = if payload = "" then acc else Paste payload :: acc in
  Sequence br_paste_end :: acc

let has_substring_at s ~sub ~pos =
  let sub_len = String.length sub in
  let limit = String.length s - sub_len in
  if pos < 0 || pos > limit then false
  else
    let rec loop i =
      if i = sub_len then true
      else if s.[pos + i] <> sub.[i] then false
      else loop (i + 1)
    in
    loop 0

let find_substring_from s sub start =
  let sub_len = String.length sub in
  let len = String.length s in
  let limit = len - sub_len in
  let rec scan i =
    if i > limit then -1
    else if has_substring_at s ~sub ~pos:i then i
    else scan (i + 1)
  in
  if sub_len = 0 || start > limit then -1 else scan start

let ensure_paste_capacity t needed =
  let required = t.paste_len + needed in
  if required > Bytes.length t.paste_buffer then (
    let new_cap = max required (Bytes.length t.paste_buffer * 2) in
    let buf = Bytes.create new_cap in
    Bytes.blit t.paste_buffer 0 buf 0 t.paste_len;
    t.paste_buffer <- buf)

let reset_paste_state t =
  t.paste_len <- 0;
  t.paste_match <- 0

let complete_paste t =
  let payload_len = t.paste_len - br_paste_end_len in
  let payload =
    if payload_len <= 0 then ""
    else Bytes.sub_string t.paste_buffer 0 payload_len
  in
  reset_paste_state t;
  t.mode <- `Normal;
  payload

let rec advance_paste_match current c =
  if c = br_paste_end.[current] then current + 1
  else if current = 0 then 0
  else advance_paste_match br_paste_end_failure.(current - 1) c

let add_paste_char t c =
  if t.paste_len < max_paste_len then (
    ensure_paste_capacity t 1;
    Bytes.unsafe_set t.paste_buffer t.paste_len c;
    t.paste_len <- t.paste_len + 1);
  (* Always advance the KMP match state so we detect the end marker even when
     the payload has been truncated. *)
  t.paste_match <- advance_paste_match t.paste_match c;
  t.paste_match = br_paste_end_len

(* escape-sequence parsing *)

let is_csi_final c =
  let code = Char.code c in
  (code >= 0x40 && code <= 0x7e) || code = 0x24 || code = 0x5e

let rec find_st s i len =
  if i + 1 >= len then None
  else if s.[i] = esc && s.[i + 1] = '\\' then Some (i + 2)
  else find_st s (i + 1) len

let find_sequence_end s start len =
  if start + 1 >= len then None
  else
    match s.[start + 1] with
    | '[' ->
        (* Mouse reporting: ESC [ M ... (3 bytes after M) *)
        if start + 2 < len && s.[start + 2] = 'M' then
          let expected = start + 6 in
          if expected <= len then Some expected else None
        else
          let rec loop i =
            if i >= len then None
            else if is_csi_final s.[i] then
              if (s.[i] = '$' || s.[i] = '^') && i + 1 < len then loop (i + 1)
              else Some (i + 1)
            else loop (i + 1)
          in
          loop (start + 2)
    | ']' ->
        (* OSC terminates with BEL or ST (ESC \) *)
        let rec loop i =
          if i >= len then None
          else
            let c = s.[i] in
            if c = '\x07' then Some (i + 1)
            else if c = esc && i + 1 < len && s.[i + 1] = '\\' then Some (i + 2)
            else loop (i + 1)
        in
        loop (start + 2)
    | 'P' | '_' ->
        (* DCS / APC, terminated by ST *)
        find_st s (start + 2) len
    | 'O' ->
        (* SS3: ESC O <char> *)
        if start + 2 < len then Some (start + 3) else None
    | _ ->
        (* Generic short escape: ESC X *)
        Some (start + 2)

let extract_sequences_from s =
  let len = String.length s in
  let rec loop pos acc =
    if pos >= len then (List.rev acc, "")
    else
      let c = s.[pos] in
      if c = esc then
        match find_sequence_end s pos len with
        | None ->
            (* incomplete sequence: keep the rest for later *)
            (List.rev acc, String.sub s pos (len - pos))
        | Some end_pos ->
            let seq = String.sub s pos (end_pos - pos) in
            loop end_pos (Sequence seq :: acc)
      else
        (* run of plain text until next ESC or end *)
        let rec find_esc i =
          if i >= len then len else if s.[i] = esc then i else find_esc (i + 1)
        in
        let stop = find_esc (pos + 1) in
        let txt = String.sub s pos (stop - pos) in
        loop stop (Text txt :: acc)
  in
  loop 0 []

(* state machine *)

let consume_paste_from_string t s start stop acc =
  if start >= stop then (acc, None)
  else
    let rec loop i acc =
      if i >= stop then (acc, None)
      else
        let matched = add_paste_char t s.[i] in
        if matched then
          let payload = complete_paste t in
          let acc = add_paste_tokens acc payload in
          (acc, Some (i + 1))
        else loop (i + 1) acc
    in
    loop start acc

let consume_paste_from_bytes t bytes start stop acc =
  if start >= stop then (acc, None)
  else
    let rec loop i acc =
      if i >= stop then (acc, None)
      else
        let matched = add_paste_char t (Bytes.unsafe_get bytes i) in
        if matched then
          let payload = complete_paste t in
          let acc = add_paste_tokens acc payload in
          (acc, Some (i + 1))
        else loop (i + 1) acc
    in
    loop start acc

let rec process t now acc =
  if t.mode = `Paste then List.rev acc
  else if Buffer.length t.buffer = 0 then List.rev acc
  else
    let buf_str = Buffer.contents t.buffer in
    Buffer.clear t.buffer;
    let len = String.length buf_str in
    let start_idx = find_substring_from buf_str br_paste_start 0 in
    if start_idx < 0 then
      let seqs, rem = extract_sequences_from buf_str in
      if rem <> "" then
        if String.length rem > max_sequence_len then (
          (* Incomplete sequence exceeded the safety cap — treat as plain text
             rather than buffering without bound. *)
          t.flush_deadline <- None;
          let acc = push_tokens acc seqs in
          List.rev (Text rem :: acc))
        else (
          Buffer.add_string t.buffer rem;
          schedule_flush t now;
          let acc = push_tokens acc seqs in
          List.rev acc)
      else (
        t.flush_deadline <- None;
        let acc = push_tokens acc seqs in
        List.rev acc)
    else
      let before = String.sub buf_str 0 start_idx in
      let after_start = start_idx + br_paste_start_len in
      let after_len = len - after_start in
      let after =
        if after_len > 0 then String.sub buf_str after_start after_len else ""
      in
      let seqs, rem = extract_sequences_from before in
      reset_paste_state t;
      t.mode <- `Paste;
      t.flush_deadline <- None;
      let acc = push_tokens acc seqs in
      let acc = Sequence br_paste_start :: acc in
      let acc, rem_stop =
        if rem = "" then (acc, None)
        else consume_paste_from_string t rem 0 (String.length rem) acc
      in
      if t.mode = `Normal then (
        (match rem_stop with
        | Some idx when idx < String.length rem ->
            Buffer.add_substring t.buffer rem idx (String.length rem - idx)
        | _ -> ());
        if after <> "" then Buffer.add_string t.buffer after;
        t.flush_deadline <- None;
        process t now acc)
      else
        let acc, after_stop =
          if after = "" then (acc, None)
          else consume_paste_from_string t after 0 (String.length after) acc
        in
        if t.mode = `Normal then (
          (match after_stop with
          | Some idx when idx < String.length after ->
              Buffer.add_substring t.buffer after idx (String.length after - idx)
          | _ -> ());
          t.flush_deadline <- None;
          process t now acc)
        else List.rev acc

let feed t bytes off len ~now =
  if off < 0 || len < 0 || off + len > Bytes.length bytes then
    invalid_arg "Input_tokenizer.feed: out of bounds";
  if t.mode = `Paste then (
    let acc, stop_opt = consume_paste_from_bytes t bytes off (off + len) [] in
    match stop_opt with
    | None -> List.rev acc
    | Some stop ->
        let remaining = off + len - stop in
        if remaining > 0 then Buffer.add_subbytes t.buffer bytes stop remaining;
        t.flush_deadline <- None;
        process t now acc)
  else (
    Buffer.add_subbytes t.buffer bytes off len;
    t.flush_deadline <- None;
    process t now [])

let deadline t = t.flush_deadline

let flush_expired t now =
  match t.flush_deadline with
  | Some expiry when now >= expiry && t.mode = `Normal ->
      t.flush_deadline <- None;
      if Buffer.length t.buffer = 0 then []
      else
        let leftover = Buffer.contents t.buffer in
        Buffer.clear t.buffer;
        if leftover = "" then [] else [ Sequence leftover ]
  | _ -> []