package quickjs

  1. Overview
  2. Docs

Source file RegExp.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
type t = {
  bc : Unsigned.uint8 Ctypes_static.ptr;
  source : string;
  flags : int;
  mutable lastIndex : int;
}

type matchResult = {
  captures : string array;
  input : string;
  index : int;
  groups : (string * string) list;
}

type result = (matchResult, string) Stdlib.result

type compile_error =
  [ `Unexpected_end
  | `Malformed_unicode_char
  | `Invalid_escape_sequence
  | `Nothing_to_repeat
  | `Unknown of string ]

let compile_error_to_string = function
  | `Unexpected_end -> "unexpected end"
  | `Malformed_unicode_char -> "malformed unicode char"
  | `Invalid_escape_sequence -> "invalid escape sequence"
  | `Nothing_to_repeat -> "nothing to repeat"
  | `Unknown s -> s

(* #define LRE_FLAG_GLOBAL (1 << 0) *)
let lre_flag_global = 0b01

(* #define LRE_FLAG_IGNORECASE (1 << 1) *)
let lre_flag_ignorecase = 0b10

(* #define LRE_FLAG_MULTILINE (1 << 2) *)
let lre_flag_multiline = 0b100

(* #define LRE_FLAG_DOTALL (1 << 3) *)
let lre_flag_dotall = 0b1000

(* #define LRE_FLAG_UNICODE (1 << 4) *)
let lre_flag_unicode = 0b10000

(* #define LRE_FLAG_STICKY (1 << 5) *)
let lre_flag_sticky = 0b100000
let has_flag flags flag = flags land flag != 0
let global regexp = has_flag regexp.flags lre_flag_global
let ignorecase regexp = has_flag regexp.flags lre_flag_ignorecase
let multiline regexp = has_flag regexp.flags lre_flag_multiline
let dotall regexp = has_flag regexp.flags lre_flag_dotall
let sticky regexp = has_flag regexp.flags lre_flag_sticky
let unicode regexp = has_flag regexp.flags lre_flag_unicode

let parse_flags flags =
  let rec parse_flags' flags acc =
    match flags with
    | [] -> acc
    | 'g' :: rest -> parse_flags' rest (acc lor lre_flag_global)
    | 'i' :: rest -> parse_flags' rest (acc lor lre_flag_ignorecase)
    | 'm' :: rest -> parse_flags' rest (acc lor lre_flag_multiline)
    | 's' :: rest -> parse_flags' rest (acc lor lre_flag_dotall)
    | 'u' :: rest -> parse_flags' rest (acc lor lre_flag_unicode)
    | 'y' :: rest -> parse_flags' rest (acc lor lre_flag_sticky)
    | _ :: rest -> parse_flags' rest acc
  in
  parse_flags' (Stdlib.String.to_seq flags |> List.of_seq) 0

let flags_to_string flags =
  let rec flags_to_string' flags acc =
    match flags with
    | 0 -> acc
    | _ when has_flag flags lre_flag_global ->
        flags_to_string' (flags land lre_flag_global lxor flags) (acc ^ "g")
    | _ when has_flag flags lre_flag_ignorecase ->
        flags_to_string' (flags land lre_flag_ignorecase lxor flags) (acc ^ "i")
    | _ when has_flag flags lre_flag_multiline ->
        flags_to_string' (flags land lre_flag_multiline lxor flags) (acc ^ "m")
    | _ when has_flag flags lre_flag_dotall ->
        flags_to_string' (flags land lre_flag_dotall lxor flags) (acc ^ "s")
    | _ when has_flag flags lre_flag_unicode ->
        flags_to_string' (flags land lre_flag_unicode lxor flags) (acc ^ "u")
    | _ when has_flag flags lre_flag_sticky ->
        flags_to_string' (flags land lre_flag_sticky lxor flags) (acc ^ "y")
    | _ -> acc
  in
  flags_to_string' flags ""

let strlen ptr =
  let rec aux ptr len =
    let c = Ctypes.( !@ ) ptr in
    if c = char_of_int 0 then len else aux (Ctypes.( +@ ) ptr 1) (len + 1)
  in
  aux ptr 0

(* Check if a string contains non-ASCII bytes that require Unicode mode in libregexp.
   Any byte >= 0x80 indicates multi-byte UTF-8 which needs Unicode mode for proper matching. *)
let needs_unicode_mode s =
  let len = Stdlib.String.length s in
  let rec check i =
    if i >= len then false
    else
      let byte = Char.code (Stdlib.String.get s i) in
      if byte >= 0x80 then true else check (i + 1)
  in
  check 0

let compile ~flags re =
  let compiled_byte_code_len = Ctypes.allocate Ctypes.int 0 in
  let size_of_error_msg = 64 in
  let error_msg = Ctypes.allocate_n ~count:size_of_error_msg Ctypes.char in
  let input = Ctypes.ocaml_string_start re in
  let input_length = Stdlib.String.length re |> Unsigned.Size_t.of_int in
  let parsed_flags = parse_flags flags in
  (* Auto-enable Unicode mode for patterns containing 4-byte UTF-8 sequences
     (code points >= U+10000, like emojis). libregexp requires this. *)
  let flags =
    if needs_unicode_mode re then parsed_flags lor lre_flag_unicode
    else parsed_flags
  in
  let compiled_byte_code =
    Libregexp.compile compiled_byte_code_len error_msg size_of_error_msg input
      input_length flags Ctypes.null
  in
  match Ctypes.is_null compiled_byte_code with
  | false -> Ok { bc = compiled_byte_code; flags; lastIndex = 0; source = re }
  | true ->
      let length = strlen error_msg in
      let error = Ctypes.string_from_ptr ~length error_msg in
      Error
        (match error with
        | "unexpected end" -> `Unexpected_end
        | "malformed unicode char" -> `Malformed_unicode_char
        | "nothing to repeat" -> `Nothing_to_repeat
        | "invalid escape sequence in regular expression" ->
            `Invalid_escape_sequence
        | unknown -> `Unknown unknown)

let index result = match result with Ok result -> result.index | Error _ -> 0
let lastIndex regexp = regexp.lastIndex
let source regexp = regexp.source
let input result = match result with Ok result -> result.input | Error _ -> ""
let setLastIndex regexp lastIndex = regexp.lastIndex <- lastIndex

let captures result =
  match result with Ok result -> result.captures | Error _ -> [||]

let groups result =
  match result with Ok result -> result.groups | Error _ -> []

let group name result =
  match result with
  | Ok result -> List.assoc_opt name result.groups
  | Error _ -> None

let flags regexp = flags_to_string regexp.flags

(* Convert UTF-8 string to UTF-16 code units (as uint8_t pairs, little-endian) *)
let utf8_to_utf16_bytes s =
  let decoder = Uutf.decoder ~encoding:`UTF_8 (`String s) in
  let buf = Buffer.create (Stdlib.String.length s * 2) in
  let add_u16 code =
    Buffer.add_char buf (Char.chr (code land 0xFF));
    Buffer.add_char buf (Char.chr ((code lsr 8) land 0xFF))
  in
  let rec loop () =
    match Uutf.decode decoder with
    | `Uchar u ->
        let code = Uchar.to_int u in
        (if code < 0x10000 then add_u16 code
         else
           (* Surrogate pair for code points >= 0x10000 *)
           let code' = code - 0x10000 in
           add_u16 (0xD800 lor (code' lsr 10));
           add_u16 (0xDC00 lor (code' land 0x3FF)));
        loop ()
    | `End -> Buffer.contents buf
    | `Malformed _ ->
        add_u16 0xFFFD;
        (* Replacement character *)
        loop ()
    | `Await -> assert false
  in
  loop ()

(* Build a mapping from UTF-16 code unit index to UTF-8 byte index *)
let build_utf16_to_utf8_map s =
  let decoder = Uutf.decoder ~encoding:`UTF_8 (`String s) in
  let map = ref [] in
  let utf16_idx = ref 0 in
  let rec loop () =
    let byte_idx = Uutf.decoder_byte_count decoder in
    match Uutf.decode decoder with
    | `Uchar u ->
        map := (!utf16_idx, byte_idx) :: !map;
        let code = Uchar.to_int u in
        if code < 0x10000 then incr utf16_idx else utf16_idx := !utf16_idx + 2;
        (* Surrogate pair *)
        loop ()
    | `End ->
        map := (!utf16_idx, byte_idx) :: !map;
        Array.of_list (List.rev !map)
    | `Malformed _ ->
        map := (!utf16_idx, byte_idx) :: !map;
        incr utf16_idx;
        loop ()
    | `Await -> assert false
  in
  loop ()

(* Convert UTF-16 index to UTF-8 byte index using the map *)
let utf16_to_utf8_index map utf16_idx =
  (* Binary search or linear scan *)
  let rec find i =
    if i >= Array.length map then snd map.(Array.length map - 1)
    else
      let u16, u8 = map.(i) in
      if u16 = utf16_idx then u8
      else if u16 > utf16_idx then if i = 0 then 0 else snd map.(i - 1)
      else find (i + 1)
  in
  find 0

(* Convert UTF-8 byte index to UTF-16 code unit index *)
let utf8_to_utf16_index s utf8_idx =
  let decoder = Uutf.decoder ~encoding:`UTF_8 (`String s) in
  let utf16_idx = ref 0 in
  let rec loop () =
    let byte_idx = Uutf.decoder_byte_count decoder in
    if byte_idx >= utf8_idx then !utf16_idx
    else
      match Uutf.decode decoder with
      | `Uchar u ->
          let code = Uchar.to_int u in
          if code < 0x10000 then incr utf16_idx else utf16_idx := !utf16_idx + 2;
          (* Surrogate pair *)
          loop ()
      | `End -> !utf16_idx
      | `Malformed _ ->
          incr utf16_idx;
          loop ()
      | `Await -> assert false
  in
  loop ()

(* exec is not a binding to lre_exec but an implementation of `js_regexp_exec` *)
let exec regexp input =
  let capture_count = Libregexp.get_capture_count regexp.bc in
  let capture_size = capture_count * 2 in
  let capture = Ctypes.CArray.make (Ctypes.ptr Ctypes.uint8_t) capture_size in
  let start_capture = Ctypes.CArray.start capture in

  (* Check if we need Unicode mode (pattern has non-ASCII or unicode flag) *)
  let use_unicode =
    unicode regexp
    || needs_unicode_mode regexp.source
    || needs_unicode_mode input
  in

  (* Create buffer and keep it alive - bufp must not go out of scope before lre_exec *)
  let bufp, matching_length, shift, utf16_map =
    if use_unicode then
      (* Convert UTF-8 input to UTF-16 for proper Unicode matching *)
      let utf16_str = utf8_to_utf16_bytes input in
      let utf16_len = Stdlib.String.length utf16_str in
      let bytes_list =
        List.init utf16_len (fun i ->
            Unsigned.UInt8.of_int (Char.code (Stdlib.String.get utf16_str i)))
      in
      let bufp = Ctypes.CArray.of_list Ctypes.uint8_t bytes_list in
      let map = build_utf16_to_utf8_map input in
      (bufp, utf16_len / 2, 1, Some map)
    else
      (* ASCII-only: use bytes directly *)
      let bytes_list =
        List.init (Stdlib.String.length input) (fun i ->
            Unsigned.UInt8.of_int (Char.code (Stdlib.String.get input i)))
      in
      let bufp = Ctypes.CArray.of_list Ctypes.uint8_t bytes_list in
      (bufp, Stdlib.String.length input, 0, None)
  in
  let buffer = Ctypes.CArray.start bufp in

  let lastIndex =
    match global regexp || sticky regexp with
    | true ->
        if use_unicode then
          (* Convert UTF-8 byte position to UTF-16 code unit position *)
          utf8_to_utf16_index input regexp.lastIndex
        else regexp.lastIndex
    | false -> 0
  in

  (* Check if lastIndex is beyond string length (QuickJS does this check) *)
  if lastIndex > matching_length then (
    (* No match possible - reset lastIndex and return empty result *)
    (match sticky regexp || global regexp with
    | true -> regexp.lastIndex <- 0
    | false -> ());
    Ok { captures = [||]; input; index = 0; groups = [] })
  else
    let exec_result =
      Libregexp.exec start_capture regexp.bc buffer lastIndex matching_length
        shift Ctypes.null
    in
    (* Keep bufp alive until after lre_exec completes *)
    let _ = bufp in

    match exec_result with
    | 1 ->
        let substrings = Array.make capture_count "" in
        let i = ref 0 in
        let index = ref 0 in
        let groups = ref [] in
        let group_name_ptr = ref (Libregexp.get_groupnames regexp.bc) in
        while !i < capture_size - 1 do
          let start_ptr = Ctypes.CArray.get capture !i in
          let end_ptr = Ctypes.CArray.get capture (!i + 1) in
          let raw_start = Ctypes.ptr_diff buffer start_ptr in
          let raw_length = Ctypes.ptr_diff start_ptr end_ptr in

          (* Convert indices based on mode *)
          let start_index, length =
            match utf16_map with
            | Some map ->
                (* UTF-16 mode: convert indices back to UTF-8 byte positions *)
                let u16_start = raw_start / 2 in
                let u16_end = u16_start + (raw_length / 2) in
                let u8_start = utf16_to_utf8_index map u16_start in
                let u8_end = utf16_to_utf8_index map u16_end in
                (u8_start, u8_end - u8_start)
            | None ->
                (* ASCII mode: indices are byte positions *)
                (raw_start, raw_length)
          in

          (* Only set index on first capture (the full match) *)
          if !i = 0 then index := start_index;
          let substring =
            match Stdlib.String.sub input start_index length with
            | sub -> sub
            | exception _ -> ""
          in
          (* Store the captured substring *)
          substrings.(!i / 2) <- substring;
          (* Update the lastIndex *)
          regexp.lastIndex <- start_index + length;

          (* if (\*group_name_ptr) { *)
          (match !group_name_ptr with
          (* if (group_name_ptr && i > 0) { *)
          | Some pointer when !i > 0 ->
              (*
              if (JS_DefinePropertyValueStr(ctx, groups, group_name_ptr, JS_DupValue(ctx, val), prop_flags) < 0) {
                  JS_FreeValue(ctx, val);
                  goto fail;
              }
            *)
              (* store the group name and its captured value, but only if named *)
              let name_len = strlen pointer in
              (if name_len > 0 then
                 let current_group_name =
                   Ctypes.string_from_ptr ~length:name_len pointer
                 in
                 groups := (current_group_name, substring) :: !groups);
              (* group_name_ptr += strlen(group_name_ptr) + 1; *)
              let next_group_name_ptr = Ctypes.( +@ ) pointer (name_len + 1) in
              if Ctypes.is_null next_group_name_ptr then group_name_ptr := None
              else group_name_ptr := Some next_group_name_ptr
          | None | Some _ -> ());
          (* Incement the index *)
          i := !i + 2
        done;
        Ok { captures = substrings; input; index = !index; groups = !groups }
    | 0 ->
        (* When there's no matches left, lastIndex goes back to 0 *)
        (match sticky regexp || global regexp with
        | true -> regexp.lastIndex <- 0
        | false -> ());
        Ok { captures = [||]; input; index = 0; groups = [] }
    | _ (* -1 *) -> Error "Error"

let test regexp input =
  let result = exec regexp input in
  match result with
  | Ok result -> Array.length result.captures > 0
  | Error _ -> false