Source file tokenizer.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
open Common
type token = { ending : int; scopes : string list }
let ending token = token.ending
let scopes token = token.scopes
type stack_elem = {
stack_delim : delim;
stack_region : Oniguruma.Region.t;
stack_begin_line : string;
stack_grammar : grammar;
stack_repos : (string, repo_item) Hashtbl.t list;
stack_scopes : string list;
stack_prev_scopes : string list;
}
type stack = stack_elem list
let empty = []
let rec add_scopes scopes = function
| [] -> scopes
| None :: xs -> add_scopes scopes xs
| Some x :: xs -> add_scopes (x :: scopes) xs
let next_pats grammar = function
| [] -> grammar.patterns
| s :: _ -> s.stack_delim.delim_patterns
let is_special = function
| '|' | '.' | '*' | '+' | '?' | '^' | '$' | '-' | ':' | '~' | '#' | '&' | '('
| ')' | '[' | ']' | '{' | '}' | '<' | '>' | '\\' | '\'' ->
true
| _ -> false
let insert_capture buf line beg end_ =
let rec loop i =
if i = end_ then ()
else
let ch = line.[i] in
if is_special ch then Buffer.add_char buf '\\';
Buffer.add_char buf ch;
loop (i + 1)
in
loop beg
let subst_backrefs stack_top =
let {
stack_delim = { delim_end = regex_str; delim_begin = begin_re; _ };
stack_begin_line = line;
stack_region = region;
_;
} =
stack_top
in
let buf = Buffer.create (String.length regex_str) in
let num_beg_captures = Oniguruma.num_captures begin_re in
let rec loop i escaped =
if i < String.length regex_str then
match (regex_str.[i], escaped) with
| '\\', true ->
Buffer.add_string buf "\\\\";
loop (i + 1) false
| '\\', false -> loop (i + 1) true
| char, true ->
if char >= '0' && char <= '9' then (
let idx = Char.code char - Char.code '0' in
if idx < num_beg_captures then
let beg = Oniguruma.Region.capture_beg region idx in
let end_ = Oniguruma.Region.capture_end region idx in
if beg <> -1 then insert_capture buf line beg end_)
else (
Buffer.add_char buf '\\';
Buffer.add_char buf char);
loop (i + 1) false
| char, false ->
Buffer.add_char buf char;
loop (i + 1) false
in
loop 0 false;
Buffer.contents buf
let match_subst se =
match
Oniguruma.create (subst_backrefs se) Oniguruma.Options.none
Oniguruma.Encoding.utf8 Oniguruma.Syntax.default
with
| Error e -> error ("End pattern: " ^ se.stack_delim.delim_end ^ ": " ^ e)
| Ok re -> re
let rec find_nested scope = function
| [] -> None
| repo :: repos -> (
match Hashtbl.find_opt repo scope with
| Some x -> Some x
| None -> find_nested scope repos)
let remove_empties =
let rec go acc = function
| [] -> acc
| tok :: toks ->
let prev =
match toks with
| [] -> 0
| tok :: _ -> tok.ending
in
if tok.ending = prev then go acc toks else go (tok :: acc) toks
in
go []
let handle_captures re scopes default mat_start mat_end region captures tokens
=
let captures =
Array.concat
(Hashtbl.fold
(fun k capture acc ->
let captures =
match k with
| Capture_idx idx -> [| (idx, capture) |]
| Capture_name str ->
Array.map
(fun idx -> (idx, capture))
(Oniguruma.name_to_group_numbers re str)
in
captures :: acc)
captures [])
in
let captures = Array.to_list captures in
let captures =
List.filter_map
(fun (idx, capture) ->
if idx < 0 || idx >= Oniguruma.Region.length region then None
else
let beg = Oniguruma.Region.capture_beg region idx in
let end_ = Oniguruma.Region.capture_end region idx in
Some (capture, beg, end_))
captures
in
let captures =
List.stable_sort
(fun (_, a, b) (_, c, d) -> compare (a, b) (d, c))
captures
in
let _, _, stack, tokens =
List.fold_left
(fun (prev_idx, start, stack, tokens) (capture, cap_start, cap_end) ->
if cap_start = -1 then
(prev_idx, start, stack, tokens)
else
let rec pop prev_idx start tokens = function
| [] ->
let ending = if prev_idx > start then prev_idx else start in
( ending,
{ scopes = add_scopes scopes [ default ]; ending } :: tokens,
[] )
| (ending, scopes) :: stack' as stack ->
if start >= ending then
let ending = if prev_idx > ending then prev_idx else ending in
pop ending start ({ scopes; ending } :: tokens) stack'
else
let ending = if prev_idx > start then prev_idx else start in
(ending, { scopes; ending } :: tokens, stack)
in
let cap_start = if cap_start < start then start else cap_start in
let cap_end = if cap_end > mat_end then mat_end else cap_end in
let prev_idx, tokens, stack = pop prev_idx cap_start tokens stack in
( prev_idx,
cap_start,
(cap_end, add_scopes scopes [ capture.capture_name ]) :: stack,
tokens ))
(mat_start, mat_start, [], tokens)
captures
in
let rec pop tokens = function
| [] -> tokens
| (ending, scopes) :: stack -> pop ({ scopes; ending } :: tokens) stack
in
pop tokens stack
let get_whiles =
let rec loop acc = function
| [] -> acc
| se :: stack -> (
match se.delim_kind with
| End -> loop acc stack
| While -> loop acc stack)
in
loop []
let rec match_line ~t ~grammar ~stack ~pos ~toks ~line rem_pats =
let len = String.length line in
let scopes, stk_pats, repos, cur_grammar =
match stack with
| [] ->
( [ grammar.scope_name ],
grammar.patterns,
[ grammar.repository ],
grammar )
| se :: _ ->
let d = se.stack_delim in
(se.stack_scopes, d.delim_patterns, se.stack_repos, se.stack_grammar)
in
let rec try_pats repos cur_grammar ~k = function
| [] -> k ()
| Match m :: pats -> (
let match_result =
Oniguruma.match_ m.pattern line pos Oniguruma.Options.none
in
match match_result with
| None -> try_pats repos cur_grammar ~k pats
| Some region ->
let start = Oniguruma.Region.capture_beg region 0 in
let end_ = Oniguruma.Region.capture_end region 0 in
assert (start = pos);
let toks = { scopes; ending = pos } :: toks in
let toks =
handle_captures m.pattern scopes m.name pos end_ region m.captures
toks
in
let toks =
{ scopes = add_scopes scopes [ m.name ]; ending = end_ } :: toks
in
match_line ~t ~grammar ~stack ~pos:end_ ~toks ~line
(next_pats grammar stack))
| Delim d :: pats -> (
let match_result =
Oniguruma.match_ d.delim_begin line pos Oniguruma.Options.none
in
match match_result with
| None -> try_pats repos cur_grammar ~k pats
| Some region -> (
let start = Oniguruma.Region.capture_beg region 0 in
let end_ = Oniguruma.Region.capture_end region 0 in
assert (start = pos);
let toks = { scopes; ending = pos } :: toks in
let toks =
handle_captures d.delim_begin scopes d.delim_name pos end_ region
d.delim_begin_captures toks
in
let toks =
{ scopes = add_scopes scopes [ d.delim_name ]; ending = end_ }
:: toks
in
let se =
{
stack_delim = d;
stack_region = region;
stack_begin_line = line;
stack_repos = repos;
stack_grammar = cur_grammar;
stack_scopes =
add_scopes scopes [ d.delim_name; d.delim_content_name ];
stack_prev_scopes = scopes;
}
in
match d.delim_kind with
| End ->
match_line ~t ~grammar ~stack:(se :: stack) ~pos:end_ ~toks ~line
d.delim_patterns
| While ->
( remove_empties
({
scopes =
add_scopes scopes [ d.delim_name; d.delim_content_name ];
ending = len;
}
:: toks),
se :: stack )))
| Include_scope name :: pats -> (
match find_by_scope_name t name with
| None ->
try_pats repos cur_grammar ~k pats
| Some nested_grammar ->
let k () = try_pats repos cur_grammar ~k pats in
try_pats
[ nested_grammar.repository ]
nested_grammar nested_grammar.patterns ~k)
| Include_base :: pats ->
let k () = try_pats repos cur_grammar ~k pats in
try_pats [ grammar.repository ] grammar grammar.patterns ~k
| Include_self :: pats ->
let k () = try_pats repos cur_grammar ~k pats in
try_pats [ cur_grammar.repository ] cur_grammar cur_grammar.patterns ~k
| Include_local key :: pats -> (
match find_nested key repos with
| None -> error ("Unknown repository key " ^ key ^ ".")
| Some item -> (
match item.repo_item_kind with
| Repo_rule rule ->
try_pats (item.repo_inner :: repos) cur_grammar (rule :: pats) ~k
| Repo_patterns pats' ->
let k () = try_pats repos cur_grammar ~k pats in
try_pats (item.repo_inner :: repos) cur_grammar pats' ~k))
in
let try_delim stack_top stack' ~k =
let delim = stack_top.stack_delim in
let end_match =
let re = match_subst stack_top in
match Oniguruma.match_ re line pos Oniguruma.Options.none with
| None -> None
| Some region ->
let start = Oniguruma.Region.capture_beg region 0 in
let end_ = Oniguruma.Region.capture_end region 0 in
assert (start = pos);
let toks =
{
scopes =
add_scopes stack_top.stack_prev_scopes
[ delim.delim_name; delim.delim_content_name ];
ending = pos;
}
:: toks
in
let toks =
handle_captures re stack_top.stack_prev_scopes delim.delim_name pos
end_ region delim.delim_end_captures toks
in
Some (end_, toks)
in
match (delim.delim_kind, end_match) with
| End, None -> k ()
| End, Some (end_, toks) ->
let toks =
{ scopes = add_scopes scopes [ delim.delim_name ]; ending = end_ }
:: toks
in
match_line ~t ~grammar ~stack:stack' ~pos:end_ ~toks ~line
(next_pats grammar stack')
| While, _ -> error "Unreachable"
in
if pos > len then
match stack with
| [] -> (remove_empties ({ scopes; ending = len } :: toks), stack)
| se :: _stack' ->
let d = se.stack_delim in
( remove_empties
({ scopes = add_scopes scopes [ d.delim_name ]; ending = len }
:: toks),
stack )
else
let k () =
match_line ~t ~grammar:cur_grammar ~stack ~pos:(pos + 1) ~toks ~line
stk_pats
in
match stack with
| [] -> try_pats repos grammar rem_pats ~k
| se :: stack' -> (
match se.stack_delim.delim_kind with
| While -> try_pats repos se.stack_grammar rem_pats ~k
| End ->
if se.stack_delim.delim_apply_end_pattern_last then
try_pats repos se.stack_grammar rem_pats ~k:(fun () ->
try_delim se stack' ~k)
else
try_delim se stack' ~k:(fun () ->
try_pats repos se.stack_grammar rem_pats ~k))
let tokenize_exn t grammar stack line =
let rec try_while_rules pos toks rem_stack = function
| [] -> (toks, pos, rem_stack)
| se :: stack -> (
match se.stack_delim.delim_kind with
| End -> try_while_rules pos toks (se :: rem_stack) stack
| While ->
let rec loop pos' =
if pos' = String.length line then (toks, pos, rem_stack)
else
let re = match_subst se in
match Oniguruma.match_ re line pos' Oniguruma.Options.none with
| None -> loop (pos' + 1)
| Some region ->
let start = Oniguruma.Region.capture_beg region 0 in
let end_ = Oniguruma.Region.capture_end region 0 in
assert (start = pos');
let toks =
{ scopes = se.stack_prev_scopes; ending = pos' } :: toks
in
let toks =
handle_captures re se.stack_prev_scopes
se.stack_delim.delim_name pos' end_ region
se.stack_delim.delim_end_captures toks
in
let toks =
{
scopes =
add_scopes se.stack_prev_scopes
[ se.stack_delim.delim_name ];
ending = end_;
}
:: toks
in
try_while_rules end_ toks (se :: rem_stack) stack
in
loop pos)
in
let toks, pos, stack = try_while_rules 0 [] [] (List.rev stack) in
match_line ~t ~grammar ~stack ~pos ~toks ~line (next_pats grammar stack)