package catala

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file lexer_common.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
(* This file is part of the Catala compiler, a specification language for tax
   and social benefits computation rules. Copyright (C) 2020 Inria,
   contributors: Denis Merigoux <denis.merigoux@inria.fr>, Emile Rolley
   <emile.rolley@tuta.io>

   Licensed under the Apache License, Version 2.0 (the "License"); you may not
   use this file except in compliance with the License. You may obtain a copy of
   the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
   WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
   License for the specific language governing permissions and limitations under
   the License. *)

open Tokens
open Sedlexing
open Catala_utils
module R = Re.Pcre

(* Calculates the precedence according a {!val: matched_regex} of the form :
   '[#]+'.

   @note -2 because [LAW_HEADING] start with at least "#" and the number of '#'
   remaining corresponds to the precedence. *)
let calc_precedence (matched_regex : string) : int =
  String.length matched_regex - 1

(* Gets the [LAW_HEADING] token from the current {!val: lexbuf} *)
let get_law_heading (lexbuf : lexbuf) : token =
  let extract_article_title =
    R.regexp "([#]+)\\s*([^\\|]+)(\\|\\s*([^\\s]+)|)(\\s*(\\[archive\\])|)"
  in
  let rex = R.exec ~rex:extract_article_title (Utf8.lexeme lexbuf) in
  let title = String.trim (R.get_substring rex 2) in
  let article_id =
    try Some (String.trim (R.get_substring rex 4)) with Not_found -> None
  in
  let is_archive = Option.is_some (Re.Group.get_opt rex 6) in
  let precedence = calc_precedence (String.trim (R.get_substring rex 1)) in
  LAW_HEADING (title, article_id, is_archive, precedence)

type lexing_context = Law | Raw | Code | Directive | Directive_args | Inactive

(** Reference used by the lexer as the mutable state to distinguish whether it
    is lexing code or law. *)
let context : lexing_context ref = ref Inactive

let context_start_pos : (Lexing.position * Lexing.position) ref =
  ref (Lexing.dummy_pos, Lexing.dummy_pos)

(** Mutable string reference that accumulates the string representation of the
    body of code being lexed. This string representation is used in the literate
    programming backends to faithfully capture the spacing pattern of the
    original program *)
let code_buffer : Buffer.t option ref = ref None

let with_lexing_context filename f =
  let saved_context = !context in
  let saved_context_start_pos = !context_start_pos in
  let saved_buffer = !code_buffer in
  context := Law;
  (context_start_pos :=
     let pos =
       { Lexing.pos_fname = filename; pos_bol = 0; pos_cnum = 0; pos_lnum = 0 }
     in
     pos, pos);
  code_buffer := Some (Buffer.create 4000);
  Fun.protect f ~finally:(fun () ->
      if
        !context <> Law
        || match !code_buffer with Some b -> Buffer.length b > 0 | _ -> false
      then
        Message.delayed_error ~kind:Lexing ()
          ~pos:(Pos.from_lpos !context_start_pos)
          "Unclosed block or missing newline at the end of file.@ Did you \
           forget a @{<yellow>```@} delimiter ?";
      context := saved_context;
      context_start_pos := saved_context_start_pos;
      code_buffer := saved_buffer)

(** Updates {!val:code_buffer} with the current lexeme *)
let update_acc (lexbuf : lexbuf) : unit =
  match !code_buffer with
  | None ->
    Message.error ~internal:true "Lexer update outside of a lexing context"
  | Some buf -> Buffer.add_string buf (Utf8.lexeme lexbuf)

let flush_acc () =
  match !code_buffer with
  | None ->
    Message.error ~internal:true "Lexer update outside of a lexing context"
  | Some buf ->
    let s = Buffer.contents buf in
    Buffer.clear buf;
    s

exception Lexing_error of (Pos.t * string)

(** Error-generating helper *)
let raise_lexer_error (loc : Pos.t) (token : string) =
  raise (Lexing_error (loc, token))

(** Associative list matching each punctuation string part of the Catala syntax
    with its {!module: Surface.Parser} token. Same for all the input languages
    (English, French, etc.) *)
let token_list_language_agnostic : (string * token) list =
  [
    ".", DOT;
    "<=", LESSER_EQUAL KPoly;
    ">=", GREATER_EQUAL KPoly;
    ">", GREATER KPoly;
    "!=", NOT_EQUAL;
    "=", EQUAL;
    "(", LPAREN;
    ")", RPAREN;
    "{", LBRACE;
    "}", RBRACE;
    "{", LBRACKET;
    "}", RBRACKET;
    "+", PLUS KPoly;
    "-", MINUS KPoly;
    "*", MULT KPoly;
    "/", DIV KPoly;
    ":", COLON;
    ";", SEMICOLON;
    "--", ALT;
    "++", PLUSPLUS;
  ]

type line_token =
  | LINE_INLINE_TEST (* ```catala-test-cli *)
  | LINE_BLOCK_END (* ``` *)
  | LINE_INCLUDE of string (* > Include foo.catala_en *)
  | LINE_MODULE_DEF of string * bool (* > Module Xxx [external] *)
  | LINE_MODULE_USE of string (* > Using Xxx [as Yyy] *)
  | LINE_TEST_ATTRIBUTE (* any line containing a #[test] attribute *)
  | LINE_ANY (* anything else *)

module type LocalisedLexer = sig
  val token_list : (string * Tokens.token) list
  (** Same as {!val: token_list_language_agnostic}, but with tokens specialized
      to a given language. *)

  val lex_builtin : string -> Ast.builtin_expression option
  (** Simple lexer for builtins (from an lident) *)

  val lex_primitive_type : string -> Ast.primitive_typ option
  (** Simple lexer for builtin primitive types (from an lident) *)

  val lex_builtin_constr : string -> Ast.builtin_constr option
  (** Simple lexer for builtin constructors (from an uident) *)

  val lex_code : Sedlexing.lexbuf -> Tokens.token
  (** Main lexing function used in code blocks *)

  val lex_law : Sedlexing.lexbuf -> Tokens.token
  (** Main lexing function used outside code blocks *)

  val lexer : Sedlexing.lexbuf -> Tokens.token
  (** Entry point of the lexer, distributes to {!val: lex_code} or
      {!val:lex_law} depending of the current
      {!val:Surface.Lexer_common.context}. *)

  val lex_line :
    context:[ `Law | `Code | `Test | `Raw ] ref ->
    Sedlexing.lexbuf ->
    (string * line_token) option
  (** Low-level lexer intended for dependency extraction *)
end