Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Input.ml1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138(******************************************************************************) (* *) (* Menhir *) (* *) (* Copyright Inria. All rights reserved. This file is distributed under *) (* the terms of the GNU Library General Public License version 2, with a *) (* special exception on linking, as described in the file LICENSE. *) (* *) (******************************************************************************) open Lexing module V = MiniVector type 'token input = { lexer: lexbuf -> 'token; (**The lexer. *) lexbuf: lexbuf; (**The lexing buffer. *) mutable lookahead_token: 'token; (**The first unconsumed token may or may not have been obtained already from the lexer. This field should have type ['token option]. To avoid allocating an option, we use the address of the [input] object itself as a sentinel value, standing for [None]. (Cute, eh?) (Danger!) We save 2 words of memory per token, and about 2% in time, it seems. *) startpos: position V.vector; (**A vector of the start positions of the tokens that we have consumed. *) endpos: position V.vector; (**A vector of the end positions of the tokens that we have consumed. *) initp: position; (**The initial position. This position is obtained from the lexing buffer at the very beginning. *) } let create lexer lexbuf = let lookahead_token = Obj.magic () (* overwritten below *) and startpos = V.create() and endpos = V.create() and initp = lexbuf.lex_curr_p in let input = { lexer; lexbuf; lookahead_token; startpos; endpos; initp } in input.lookahead_token <- Obj.magic input; input let[@inline] consume input = (* It would be surprising if the parser would consume a token without having examined it first. (An LR parser will never do this.) So, we consider this situation an error. *) assert (input.lookahead_token != Obj.magic input); (* Push the positions of the current lookahead symbol (which we are about to consume) into the position vectors. *) V.push input.startpos input.lexbuf.lex_start_p; V.push input.endpos input.lexbuf.lex_curr_p; (* Record the fact that we have not yet read the next token. *) input.lookahead_token <- Obj.magic input let[@inline] lookahead input = let token = input.lookahead_token in if token != Obj.magic input then token else let token = input.lexer input.lexbuf in input.lookahead_token <- token; token (* -------------------------------------------------------------------------- *) (* A note on keeping track of positions. *) (* Every node in the GSS carries a date, that is, an offset into the input, measured in number of tokens. A date represents a position *between* two tokens. *) (* It would make sense to give the user access to dates, perhaps via new keywords that could be used in the semantic actions. *) (* For the moment, for historical reasons, the user does not have access to dates. Instead the user has access to Menhir's traditional position keywords, such as [$startpos], [$endpos], [$startpos(x)], etc. *) (* There is an unfortunate mismatch between the concept of position and the concept of date. Indeed, because all positions are originally produced by the lexer, every position must be the start position or the end position of some token. Thus, in between two tokens, we have *two* possible positions: the end position of the first token, and the start position of the second token. This creates a problem in the translation of a date to a position: which of the two positions should be used? A somewhat sensible approach, which we adopt, is to use the end position of the first token if the date is supposed to represent an end date, and to the use the start position of the second token if the date is supposed to represent a start date. *) (* This approach still has a potentially surprising property. The empty date interval [\[i, i)] is translated to an interval [\[pos1, pos2)] where the position [pos2] possibly lies leftwards of the position [pos1]. To avoid this problem, perhaps a good rule of thumb is to avoid using the keywords [$startpos] and [$endpos] inside an epsilon production, and to avoid using the keywords [$startpos(x)] and [$endpos(x)] when the symbol denoted by [x] is nullable. Another idea is to just be aware of this problem and exchange the positions [pos1] and [pos2] in this case. *) (* In this module, the vectors [startpos] and [endpos] record the start position and end position of the input symbols that have been consumed. These vectors serve as a basis in the translation of dates to positions. *) type date = int let[@inline] end_position input date = let endpos = input.endpos in assert (0 <= date && date <= V.length endpos); if 0 < date then V.unsafe_get endpos (date-1) else (* This case is problematic, but should arise only if we are asked about the end position of an epsilon production or of a nullable symbol. *) (* In this case, instead of the end position of the token at date [-1], which does not exist, we return the initial position of the input stream. *) input.initp let[@inline] start_position input date = let startpos = input.startpos in assert (0 <= date && date <= V.length startpos); if date < V.length startpos then V.unsafe_get startpos date else (* This case is problematic, but should arise only if we are asked about the start position of an epsilon production or of a nullable symbol. *) (* In this case, instead of the start position of the token at [date], which does not exist, we return the end position of the previous token. *) end_position input date