nxt.ada_lexer source code

1 /** Ada Lexer.  */
2 module nxt.ada_lexer;
3 
4 import std.typecons;
5 import std.meta;
6 import std.array;
7 import std.algorithm;
8 import std.range;
9 
10 import nxt.lexer;
11 import nxt.ada_defs;
12 import nxt.stringcache;
13 
14 /// Operators
15 private enum operators = ada_defs.operators;
16 
17 /// Keywords
18 private enum keywords = ada_defs.keywords2012;
19 
20 /// Other tokens
21 private enum dynamicTokens = [
22     `string`, `number`, `identifier`, `comment`, `whitespace`
23     ];
24 
25 private enum pseudoTokenHandlers = [
26     `"`, `lexStringLiteral`,
27     `0`, `lexNumber`,
28     `1`, `lexNumber`,
29     `2`, `lexNumber`,
30     `3`, `lexNumber`,
31     `4`, `lexNumber`,
32     `5`, `lexNumber`,
33     `6`, `lexNumber`,
34     `7`, `lexNumber`,
35     `8`, `lexNumber`,
36     `9`, `lexNumber`,
37     ` `, `lexWhitespace`,
38     `\t`, `lexWhitespace`,
39     `\r`, `lexWhitespace`,
40     `\n`, `lexWhitespace`,
41     `--`, `lexComment`,
42     ];
43 
44 /// Token ID type for the D lexer.
45 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
46 
47 /**
48  * Function used for converting an IdType to a string.
49  *
50  * Examples:
51  * ---
52  * IdType c = tok!"case";
53  * assert (str(c) == "case");
54  * ---
55  */
56 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
57 
58 /**
59  * Template used to refer to D token types.
60  *
61  * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for
62  * values that can be passed to this template.
63  * Example:
64  * ---
65  * import std.d.lexer;
66  * IdType t = tok!"floatLiteral";
67  * ---
68  */
69 public template tok(string token)
70 {
71     alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
72 }
73 
74 private enum extraFields = q{
75     string comment;
76     string trailingComment;
77 
78     int opCmp(size_t i) const pure nothrow @safe {
79         if (index < i) return -1;
80         if (index > i) return 1;
81         return 0;
82     }
83 
84     int opCmp(ref const typeof(this) other) const pure nothrow @safe {
85         return opCmp(other.index);
86     }
87 };
88 
89 /// The token type in the D lexer
90 public alias Token = lexer.TokenStructure!(IdType, extraFields);
91 
92 /**
93  * Lexer configuration struct
94  */
95 public struct LexerConfig
96 {
97     string fileName;
98 }
99 
100 /**
101  * Returns: an array of tokens lexed from the given source code to the output range. All
102  * whitespace tokens are skipped and comments are attached to the token nearest
103  * to them.
104  */
105 const(Token)[] getTokensForParser(ubyte[] sourceCode, const LexerConfig config,
106                                   StringCache* cache)
107 {
108 //	import std.stdio;
109     enum CommentType : ubyte
110     {
111         notDoc,
112         line,
113         block
114 	}
115 
116     static CommentType commentType(string comment) pure nothrow @safe
117     {
118         if (comment.length < 3)
119             return CommentType.notDoc;
120         if (comment[0 ..3] == "///")
121             return CommentType.line;
122         if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**")
123             return CommentType.block;
124         return CommentType.notDoc;
125     }
126 
127     auto output = appender!(typeof(return))();
128     auto lexer = AdaLexer(sourceCode, config, cache);
129     string blockComment;
130     size_t tokenCount;
131     while (!lexer.empty)
132     {
133         switch (lexer.front.type)
134         {
135             case tok!"whitespace":
136                 lexer.popFront();
137             break;
138             case tok!"comment":
139                 final switch (commentType(lexer.front.text))
140             {
141                 case CommentType.block:
142                     blockComment = lexer.front.text;
143                     lexer.popFront();
144                     break;
145                 case CommentType.line:
146                     if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line)
147                     {
148                         // writeln("attaching comment");
149                         (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text;
150                     }
151                     else
152                     {
153                         blockComment = cache.intern(blockComment.length == 0 ? lexer.front.text
154                                                     : blockComment ~ "\n" ~ lexer.front.text);
155                     }
156                     lexer.popFront();
157                     break;
158                 case CommentType.notDoc:
159                     lexer.popFront();
160                     break;
161             }
162                 break;
163             default:
164                 Token t = lexer.front;
165                 lexer.popFront();
166                 tokenCount++;
167                 t.comment = blockComment;
168                 blockComment = null;
169                 output.put(t);
170                 break;
171         }
172     }
173 
174     return output.data;
175 }
176 
177 /**
178  * The Ada lexer.
179  */
180 public struct AdaLexer
181 {
182     import core.vararg;
183 
184     mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
185                  keywords, pseudoTokenHandlers);
186 
187     @disable this();
188 
189     /**
190      * Params:
191      *     range = the bytes that compose the source code that will be lexed.
192      *     config = the lexer configuration to use.
193      *     cache = the string interning cache for de-duplicating identifiers and
194      *         other token text.
195      */
196     this(ubyte[] range, const LexerConfig config, StringCache* cache)
197     {
198         this.range = LexerRange(range);
199         this.config = config;
200         this.cache = cache;
201         popFront();
202     }
203 
204     public void popFront() pure
205     {
206         _popFront();
207     }
208 
209     bool isWhitespace() pure const nothrow
210     {
211         switch (range.front)
212         {
213             case ' ':
214             case '\r':
215             case '\n':
216             case '\t':
217                 return true;
218             case 0xe2:
219                 auto peek = range.peek(2);
220                 return peek.length == 2
221                 && peek[0] == 0x80
222                 && (peek[1] == 0xa8 || peek[1] == 0xa9);
223             default:
224                 return false;
225         }
226     }
227 
228     void popFrontWhitespaceAware() pure nothrow
229     {
230         switch (range.front)
231         {
232             case '\r':
233                 range.popFront();
234                 if (!range.empty && range.front == '\n')
235                 {
236                     range.popFront();
237                     range.incrementLine();
238                 }
239                 else
240                 range.incrementLine();
241                 return;
242             case '\n':
243                 range.popFront();
244                 range.incrementLine();
245                 return;
246             case 0xe2:
247                 auto lookahead = range.peek(3);
248                 if (lookahead.length == 3 && lookahead[1] == 0x80
249                     && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
250                 {
251                     range.popFront();
252                     range.popFront();
253                     range.popFront();
254                     range.incrementLine();
255                     return;
256                 }
257                 else
258                 {
259                     range.popFront();
260                     return;
261                 }
262             default:
263                 range.popFront();
264                 return;
265         }
266     }
267 
268     /// https://en.wikibooks.org/wiki/Ada_Programming/Lexical_elements#String_literals
269     Token lexStringLiteral() pure nothrow @safe
270     {
271         mixin (tokenStart);
272         ubyte quote = range.front;
273         range.popFront();
274         while (true)
275         {
276             if (range.empty)
277                 return Token(tok!"", null, 0, 0, 0);
278             if (range.front == '\\')
279             {
280                 range.popFront();
281                 if (range.empty)
282                     return Token(tok!"", null, 0, 0, 0);
283                 range.popFront();
284             }
285             else if (range.front == quote)
286             {
287                 range.popFront();
288                 break;
289             }
290             else
291             range.popFront();
292         }
293         return Token(tok!"string", cache.intern(range.slice(mark)), line,
294                      column, index);
295     }
296 
297     Token lexWhitespace() pure nothrow @safe
298     {
299         import std.ascii: isWhite;
300         mixin (tokenStart);
301         while (!range.empty && isWhite(range.front))
302             range.popFront();
303         string text = cache.intern(range.slice(mark));
304         return Token(tok!"whitespace", text, line, column, index);
305     }
306 
307     void lexExponent() pure nothrow @safe
308     {
309         range.popFront();
310         bool foundSign = false;
311         bool foundDigit = false;
312         while (!range.empty)
313         {
314             switch (range.front)
315             {
316                 case '-':
317                 case '+':
318                     if (foundSign)
319                         return;
320                     foundSign = true;
321                     range.popFront();
322                     break;
323                 case '0': .. case '9':
324                     foundDigit = true;
325                     range.popFront();
326                     break;
327                 default:
328                     return;
329             }
330         }
331     }
332 
333     Token lexNumber() pure nothrow
334     {
335         mixin (tokenStart);
336         bool foundDot = range.front == '.';
337         if (foundDot)
338             range.popFront();
339     decimalLoop: while (!range.empty)
340         {
341             switch (range.front)
342             {
343                 case '0': .. case '9':
344                     range.popFront();
345                     break;
346                 case 'e':
347                 case 'E':
348                     lexExponent();
349                     break decimalLoop;
350                 case '.':
351                     if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.')
352                         break decimalLoop;
353                     else
354                     {
355                         // The following bit of silliness tries to tell the
356                         // difference between "int dot identifier" and
357                         // "double identifier".
358                         if (range.canPeek(1))
359                         {
360                             switch (range.peekAt(1))
361                             {
362                                 case '0': .. case '9':
363                                     goto doubleLiteral;
364                                 default:
365                                     break decimalLoop;
366                             }
367                         }
368                         else
369                         {
370                         doubleLiteral:
371                             range.popFront();
372                             foundDot = true;
373                         }
374                     }
375                     break;
376                 default:
377                     break decimalLoop;
378             }
379         }
380         return Token(tok!"number", cache.intern(range.slice(mark)),
381                      line, column, index);
382     }
383 
384     Token lexComment() pure
385     {
386         mixin (tokenStart);
387         IdType type = tok!"comment";
388         range.popFrontN(2);
389         while (!range.empty)
390         {
391             if (range.front == '*')
392             {
393                 range.popFront();
394                 if (!range.empty && range.front == '/')
395                 {
396                     range.popFront();
397                     break;
398                 }
399             }
400             else
401             popFrontWhitespaceAware();
402         }
403     end:
404         return Token(type, cache.intern(range.slice(mark)), line, column,
405                      index);
406     }
407 
408     Token lexSlashSlashComment() pure nothrow
409     {
410         mixin (tokenStart);
411         IdType type = tok!"comment";
412         range.popFrontN(2);
413         while (!range.empty)
414         {
415             if (range.front == '\r' || range.front == '\n')
416                 break;
417             range.popFront();
418         }
419     end:
420         return Token(type, cache.intern(range.slice(mark)), line, column,
421                      index);
422     }
423 
424     Token lexIdentifier() pure nothrow
425     {
426         import std.stdio;
427         mixin (tokenStart);
428         uint hash = 0;
429         if (isSeparating(0) || range.empty)
430         {
431             error("Invalid identifier");
432             range.popFront();
433         }
434         while (!range.empty && !isSeparating(0))
435         {
436             hash = StringCache.hashStep(range.front, hash);
437             range.popFront();
438         }
439         return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line,
440                      column, index);
441     }
442 
443     bool isNewline() pure @safe nothrow
444     {
445         if (range.front == '\n') return true;
446         if (range.front == '\r') return true;
447         return (range.front & 0x80) && range.canPeek(2)
448         && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
449     }
450 
451     bool isSeparating(size_t offset) pure nothrow @safe
452     {
453         if (!range.canPeek(offset)) return true;
454         auto c = range.peekAt(offset);
455         if (c >= 'A' && c <= 'Z') return false;
456         if (c >= 'a' && c <= 'z') return false;
457         if (c <= 0x2f) return true;
458         if (c >= ':' && c <= '@') return true;
459         if (c >= '[' && c <= '^') return true;
460         if (c >= '{' && c <= '~') return true;
461         if (c == '`') return true;
462         if (c & 0x80)
463         {
464             auto r = range;
465             range.popFrontN(offset);
466             return (r.canPeek(2) && (r.peek(2) == "\u2028"
467                                      || r.peek(2) == "\u2029"));
468         }
469         return false;
470     }
471 
472     enum tokenStart = q{
473         size_t index = range.index;
474         size_t column = range.column;
475         size_t line = range.line;
476         auto mark = range.mark();
477     };
478 
479     void error(string message) pure nothrow @safe
480     {
481         messages ~= Message(range.line, range.column, message, true);
482     }
483 
484     void warning(string message) pure nothrow @safe
485     {
486         messages ~= Message(range.line, range.column, message, false);
487         assert (messages.length > 0);
488     }
489 
490     struct Message
491     {
492         size_t line;
493         size_t column;
494         string message;
495         bool isError;
496     }
497 
498     Message[] messages;
499     StringCache* cache;
500     LexerConfig config;
501 }
502 
503 public auto byToken(ubyte[] range)
504 {
505     LexerConfig config;
506     StringCache* cache = new StringCache(StringCache.defaultBucketCount);
507     return AdaLexer(range, config, cache);
508 }
509 
510 public auto byToken(ubyte[] range, StringCache* cache)
511 {
512     LexerConfig config;
513     return AdaLexer(range, config, cache);
514 }
515 
516 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
517 {
518     return AdaLexer(range, config, cache);
519 }
520 
521 unittest
522 {
523     assert(getTokensForParser(cast(ubyte[])`X;`, LexerConfig(), new StringCache(StringCache.defaultBucketCount))
524            .map!`a.type`()
525            .equal([tok!`identifier`,
526                    tok!`;`]));
527 }
528 
529 unittest
530 {
531     assert(getTokensForParser(cast(ubyte[])`x = "a";`, LexerConfig(), new StringCache(StringCache.defaultBucketCount))
532            .map!`a.type`()
533            .equal([tok!`identifier`,
534                    tok!`=`,
535                    tok!`string`,
536                    tok!`;`]));
537 }