1 // Written in the D programming language
2 
3 /**
4  * $(H2 Summary)
5  * This module contains a range-based compile-time _lexer generator.
6  *
7  * $(H2 Overview)
8  * The _lexer generator consists of a template mixin, $(LREF Lexer), along with
9  * several helper templates for generating such things as token identifiers.
10  *
11  * To write a _lexer using this API:
12  * $(OL
13  *     $(LI Create the string array constants for your language.
14  *         $(UL
15  *             $(LI $(LINK2 #.staticTokens, staticTokens))
16  *             $(LI $(LINK2 #.dynamicTokens, dynamicTokens))
17  *             $(LI $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens))
18  *             $(LI $(LINK2 #.tokenHandlers, tokenHandlers))
19  *         ))
20  *     $(LI Create aliases for the various token and token identifier types
21  *         specific to your language.
22  *         $(UL
23  *             $(LI $(LREF TokenIdType))
24  *             $(LI $(LREF tokenStringRepresentation))
25  *             $(LI $(LREF TokenStructure))
26  *             $(LI $(LREF TokenId))
27  *         ))
28  *     $(LI Create a struct that mixes in the Lexer template mixin and
29  *         implements the necessary functions.
30  *         $(UL
31  *             $(LI $(LREF Lexer))
32  *         ))
33  * )
34  * Examples:
35  * $(UL
36  * $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/std/d/lexer.d, here).)
37  * $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).)
38  * $(LI A _lexer for JSON is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/jsonlexer.d, here).)
39  * )
40  * $(DDOC_ANCHOR TemplateParameters) $(H2 Template Parameter Definitions)
41  * $(DL
42  * $(DT $(DDOC_ANCHOR defaultTokenFunction) $(B defaultTokenFunction)
43  * $(DD A function that serves as the default token lexing function. For most
44  *     languages this will be the identifier lexing function.))
45  * $(DT $(DDOC_ANCHOR tokenSeparatingFunction) $(B tokenSeparatingFunction))
46  * $(DD A function that is able to determine if an identifier/keyword has come
47  *     to an end. This function must return bool and take a single size_t
48  *     argument representing the number of bytes to skip over before looking for
49  *     a separating character.)
50  * $(DT $(DDOC_ANCHOR staticTokens) $(B staticTokens))
51  * $(DD A listing of the tokens whose exact value never changes and which cannot
52  *     possibly be a token handled by the default token lexing function. The
53  *     most common example of this kind of token is an operator such as
54  *     $(D_STRING "*"), or $(D_STRING "-") in a programming language.)
55  * $(DT $(DDOC_ANCHOR dynamicTokens) $(B dynamicTokens))
56  * $(DD A listing of tokens whose value is variable, such as whitespace,
57  *     identifiers, number literals, and string literals.)
58  * $(DT $(DDOC_ANCHOR possibleDefaultTokens) $(B possibleDefaultTokens))
59  * $(DD A listing of tokens that could posibly be one of the tokens handled by
60  *     the default token handling function. An common example of this is
61  *     a keyword such as $(D_STRING "for"), which looks like the beginning of
62  *     the identifier $(D_STRING "fortunate"). $(B tokenSeparatingFunction) is
63  *     called to determine if the character after the $(D_STRING 'r') separates
64  *     the identifier, indicating that the token is $(D_STRING "for"), or if
65  *     lexing should be turned over to the $(B defaultTokenFunction).)
66  * $(DT $(DDOC_ANCHOR tokenHandlers) $(B tokenHandlers))
67  * $(DD A mapping of prefixes to custom token handling function names. The
68  *     generated _lexer will search for the even-index elements of this array,
69  *     and then call the function whose name is the element immedately after the
70  *     even-indexed element. This is used for lexing complex tokens whose prefix
71  *     is fixed.)
72  * )
73  *
74  * Here are some example constants for a simple calculator _lexer:
75  * ---
76  * // There are a near infinite number of valid number literals, so numbers are
77  * // dynamic tokens.
78  * enum string[] dynamicTokens = ["numberLiteral", "whitespace"];
79  *
80  * // The operators are always the same, and cannot start a numberLiteral, so
81  * // they are staticTokens
82  * enum string[] staticTokens = ["-", "+", "*", "/"];
83  *
84  * // In this simple example there are no keywords or other tokens that could
85  * // look like dynamic tokens, so this is blank.
86  * enum string[] possibleDefaultTokens = [];
87  *
88  * // If any whitespace character or digit is encountered, pass lexing over to
89  * // our custom handler functions. These will be demonstrated in an example
90  * // later on.
91  * enum string[] tokenHandlers = [
92  *     "0", "lexNumber",
93  *     "1", "lexNumber",
94  *     "2", "lexNumber",
95  *     "3", "lexNumber",
96  *     "4", "lexNumber",
97  *     "5", "lexNumber",
98  *     "6", "lexNumber",
99  *     "7", "lexNumber",
100  *     "8", "lexNumber",
101  *     "9", "lexNumber",
102  *     " ", "lexWhitespace",
103  *     "\n", "lexWhitespace",
104  *     "\t", "lexWhitespace",
105  *     "\r", "lexWhitespace"
106  * ];
107  * ---
108  *
109  * Copyright: Brian Schott 2013
110  * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
111  * Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu
112  * Source: $(PHOBOSSRC std/experimental/_lexer.d)
113  */
114 
115 module std.experimental.lexer;
116 
117 /**
118  * Template for determining the type used for a token type.
119  *
120  * Selects the smallest unsigned integral type that is able to hold the value
121  * staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length.
122  * For example if there are 20 static tokens, 30 dynamic tokens,
123  * and 10 possible default tokens, this template will alias itself to ubyte,
124  * as 20 + 30 + 10 < $(D_KEYWORD ubyte).max.
125  * Examples:
126  * ---
127  * // In our calculator example this means that IdType is an alias for ubyte.
128  * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
129  * ---
130  */
131 template TokenIdType(alias staticTokens, alias dynamicTokens,
132     alias possibleDefaultTokens)
133 {
134     immutable tokenCount = staticTokens.length + dynamicTokens.length
135         + possibleDefaultTokens.length + 1;
136     static if (tokenCount <= ubyte.max)
137         alias TokenIdType = ubyte;
138     else static if (tokenCount <= ushort.max)
139         alias TokenIdType = ushort;
140     else static if (tokenCount <= uint.max)
141         alias TokenIdType = uint;
142     else
143         static assert (false, "The number of tokens must be less than uint.max");
144 }
145 
146 /**
147  * Looks up the string representation of the given token type.
148  *
149  * This is the opposite of the function of the TokenId template.
150  * Params: type = the token type identifier
151  * Examples:
152  * ---
153  * alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
154  * assert (str(tok!"*") == "*");
155  * ---
156  * See_Also: $(LREF TokenId)
157  */
158 string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens,
159     alias possibleDefaultTokens)(IdType type) pure nothrow @property @nogc @safe
160 {
161     // hax
162     static auto f() pure nothrow @trusted
163     {
164         return cast(immutable) staticTokens ~ dynamicTokens ~ possibleDefaultTokens;
165     }
166 
167     static immutable tokens = f();
168 
169     if (type == 0)
170         return "!ERROR!";
171     else if (type < tokens.length + 1)
172         return tokens[type - 1];
173     else
174         return null;
175 }
176 
177 unittest
178 {
179     alias IdType = TokenIdType!(["foo"], ["bar"], ["doo"]);
180     enum tok(string token) = TokenId!(IdType, ["foo"], ["bar"], ["doo"], token);
181     alias str = tokenStringRepresentation!(IdType, ["foo"], ["bar"], ["doo"]);
182 
183     static assert (str(tok!"foo") == "foo");
184     static assert (str(tok!"bar") == "bar");
185     static assert (str(tok!"doo") == "doo");
186 }
187 
188 /**
189  * Generates the token type identifier for the given symbol.
190  *
191  * There are two special cases:
192  * $(UL
193  *     $(LI If symbol is $(D_STRING ""), then the token identifier will be 0)
194  *     $(LI If symbol is $(D_STRING "\0"), then the token identifier will be the maximum
195  *         valid token type identifier)
196  * )
197  * In all cases this template will alias itself to a constant of type IdType.
198  * This template will fail at compile time if $(D_PARAM symbol) is not one of
199  * the staticTokens, dynamicTokens, or possibleDefaultTokens.
200  * Examples:
201  * ---
202  * template tok(string symbol)
203  * {
204  *     alias tok = TokenId!(IdType, staticTokens, dynamicTokens,
205  *         possibleDefaultTokens, symbol);
206  * }
207  * // num and plus are of type ubyte.
208  * IdType plus = tok!"+";
209  * IdType num = tok!"numberLiteral";
210  * ---
211  */
212 template TokenId(IdType, alias staticTokens, alias dynamicTokens,
213     alias possibleDefaultTokens, string symbol)
214 {
215     enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens;
216 
217     import std.algorithm;
218     static if (symbol == "")
219     {
220         enum id = 0;
221         alias TokenId = id;
222     }
223     else static if (symbol == "\0")
224     {
225         enum id = 1 + tokens.length;
226         alias TokenId = id;
227     }
228     else
229     {
230         enum i = tokens.countUntil(symbol);
231         static if (i != -1)
232         {
233             enum id = i + 1;
234             static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
235             alias TokenId = id;
236         }
237         else
238             static assert (0, "Invalid token: " ~ symbol);
239     }
240 }
241 
242 /**
243  * The token that is returned by the lexer.
244  * Params:
245  *     IdType = The D type of the "type" token type field.
246  *     extraFields = A string containing D code for any extra fields that should
247  *         be included in the token structure body. This string is passed
248  *         directly to a mixin statement.
249  * Examples:
250  * ---
251  * // No extra struct fields are desired in this example, so leave it blank.
252  * alias Token = TokenStructure!(IdType, "");
253  * Token minusToken = Token(tok!"-");
254  * ---
255  */
256 struct TokenStructure(IdType, string extraFields = "")
257 {
258 public pure nothrow @safe @nogc:
259 
260     bool opEquals(ref const typeof(this) other) const
261     {
262         return this.type == other.type && this.text == other.text;
263     }
264 
265     /**
266      * Returs: true if the token has the given type, false otherwise.
267      */
268     bool opEquals(IdType type) const
269     {
270         return this.type == type;
271     }
272 
273     /**
274      * Constructs a token from a token type.
275      * Params: type = the token type
276      */
277     this(IdType type)
278     {
279         this.type = type;
280     }
281 
282     /**
283      * Constructs a token.
284      * Params:
285      *     type = the token type
286      *     text = the text of the token, which may be null
287      *     line = the line number at which this token occurs
288      *     column = the column number at which this token occurs
289      *     index = the byte offset from the beginning of the input at which this
290      *         token occurs
291      */
292     this(IdType type, string text, size_t line, size_t column, size_t index)
293     {
294         this.text = text;
295         this.line = line;
296         this.column = column;
297         this.type = type;
298         this.index = index;
299     }
300 
301     /**
302      * The _text of the token.
303      */
304     string text;
305 
306     /**
307      * The _line number at which this token occurs.
308      */
309     size_t line;
310 
311     /**
312      * The _column number at which this token occurs. This is measured in bytes
313      * and may not be correct when tab characters are involved.
314      */
315     size_t column;
316 
317     /**
318      * The byte offset from the beginning of the input at which this token
319      * occurs.
320      */
321     size_t index;
322 
323     /**
324      * The token type.
325      */
326     IdType type;
327 
328     mixin (extraFields);
329 }
330 
331 /**
332  * The implementation of the _lexer is contained within this mixin template.
333  *
334  * To use it, this template should be mixed in to a struct that represents the
335  * _lexer for your language. This struct should implement the following methods:
336  * $(UL
337  *     $(LI popFront, which should call this mixin's _popFront() and
338  *         additionally perform any token filtering or shuffling you deem
339  *         necessary. For example, you can implement popFront to skip comment or
340  *          tokens.)
341  *     $(LI A function that serves as the default token lexing function. For
342  *         most languages this will be the identifier lexing function. This
343  *         should then be passed to the $(LREF Lexer) template mixin as the
344  *         $(LINK2 #.defaultTokenFunction defaultTokenFunction) template
345  *         parameter.)
346  *     $(LI A function that is able to determine if an identifier/keyword has
347  *         come to an end. This function must return $(D_KEYWORD bool) and take
348  *         a single $(D_KEYWORD size_t) argument representing the number of
349  *         bytes to skip over before looking for a separating character.)
350  *     $(LI Any functions referred to in the tokenHandlers template paramater.
351  *         These functions must be marked $(D_KEYWORD pure nothrow), take no
352  *         arguments, and return a token)
353  *     $(LI A constructor that initializes the range field as well as calls
354  *         popFront() exactly once (to initialize the _front field).)
355  * )
356  * Params:
357  *     Token = $(LREF TokenStructure)
358  *     defaultTokenFunction = $(LINK2 #.defaultTokenFunction, defaultTokenFunction)
359  *     tokenSeparatingFunction = $(LINK2 #.tokenSeparatingFunction, tokenSeparatingFunction)
360  *     staticTokens = $(LINK2 #.staticTokens, staticTokens)
361  *     dynamicTokens = $(LINK2 #.dynamicTokens, dynamicTokens)
362  *     possibleDefaultTokens = $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens)
363  *     tokenHandlers = $(LINK2 #.tokenHandlers, tokenHandlers)
364  * Examples:
365  * ---
366  * struct CalculatorLexer
367  * {
368  *     mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating,
369  *         staticTokens, dynamicTokens, possibleDefaultTokens, tokenHandlers);
370  *
371  *     this (ubyte[] bytes)
372  *     {
373  *         this.range = LexerRange(bytes);
374  *         popFront();
375  *     }
376  *
377  *     void popFront() pure
378  *     {
379  *         _popFront();
380  *     }
381  *
382  *     Token lexNumber() pure nothrow @safe
383  *     {
384  *         // implementation goes here
385  *     }
386  *
387  *     Token lexWhitespace() pure nothrow @safe
388  *     {
389  *         // implementation goes here
390  *     }
391  *
392  *     Token defaultTokenFunction() pure nothrow @safe
393  *     {
394  *         // There is no default token in the example calculator language, so
395  *         // this is always an error.
396  *         range.popFront();
397  *         return Token(tok!"");
398  *     }
399  *
400  *     bool isSeparating(size_t offset) pure nothrow @safe
401  *     {
402  *         // For this example language, always return true.
403  *         return true;
404  *     }
405  * }
406  * ---
407  */
408 mixin template Lexer(Token, alias defaultTokenFunction,
409     alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens,
410     alias possibleDefaultTokens, alias tokenHandlers)
411 {
412     private alias _IDType = typeof(Token.type);
413     private enum _tok(string symbol) = TokenId!(_IDType, staticTokens, dynamicTokens, possibleDefaultTokens, symbol);
414 
415     static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must"
416         ~ " have a corresponding handler function name.");
417 
418     static string generateMask(const ubyte[] arr)
419     {
420         import std..string : format;
421         ulong u;
422         for (size_t i = 0; i < arr.length && i < 8; i++)
423         {
424             u |= (cast(ulong) arr[i]) << (i * 8);
425         }
426         return format("0x%016x", u);
427     }
428 
429     private static string generateByteMask(size_t l)
430     {
431         import std..string : format;
432         return format("0x%016x", ulong.max >> ((8 - l) * 8));
433     }
434 
435     private static size_t calcSplitCount(size_t a, size_t b) pure nothrow
436     {
437         int i;
438         while (true)
439         {
440             i++;
441             a /= 2;
442             if (a < b)
443                 break;
444         }
445         return i;
446     }
447 
448     private static char[] getBeginningChars(string[] allTokens)
449     {
450         char[] beginningChars;
451         for (size_t i = 0; i < allTokens.length; i++)
452         {
453             if (allTokens[i].length == 0)
454                 continue;
455             beginningChars ~= allTokens[i][0];
456             size_t j = i + 1;
457             while (j < allTokens.length && allTokens[i][0] == allTokens[j][0])
458                 j++;
459             i = j - 1;
460         }
461         return beginningChars;
462     }
463 
464     private static string generateStatements()
465     {
466         import std.algorithm : sort;
467         import std.range : stride;
468 
469         string[] pseudoTokens = array(tokenHandlers.stride(2));
470         string[] allTokens = array(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq());
471         // Array consisting of a sorted list of the first characters of the
472         // tokens.
473         char[] beginningChars = getBeginningChars(allTokens);
474         size_t i = calcSplitCount(beginningChars.length, 8);
475         return generateStatementsStep(allTokens, pseudoTokens, beginningChars, i);
476     }
477 
478     private static string generateStatementsStep(string[] allTokens,
479         string[] pseudoTokens, char[] chars, size_t i, string indent = "")
480     {
481         import std..string : format;
482         string code;
483         if (i > 0)
484         {
485             size_t p = chars.length / 2;
486             code ~= indent ~ format("if (f < 0x%02x) // %s \n%s{\n", chars[p], chars[p], indent);
487             code ~= generateStatementsStep(allTokens, pseudoTokens, chars[0 .. p], i - 1, indent ~ "    ");
488             code ~= indent ~ "}\n" ~ indent ~ "else\n" ~ indent ~ "{\n";
489             code ~= generateStatementsStep(allTokens, pseudoTokens, chars[p .. $], i - 1, indent ~ "    ");
490             code ~= indent ~ "}\n";
491         }
492         else
493         {
494             code ~= indent ~ "switch (f)\n" ~ indent ~ "{\n";
495             foreach (char c; chars)
496             {
497                 size_t begin;
498                 size_t end;
499                 for (size_t j = 0; j < allTokens.length; j++)
500                 {
501                     if (allTokens[j].length == 0 || allTokens[j][0] != c)
502                         continue;
503                     begin = j;
504                     end = j + 1;
505                     while (end < allTokens.length && allTokens[begin][0] == allTokens[end][0])
506                         end++;
507                     break;
508                 }
509                 code ~= format("%scase 0x%02x:\n", indent, c);
510                 code ~= printCase(allTokens[begin .. end], pseudoTokens, indent ~ "    ");
511             }
512             code ~= indent ~ "default: goto _defaultTokenFunction;\n";
513             code ~= indent ~ "}\n";
514         }
515 
516         return code;
517     }
518 
519     private static string printCase(string[] tokens, string[] pseudoTokens, string indent)
520     {
521         import std.array : array;
522         import std.algorithm : countUntil;
523         import std.conv : text;
524         string[] sortedTokens = array(sort!"a.length > b.length"(tokens));
525 
526 
527         if (tokens.length == 1 && tokens[0].length == 1)
528         {
529             if (pseudoTokens.countUntil(tokens[0]) >= 0)
530             {
531                 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
532                     ~ "(token);\n" ~ indent ~ "return;\n";
533             }
534             else if (staticTokens.countUntil(tokens[0]) >= 0)
535             {
536                 return indent ~ "range.index++; range.column++;\n"
537                     ~ indent ~ "token= Token(_tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n"
538                     ~ indent ~ "return;";
539             }
540             else if (pseudoTokens.countUntil(tokens[0]) >= 0)
541             {
542                 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
543                     ~ "(token);\n" ~ indent ~ "return;\n";
544 
545             }
546         }
547 
548         string code;
549 
550         bool insertTrailingGoto = true;
551         foreach (i, token; sortedTokens)
552         {
553             immutable mask = generateMask(cast (const ubyte[]) token);
554             if (token.length >= 8)
555                 code ~= indent ~ "if (frontBytes == " ~ mask ~ ")\n";
556             else if (token.length != 1)
557                 code ~= indent ~ "if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n";
558             if (token.length != 1)
559                 code ~= indent ~ "{\n";
560             if (pseudoTokens.countUntil(token) >= 0)
561             {
562                 if (token.length <= 8)
563                 {
564                     code ~= indent ~ "    "
565                         ~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
566                         ~ "(token);\n";
567                     code ~= indent ~ "return;\n";
568                 }
569                 else
570                 {
571                     code ~= indent ~ "    if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~ "\")\n";
572                     code ~= indent ~ "        "
573                         ~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
574                         ~ "();\n";
575                     code ~= indent ~ "return;\n";
576                 }
577             }
578             else if (staticTokens.countUntil(token) >= 0)
579             {
580                 if (token.length <= 8)
581                 {
582                     insertTrailingGoto = false;
583                     code ~= indent ~ (token.length != 1 ? "    " : "") ~ "range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
584                     code ~= indent ~ (token.length != 1 ? "    " : "") ~ "token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
585                     code ~= indent ~ (token.length != 1 ? "    " : "") ~ "return;\n";
586                 }
587                 else
588                 {
589                     code ~= indent ~ "    pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n";
590                 }
591             }
592             else
593             {
594                 // possible default
595                 if (token.length <= 8)
596                 {
597                     code ~= indent ~ "    if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n";
598                     code ~= indent ~ "    {\n";
599                     code ~= indent ~ "        range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
600                     code ~= indent ~ "        token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
601                     code ~= indent ~ "        return;\n";
602                     code ~= indent ~ "    }\n";
603                     code ~= indent ~ "    else\n";
604                     code ~= indent ~ "        goto _defaultTokenFunction;\n";
605                 }
606                 else
607                 {
608                     code ~= indent ~ "    if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~"\") && isSeparating(" ~ text(token.length) ~ "))\n";
609                     code ~= indent ~ "    {\n";
610                     code ~= indent ~ "        range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
611                     code ~= indent ~ "        token =  Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
612                     code ~= indent ~ "        return;\n";
613                     code ~= indent ~ "    }\n";
614                     code ~= indent ~ "    else\n";
615                     code ~= indent ~ "        goto _defaultTokenFunction;\n";
616                 }
617             }
618             if (token.length != 1)
619             {
620                 code ~= indent ~ "}\n";
621             }
622         }
623         if (insertTrailingGoto)
624             code ~= indent ~ "goto _defaultTokenFunction;\n";
625         return code;
626     }
627 
628     /**
629      * Implements the range primitive _front.
630      */
631     ref const(Token) front()() pure nothrow const @property @safe
632     {
633         return _front;
634     }
635 
636     /**
637      * Advances the lexer to the next token and stores the new current token in
638      * the _front variable.
639      */
640     void _popFront()() pure nothrow @safe
641     {
642         advance(_front);
643     }
644 
645     /**
646      * Implements the range primitive _empty.
647      */
648     bool empty()() pure const nothrow @property @safe @nogc
649     {
650         return _front.type == _tok!"\0";
651     }
652 
653     static string escape(string input) pure @trusted
654     {
655         string retVal;
656         foreach (ubyte c; cast(ubyte[]) input)
657         {
658             switch (c)
659             {
660             case '\\': retVal ~= `\\`; break;
661             case '"': retVal ~= `\"`; break;
662             case '\'': retVal ~= `\'`; break;
663             case '\t': retVal ~= `\t`; break;
664             case '\n': retVal ~= `\n`; break;
665             case '\r': retVal ~= `\r`; break;
666             default: retVal ~= c; break;
667             }
668         }
669         return retVal;
670     }
671 
672     enum tokenSearch = generateStatements();
673 
674     static ulong getFront(const ubyte[] arr) pure nothrow @trusted
675     {
676         static union ByteArr { ulong l; ubyte[8] arr; }
677         static assert(ByteArr.sizeof == ulong.sizeof);
678         ByteArr b;
679         b.l = ulong.max;
680         b.arr[0 .. arr.length] = arr[];
681         return b.l;
682     }
683 
684     void advance(ref Token token) pure nothrow @trusted
685     {
686         if (range.index >= range.bytes.length)
687         {
688             token.type = _tok!"\0";
689             return;
690         }
691         immutable size_t index = range.index;
692         immutable size_t column = range.column;
693         immutable size_t line = range.line;
694         immutable ulong frontBytes = range.index + 8 <= range.bytes.length
695             ? getFront(range.bytes[range.index .. range.index + 8])
696             : getFront(range.bytes[range.index .. $]);
697         ubyte f = cast(ubyte) frontBytes;
698 //        pragma(msg, tokenSearch);
699         mixin(tokenSearch);
700     _defaultTokenFunction:
701         defaultTokenFunction(token);
702     }
703 
704     /**
705      * The lexer input.
706      */
707     LexerRange range;
708 
709     /**
710      * The token that is currently at the front of the range.
711      */
712     Token _front;
713 }
714 
715 /**
716  * Range structure that wraps the _lexer's input.
717  */
718 struct LexerRange
719 {
720     // TODO: When D gets @forceinline the template inline hack (i.e
721     // `void front()() { ... }` )should be removed.
722 
723 public nothrow pure @safe @nogc:
724     /**
725      * Params:
726      *     bytes = the _lexer input
727      *     index = the initial offset from the beginning of $(D_PARAM bytes)
728      *     column = the initial _column number
729      *     line = the initial _line number
730      */
731     this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1)
732     {
733         this.bytes = bytes;
734         this.index = index;
735         this.column = column;
736         this.line = line;
737     }
738 
739     /**
740      * Returns: a mark at the current position that can then be used with slice.
741      */
742     size_t mark()() const
743     {
744         return index;
745     }
746 
747     /**
748      * Sets the range to the given position.
749      * Params: m = the position to seek to
750      */
751     void seek()(size_t m)
752     {
753         index = m;
754     }
755 
756     /**
757      * Returs a slice of the input byte array between the given mark and the
758      * current position.
759      * Params m = the beginning index of the slice to return
760      */
761     const(ubyte)[] slice()(size_t m) const
762     {
763         return bytes[m .. index];
764     }
765 
766     /**
767      * Implements the range primitive _empty.
768      */
769     bool empty()() const
770     {
771         return index >= bytes.length;
772     }
773 
774     /**
775      * Implements the range primitive _front.
776      */
777     ubyte front()() const
778     {
779         return bytes[index];
780     }
781 
782     /**
783      * Returns: the current item as well as the items $(D_PARAM p) items ahead.
784      */
785     const(ubyte)[] peek(size_t p) const
786     {
787         return index + p + 1 > bytes.length
788             ? bytes[index .. $]
789             : bytes[index .. index + p + 1];
790     }
791 
792     /**
793      * Returns: true if the range starts with the given byte sequence
794      */
795     bool startsWith(const(ubyte[]) needle) const
796     {
797         if (needle.length + index > bytes.length)
798             return false;
799         foreach (i; 0 .. needle.length)
800             if (needle[i] != bytes[index + i])
801                 return false;
802         return true;
803     }
804 
805     /**
806      *
807      */
808     ubyte peekAt()(size_t offset) const
809     {
810         return bytes[index + offset];
811     }
812 
813     /**
814      * Returns: true if it is possible to peek $(D_PARAM p) bytes ahead.
815      */
816     bool canPeek()(size_t p) const
817     {
818         return index + p < bytes.length;
819     }
820 
821     /**
822      * Implements the range primitive _popFront.
823      */
824     void popFront()()
825     {
826         index++;
827         column++;
828     }
829 
830     /**
831      * Implements the algorithm _popFrontN more efficiently. This function does
832      * not detect or handle newlines.
833      */
834     void popFrontN()(size_t n)
835     {
836         index += n;
837         column += n;
838     }
839 
840     /**
841      * Increments the range's line number and resets the column counter.
842      */
843     void incrementLine()(size_t i = 1)
844     {
845         column = 1;
846         line += i;
847     }
848 
849     /**
850      * The input _bytes.
851      */
852     const(ubyte)[] bytes;
853 
854     /**
855      * The range's current position.
856      */
857     size_t index;
858 
859     /**
860      * The current _column number.
861      */
862     size_t column;
863 
864     /**
865      * The current _line number.
866      */
867     size_t line;
868 }