std.experimental.lexer source code

1 // Written in the D programming language
2 
3 /**
4  * $(H2 Summary)
5  * This module contains a range-based compile-time _lexer generator.
6  *
7  * $(H2 Overview)
8  * The _lexer generator consists of a template mixin, $(LREF Lexer), along with
9  * several helper templates for generating such things as token identifiers.
10  *
11  * To write a _lexer using this API:
12  * $(OL
13  *	 $(LI Create the string array constants for your language.
14  *		 $(UL
15  *			 $(LI $(LINK2 #.staticTokens, staticTokens))
16  *			 $(LI $(LINK2 #.dynamicTokens, dynamicTokens))
17  *			 $(LI $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens))
18  *			 $(LI $(LINK2 #.tokenHandlers, tokenHandlers))
19  *		 ))
20  *	 $(LI Create aliases for the various token and token identifier types
21  *		 specific to your language.
22  *		 $(UL
23  *			 $(LI $(LREF TokenIdType))
24  *			 $(LI $(LREF tokenStringRepresentation))
25  *			 $(LI $(LREF TokenStructure))
26  *			 $(LI $(LREF TokenId))
27  *		 ))
28  *	 $(LI Create a struct that mixes in the Lexer template mixin and
29  *		 implements the necessary functions.
30  *		 $(UL
31  *			 $(LI $(LREF Lexer))
32  *		 ))
33  * )
34  * Examples:
35  * $(UL
36  * $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/std/d/lexer.d, here).)
37  * $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).)
38  * $(LI A _lexer for JSON is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/jsonlexer.d, here).)
39  * )
40  * $(DDOC_ANCHOR TemplateParameters) $(H2 Template Parameter Definitions)
41  * $(DL
42  * $(DT $(DDOC_ANCHOR defaultTokenFunction) $(B defaultTokenFunction)
43  * $(DD A function that serves as the default token lexing function. For most
44  *	 languages this will be the identifier lexing function.))
45  * $(DT $(DDOC_ANCHOR tokenSeparatingFunction) $(B tokenSeparatingFunction))
46  * $(DD A function that is able to determine if an identifier/keyword has come
47  *	 to an end. This function must return bool and take a single size_t
48  *	 argument representing the number of bytes to skip over before looking for
49  *	 a separating character.)
50  * $(DT $(DDOC_ANCHOR staticTokens) $(B staticTokens))
51  * $(DD A listing of the tokens whose exact value never changes and which cannot
52  *	 possibly be a token handled by the default token lexing function. The
53  *	 most common example of this kind of token is an operator such as
54  *	 $(D_STRING "*"), or $(D_STRING "-") in a programming language.)
55  * $(DT $(DDOC_ANCHOR dynamicTokens) $(B dynamicTokens))
56  * $(DD A listing of tokens whose value is variable, such as whitespace,
57  *	 identifiers, number literals, and string literals.)
58  * $(DT $(DDOC_ANCHOR possibleDefaultTokens) $(B possibleDefaultTokens))
59  * $(DD A listing of tokens that could posibly be one of the tokens handled by
60  *	 the default token handling function. An common example of this is
61  *	 a keyword such as $(D_STRING "for"), which looks like the beginning of
62  *	 the identifier $(D_STRING "fortunate"). $(B tokenSeparatingFunction) is
63  *	 called to determine if the character after the $(D_STRING 'r') separates
64  *	 the identifier, indicating that the token is $(D_STRING "for"), or if
65  *	 lexing should be turned over to the $(B defaultTokenFunction).)
66  * $(DT $(DDOC_ANCHOR tokenHandlers) $(B tokenHandlers))
67  * $(DD A mapping of prefixes to custom token handling function names. The
68  *	 generated _lexer will search for the even-index elements of this array,
69  *	 and then call the function whose name is the element immedately after the
70  *	 even-indexed element. This is used for lexing complex tokens whose prefix
71  *	 is fixed.)
72  * )
73  *
74  * Here are some example constants for a simple calculator _lexer:
75  * ---
76  * // There are a near infinite number of valid number literals, so numbers are
77  * // dynamic tokens.
78  * enum string[] dynamicTokens = ["numberLiteral", "whitespace"];
79  *
80  * // The operators are always the same, and cannot start a numberLiteral, so
81  * // they are staticTokens
82  * enum string[] staticTokens = ["-", "+", "*", "/"];
83  *
84  * // In this simple example there are no keywords or other tokens that could
85  * // look like dynamic tokens, so this is blank.
86  * enum string[] possibleDefaultTokens = [];
87  *
88  * // If any whitespace character or digit is encountered, pass lexing over to
89  * // our custom handler functions. These will be demonstrated in an example
90  * // later on.
91  * enum string[] tokenHandlers = [
92  *	 "0", "lexNumber",
93  *	 "1", "lexNumber",
94  *	 "2", "lexNumber",
95  *	 "3", "lexNumber",
96  *	 "4", "lexNumber",
97  *	 "5", "lexNumber",
98  *	 "6", "lexNumber",
99  *	 "7", "lexNumber",
100  *	 "8", "lexNumber",
101  *	 "9", "lexNumber",
102  *	 " ", "lexWhitespace",
103  *	 "\n", "lexWhitespace",
104  *	 "\t", "lexWhitespace",
105  *	 "\r", "lexWhitespace"
106  * ];
107  * ---
108  *
109  * Copyright: Brian Schott 2013
110  * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
111  * Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu
112  * Source: $(PHOBOSSRC std/experimental/_lexer.d)
113  */
114 
115 module std.experimental.lexer;
116 
117 /**
118  * Template for determining the type used for a token type.
119  *
120  * Selects the smallest unsigned integral type that is able to hold the value
121  * staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length.
122  * For example if there are 20 static tokens, 30 dynamic tokens,
123  * and 10 possible default tokens, this template will alias itself to ubyte,
124  * as 20 + 30 + 10 < $(D_KEYWORD ubyte).max.
125  * Examples:
126  * ---
127  * // In our calculator example this means that IdType is an alias for ubyte.
128  * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
129  * ---
130  */
131 template TokenIdType(alias staticTokens, alias dynamicTokens,
132 	alias possibleDefaultTokens)
133 {
134 	immutable tokenCount = staticTokens.length + dynamicTokens.length
135 		+ possibleDefaultTokens.length + 1;
136 	static if (tokenCount <= ubyte.max)
137 		alias TokenIdType = ubyte;
138 	else static if (tokenCount <= ushort.max)
139 		alias TokenIdType = ushort;
140 	else static if (tokenCount <= uint.max)
141 		alias TokenIdType = uint;
142 	else
143 		static assert (false, "The number of tokens must be less than uint.max");
144 }
145 
146 /**
147  * Looks up the string representation of the given token type.
148  *
149  * This is the opposite of the function of the TokenId template.
150  * Params: type = the token type identifier
151  * Examples:
152  * ---
153  * alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
154  * assert (str(tok!"*") == "*");
155  * ---
156  * See_Also: $(LREF TokenId)
157  */
158 string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens,
159 	alias possibleDefaultTokens)(IdType type) pure nothrow @property @nogc @safe
160 {
161 	// hax
162 	static auto f() pure nothrow @trusted
163 	{
164 		return cast(immutable) staticTokens ~ dynamicTokens ~ possibleDefaultTokens;
165 	}
166 
167 	static immutable tokens = f();
168 
169 	if (type == 0)
170 		return "!ERROR!";
171 	else if (type < tokens.length + 1)
172 		return tokens[type - 1];
173 	else
174 		return null;
175 }
176 
177 unittest {
178 	alias IdType = TokenIdType!(["foo"], ["bar"], ["doo"]);
179 	enum tok(string token) = TokenId!(IdType, ["foo"], ["bar"], ["doo"], token);
180 	alias str = tokenStringRepresentation!(IdType, ["foo"], ["bar"], ["doo"]);
181 
182 	static assert (str(tok!"foo") == "foo");
183 	static assert (str(tok!"bar") == "bar");
184 	static assert (str(tok!"doo") == "doo");
185 }
186 
187 /**
188  * Generates the token type identifier for the given symbol.
189  *
190  * There are two special cases:
191  * $(UL
192  *	 $(LI If symbol is $(D_STRING ""), then the token identifier will be 0)
193  *	 $(LI If symbol is $(D_STRING "\0"), then the token identifier will be the maximum
194  *		 valid token type identifier)
195  * )
196  * In all cases this template will alias itself to a constant of type IdType.
197  * This template will fail at compile time if $(D_PARAM symbol) is not one of
198  * the staticTokens, dynamicTokens, or possibleDefaultTokens.
199  * Examples:
200  * ---
201  * template tok(string symbol)
202  * {
203  *	 alias tok = TokenId!(IdType, staticTokens, dynamicTokens,
204  *		 possibleDefaultTokens, symbol);
205  * }
206  * // num and plus are of type ubyte.
207  * IdType plus = tok!"+";
208  * IdType num = tok!"numberLiteral";
209  * ---
210  */
211 template TokenId(IdType, alias staticTokens, alias dynamicTokens,
212 	alias possibleDefaultTokens, string symbol)
213 {
214 	enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens;
215 
216 	import std.algorithm;
217 	static if (symbol == "")
218 	{
219 		enum id = 0;
220 		alias TokenId = id;
221 	}
222 	else static if (symbol == "\0")
223 	{
224 		enum id = 1 + tokens.length;
225 		alias TokenId = id;
226 	}
227 	else
228 	{
229 		enum i = tokens.countUntil(symbol);
230 		static if (i != -1)
231 		{
232 			enum id = i + 1;
233 			static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
234 			alias TokenId = id;
235 		}
236 		else
237 			static assert (0, "Invalid token: " ~ symbol);
238 	}
239 }
240 
241 /**
242  * The token that is returned by the lexer.
243  * Params:
244  *	 IdType = The D type of the "type" token type field.
245  *	 extraFields = A string containing D code for any extra fields that should
246  *		 be included in the token structure body. This string is passed
247  *		 directly to a mixin statement.
248  * Examples:
249  * ---
250  * // No extra struct fields are desired in this example, so leave it blank.
251  * alias Token = TokenStructure!(IdType, "");
252  * Token minusToken = Token(tok!"-");
253  * ---
254  */
255 struct TokenStructure(IdType, string extraFields = "")
256 {
257 public pure nothrow @safe @nogc:
258 
259 	bool opEquals(ref const typeof(this) other) const
260 	{
261 		return this.type == other.type && this.text == other.text;
262 	}
263 
264 	/**
265 	 * Returs: true if the token has the given type, false otherwise.
266 	 */
267 	bool opEquals(IdType type) const
268 	{
269 		return this.type == type;
270 	}
271 
272 	/**
273 	 * Constructs a token from a token type.
274 	 * Params: type = the token type
275 	 */
276 	this(IdType type)
277 	{
278 		this.type = type;
279 	}
280 
281 	/**
282 	 * Constructs a token.
283 	 * Params:
284 	 *	 type = the token type
285 	 *	 text = the text of the token, which may be null
286 	 *	 line = the line number at which this token occurs
287 	 *	 column = the column number at which this token occurs
288 	 *	 index = the byte offset from the beginning of the input at which this
289 	 *		 token occurs
290 	 */
291 	this(IdType type, string text, size_t line, size_t column, size_t index)
292 	{
293 		this.text = text;
294 		this.line = line;
295 		this.column = column;
296 		this.type = type;
297 		this.index = index;
298 	}
299 
300 	/**
301 	 * The _text of the token.
302 	 */
303 	string text;
304 
305 	/**
306 	 * The _line number at which this token occurs.
307 	 */
308 	size_t line;
309 
310 	/**
311 	 * The _column number at which this token occurs. This is measured in bytes
312 	 * and may not be correct when tab characters are involved.
313 	 */
314 	size_t column;
315 
316 	/**
317 	 * The byte offset from the beginning of the input at which this token
318 	 * occurs.
319 	 */
320 	size_t index;
321 
322 	/**
323 	 * The token type.
324 	 */
325 	IdType type;
326 
327 	mixin (extraFields);
328 }
329 
330 /**
331  * The implementation of the _lexer is contained within this mixin template.
332  *
333  * To use it, this template should be mixed in to a struct that represents the
334  * _lexer for your language. This struct should implement the following methods:
335  * $(UL
336  *	 $(LI popFront, which should call this mixin's _popFront() and
337  *		 additionally perform any token filtering or shuffling you deem
338  *		 necessary. For example, you can implement popFront to skip comment or
339  *		  tokens.)
340  *	 $(LI A function that serves as the default token lexing function. For
341  *		 most languages this will be the identifier lexing function. This
342  *		 should then be passed to the $(LREF Lexer) template mixin as the
343  *		 $(LINK2 #.defaultTokenFunction defaultTokenFunction) template
344  *		 parameter.)
345  *	 $(LI A function that is able to determine if an identifier/keyword has
346  *		 come to an end. This function must return $(D_KEYWORD bool) and take
347  *		 a single $(D_KEYWORD size_t) argument representing the number of
348  *		 bytes to skip over before looking for a separating character.)
349  *	 $(LI Any functions referred to in the tokenHandlers template paramater.
350  *		 These functions must be marked $(D_KEYWORD pure nothrow), take no
351  *		 arguments, and return a token)
352  *	 $(LI A constructor that initializes the range field as well as calls
353  *		 popFront() exactly once (to initialize the _front field).)
354  * )
355  * Params:
356  *	 Token = $(LREF TokenStructure)
357  *	 defaultTokenFunction = $(LINK2 #.defaultTokenFunction, defaultTokenFunction)
358  *	 tokenSeparatingFunction = $(LINK2 #.tokenSeparatingFunction, tokenSeparatingFunction)
359  *	 staticTokens = $(LINK2 #.staticTokens, staticTokens)
360  *	 dynamicTokens = $(LINK2 #.dynamicTokens, dynamicTokens)
361  *	 possibleDefaultTokens = $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens)
362  *	 tokenHandlers = $(LINK2 #.tokenHandlers, tokenHandlers)
363  * Examples:
364  * ---
365  * struct CalculatorLexer
366  * {
367  *	 mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating,
368  *		 staticTokens, dynamicTokens, possibleDefaultTokens, tokenHandlers);
369  *
370  *	 this (ubyte[] bytes)
371  *	 {
372  *		 this.range = LexerRange(bytes);
373  *		 popFront();
374  *	 }
375  *
376  *	 void popFront() pure
377  *	 {
378  *		 _popFront();
379  *	 }
380  *
381  *	 Token lexNumber() pure nothrow @safe
382  *	 {
383  *		 // implementation goes here
384  *	 }
385  *
386  *	 Token lexWhitespace() pure nothrow @safe
387  *	 {
388  *		 // implementation goes here
389  *	 }
390  *
391  *	 Token defaultTokenFunction() pure nothrow @safe
392  *	 {
393  *		 // There is no default token in the example calculator language, so
394  *		 // this is always an error.
395  *		 range.popFront();
396  *		 return Token(tok!"");
397  *	 }
398  *
399  *	 bool isSeparating(size_t offset) pure nothrow @safe
400  *	 {
401  *		 // For this example language, always return true.
402  *		 return true;
403  *	 }
404  * }
405  * ---
406  */
407 mixin template Lexer(Token, alias defaultTokenFunction,
408 	alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens,
409 	alias possibleDefaultTokens, alias tokenHandlers)
410 {
411 	private alias _IDType = typeof(Token.type);
412 	private enum _tok(string symbol) = TokenId!(_IDType, staticTokens, dynamicTokens, possibleDefaultTokens, symbol);
413 
414 	static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must"
415 		~ " have a corresponding handler function name.");
416 
417 	static string generateMask(const ubyte[] arr)
418 	{
419 		import std.string : format;
420 		ulong u;
421 		for (size_t i = 0; i < arr.length && i < 8; i++)
422 		{
423 			u |= (cast(ulong) arr[i]) << (i * 8);
424 		}
425 		return format("0x%016x", u);
426 	}
427 
428 	private static string generateByteMask(size_t l)
429 	{
430 		import std.string : format;
431 		return format("0x%016x", ulong.max >> ((8 - l) * 8));
432 	}
433 
434 	private static size_t calcSplitCount(size_t a, size_t b) pure nothrow
435 	{
436 		int i;
437 		while (true)
438 		{
439 			i++;
440 			a /= 2;
441 			if (a < b)
442 				break;
443 		}
444 		return i;
445 	}
446 
447 	private static char[] getBeginningChars(string[] allTokens)
448 	{
449 		char[] beginningChars;
450 		for (size_t i = 0; i < allTokens.length; i++)
451 		{
452 			if (allTokens[i].length == 0)
453 				continue;
454 			beginningChars ~= allTokens[i][0];
455 			size_t j = i + 1;
456 			while (j < allTokens.length && allTokens[i][0] == allTokens[j][0])
457 				j++;
458 			i = j - 1;
459 		}
460 		return beginningChars;
461 	}
462 
463 	private static string generateStatements()
464 	{
465 		import std.algorithm : sort;
466 		import std.range : stride;
467 
468 		string[] pseudoTokens = array(tokenHandlers.stride(2));
469 		string[] allTokens = array(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq());
470 		// Array consisting of a sorted list of the first characters of the
471 		// tokens.
472 		char[] beginningChars = getBeginningChars(allTokens);
473 		size_t i = calcSplitCount(beginningChars.length, 8);
474 		return generateStatementsStep(allTokens, pseudoTokens, beginningChars, i);
475 	}
476 
477 	private static string generateStatementsStep(string[] allTokens,
478 		string[] pseudoTokens, char[] chars, size_t i, string indent = "")
479 	{
480 		import std.string : format;
481 		string code;
482 		if (i > 0)
483 		{
484 			size_t p = chars.length / 2;
485 			code ~= indent ~ format("if (f < 0x%02x) // %s \n%s{\n", chars[p], chars[p], indent);
486 			code ~= generateStatementsStep(allTokens, pseudoTokens, chars[0 .. p], i - 1, indent ~ "	");
487 			code ~= indent ~ "}\n" ~ indent ~ "else\n" ~ indent ~ "{\n";
488 			code ~= generateStatementsStep(allTokens, pseudoTokens, chars[p .. $], i - 1, indent ~ "	");
489 			code ~= indent ~ "}\n";
490 		}
491 		else
492 		{
493 			code ~= indent ~ "switch (f)\n" ~ indent ~ "{\n";
494 			foreach (char c; chars)
495 			{
496 				size_t begin;
497 				size_t end;
498 				for (size_t j = 0; j < allTokens.length; j++)
499 				{
500 					if (allTokens[j].length == 0 || allTokens[j][0] != c)
501 						continue;
502 					begin = j;
503 					end = j + 1;
504 					while (end < allTokens.length && allTokens[begin][0] == allTokens[end][0])
505 						end++;
506 					break;
507 				}
508 				code ~= format("%scase 0x%02x:\n", indent, c);
509 				code ~= printCase(allTokens[begin .. end], pseudoTokens, indent ~ "	");
510 			}
511 			code ~= indent ~ "default: goto _defaultTokenFunction;\n";
512 			code ~= indent ~ "}\n";
513 		}
514 
515 		return code;
516 	}
517 
518 	private static string printCase(string[] tokens, string[] pseudoTokens, string indent)
519 	{
520 		import std.array : array;
521 		import std.algorithm : countUntil;
522 		import std.conv : text;
523 		string[] sortedTokens = array(sort!"a.length > b.length"(tokens));
524 
525 
526 		if (tokens.length == 1 && tokens[0].length == 1)
527 		{
528 			if (pseudoTokens.countUntil(tokens[0]) >= 0)
529 			{
530 				return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
531 					~ "(token);\n" ~ indent ~ "return;\n";
532 			}
533 			else if (staticTokens.countUntil(tokens[0]) >= 0)
534 			{
535 				return indent ~ "range.index++; range.column++;\n"
536 					~ indent ~ "token= Token(_tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n"
537 					~ indent ~ "return;";
538 			}
539 			else if (pseudoTokens.countUntil(tokens[0]) >= 0)
540 			{
541 				return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
542 					~ "(token);\n" ~ indent ~ "return;\n";
543 
544 			}
545 		}
546 
547 		string code;
548 
549 		bool insertTrailingGoto = true;
550 		foreach (i, token; sortedTokens)
551 		{
552 			immutable mask = generateMask(cast (const ubyte[]) token);
553 			if (token.length >= 8)
554 				code ~= indent ~ "if (frontBytes == " ~ mask ~ ")\n";
555 			else if (token.length != 1)
556 				code ~= indent ~ "if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n";
557 			if (token.length != 1)
558 				code ~= indent ~ "{\n";
559 			if (pseudoTokens.countUntil(token) >= 0)
560 			{
561 				if (token.length <= 8)
562 				{
563 					code ~= indent ~ "	"
564 						~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
565 						~ "(token);\n";
566 					code ~= indent ~ "return;\n";
567 				}
568 				else
569 				{
570 					code ~= indent ~ "	if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~ "\")\n";
571 					code ~= indent ~ "		"
572 						~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
573 						~ "();\n";
574 					code ~= indent ~ "return;\n";
575 				}
576 			}
577 			else if (staticTokens.countUntil(token) >= 0)
578 			{
579 				if (token.length <= 8)
580 				{
581 					insertTrailingGoto = false;
582 					code ~= indent ~ (token.length != 1 ? "	" : "") ~ "range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
583 					code ~= indent ~ (token.length != 1 ? "	" : "") ~ "token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
584 					code ~= indent ~ (token.length != 1 ? "	" : "") ~ "return;\n";
585 				}
586 				else
587 				{
588 					code ~= indent ~ "	pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n";
589 				}
590 			}
591 			else
592 			{
593 				// possible default
594 				if (token.length <= 8)
595 				{
596 					code ~= indent ~ "	if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n";
597 					code ~= indent ~ "	{\n";
598 					code ~= indent ~ "		range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
599 					code ~= indent ~ "		token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
600 					code ~= indent ~ "		return;\n";
601 					code ~= indent ~ "	}\n";
602 					code ~= indent ~ "	else\n";
603 					code ~= indent ~ "		goto _defaultTokenFunction;\n";
604 				}
605 				else
606 				{
607 					code ~= indent ~ "	if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~"\") && isSeparating(" ~ text(token.length) ~ "))\n";
608 					code ~= indent ~ "	{\n";
609 					code ~= indent ~ "		range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
610 					code ~= indent ~ "		token =  Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
611 					code ~= indent ~ "		return;\n";
612 					code ~= indent ~ "	}\n";
613 					code ~= indent ~ "	else\n";
614 					code ~= indent ~ "		goto _defaultTokenFunction;\n";
615 				}
616 			}
617 			if (token.length != 1)
618 			{
619 				code ~= indent ~ "}\n";
620 			}
621 		}
622 		if (insertTrailingGoto)
623 			code ~= indent ~ "goto _defaultTokenFunction;\n";
624 		return code;
625 	}
626 
627 	/**
628 	 * Implements the range primitive _front.
629 	 */
630 	ref const(Token) front()() pure nothrow const @property @safe
631 	{
632 		return _front;
633 	}
634 
635 	/**
636 	 * Advances the lexer to the next token and stores the new current token in
637 	 * the _front variable.
638 	 */
639 	void _popFront()() pure nothrow @safe
640 	{
641 		advance(_front);
642 	}
643 
644 	/**
645 	 * Implements the range primitive _empty.
646 	 */
647 	bool empty()() pure const nothrow @property @safe @nogc
648 	{
649 		return _front.type == _tok!"\0";
650 	}
651 
652 	static string escape(string input) pure @trusted
653 	{
654 		string retVal;
655 		foreach (ubyte c; cast(ubyte[]) input)
656 		{
657 			switch (c)
658 			{
659 			case '\\': retVal ~= `\\`; break;
660 			case '"': retVal ~= `\"`; break;
661 			case '\'': retVal ~= `\'`; break;
662 			case '\t': retVal ~= `\t`; break;
663 			case '\n': retVal ~= `\n`; break;
664 			case '\r': retVal ~= `\r`; break;
665 			default: retVal ~= c; break;
666 			}
667 		}
668 		return retVal;
669 	}
670 
671 	enum tokenSearch = generateStatements();
672 
673 	static ulong getFront(const ubyte[] arr) pure nothrow @trusted
674 	{
675 		static union ByteArr { ulong l; ubyte[8] arr; }
676 		static assert(ByteArr.sizeof == ulong.sizeof);
677 		ByteArr b;
678 		b.l = ulong.max;
679 		b.arr[0 .. arr.length] = arr[];
680 		return b.l;
681 	}
682 
683 	void advance(ref Token token) pure nothrow @trusted
684 	{
685 		if (range.index >= range.bytes.length)
686 		{
687 			token.type = _tok!"\0";
688 			return;
689 		}
690 		immutable size_t index = range.index;
691 		immutable size_t column = range.column;
692 		immutable size_t line = range.line;
693 		immutable ulong frontBytes = range.index + 8 <= range.bytes.length
694 			? getFront(range.bytes[range.index .. range.index + 8])
695 			: getFront(range.bytes[range.index .. $]);
696 		ubyte f = cast(ubyte) frontBytes;
697 //		pragma(msg, tokenSearch);
698 		mixin(tokenSearch);
699 	_defaultTokenFunction:
700 		defaultTokenFunction(token);
701 	}
702 
703 	/**
704 	 * The lexer input.
705 	 */
706 	LexerRange range;
707 
708 	/**
709 	 * The token that is currently at the front of the range.
710 	 */
711 	Token _front;
712 }
713 
714 /**
715  * Range structure that wraps the _lexer's input.
716  */
717 struct LexerRange
718 {
719 	/+ TODO: When D gets @forceinline the template inline hack (i.e +/
720 	// `void front()() { ... }` )should be removed.
721 
722 public nothrow pure @safe @nogc:
723 	/**
724 	 * Params:
725 	 *	 bytes = the _lexer input
726 	 *	 index = the initial offset from the beginning of $(D_PARAM bytes)
727 	 *	 column = the initial _column number
728 	 *	 line = the initial _line number
729 	 */
730 	this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1)
731 	{
732 		this.bytes = bytes;
733 		this.index = index;
734 		this.column = column;
735 		this.line = line;
736 	}
737 
738 	/**
739 	 * Returns: a mark at the current position that can then be used with slice.
740 	 */
741 	size_t mark()() const
742 	{
743 		return index;
744 	}
745 
746 	/**
747 	 * Sets the range to the given position.
748 	 * Params: m = the position to seek to
749 	 */
750 	void seek()(size_t m)
751 	{
752 		index = m;
753 	}
754 
755 	/**
756 	 * Returs a slice of the input byte array between the given mark and the
757 	 * current position.
758 	 * Params m = the beginning index of the slice to return
759 	 */
760 	const(ubyte)[] slice()(size_t m) const
761 	{
762 		return bytes[m .. index];
763 	}
764 
765 	/**
766 	 * Implements the range primitive _empty.
767 	 */
768 	bool empty()() const
769 	{
770 		return index >= bytes.length;
771 	}
772 
773 	/**
774 	 * Implements the range primitive _front.
775 	 */
776 	ubyte front()() const
777 	{
778 		return bytes[index];
779 	}
780 
781 	/**
782 	 * Returns: the current item as well as the items $(D_PARAM p) items ahead.
783 	 */
784 	const(ubyte)[] peek(size_t p) const
785 	{
786 		return index + p + 1 > bytes.length
787 			? bytes[index .. $]
788 			: bytes[index .. index + p + 1];
789 	}
790 
791 	/**
792 	 * Returns: true if the range starts with the given byte sequence
793 	 */
794 	bool startsWith(const(ubyte[]) needle) const
795 	{
796 		if (needle.length + index > bytes.length)
797 			return false;
798 		foreach (i; 0 .. needle.length)
799 			if (needle[i] != bytes[index + i])
800 				return false;
801 		return true;
802 	}
803 
804 	/**
805 	 *
806 	 */
807 	ubyte peekAt()(size_t offset) const
808 	{
809 		return bytes[index + offset];
810 	}
811 
812 	/**
813 	 * Returns: true if it is possible to peek $(D_PARAM p) bytes ahead.
814 	 */
815 	bool canPeek()(size_t p) const
816 	{
817 		return index + p < bytes.length;
818 	}
819 
820 	/**
821 	 * Implements the range primitive _popFront.
822 	 */
823 	void popFront()()
824 	{
825 		index++;
826 		column++;
827 	}
828 
829 	/**
830 	 * Implements the algorithm _popFrontN more efficiently. This function does
831 	 * not detect or handle newlines.
832 	 */
833 	void popFrontN()(size_t n)
834 	{
835 		index += n;
836 		column += n;
837 	}
838 
839 	/**
840 	 * Increments the range's line number and resets the column counter.
841 	 */
842 	void incrementLine()(size_t i = 1)
843 	{
844 		column = 1;
845 		line += i;
846 	}
847 
848 	/**
849 	 * The input _bytes.
850 	 */
851 	const(ubyte)[] bytes;
852 
853 	/**
854 	 * The range's current position.
855 	 */
856 	size_t index;
857 
858 	/**
859 	 * The current _column number.
860 	 */
861 	size_t column;
862 
863 	/**
864 	 * The current _line number.
865 	 */
866 	size_t line;
867 }