1 // Written in the D programming language 2 3 /** 4 * $(H2 Summary) 5 * This module contains a range-based compile-time _lexer generator. 6 * 7 * $(H2 Overview) 8 * The _lexer generator consists of a template mixin, $(LREF Lexer), along with 9 * several helper templates for generating such things as token identifiers. 10 * 11 * To write a _lexer using this API: 12 * $(OL 13 * $(LI Create the string array constants for your language. 14 * $(UL 15 * $(LI $(LINK2 #.staticTokens, staticTokens)) 16 * $(LI $(LINK2 #.dynamicTokens, dynamicTokens)) 17 * $(LI $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens)) 18 * $(LI $(LINK2 #.tokenHandlers, tokenHandlers)) 19 * )) 20 * $(LI Create aliases for the various token and token identifier types 21 * specific to your language. 22 * $(UL 23 * $(LI $(LREF TokenIdType)) 24 * $(LI $(LREF tokenStringRepresentation)) 25 * $(LI $(LREF TokenStructure)) 26 * $(LI $(LREF TokenId)) 27 * )) 28 * $(LI Create a struct that mixes in the Lexer template mixin and 29 * implements the necessary functions. 30 * $(UL 31 * $(LI $(LREF Lexer)) 32 * )) 33 * ) 34 * Examples: 35 * $(UL 36 * $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/std/d/lexer.d, here).) 37 * $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).) 38 * $(LI A _lexer for JSON is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/jsonlexer.d, here).) 39 * ) 40 * $(DDOC_ANCHOR TemplateParameters) $(H2 Template Parameter Definitions) 41 * $(DL 42 * $(DT $(DDOC_ANCHOR defaultTokenFunction) $(B defaultTokenFunction) 43 * $(DD A function that serves as the default token lexing function. For most 44 * languages this will be the identifier lexing function.)) 45 * $(DT $(DDOC_ANCHOR tokenSeparatingFunction) $(B tokenSeparatingFunction)) 46 * $(DD A function that is able to determine if an identifier/keyword has come 47 * to an end. This function must return bool and take a single size_t 48 * argument representing the number of bytes to skip over before looking for 49 * a separating character.) 50 * $(DT $(DDOC_ANCHOR staticTokens) $(B staticTokens)) 51 * $(DD A listing of the tokens whose exact value never changes and which cannot 52 * possibly be a token handled by the default token lexing function. The 53 * most common example of this kind of token is an operator such as 54 * $(D_STRING "*"), or $(D_STRING "-") in a programming language.) 55 * $(DT $(DDOC_ANCHOR dynamicTokens) $(B dynamicTokens)) 56 * $(DD A listing of tokens whose value is variable, such as whitespace, 57 * identifiers, number literals, and string literals.) 58 * $(DT $(DDOC_ANCHOR possibleDefaultTokens) $(B possibleDefaultTokens)) 59 * $(DD A listing of tokens that could posibly be one of the tokens handled by 60 * the default token handling function. An common example of this is 61 * a keyword such as $(D_STRING "for"), which looks like the beginning of 62 * the identifier $(D_STRING "fortunate"). $(B tokenSeparatingFunction) is 63 * called to determine if the character after the $(D_STRING 'r') separates 64 * the identifier, indicating that the token is $(D_STRING "for"), or if 65 * lexing should be turned over to the $(B defaultTokenFunction).) 66 * $(DT $(DDOC_ANCHOR tokenHandlers) $(B tokenHandlers)) 67 * $(DD A mapping of prefixes to custom token handling function names. The 68 * generated _lexer will search for the even-index elements of this array, 69 * and then call the function whose name is the element immedately after the 70 * even-indexed element. This is used for lexing complex tokens whose prefix 71 * is fixed.) 72 * ) 73 * 74 * Here are some example constants for a simple calculator _lexer: 75 * --- 76 * // There are a near infinite number of valid number literals, so numbers are 77 * // dynamic tokens. 78 * enum string[] dynamicTokens = ["numberLiteral", "whitespace"]; 79 * 80 * // The operators are always the same, and cannot start a numberLiteral, so 81 * // they are staticTokens 82 * enum string[] staticTokens = ["-", "+", "*", "/"]; 83 * 84 * // In this simple example there are no keywords or other tokens that could 85 * // look like dynamic tokens, so this is blank. 86 * enum string[] possibleDefaultTokens = []; 87 * 88 * // If any whitespace character or digit is encountered, pass lexing over to 89 * // our custom handler functions. These will be demonstrated in an example 90 * // later on. 91 * enum string[] tokenHandlers = [ 92 * "0", "lexNumber", 93 * "1", "lexNumber", 94 * "2", "lexNumber", 95 * "3", "lexNumber", 96 * "4", "lexNumber", 97 * "5", "lexNumber", 98 * "6", "lexNumber", 99 * "7", "lexNumber", 100 * "8", "lexNumber", 101 * "9", "lexNumber", 102 * " ", "lexWhitespace", 103 * "\n", "lexWhitespace", 104 * "\t", "lexWhitespace", 105 * "\r", "lexWhitespace" 106 * ]; 107 * --- 108 * 109 * Copyright: Brian Schott 2013 110 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) 111 * Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu 112 * Source: $(PHOBOSSRC std/experimental/_lexer.d) 113 */ 114 115 module std.experimental.lexer; 116 117 /** 118 * Template for determining the type used for a token type. 119 * 120 * Selects the smallest unsigned integral type that is able to hold the value 121 * staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length. 122 * For example if there are 20 static tokens, 30 dynamic tokens, 123 * and 10 possible default tokens, this template will alias itself to ubyte, 124 * as 20 + 30 + 10 < $(D_KEYWORD ubyte).max. 125 * Examples: 126 * --- 127 * // In our calculator example this means that IdType is an alias for ubyte. 128 * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens); 129 * --- 130 */ 131 template TokenIdType(alias staticTokens, alias dynamicTokens, 132 alias possibleDefaultTokens) 133 { 134 immutable tokenCount = staticTokens.length + dynamicTokens.length 135 + possibleDefaultTokens.length + 1; 136 static if (tokenCount <= ubyte.max) 137 alias TokenIdType = ubyte; 138 else static if (tokenCount <= ushort.max) 139 alias TokenIdType = ushort; 140 else static if (tokenCount <= uint.max) 141 alias TokenIdType = uint; 142 else 143 static assert (false, "The number of tokens must be less than uint.max"); 144 } 145 146 /** 147 * Looks up the string representation of the given token type. 148 * 149 * This is the opposite of the function of the TokenId template. 150 * Params: type = the token type identifier 151 * Examples: 152 * --- 153 * alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens); 154 * assert (str(tok!"*") == "*"); 155 * --- 156 * See_Also: $(LREF TokenId) 157 */ 158 string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, 159 alias possibleDefaultTokens)(IdType type) pure nothrow @property @nogc @safe 160 { 161 // hax 162 static auto f() pure nothrow @trusted 163 { 164 return cast(immutable) staticTokens ~ dynamicTokens ~ possibleDefaultTokens; 165 } 166 167 static immutable tokens = f(); 168 169 if (type == 0) 170 return "!ERROR!"; 171 else if (type < tokens.length + 1) 172 return tokens[type - 1]; 173 else 174 return null; 175 } 176 177 unittest 178 { 179 alias IdType = TokenIdType!(["foo"], ["bar"], ["doo"]); 180 enum tok(string token) = TokenId!(IdType, ["foo"], ["bar"], ["doo"], token); 181 alias str = tokenStringRepresentation!(IdType, ["foo"], ["bar"], ["doo"]); 182 183 static assert (str(tok!"foo") == "foo"); 184 static assert (str(tok!"bar") == "bar"); 185 static assert (str(tok!"doo") == "doo"); 186 } 187 188 /** 189 * Generates the token type identifier for the given symbol. 190 * 191 * There are two special cases: 192 * $(UL 193 * $(LI If symbol is $(D_STRING ""), then the token identifier will be 0) 194 * $(LI If symbol is $(D_STRING "\0"), then the token identifier will be the maximum 195 * valid token type identifier) 196 * ) 197 * In all cases this template will alias itself to a constant of type IdType. 198 * This template will fail at compile time if $(D_PARAM symbol) is not one of 199 * the staticTokens, dynamicTokens, or possibleDefaultTokens. 200 * Examples: 201 * --- 202 * template tok(string symbol) 203 * { 204 * alias tok = TokenId!(IdType, staticTokens, dynamicTokens, 205 * possibleDefaultTokens, symbol); 206 * } 207 * // num and plus are of type ubyte. 208 * IdType plus = tok!"+"; 209 * IdType num = tok!"numberLiteral"; 210 * --- 211 */ 212 template TokenId(IdType, alias staticTokens, alias dynamicTokens, 213 alias possibleDefaultTokens, string symbol) 214 { 215 enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens; 216 217 import std.algorithm; 218 static if (symbol == "") 219 { 220 enum id = 0; 221 alias TokenId = id; 222 } 223 else static if (symbol == "\0") 224 { 225 enum id = 1 + tokens.length; 226 alias TokenId = id; 227 } 228 else 229 { 230 enum i = tokens.countUntil(symbol); 231 static if (i != -1) 232 { 233 enum id = i + 1; 234 static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol); 235 alias TokenId = id; 236 } 237 else 238 static assert (0, "Invalid token: " ~ symbol); 239 } 240 } 241 242 /** 243 * The token that is returned by the lexer. 244 * Params: 245 * IdType = The D type of the "type" token type field. 246 * extraFields = A string containing D code for any extra fields that should 247 * be included in the token structure body. This string is passed 248 * directly to a mixin statement. 249 * Examples: 250 * --- 251 * // No extra struct fields are desired in this example, so leave it blank. 252 * alias Token = TokenStructure!(IdType, ""); 253 * Token minusToken = Token(tok!"-"); 254 * --- 255 */ 256 struct TokenStructure(IdType, string extraFields = "") 257 { 258 public pure nothrow @safe @nogc: 259 260 bool opEquals(ref const typeof(this) other) const 261 { 262 return this.type == other.type && this.text == other.text; 263 } 264 265 /** 266 * Returs: true if the token has the given type, false otherwise. 267 */ 268 bool opEquals(IdType type) const 269 { 270 return this.type == type; 271 } 272 273 /** 274 * Constructs a token from a token type. 275 * Params: type = the token type 276 */ 277 this(IdType type) 278 { 279 this.type = type; 280 } 281 282 /** 283 * Constructs a token. 284 * Params: 285 * type = the token type 286 * text = the text of the token, which may be null 287 * line = the line number at which this token occurs 288 * column = the column number at which this token occurs 289 * index = the byte offset from the beginning of the input at which this 290 * token occurs 291 */ 292 this(IdType type, string text, size_t line, size_t column, size_t index) 293 { 294 this.text = text; 295 this.line = line; 296 this.column = column; 297 this.type = type; 298 this.index = index; 299 } 300 301 /** 302 * The _text of the token. 303 */ 304 string text; 305 306 /** 307 * The _line number at which this token occurs. 308 */ 309 size_t line; 310 311 /** 312 * The _column number at which this token occurs. This is measured in bytes 313 * and may not be correct when tab characters are involved. 314 */ 315 size_t column; 316 317 /** 318 * The byte offset from the beginning of the input at which this token 319 * occurs. 320 */ 321 size_t index; 322 323 /** 324 * The token type. 325 */ 326 IdType type; 327 328 mixin (extraFields); 329 } 330 331 /** 332 * The implementation of the _lexer is contained within this mixin template. 333 * 334 * To use it, this template should be mixed in to a struct that represents the 335 * _lexer for your language. This struct should implement the following methods: 336 * $(UL 337 * $(LI popFront, which should call this mixin's _popFront() and 338 * additionally perform any token filtering or shuffling you deem 339 * necessary. For example, you can implement popFront to skip comment or 340 * tokens.) 341 * $(LI A function that serves as the default token lexing function. For 342 * most languages this will be the identifier lexing function. This 343 * should then be passed to the $(LREF Lexer) template mixin as the 344 * $(LINK2 #.defaultTokenFunction defaultTokenFunction) template 345 * parameter.) 346 * $(LI A function that is able to determine if an identifier/keyword has 347 * come to an end. This function must return $(D_KEYWORD bool) and take 348 * a single $(D_KEYWORD size_t) argument representing the number of 349 * bytes to skip over before looking for a separating character.) 350 * $(LI Any functions referred to in the tokenHandlers template paramater. 351 * These functions must be marked $(D_KEYWORD pure nothrow), take no 352 * arguments, and return a token) 353 * $(LI A constructor that initializes the range field as well as calls 354 * popFront() exactly once (to initialize the _front field).) 355 * ) 356 * Params: 357 * Token = $(LREF TokenStructure) 358 * defaultTokenFunction = $(LINK2 #.defaultTokenFunction, defaultTokenFunction) 359 * tokenSeparatingFunction = $(LINK2 #.tokenSeparatingFunction, tokenSeparatingFunction) 360 * staticTokens = $(LINK2 #.staticTokens, staticTokens) 361 * dynamicTokens = $(LINK2 #.dynamicTokens, dynamicTokens) 362 * possibleDefaultTokens = $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens) 363 * tokenHandlers = $(LINK2 #.tokenHandlers, tokenHandlers) 364 * Examples: 365 * --- 366 * struct CalculatorLexer 367 * { 368 * mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating, 369 * staticTokens, dynamicTokens, possibleDefaultTokens, tokenHandlers); 370 * 371 * this (ubyte[] bytes) 372 * { 373 * this.range = LexerRange(bytes); 374 * popFront(); 375 * } 376 * 377 * void popFront() pure 378 * { 379 * _popFront(); 380 * } 381 * 382 * Token lexNumber() pure nothrow @safe 383 * { 384 * // implementation goes here 385 * } 386 * 387 * Token lexWhitespace() pure nothrow @safe 388 * { 389 * // implementation goes here 390 * } 391 * 392 * Token defaultTokenFunction() pure nothrow @safe 393 * { 394 * // There is no default token in the example calculator language, so 395 * // this is always an error. 396 * range.popFront(); 397 * return Token(tok!""); 398 * } 399 * 400 * bool isSeparating(size_t offset) pure nothrow @safe 401 * { 402 * // For this example language, always return true. 403 * return true; 404 * } 405 * } 406 * --- 407 */ 408 mixin template Lexer(Token, alias defaultTokenFunction, 409 alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens, 410 alias possibleDefaultTokens, alias tokenHandlers) 411 { 412 private alias _IDType = typeof(Token.type); 413 private enum _tok(string symbol) = TokenId!(_IDType, staticTokens, dynamicTokens, possibleDefaultTokens, symbol); 414 415 static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must" 416 ~ " have a corresponding handler function name."); 417 418 static string generateMask(const ubyte[] arr) 419 { 420 import std..string : format; 421 ulong u; 422 for (size_t i = 0; i < arr.length && i < 8; i++) 423 { 424 u |= (cast(ulong) arr[i]) << (i * 8); 425 } 426 return format("0x%016x", u); 427 } 428 429 private static string generateByteMask(size_t l) 430 { 431 import std..string : format; 432 return format("0x%016x", ulong.max >> ((8 - l) * 8)); 433 } 434 435 private static size_t calcSplitCount(size_t a, size_t b) pure nothrow 436 { 437 int i; 438 while (true) 439 { 440 i++; 441 a /= 2; 442 if (a < b) 443 break; 444 } 445 return i; 446 } 447 448 private static char[] getBeginningChars(string[] allTokens) 449 { 450 char[] beginningChars; 451 for (size_t i = 0; i < allTokens.length; i++) 452 { 453 if (allTokens[i].length == 0) 454 continue; 455 beginningChars ~= allTokens[i][0]; 456 size_t j = i + 1; 457 while (j < allTokens.length && allTokens[i][0] == allTokens[j][0]) 458 j++; 459 i = j - 1; 460 } 461 return beginningChars; 462 } 463 464 private static string generateStatements() 465 { 466 import std.algorithm : sort; 467 import std.range : stride; 468 469 string[] pseudoTokens = array(tokenHandlers.stride(2)); 470 string[] allTokens = array(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq()); 471 // Array consisting of a sorted list of the first characters of the 472 // tokens. 473 char[] beginningChars = getBeginningChars(allTokens); 474 size_t i = calcSplitCount(beginningChars.length, 8); 475 return generateStatementsStep(allTokens, pseudoTokens, beginningChars, i); 476 } 477 478 private static string generateStatementsStep(string[] allTokens, 479 string[] pseudoTokens, char[] chars, size_t i, string indent = "") 480 { 481 import std..string : format; 482 string code; 483 if (i > 0) 484 { 485 size_t p = chars.length / 2; 486 code ~= indent ~ format("if (f < 0x%02x) // %s \n%s{\n", chars[p], chars[p], indent); 487 code ~= generateStatementsStep(allTokens, pseudoTokens, chars[0 .. p], i - 1, indent ~ " "); 488 code ~= indent ~ "}\n" ~ indent ~ "else\n" ~ indent ~ "{\n"; 489 code ~= generateStatementsStep(allTokens, pseudoTokens, chars[p .. $], i - 1, indent ~ " "); 490 code ~= indent ~ "}\n"; 491 } 492 else 493 { 494 code ~= indent ~ "switch (f)\n" ~ indent ~ "{\n"; 495 foreach (char c; chars) 496 { 497 size_t begin; 498 size_t end; 499 for (size_t j = 0; j < allTokens.length; j++) 500 { 501 if (allTokens[j].length == 0 || allTokens[j][0] != c) 502 continue; 503 begin = j; 504 end = j + 1; 505 while (end < allTokens.length && allTokens[begin][0] == allTokens[end][0]) 506 end++; 507 break; 508 } 509 code ~= format("%scase 0x%02x:\n", indent, c); 510 code ~= printCase(allTokens[begin .. end], pseudoTokens, indent ~ " "); 511 } 512 code ~= indent ~ "default: goto _defaultTokenFunction;\n"; 513 code ~= indent ~ "}\n"; 514 } 515 516 return code; 517 } 518 519 private static string printCase(string[] tokens, string[] pseudoTokens, string indent) 520 { 521 import std.array : array; 522 import std.algorithm : countUntil; 523 import std.conv : text; 524 string[] sortedTokens = array(sort!"a.length > b.length"(tokens)); 525 526 527 if (tokens.length == 1 && tokens[0].length == 1) 528 { 529 if (pseudoTokens.countUntil(tokens[0]) >= 0) 530 { 531 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] 532 ~ "(token);\n" ~ indent ~ "return;\n"; 533 } 534 else if (staticTokens.countUntil(tokens[0]) >= 0) 535 { 536 return indent ~ "range.index++; range.column++;\n" 537 ~ indent ~ "token= Token(_tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n" 538 ~ indent ~ "return;"; 539 } 540 else if (pseudoTokens.countUntil(tokens[0]) >= 0) 541 { 542 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] 543 ~ "(token);\n" ~ indent ~ "return;\n"; 544 545 } 546 } 547 548 string code; 549 550 bool insertTrailingGoto = true; 551 foreach (i, token; sortedTokens) 552 { 553 immutable mask = generateMask(cast (const ubyte[]) token); 554 if (token.length >= 8) 555 code ~= indent ~ "if (frontBytes == " ~ mask ~ ")\n"; 556 else if (token.length != 1) 557 code ~= indent ~ "if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n"; 558 if (token.length != 1) 559 code ~= indent ~ "{\n"; 560 if (pseudoTokens.countUntil(token) >= 0) 561 { 562 if (token.length <= 8) 563 { 564 code ~= indent ~ " " 565 ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] 566 ~ "(token);\n"; 567 code ~= indent ~ "return;\n"; 568 } 569 else 570 { 571 code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~ "\")\n"; 572 code ~= indent ~ " " 573 ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] 574 ~ "();\n"; 575 code ~= indent ~ "return;\n"; 576 } 577 } 578 else if (staticTokens.countUntil(token) >= 0) 579 { 580 if (token.length <= 8) 581 { 582 insertTrailingGoto = false; 583 code ~= indent ~ (token.length != 1 ? " " : "") ~ "range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 584 code ~= indent ~ (token.length != 1 ? " " : "") ~ "token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 585 code ~= indent ~ (token.length != 1 ? " " : "") ~ "return;\n"; 586 } 587 else 588 { 589 code ~= indent ~ " pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n"; 590 } 591 } 592 else 593 { 594 // possible default 595 if (token.length <= 8) 596 { 597 code ~= indent ~ " if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n"; 598 code ~= indent ~ " {\n"; 599 code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 600 code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 601 code ~= indent ~ " return;\n"; 602 code ~= indent ~ " }\n"; 603 code ~= indent ~ " else\n"; 604 code ~= indent ~ " goto _defaultTokenFunction;\n"; 605 } 606 else 607 { 608 code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~"\") && isSeparating(" ~ text(token.length) ~ "))\n"; 609 code ~= indent ~ " {\n"; 610 code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 611 code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 612 code ~= indent ~ " return;\n"; 613 code ~= indent ~ " }\n"; 614 code ~= indent ~ " else\n"; 615 code ~= indent ~ " goto _defaultTokenFunction;\n"; 616 } 617 } 618 if (token.length != 1) 619 { 620 code ~= indent ~ "}\n"; 621 } 622 } 623 if (insertTrailingGoto) 624 code ~= indent ~ "goto _defaultTokenFunction;\n"; 625 return code; 626 } 627 628 /** 629 * Implements the range primitive _front. 630 */ 631 ref const(Token) front()() pure nothrow const @property @safe 632 { 633 return _front; 634 } 635 636 /** 637 * Advances the lexer to the next token and stores the new current token in 638 * the _front variable. 639 */ 640 void _popFront()() pure nothrow @safe 641 { 642 advance(_front); 643 } 644 645 /** 646 * Implements the range primitive _empty. 647 */ 648 bool empty()() pure const nothrow @property @safe @nogc 649 { 650 return _front.type == _tok!"\0"; 651 } 652 653 static string escape(string input) pure @trusted 654 { 655 string retVal; 656 foreach (ubyte c; cast(ubyte[]) input) 657 { 658 switch (c) 659 { 660 case '\\': retVal ~= `\\`; break; 661 case '"': retVal ~= `\"`; break; 662 case '\'': retVal ~= `\'`; break; 663 case '\t': retVal ~= `\t`; break; 664 case '\n': retVal ~= `\n`; break; 665 case '\r': retVal ~= `\r`; break; 666 default: retVal ~= c; break; 667 } 668 } 669 return retVal; 670 } 671 672 enum tokenSearch = generateStatements(); 673 674 static ulong getFront(const ubyte[] arr) pure nothrow @trusted 675 { 676 static union ByteArr { ulong l; ubyte[8] arr; } 677 static assert(ByteArr.sizeof == ulong.sizeof); 678 ByteArr b; 679 b.l = ulong.max; 680 b.arr[0 .. arr.length] = arr[]; 681 return b.l; 682 } 683 684 void advance(ref Token token) pure nothrow @trusted 685 { 686 if (range.index >= range.bytes.length) 687 { 688 token.type = _tok!"\0"; 689 return; 690 } 691 immutable size_t index = range.index; 692 immutable size_t column = range.column; 693 immutable size_t line = range.line; 694 immutable ulong frontBytes = range.index + 8 <= range.bytes.length 695 ? getFront(range.bytes[range.index .. range.index + 8]) 696 : getFront(range.bytes[range.index .. $]); 697 ubyte f = cast(ubyte) frontBytes; 698 // pragma(msg, tokenSearch); 699 mixin(tokenSearch); 700 _defaultTokenFunction: 701 defaultTokenFunction(token); 702 } 703 704 /** 705 * The lexer input. 706 */ 707 LexerRange range; 708 709 /** 710 * The token that is currently at the front of the range. 711 */ 712 Token _front; 713 } 714 715 /** 716 * Range structure that wraps the _lexer's input. 717 */ 718 struct LexerRange 719 { 720 // TODO: When D gets @forceinline the template inline hack (i.e 721 // `void front()() { ... }` )should be removed. 722 723 public nothrow pure @safe @nogc: 724 /** 725 * Params: 726 * bytes = the _lexer input 727 * index = the initial offset from the beginning of $(D_PARAM bytes) 728 * column = the initial _column number 729 * line = the initial _line number 730 */ 731 this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) 732 { 733 this.bytes = bytes; 734 this.index = index; 735 this.column = column; 736 this.line = line; 737 } 738 739 /** 740 * Returns: a mark at the current position that can then be used with slice. 741 */ 742 size_t mark()() const 743 { 744 return index; 745 } 746 747 /** 748 * Sets the range to the given position. 749 * Params: m = the position to seek to 750 */ 751 void seek()(size_t m) 752 { 753 index = m; 754 } 755 756 /** 757 * Returs a slice of the input byte array between the given mark and the 758 * current position. 759 * Params m = the beginning index of the slice to return 760 */ 761 const(ubyte)[] slice()(size_t m) const 762 { 763 return bytes[m .. index]; 764 } 765 766 /** 767 * Implements the range primitive _empty. 768 */ 769 bool empty()() const 770 { 771 return index >= bytes.length; 772 } 773 774 /** 775 * Implements the range primitive _front. 776 */ 777 ubyte front()() const 778 { 779 return bytes[index]; 780 } 781 782 /** 783 * Returns: the current item as well as the items $(D_PARAM p) items ahead. 784 */ 785 const(ubyte)[] peek(size_t p) const 786 { 787 return index + p + 1 > bytes.length 788 ? bytes[index .. $] 789 : bytes[index .. index + p + 1]; 790 } 791 792 /** 793 * Returns: true if the range starts with the given byte sequence 794 */ 795 bool startsWith(const(ubyte[]) needle) const 796 { 797 if (needle.length + index > bytes.length) 798 return false; 799 foreach (i; 0 .. needle.length) 800 if (needle[i] != bytes[index + i]) 801 return false; 802 return true; 803 } 804 805 /** 806 * 807 */ 808 ubyte peekAt()(size_t offset) const 809 { 810 return bytes[index + offset]; 811 } 812 813 /** 814 * Returns: true if it is possible to peek $(D_PARAM p) bytes ahead. 815 */ 816 bool canPeek()(size_t p) const 817 { 818 return index + p < bytes.length; 819 } 820 821 /** 822 * Implements the range primitive _popFront. 823 */ 824 void popFront()() 825 { 826 index++; 827 column++; 828 } 829 830 /** 831 * Implements the algorithm _popFrontN more efficiently. This function does 832 * not detect or handle newlines. 833 */ 834 void popFrontN()(size_t n) 835 { 836 index += n; 837 column += n; 838 } 839 840 /** 841 * Increments the range's line number and resets the column counter. 842 */ 843 void incrementLine()(size_t i = 1) 844 { 845 column = 1; 846 line += i; 847 } 848 849 /** 850 * The input _bytes. 851 */ 852 const(ubyte)[] bytes; 853 854 /** 855 * The range's current position. 856 */ 857 size_t index; 858 859 /** 860 * The current _column number. 861 */ 862 size_t column; 863 864 /** 865 * The current _line number. 866 */ 867 size_t line; 868 }