1 // Written in the D programming language 2 3 /** 4 * $(H2 Summary) 5 * This module contains a range-based compile-time _lexer generator. 6 * 7 * $(H2 Overview) 8 * The _lexer generator consists of a template mixin, $(LREF Lexer), along with 9 * several helper templates for generating such things as token identifiers. 10 * 11 * To write a _lexer using this API: 12 * $(OL 13 * $(LI Create the string array constants for your language. 14 * $(UL 15 * $(LI $(LINK2 #.staticTokens, staticTokens)) 16 * $(LI $(LINK2 #.dynamicTokens, dynamicTokens)) 17 * $(LI $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens)) 18 * $(LI $(LINK2 #.tokenHandlers, tokenHandlers)) 19 * )) 20 * $(LI Create aliases for the various token and token identifier types 21 * specific to your language. 22 * $(UL 23 * $(LI $(LREF TokenIdType)) 24 * $(LI $(LREF tokenStringRepresentation)) 25 * $(LI $(LREF TokenStructure)) 26 * $(LI $(LREF TokenId)) 27 * )) 28 * $(LI Create a struct that mixes in the Lexer template mixin and 29 * implements the necessary functions. 30 * $(UL 31 * $(LI $(LREF Lexer)) 32 * )) 33 * ) 34 * Examples: 35 * $(UL 36 * $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/std/d/lexer.d, here).) 37 * $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).) 38 * $(LI A _lexer for JSON is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/jsonlexer.d, here).) 39 * ) 40 * $(DDOC_ANCHOR TemplateParameters) $(H2 Template Parameter Definitions) 41 * $(DL 42 * $(DT $(DDOC_ANCHOR defaultTokenFunction) $(B defaultTokenFunction) 43 * $(DD A function that serves as the default token lexing function. For most 44 * languages this will be the identifier lexing function.)) 45 * $(DT $(DDOC_ANCHOR tokenSeparatingFunction) $(B tokenSeparatingFunction)) 46 * $(DD A function that is able to determine if an identifier/keyword has come 47 * to an end. This function must return bool and take a single size_t 48 * argument representing the number of bytes to skip over before looking for 49 * a separating character.) 50 * $(DT $(DDOC_ANCHOR staticTokens) $(B staticTokens)) 51 * $(DD A listing of the tokens whose exact value never changes and which cannot 52 * possibly be a token handled by the default token lexing function. The 53 * most common example of this kind of token is an operator such as 54 * $(D_STRING "*"), or $(D_STRING "-") in a programming language.) 55 * $(DT $(DDOC_ANCHOR dynamicTokens) $(B dynamicTokens)) 56 * $(DD A listing of tokens whose value is variable, such as whitespace, 57 * identifiers, number literals, and string literals.) 58 * $(DT $(DDOC_ANCHOR possibleDefaultTokens) $(B possibleDefaultTokens)) 59 * $(DD A listing of tokens that could posibly be one of the tokens handled by 60 * the default token handling function. An common example of this is 61 * a keyword such as $(D_STRING "for"), which looks like the beginning of 62 * the identifier $(D_STRING "fortunate"). $(B tokenSeparatingFunction) is 63 * called to determine if the character after the $(D_STRING 'r') separates 64 * the identifier, indicating that the token is $(D_STRING "for"), or if 65 * lexing should be turned over to the $(B defaultTokenFunction).) 66 * $(DT $(DDOC_ANCHOR tokenHandlers) $(B tokenHandlers)) 67 * $(DD A mapping of prefixes to custom token handling function names. The 68 * generated _lexer will search for the even-index elements of this array, 69 * and then call the function whose name is the element immedately after the 70 * even-indexed element. This is used for lexing complex tokens whose prefix 71 * is fixed.) 72 * ) 73 * 74 * Here are some example constants for a simple calculator _lexer: 75 * --- 76 * // There are a near infinite number of valid number literals, so numbers are 77 * // dynamic tokens. 78 * enum string[] dynamicTokens = ["numberLiteral", "whitespace"]; 79 * 80 * // The operators are always the same, and cannot start a numberLiteral, so 81 * // they are staticTokens 82 * enum string[] staticTokens = ["-", "+", "*", "/"]; 83 * 84 * // In this simple example there are no keywords or other tokens that could 85 * // look like dynamic tokens, so this is blank. 86 * enum string[] possibleDefaultTokens = []; 87 * 88 * // If any whitespace character or digit is encountered, pass lexing over to 89 * // our custom handler functions. These will be demonstrated in an example 90 * // later on. 91 * enum string[] tokenHandlers = [ 92 * "0", "lexNumber", 93 * "1", "lexNumber", 94 * "2", "lexNumber", 95 * "3", "lexNumber", 96 * "4", "lexNumber", 97 * "5", "lexNumber", 98 * "6", "lexNumber", 99 * "7", "lexNumber", 100 * "8", "lexNumber", 101 * "9", "lexNumber", 102 * " ", "lexWhitespace", 103 * "\n", "lexWhitespace", 104 * "\t", "lexWhitespace", 105 * "\r", "lexWhitespace" 106 * ]; 107 * --- 108 * 109 * Copyright: Brian Schott 2013 110 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) 111 * Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu 112 * Source: $(PHOBOSSRC std/experimental/_lexer.d) 113 */ 114 115 module std.experimental.lexer; 116 117 /** 118 * Template for determining the type used for a token type. 119 * 120 * Selects the smallest unsigned integral type that is able to hold the value 121 * staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length. 122 * For example if there are 20 static tokens, 30 dynamic tokens, 123 * and 10 possible default tokens, this template will alias itself to ubyte, 124 * as 20 + 30 + 10 < $(D_KEYWORD ubyte).max. 125 * Examples: 126 * --- 127 * // In our calculator example this means that IdType is an alias for ubyte. 128 * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens); 129 * --- 130 */ 131 template TokenIdType(alias staticTokens, alias dynamicTokens, 132 alias possibleDefaultTokens) 133 { 134 immutable tokenCount = staticTokens.length + dynamicTokens.length 135 + possibleDefaultTokens.length + 1; 136 static if (tokenCount <= ubyte.max) 137 alias TokenIdType = ubyte; 138 else static if (tokenCount <= ushort.max) 139 alias TokenIdType = ushort; 140 else static if (tokenCount <= uint.max) 141 alias TokenIdType = uint; 142 else 143 static assert (false, "The number of tokens must be less than uint.max"); 144 } 145 146 /** 147 * Looks up the string representation of the given token type. 148 * 149 * This is the opposite of the function of the TokenId template. 150 * Params: type = the token type identifier 151 * Examples: 152 * --- 153 * alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens); 154 * assert (str(tok!"*") == "*"); 155 * --- 156 * See_Also: $(LREF TokenId) 157 */ 158 string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, 159 alias possibleDefaultTokens)(IdType type) pure nothrow @property @nogc @safe 160 { 161 // hax 162 static auto f() pure nothrow @trusted 163 { 164 return cast(immutable) staticTokens ~ dynamicTokens ~ possibleDefaultTokens; 165 } 166 167 static immutable tokens = f(); 168 169 if (type == 0) 170 return "!ERROR!"; 171 else if (type < tokens.length + 1) 172 return tokens[type - 1]; 173 else 174 return null; 175 } 176 177 unittest { 178 alias IdType = TokenIdType!(["foo"], ["bar"], ["doo"]); 179 enum tok(string token) = TokenId!(IdType, ["foo"], ["bar"], ["doo"], token); 180 alias str = tokenStringRepresentation!(IdType, ["foo"], ["bar"], ["doo"]); 181 182 static assert (str(tok!"foo") == "foo"); 183 static assert (str(tok!"bar") == "bar"); 184 static assert (str(tok!"doo") == "doo"); 185 } 186 187 /** 188 * Generates the token type identifier for the given symbol. 189 * 190 * There are two special cases: 191 * $(UL 192 * $(LI If symbol is $(D_STRING ""), then the token identifier will be 0) 193 * $(LI If symbol is $(D_STRING "\0"), then the token identifier will be the maximum 194 * valid token type identifier) 195 * ) 196 * In all cases this template will alias itself to a constant of type IdType. 197 * This template will fail at compile time if $(D_PARAM symbol) is not one of 198 * the staticTokens, dynamicTokens, or possibleDefaultTokens. 199 * Examples: 200 * --- 201 * template tok(string symbol) 202 * { 203 * alias tok = TokenId!(IdType, staticTokens, dynamicTokens, 204 * possibleDefaultTokens, symbol); 205 * } 206 * // num and plus are of type ubyte. 207 * IdType plus = tok!"+"; 208 * IdType num = tok!"numberLiteral"; 209 * --- 210 */ 211 template TokenId(IdType, alias staticTokens, alias dynamicTokens, 212 alias possibleDefaultTokens, string symbol) 213 { 214 enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens; 215 216 import std.algorithm; 217 static if (symbol == "") 218 { 219 enum id = 0; 220 alias TokenId = id; 221 } 222 else static if (symbol == "\0") 223 { 224 enum id = 1 + tokens.length; 225 alias TokenId = id; 226 } 227 else 228 { 229 enum i = tokens.countUntil(symbol); 230 static if (i != -1) 231 { 232 enum id = i + 1; 233 static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol); 234 alias TokenId = id; 235 } 236 else 237 static assert (0, "Invalid token: " ~ symbol); 238 } 239 } 240 241 /** 242 * The token that is returned by the lexer. 243 * Params: 244 * IdType = The D type of the "type" token type field. 245 * extraFields = A string containing D code for any extra fields that should 246 * be included in the token structure body. This string is passed 247 * directly to a mixin statement. 248 * Examples: 249 * --- 250 * // No extra struct fields are desired in this example, so leave it blank. 251 * alias Token = TokenStructure!(IdType, ""); 252 * Token minusToken = Token(tok!"-"); 253 * --- 254 */ 255 struct TokenStructure(IdType, string extraFields = "") 256 { 257 public pure nothrow @safe @nogc: 258 259 bool opEquals(ref const typeof(this) other) const 260 { 261 return this.type == other.type && this.text == other.text; 262 } 263 264 /** 265 * Returs: true if the token has the given type, false otherwise. 266 */ 267 bool opEquals(IdType type) const 268 { 269 return this.type == type; 270 } 271 272 /** 273 * Constructs a token from a token type. 274 * Params: type = the token type 275 */ 276 this(IdType type) 277 { 278 this.type = type; 279 } 280 281 /** 282 * Constructs a token. 283 * Params: 284 * type = the token type 285 * text = the text of the token, which may be null 286 * line = the line number at which this token occurs 287 * column = the column number at which this token occurs 288 * index = the byte offset from the beginning of the input at which this 289 * token occurs 290 */ 291 this(IdType type, string text, size_t line, size_t column, size_t index) 292 { 293 this.text = text; 294 this.line = line; 295 this.column = column; 296 this.type = type; 297 this.index = index; 298 } 299 300 /** 301 * The _text of the token. 302 */ 303 string text; 304 305 /** 306 * The _line number at which this token occurs. 307 */ 308 size_t line; 309 310 /** 311 * The _column number at which this token occurs. This is measured in bytes 312 * and may not be correct when tab characters are involved. 313 */ 314 size_t column; 315 316 /** 317 * The byte offset from the beginning of the input at which this token 318 * occurs. 319 */ 320 size_t index; 321 322 /** 323 * The token type. 324 */ 325 IdType type; 326 327 mixin (extraFields); 328 } 329 330 /** 331 * The implementation of the _lexer is contained within this mixin template. 332 * 333 * To use it, this template should be mixed in to a struct that represents the 334 * _lexer for your language. This struct should implement the following methods: 335 * $(UL 336 * $(LI popFront, which should call this mixin's _popFront() and 337 * additionally perform any token filtering or shuffling you deem 338 * necessary. For example, you can implement popFront to skip comment or 339 * tokens.) 340 * $(LI A function that serves as the default token lexing function. For 341 * most languages this will be the identifier lexing function. This 342 * should then be passed to the $(LREF Lexer) template mixin as the 343 * $(LINK2 #.defaultTokenFunction defaultTokenFunction) template 344 * parameter.) 345 * $(LI A function that is able to determine if an identifier/keyword has 346 * come to an end. This function must return $(D_KEYWORD bool) and take 347 * a single $(D_KEYWORD size_t) argument representing the number of 348 * bytes to skip over before looking for a separating character.) 349 * $(LI Any functions referred to in the tokenHandlers template paramater. 350 * These functions must be marked $(D_KEYWORD pure nothrow), take no 351 * arguments, and return a token) 352 * $(LI A constructor that initializes the range field as well as calls 353 * popFront() exactly once (to initialize the _front field).) 354 * ) 355 * Params: 356 * Token = $(LREF TokenStructure) 357 * defaultTokenFunction = $(LINK2 #.defaultTokenFunction, defaultTokenFunction) 358 * tokenSeparatingFunction = $(LINK2 #.tokenSeparatingFunction, tokenSeparatingFunction) 359 * staticTokens = $(LINK2 #.staticTokens, staticTokens) 360 * dynamicTokens = $(LINK2 #.dynamicTokens, dynamicTokens) 361 * possibleDefaultTokens = $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens) 362 * tokenHandlers = $(LINK2 #.tokenHandlers, tokenHandlers) 363 * Examples: 364 * --- 365 * struct CalculatorLexer 366 * { 367 * mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating, 368 * staticTokens, dynamicTokens, possibleDefaultTokens, tokenHandlers); 369 * 370 * this (ubyte[] bytes) 371 * { 372 * this.range = LexerRange(bytes); 373 * popFront(); 374 * } 375 * 376 * void popFront() pure 377 * { 378 * _popFront(); 379 * } 380 * 381 * Token lexNumber() pure nothrow @safe 382 * { 383 * // implementation goes here 384 * } 385 * 386 * Token lexWhitespace() pure nothrow @safe 387 * { 388 * // implementation goes here 389 * } 390 * 391 * Token defaultTokenFunction() pure nothrow @safe 392 * { 393 * // There is no default token in the example calculator language, so 394 * // this is always an error. 395 * range.popFront(); 396 * return Token(tok!""); 397 * } 398 * 399 * bool isSeparating(size_t offset) pure nothrow @safe 400 * { 401 * // For this example language, always return true. 402 * return true; 403 * } 404 * } 405 * --- 406 */ 407 mixin template Lexer(Token, alias defaultTokenFunction, 408 alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens, 409 alias possibleDefaultTokens, alias tokenHandlers) 410 { 411 private alias _IDType = typeof(Token.type); 412 private enum _tok(string symbol) = TokenId!(_IDType, staticTokens, dynamicTokens, possibleDefaultTokens, symbol); 413 414 static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must" 415 ~ " have a corresponding handler function name."); 416 417 static string generateMask(const ubyte[] arr) 418 { 419 import std.string : format; 420 ulong u; 421 for (size_t i = 0; i < arr.length && i < 8; i++) 422 { 423 u |= (cast(ulong) arr[i]) << (i * 8); 424 } 425 return format("0x%016x", u); 426 } 427 428 private static string generateByteMask(size_t l) 429 { 430 import std.string : format; 431 return format("0x%016x", ulong.max >> ((8 - l) * 8)); 432 } 433 434 private static size_t calcSplitCount(size_t a, size_t b) pure nothrow 435 { 436 int i; 437 while (true) 438 { 439 i++; 440 a /= 2; 441 if (a < b) 442 break; 443 } 444 return i; 445 } 446 447 private static char[] getBeginningChars(string[] allTokens) 448 { 449 char[] beginningChars; 450 for (size_t i = 0; i < allTokens.length; i++) 451 { 452 if (allTokens[i].length == 0) 453 continue; 454 beginningChars ~= allTokens[i][0]; 455 size_t j = i + 1; 456 while (j < allTokens.length && allTokens[i][0] == allTokens[j][0]) 457 j++; 458 i = j - 1; 459 } 460 return beginningChars; 461 } 462 463 private static string generateStatements() 464 { 465 import std.algorithm : sort; 466 import std.range : stride; 467 468 string[] pseudoTokens = array(tokenHandlers.stride(2)); 469 string[] allTokens = array(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq()); 470 // Array consisting of a sorted list of the first characters of the 471 // tokens. 472 char[] beginningChars = getBeginningChars(allTokens); 473 size_t i = calcSplitCount(beginningChars.length, 8); 474 return generateStatementsStep(allTokens, pseudoTokens, beginningChars, i); 475 } 476 477 private static string generateStatementsStep(string[] allTokens, 478 string[] pseudoTokens, char[] chars, size_t i, string indent = "") 479 { 480 import std.string : format; 481 string code; 482 if (i > 0) 483 { 484 size_t p = chars.length / 2; 485 code ~= indent ~ format("if (f < 0x%02x) // %s \n%s{\n", chars[p], chars[p], indent); 486 code ~= generateStatementsStep(allTokens, pseudoTokens, chars[0 .. p], i - 1, indent ~ " "); 487 code ~= indent ~ "}\n" ~ indent ~ "else\n" ~ indent ~ "{\n"; 488 code ~= generateStatementsStep(allTokens, pseudoTokens, chars[p .. $], i - 1, indent ~ " "); 489 code ~= indent ~ "}\n"; 490 } 491 else 492 { 493 code ~= indent ~ "switch (f)\n" ~ indent ~ "{\n"; 494 foreach (char c; chars) 495 { 496 size_t begin; 497 size_t end; 498 for (size_t j = 0; j < allTokens.length; j++) 499 { 500 if (allTokens[j].length == 0 || allTokens[j][0] != c) 501 continue; 502 begin = j; 503 end = j + 1; 504 while (end < allTokens.length && allTokens[begin][0] == allTokens[end][0]) 505 end++; 506 break; 507 } 508 code ~= format("%scase 0x%02x:\n", indent, c); 509 code ~= printCase(allTokens[begin .. end], pseudoTokens, indent ~ " "); 510 } 511 code ~= indent ~ "default: goto _defaultTokenFunction;\n"; 512 code ~= indent ~ "}\n"; 513 } 514 515 return code; 516 } 517 518 private static string printCase(string[] tokens, string[] pseudoTokens, string indent) 519 { 520 import std.array : array; 521 import std.algorithm : countUntil; 522 import std.conv : text; 523 string[] sortedTokens = array(sort!"a.length > b.length"(tokens)); 524 525 526 if (tokens.length == 1 && tokens[0].length == 1) 527 { 528 if (pseudoTokens.countUntil(tokens[0]) >= 0) 529 { 530 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] 531 ~ "(token);\n" ~ indent ~ "return;\n"; 532 } 533 else if (staticTokens.countUntil(tokens[0]) >= 0) 534 { 535 return indent ~ "range.index++; range.column++;\n" 536 ~ indent ~ "token= Token(_tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n" 537 ~ indent ~ "return;"; 538 } 539 else if (pseudoTokens.countUntil(tokens[0]) >= 0) 540 { 541 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] 542 ~ "(token);\n" ~ indent ~ "return;\n"; 543 544 } 545 } 546 547 string code; 548 549 bool insertTrailingGoto = true; 550 foreach (i, token; sortedTokens) 551 { 552 immutable mask = generateMask(cast (const ubyte[]) token); 553 if (token.length >= 8) 554 code ~= indent ~ "if (frontBytes == " ~ mask ~ ")\n"; 555 else if (token.length != 1) 556 code ~= indent ~ "if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n"; 557 if (token.length != 1) 558 code ~= indent ~ "{\n"; 559 if (pseudoTokens.countUntil(token) >= 0) 560 { 561 if (token.length <= 8) 562 { 563 code ~= indent ~ " " 564 ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] 565 ~ "(token);\n"; 566 code ~= indent ~ "return;\n"; 567 } 568 else 569 { 570 code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~ "\")\n"; 571 code ~= indent ~ " " 572 ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] 573 ~ "();\n"; 574 code ~= indent ~ "return;\n"; 575 } 576 } 577 else if (staticTokens.countUntil(token) >= 0) 578 { 579 if (token.length <= 8) 580 { 581 insertTrailingGoto = false; 582 code ~= indent ~ (token.length != 1 ? " " : "") ~ "range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 583 code ~= indent ~ (token.length != 1 ? " " : "") ~ "token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 584 code ~= indent ~ (token.length != 1 ? " " : "") ~ "return;\n"; 585 } 586 else 587 { 588 code ~= indent ~ " pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n"; 589 } 590 } 591 else 592 { 593 // possible default 594 if (token.length <= 8) 595 { 596 code ~= indent ~ " if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n"; 597 code ~= indent ~ " {\n"; 598 code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 599 code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 600 code ~= indent ~ " return;\n"; 601 code ~= indent ~ " }\n"; 602 code ~= indent ~ " else\n"; 603 code ~= indent ~ " goto _defaultTokenFunction;\n"; 604 } 605 else 606 { 607 code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~"\") && isSeparating(" ~ text(token.length) ~ "))\n"; 608 code ~= indent ~ " {\n"; 609 code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 610 code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 611 code ~= indent ~ " return;\n"; 612 code ~= indent ~ " }\n"; 613 code ~= indent ~ " else\n"; 614 code ~= indent ~ " goto _defaultTokenFunction;\n"; 615 } 616 } 617 if (token.length != 1) 618 { 619 code ~= indent ~ "}\n"; 620 } 621 } 622 if (insertTrailingGoto) 623 code ~= indent ~ "goto _defaultTokenFunction;\n"; 624 return code; 625 } 626 627 /** 628 * Implements the range primitive _front. 629 */ 630 ref const(Token) front()() pure nothrow const @property @safe 631 { 632 return _front; 633 } 634 635 /** 636 * Advances the lexer to the next token and stores the new current token in 637 * the _front variable. 638 */ 639 void _popFront()() pure nothrow @safe 640 { 641 advance(_front); 642 } 643 644 /** 645 * Implements the range primitive _empty. 646 */ 647 bool empty()() pure const nothrow @property @safe @nogc 648 { 649 return _front.type == _tok!"\0"; 650 } 651 652 static string escape(string input) pure @trusted 653 { 654 string retVal; 655 foreach (ubyte c; cast(ubyte[]) input) 656 { 657 switch (c) 658 { 659 case '\\': retVal ~= `\\`; break; 660 case '"': retVal ~= `\"`; break; 661 case '\'': retVal ~= `\'`; break; 662 case '\t': retVal ~= `\t`; break; 663 case '\n': retVal ~= `\n`; break; 664 case '\r': retVal ~= `\r`; break; 665 default: retVal ~= c; break; 666 } 667 } 668 return retVal; 669 } 670 671 enum tokenSearch = generateStatements(); 672 673 static ulong getFront(const ubyte[] arr) pure nothrow @trusted 674 { 675 static union ByteArr { ulong l; ubyte[8] arr; } 676 static assert(ByteArr.sizeof == ulong.sizeof); 677 ByteArr b; 678 b.l = ulong.max; 679 b.arr[0 .. arr.length] = arr[]; 680 return b.l; 681 } 682 683 void advance(ref Token token) pure nothrow @trusted 684 { 685 if (range.index >= range.bytes.length) 686 { 687 token.type = _tok!"\0"; 688 return; 689 } 690 immutable size_t index = range.index; 691 immutable size_t column = range.column; 692 immutable size_t line = range.line; 693 immutable ulong frontBytes = range.index + 8 <= range.bytes.length 694 ? getFront(range.bytes[range.index .. range.index + 8]) 695 : getFront(range.bytes[range.index .. $]); 696 ubyte f = cast(ubyte) frontBytes; 697 // pragma(msg, tokenSearch); 698 mixin(tokenSearch); 699 _defaultTokenFunction: 700 defaultTokenFunction(token); 701 } 702 703 /** 704 * The lexer input. 705 */ 706 LexerRange range; 707 708 /** 709 * The token that is currently at the front of the range. 710 */ 711 Token _front; 712 } 713 714 /** 715 * Range structure that wraps the _lexer's input. 716 */ 717 struct LexerRange 718 { 719 /+ TODO: When D gets @forceinline the template inline hack (i.e +/ 720 // `void front()() { ... }` )should be removed. 721 722 public nothrow pure @safe @nogc: 723 /** 724 * Params: 725 * bytes = the _lexer input 726 * index = the initial offset from the beginning of $(D_PARAM bytes) 727 * column = the initial _column number 728 * line = the initial _line number 729 */ 730 this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) 731 { 732 this.bytes = bytes; 733 this.index = index; 734 this.column = column; 735 this.line = line; 736 } 737 738 /** 739 * Returns: a mark at the current position that can then be used with slice. 740 */ 741 size_t mark()() const 742 { 743 return index; 744 } 745 746 /** 747 * Sets the range to the given position. 748 * Params: m = the position to seek to 749 */ 750 void seek()(size_t m) 751 { 752 index = m; 753 } 754 755 /** 756 * Returs a slice of the input byte array between the given mark and the 757 * current position. 758 * Params m = the beginning index of the slice to return 759 */ 760 const(ubyte)[] slice()(size_t m) const 761 { 762 return bytes[m .. index]; 763 } 764 765 /** 766 * Implements the range primitive _empty. 767 */ 768 bool empty()() const 769 { 770 return index >= bytes.length; 771 } 772 773 /** 774 * Implements the range primitive _front. 775 */ 776 ubyte front()() const 777 { 778 return bytes[index]; 779 } 780 781 /** 782 * Returns: the current item as well as the items $(D_PARAM p) items ahead. 783 */ 784 const(ubyte)[] peek(size_t p) const 785 { 786 return index + p + 1 > bytes.length 787 ? bytes[index .. $] 788 : bytes[index .. index + p + 1]; 789 } 790 791 /** 792 * Returns: true if the range starts with the given byte sequence 793 */ 794 bool startsWith(const(ubyte[]) needle) const 795 { 796 if (needle.length + index > bytes.length) 797 return false; 798 foreach (i; 0 .. needle.length) 799 if (needle[i] != bytes[index + i]) 800 return false; 801 return true; 802 } 803 804 /** 805 * 806 */ 807 ubyte peekAt()(size_t offset) const 808 { 809 return bytes[index + offset]; 810 } 811 812 /** 813 * Returns: true if it is possible to peek $(D_PARAM p) bytes ahead. 814 */ 815 bool canPeek()(size_t p) const 816 { 817 return index + p < bytes.length; 818 } 819 820 /** 821 * Implements the range primitive _popFront. 822 */ 823 void popFront()() 824 { 825 index++; 826 column++; 827 } 828 829 /** 830 * Implements the algorithm _popFrontN more efficiently. This function does 831 * not detect or handle newlines. 832 */ 833 void popFrontN()(size_t n) 834 { 835 index += n; 836 column += n; 837 } 838 839 /** 840 * Increments the range's line number and resets the column counter. 841 */ 842 void incrementLine()(size_t i = 1) 843 { 844 column = 1; 845 line += i; 846 } 847 848 /** 849 * The input _bytes. 850 */ 851 const(ubyte)[] bytes; 852 853 /** 854 * The range's current position. 855 */ 856 size_t index; 857 858 /** 859 * The current _column number. 860 */ 861 size_t column; 862 863 /** 864 * The current _line number. 865 */ 866 size_t line; 867 }