1 /** Lexer/Parser Generator for ANTLR (G, G2, G4) and (E)BNF grammars. 2 3 See_Also: https://theantlrguy.atlassian.net/wiki/spaces/ANTLR3/pages/2687036/ANTLR+Cheat+Sheet 4 See_Also: https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form 5 See_Also: https://github.com/antlr/grammars-v4 6 See_Also: https://github.com/antlr/grammars-v4/blob/master/bnf/bnf.g4 7 See_Also: https://stackoverflow.com/questions/53245751/convert-a-form-of-bnf-grammar-to-g4-grammar 8 See_Also: https://bnfc.digitalgrammars.com/ 9 See_Also: https://forum.dlang.org/post/rsmlqfwowpnggwyuibok@forum.dlang.org 10 See_Also: https://www.regular-expressions.info/unicode.html 11 See_Also: https://stackoverflow.com/questions/64654430/meaning-of-plu-in-antlr-grammar/64658336#64658336 12 See_Also: https://stackoverflow.com/questions/28829049/antlr4-any-difference-between-import-and-tokenvocab 13 See_Also: https://github.com/antlr/antlr4/blob/master/doc/grammars.md 14 See_Also: https://github.com/antlr/antlr4/tree/master/doc 15 See_Also: https://slebok.github.io/zoo/index.html 16 17 TODO: 18 19 - Use import std.algorithm.searching : commonPrefix; in alternatives and call it commonPrefixLiteral 20 21 - Add Syntax Tree Nodes as structs with members being sub-nodes. Composition 22 over inheritance. If we use structs over classes more languages, such as 23 Vox, can be supported in the code generation phase. Optionally use 24 extern(C++) classes. Sub-node pointers should be defined as unique 25 pointers with deterministic destruction. 26 27 - Should be allowed instead of warning: 28 grammars-v4/lua/Lua.g4(329,5): Warning: missing left-hand side, token (leftParen) at offset 5967 29 30 - Parallelize grammar parsing and generation of parser files using https://dlang.org/phobos/std_parallelism.html#.parallel 31 After that compilation of parser files should grouped into CPU-count number of groups. 32 33 - Use: https://forum.dlang.org/post/zcvjwdetohmklaxriswk@forum.dlang.org 34 35 - Rewriting (X+)? as X* in ANTLR grammars and `commit` to `grammars-v4`. 36 See https://stackoverflow.com/questions/64706408/rewriting-x-as-x-in-antlr-grammars 37 38 - Add errors for missing symbols during code generation 39 40 - Literal indexing: 41 - Add map from string literal to fixed-length (typically lexer) rule 42 - Warn about string literals, such as str(`...`), that are equal to tokens 43 such `ELLIPSIS` in `Python3.g4`. 44 45 - Make `Rule.root` be of type `Matcher` and make 46 - `dcharCountSpan` and 47 - `toMatchInSource` 48 members of `Matcher`. 49 - Remove `Symbol.toMatchInSource` 50 51 - Support `tokens { INDENT_WS, DEDENT_WS, LINE_BREAK_WS }` to get 52 Python3.g4` with TOK.whitespaceIndent, whitespaceDedent, whitespaceLineBreak useWhitespaceClassesFlag 53 See: https://stackoverflow.com/questions/8642154/antlr-what-is-simpliest-way-to-realize-python-like-indent-depending-grammar 54 55 - Unicode regular expressions. 56 Use https://www.regular-expressions.info/unicode.html 57 Use https://forum.dlang.org/post/rsmlqfwowpnggwyuibok@forum.dlang.org 58 59 - Use to detect conflicting rules with `import` and `tokenVocab` 60 61 - Use a region allocator on top of the GC to pre-allocate the nodes. Either 62 copied from std.allocator or Vox. Maybe one region for each file. Calculate 63 the region size from lexer statistics (number of operators, symbols and 64 literals). 65 66 - `not(...)`'s implementation needs to be adjusted. often used in conjunction with `altN`? 67 68 - handle all TODO's in `makeRule` 69 70 - Move parserSourceBegin to gxbnf_rdbase.d 71 72 - Use `TOK.tokenSpecOptions` in parsing. Ignored for now. 73 74 - Essentially, Packrat parsing just means caching whether sub-expressions 75 match at the current position in the string when they are tested -- this 76 means that if the current attempt to fit the string into an expression 77 fails then attempts to fit other possible expressions can benefit from the 78 known pass/fail of subexpressions at the points in the string where they 79 have already been tested. 80 81 - Deal with differences between `import` and `tokenVocab`. 82 See: https://stackoverflow.com/questions/28829049/antlr4-any-difference-between-import-and-tokenvocab 83 84 - Add `Rule` in generated code that defines opApply for matching that overrides 85 - Detect indirect mutual left-recursion by check if `Rule.lastOffset` (in 86 generated code) is same as current parser offset. Simple-way in generated 87 parsers: enters a rule again without offset change. Requires storing last 88 offset for each non-literal rule. 89 ** Last offset during parsing. 90 * 91 * Used to detect infinite recursion, `size_t.max` indicates no last offset 92 * yet defined for `this` rule. * 93 size_t lastOffset = size_t.max; 94 95 - Warn about `options{greedy=false;}:` and advice to replace with non-greedy variants 96 - Warn about `options{greedy=true;}:` being deprecated 97 98 - Display column range for tokens in messages. Use `head.input.length`. 99 Requires updating FlyCheck. 100 See: `-fdiagnostics-print-source-range-info` at https://clang.llvm.org/docs/UsersManual.html. 101 See: https://clang.llvm.org/diagnostics.html 102 Use GNU-style formatting such as: fix-it:"test.c":{45:3-45:21}:"gtk_widget_show_all". 103 104 - Use: `nxt.git` to scan parsing examples in `grammars-v4` 105 106 - If performance is needed: 107 - Avoid casts and instead compare against `head.tok` for `isA!NodeType` 108 - use `RuleAltN(uint n)` in `makeAlt` 109 - use `SeqN(uint n)` in `makeSeq` 110 111 - Support reading parsers from [Grammar Zoom](https://slebok.github.io/zoo/index.html). 112 */ 113 module nxt.gxbnf; 114 115 // version = show; 116 version = Do_Inline; 117 118 enum useStaticTempArrays = false; ///< Use fixed-size (statically allocated) sequence and alternative buffers. 119 120 import core.lifetime : move; 121 import core.stdc.stdio : putchar, printf; 122 123 import std.conv : to; 124 import std.algorithm.comparison : min, max; 125 import std.algorithm.iteration : map, joiner, substitute; 126 import std.array : array; 127 import std.file : tempDir; 128 import std.path; 129 import std.exception : enforce; 130 131 // `d-deps.el` requires these to be at the top: 132 import nxt.line_column : offsetLineColumn; 133 import nxt.fixed_array : FixedArray; 134 import nxt.dynamic_array : DynamicArray; 135 import nxt.file_ex : rawReadPath; 136 import nxt.array_algorithm : startsWith, endsWith, endsWithEither, skipOver, skipOverBack, skipOverAround, canFind, indexOf, indexOfEither; 137 import nxt.conv_ex : toDefaulted; 138 import nxt.dbgio; 139 140 import std.stdio : File, stdout, write, writeln; 141 142 @safe: 143 144 alias Input = string; ///< Grammar input source. 145 alias Output = DynamicArray!char; ///< Generated parser output source. 146 147 alias RulesByName = Rule[Input]; 148 149 enum matcherFunctionNamePrefix = `m__`; 150 151 ///< Token kind. TODO: make this a string type like with std.experimental.lexer 152 enum TOK 153 { 154 none, 155 156 unknown, ///< Unknown 157 158 whitespace, ///< Whitespace 159 160 symbol, ///< Symbol 161 attributeSymbol, ///< Attribute Symbol (starting with `$`) 162 actionSymbol, ///< Action Symbol (starting with `@`) 163 164 number, ///< Number 165 166 lineComment, ///< Single line comment 167 blockComment, ///< Multi-line (block) comment 168 169 leftParen, ///< Left parenthesis 170 rightParen, ///< Right parenthesis 171 172 action, ///< Code block 173 174 brackets, ///< Alternatives within '[' ... ']' 175 176 literal, ///< Text literal, single or double quoted 177 178 colon, ///< Colon `:` 179 semicolon, ///< Semicolon `;` 180 hash, ///< Hash `#` 181 labelAssignment, ///< Label assignment `=` 182 listLabelAssignment, ///< List label assignment `+=` 183 184 qmark, ///< Greedy optional or semantic predicate (`?`) 185 qmarkQmark, ///< Non-Greedy optional (`??`) 186 187 star, ///< Greedy zero or more (`*`) 188 starQmark, ///< Non-Greedy Zero or more (`*?`) 189 190 plus, ///< Greedy one or more (`+`) 191 plusQmark, ///< Non-Greedy One or more (`+?`) 192 193 pipe, ///< Alternative (`|`) 194 tilde, ///< Match negation (`~`) 195 lt, ///< `<` 196 gt, ///< `>` 197 comma, ///< `.` 198 exclamation, ///< Exclude from AST (`!`) 199 rootNode, ///< Root node (`^`) 200 wildcard, ///< `.` 201 dotdot, ///< `..` 202 203 rewrite, ///< Rewrite rule (`->`) 204 205 /** Syntactic predicate rule rewrite (`=>`). 206 * 207 * Wikipedia: A syntactic predicate specifies the syntactic validity of 208 * applying a production in a formal grammar and is analogous to a semantic 209 * predicate that specifies the semantic validity of applying a 210 * production. It is a simple and effective means of dramatically improving 211 * the recognition strength of an LL parser by providing arbitrary 212 * lookahead. In their original implementation, syntactic predicates had the 213 * form “( α )?” and could only appear on the left edge of a production. 214 * The required syntactic condition α could be any valid context-free 215 * grammar fragment. 216 * 217 * See_Also: https://en.wikipedia.org/wiki/Syntactic_predicate 218 * See_Also: https://wincent.com/wiki/ANTLR_predicates 219 */ 220 rewriteSyntacticPredicate, 221 222 /** Token spec options: 223 "<" 224 id ASSIGN optionValue 225 ( SEMI id ASSIGN optionValue )* 226 ">" 227 ; 228 */ 229 tokenSpecOptions, 230 231 _error, ///< Error token 232 } 233 234 /// Gx rule. 235 @safe struct Token 236 { 237 nothrow: 238 this(in TOK tok, Input input = null) @nogc pure 239 { 240 this.tok = tok; 241 this.input = input; 242 } 243 Input input; 244 TOK tok; 245 } 246 247 static bool isSymbolStart(in dchar ch) pure nothrow @safe @nogc 248 { 249 import std.uni : isAlpha; 250 return (ch.isAlpha || 251 ch == '_' || 252 ch == '$' || 253 ch == '@'); 254 } 255 256 /** Gx lexer for all version ANTLR grammsrs (`.g`, `.g2`, `.g4`). 257 * 258 * See_Also: `ANTLRv4Lexer.g4` 259 */ 260 @safe struct GxLexer 261 { 262 import std.algorithm.comparison : among; 263 264 this(const Input input, 265 const string path = null, 266 in bool includeComments = false, 267 in bool includeWhitespace = false) 268 { 269 _input = input; 270 this.path = path; 271 272 import nxt.parsing : isNullTerminated; 273 enforce(_input.isNullTerminated, "Input isn't null-terminated"); // input cannot be trusted 274 275 _includeComments = includeComments; 276 _includeWhitespace = includeWhitespace; 277 278 nextFront(); 279 } 280 281 @disable this(this); 282 283 @property bool empty() const pure nothrow scope @nogc 284 { 285 version(D_Coverage) {} else version(Do_Inline) pragma(inline, true); 286 return _endOfFile; 287 } 288 289 inout(Token) front() inout pure scope return nothrow @nogc 290 { 291 version(D_Coverage) {} else version(Do_Inline) pragma(inline, true); 292 assert(!empty); 293 return _token; 294 } 295 296 void popFront() scope nothrow 297 { 298 version(D_Coverage) {} else version(Do_Inline) pragma(inline, true); 299 assert(!empty); 300 nextFront(); 301 } 302 303 void frontEnforce(in TOK tok, const scope Input msg = "") nothrow // TODO: @nogc 304 { 305 if (front.tok != tok) 306 errorAtFront(msg ~ ", expected `TOK." ~ tok.toDefaulted!string(null) ~ "`"); 307 } 308 309 void popFrontEnforce(in TOK tok, const scope Input msg) nothrow // TODO: @nogc 310 { 311 if (frontPop().tok != tok) 312 errorAtFront(msg ~ ", expected `TOK." ~ tok.toDefaulted!string(null) ~ "`"); 313 } 314 315 Token frontPopEnforce(in TOK tok, const scope Input msg = "") nothrow // TODO: @nogc 316 { 317 const result = frontPop(); 318 if (result.tok != tok) 319 errorAtFront(msg ~ ", expected `TOK." ~ tok.toDefaulted!string(null) ~ "`"); 320 return result; 321 } 322 323 Token frontPop() scope return nothrow 324 { 325 version(D_Coverage) {} else version(LDC) version(Do_Inline) pragma(inline, true); 326 const result = front; 327 popFront(); 328 return result; 329 } 330 331 Token skipOverToken(in Token token) scope return nothrow 332 { 333 if (front == token) 334 return frontPop(); 335 return typeof(return).init; 336 } 337 338 Token skipOverTOK(in TOK tok) scope return nothrow 339 { 340 if (front.tok == tok) 341 return frontPop(); 342 return typeof(return).init; 343 } 344 345 import std.meta : AliasSeq; 346 347 // from std.ascii.isWhite 348 alias endOfLineChars = AliasSeq!('\n', // (0x0a) 349 '\r', // (0x0c) 350 ); 351 alias whiteChars = AliasSeq!(' ', // 0x20 352 '\t', // (0x09) 353 '\n', // (0x0a) 354 '\v', // (0x0b) 355 '\r', // (0x0c) 356 '\f' // (0x0d) 357 ); 358 359 private: 360 361 /// Peek next `char` in input. 362 dchar peek0() const pure scope nothrow @nogc 363 { 364 version(D_Coverage) {} else version(Do_Inline) pragma(inline, true); 365 return _input[_offset]; // TODO: decode `dchar` 366 } 367 368 /// Peek next next `char` in input. 369 dchar peek1() const pure scope nothrow @nogc 370 { 371 version(D_Coverage) {} else version(Do_Inline) pragma(inline, true); 372 return _input[_offset + 1]; // TODO: decode `dchar` 373 } 374 375 /// Peek `n`-th next `char` in input. 376 dchar peekN(in size_t n) const pure scope nothrow @nogc 377 { 378 version(D_Coverage) {} else version(Do_Inline) pragma(inline, true); 379 return _input[_offset + n]; // TODO: decode `dchar` 380 } 381 382 /// Drop next byte in input. 383 void drop1() pure nothrow @nogc 384 { 385 version(D_Coverage) {} else version(Do_Inline) pragma(inline, true); 386 _offset += 1; 387 } 388 389 /// Drop next `n` bytes in input. 390 void dropN(in size_t n) pure nothrow @nogc 391 { 392 version(D_Coverage) {} else version(Do_Inline) pragma(inline, true); 393 _offset += n; // TODO: decode `dchar` 394 } 395 396 /// Skip over `n` bytes in input. 397 Input skipOverN(in size_t n) pure return nothrow @nogc 398 { 399 version(D_Coverage) {} else pragma(inline); 400 const part = _input[_offset .. _offset + n]; // TODO: decode `dchar` 401 dropN(n); 402 return part; 403 } 404 405 /// Skip over next `char`. 406 Input skipOver1() pure return nothrow @nogc 407 { 408 version(D_Coverage) {} else pragma(inline); 409 return _input[_offset .. ++_offset]; // TODO: decode `dchar` 410 } 411 412 /// Skip over next two `char`s. 413 Input skipOver2() pure return nothrow @nogc 414 { 415 version(D_Coverage) {} else pragma(inline); 416 return _input[_offset .. (_offset += 2)]; // TODO: decode `dchar` 417 } 418 419 /// Skip line comment. 420 void skipLineComment() pure scope nothrow @nogc 421 { 422 while (!peek0().among!('\0', endOfLineChars)) 423 _offset += 1; // TODO: decode `dchar` 424 } 425 426 /// Skip line comment. 427 Input getLineComment() pure return nothrow @nogc 428 { 429 size_t i; 430 while (!peekN(i).among!('\0', endOfLineChars)) 431 i += 1; // TODO: decode `dchar` 432 return skipOverN(i); // TODO: decode `dchar` 433 } 434 435 /// Skip block comment. 436 void skipBlockComment() scope nothrow 437 { 438 while (!peek0().among!('\0')) 439 { 440 if (peek0() == '*' && 441 peek1() == '/') 442 { 443 _offset += 2; 444 return; 445 } 446 _offset += 1; 447 } 448 errorAtFront("unterminated block comment"); 449 } 450 451 /// Get symbol. 452 Input getSymbol() pure return nothrow @nogc 453 { 454 import std.uni : isAlphaNum; // TODO: decode `dchar` 455 size_t i; 456 const bool attributeFlag = peek0() == '@'; 457 if (peek0().isSymbolStart) 458 i += 1; 459 while (peekN(i).isAlphaNum || 460 peekN(i) == '_' || 461 (attributeFlag && // attribute name 462 peekN(i) == ':')) // may include colon qualifier 463 i += 1; 464 465 // skip optional whitespace before label assignment 466 auto j = i; 467 while (peekN(j).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite` 468 j += 1; 469 470 if (peekN(j) == '=') // label assignment 471 return skipOverN(j + 1); 472 else if (peekN(j) == '+' && 473 peekN(j + 1) == '=') // list label assignment 474 return skipOverN(j + 2); 475 else 476 return skipOverN(i); 477 } 478 479 /// Get number. 480 Input getNumber() pure return nothrow @nogc 481 { 482 import std.ascii : isDigit; 483 size_t i; 484 while (peekN(i).isDigit) 485 i += 1; 486 return skipOverN(i); 487 } 488 489 Input getWhitespace() pure return nothrow @nogc 490 { 491 size_t i; 492 while (peekN(i).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite` 493 i += 1; 494 return skipOverN(i); 495 } 496 497 bool skipOverEsc(ref size_t i) nothrow @nogc 498 { 499 if (peekN(i) == '\\') // TODO: decode `dchar` 500 { 501 i += 1; 502 if (peekN(i) == 'n') 503 i += 1; // TODO: convert to "\r" 504 else if (peekN(i) == 't') 505 i += 1; // TODO: convert to "\t" 506 else if (peekN(i) == 'r') 507 i += 1; // TODO: convert to ASCII "\r" 508 else if (peekN(i) == ']') 509 i += 1; // TODO: convert to ASCII "]" 510 else if (peekN(i) == 'u') 511 { 512 i += 1; 513 import std.ascii : isDigit; 514 while (peekN(i).isDigit) 515 i += 1; 516 // TODO: convert to `dchar` 517 } 518 else if (peekN(i) == '\0') 519 errorAtOffset("unterminated escape sequence at end of file"); 520 else 521 i += 1; 522 return true; 523 } 524 return false; 525 } 526 527 Input getLiteral(dchar terminator)() return nothrow @nogc 528 { 529 size_t i = 1; 530 while (!peekN(i).among!('\0', terminator)) 531 if (!skipOverEsc(i)) 532 i += 1; 533 if (peekN(i) == '\0') 534 errorAtOffset("unterminated string literal at end of file"); 535 return skipOverN(i + 1); // include terminator 536 } 537 538 Input getTokenSpecOptions() return nothrow @nogc 539 { 540 enum dchar terminator = '>'; 541 size_t i = 1; 542 while (!peekN(i).among!('\0', terminator)) 543 i += 1; 544 if (peekN(i) != terminator) 545 { 546 if (peekN(i) == '\0') 547 errorAtOffset("unterminated string literal at end of file"); 548 else 549 errorAtOffset("unterminated token spec option"); 550 } 551 return skipOverN(i + 1); // include terminator '>' 552 } 553 554 Input getHooks() return nothrow @nogc 555 { 556 size_t i; 557 while (!peekN(i).among!('\0', ']')) // may contain whitespace 558 if (!skipOverEsc(i)) 559 i += 1; 560 if (peekN(i) == ']') // skip ']' 561 i += 1; 562 return skipOverN(i); 563 } 564 565 Input getAction() return nothrow @nogc 566 { 567 size_t i; 568 569 DynamicArray!char ds; // delimiter stack 570 571 bool inBlockComment; 572 bool inLineComment; 573 bool inChar; 574 bool inString; 575 576 const infoFlag = false; 577 578 while (!peekN(i).among!('\0')) 579 { 580 // skip over all escape sequences in quoted 581 if (inChar || 582 inString) 583 while (skipOverEsc(i)) {} 584 585 if (!inBlockComment && 586 !inLineComment && 587 !inChar && 588 !inString) 589 { 590 if (peekN(i) == '/' && 591 peekN(i + 1) == '/') 592 { 593 if (infoFlag) 594 infoAtOffset("line comment start", i, ds[]); 595 inLineComment = true; 596 i += 2; 597 continue; 598 } 599 else if (peekN(i) == '/' && 600 peekN(i + 1) == '*') 601 { 602 if (infoFlag) 603 infoAtOffset("block comment start", i, ds[]); 604 inBlockComment = true; 605 i += 2; 606 continue; 607 } 608 else if (peekN(i) == '{') 609 { 610 if (infoFlag) 611 infoAtOffset("brace open", i, ds[]); 612 ds.put('{'); 613 } 614 else if (peekN(i) == '}') 615 { 616 if (infoFlag) 617 infoAtOffset("brace close", i, ds[]); 618 if (!ds.empty && 619 ds.back != '{') 620 errorAtOffset("unmatched", i); 621 ds.popBack(); 622 } 623 else if (peekN(i) == '[') 624 { 625 if (infoFlag) 626 infoAtOffset("hook open", i, ds[]); 627 ds.put('['); 628 } 629 else if (peekN(i) == ']') 630 { 631 if (infoFlag) 632 infoAtOffset("hook close", i, ds[]); 633 if (!ds.empty && 634 ds.back != '[') 635 errorAtOffset("unmatched", i); 636 ds.popBack(); 637 } 638 else if (peekN(i) == '(') 639 { 640 if (infoFlag) 641 infoAtOffset("paren open", i, ds[]); 642 ds.put('('); 643 } 644 else if (peekN(i) == ')') 645 { 646 if (infoFlag) 647 infoAtOffset("paren close", i, ds[]); 648 if (!ds.empty && 649 ds.back != '(') 650 errorAtOffset("unmatched", i); 651 ds.popBack(); 652 } 653 } 654 655 // block comment close 656 if (inBlockComment && 657 peekN(i) == '*' && 658 peekN(i + 1) == '/') 659 { 660 if (infoFlag) 661 infoAtOffset("block comment close", i, ds[]); 662 inBlockComment = false; 663 i += 2; 664 continue; 665 } 666 667 // line comment close 668 if (inLineComment && 669 (peekN(i) == '\n' || 670 peekN(i) == '\r')) 671 { 672 if (infoFlag) 673 infoAtOffset("line comment close", i, ds[]); 674 inLineComment = false; 675 } 676 677 // single-quote open/close 678 if (!inBlockComment && 679 !inLineComment && 680 !inString && 681 peekN(i) == '\'') 682 { 683 if (!ds.empty && 684 ds.back == '\'') 685 { 686 if (infoFlag) 687 infoAtOffset("single-quote close", i, ds[]); 688 ds.popBack(); 689 inChar = false; 690 } 691 else 692 { 693 if (infoFlag) 694 infoAtOffset("single-quote open", i, ds[]); 695 ds.put('\''); 696 inChar = true; 697 } 698 } 699 700 // double-quote open/close 701 if (!inBlockComment && 702 !inLineComment && 703 !inChar && 704 peekN(i) == '"') 705 { 706 if (!ds.empty && 707 ds.back == '"') 708 { 709 if (infoFlag) 710 infoAtOffset("double-quote close", i, ds[]); 711 ds.popBack(); 712 inString = false; 713 } 714 else 715 { 716 if (infoFlag) 717 infoAtOffset("doubl-quote open", i, ds[]); 718 ds.put('"'); 719 inString = true; 720 } 721 } 722 723 i += 1; 724 725 if (ds.length == 0) 726 break; 727 } 728 729 if (inBlockComment) 730 errorAtOffset("unterminated block comment", i); 731 if (ds.length != 0) 732 errorAtOffset("unbalanced code block", i); 733 734 return skipOverN(i); 735 } 736 737 void nextFront() scope nothrow @trusted // TODO: remove `@trusted` 738 { 739 switch (peek0()) 740 { 741 case '/': 742 if (peek1() == '/') // `//` 743 { 744 _offset += 2; 745 skipLineComment(); 746 if (_includeComments) 747 _token = Token(TOK.lineComment); 748 else 749 nextFront(); 750 } 751 else if (peek1() == '*') // `/*` 752 { 753 _offset += 2; 754 skipBlockComment(); 755 if (_includeComments) 756 _token = Token(TOK.blockComment); 757 else 758 return nextFront(); 759 } 760 else 761 errorAtOffset("unexpected character"); 762 break; 763 case '(': 764 _token = Token(TOK.leftParen, skipOver1()); 765 break; 766 case ')': 767 _token = Token(TOK.rightParen, skipOver1()); 768 break; 769 case '{': 770 _token = Token(TOK.action, getAction()); 771 break; 772 case '[': 773 _token = Token(TOK.brackets, getHooks()); 774 break; 775 case '"': 776 _token = Token(TOK.literal, getLiteral!('"')()); 777 break; 778 case '\'': 779 _token = Token(TOK.literal, getLiteral!('\'')()); 780 break; 781 case ':': 782 _token = Token(TOK.colon, skipOver1()); 783 break; 784 case ';': 785 _token = Token(TOK.semicolon, skipOver1()); 786 break; 787 case '#': 788 _token = Token(TOK.hash, skipOver1()); 789 break; 790 case '=': 791 if (peek1() == '>') 792 _token = Token(TOK.rewriteSyntacticPredicate, skipOver2()); 793 else 794 errorAtFront("expected '>' after '='"); 795 break; 796 case '?': 797 if (peek1() == '?') 798 _token = Token(TOK.qmarkQmark, skipOver2()); 799 else 800 _token = Token(TOK.qmark, skipOver1()); 801 break; 802 case '*': 803 if (peek1() == '?') 804 _token = Token(TOK.starQmark, skipOver2()); 805 else 806 _token = Token(TOK.star, skipOver1()); 807 break; 808 case '+': 809 if (peek1() == '=') 810 _token = Token(TOK.listLabelAssignment, skipOver2()); 811 else if (peek1() == '?') 812 _token = Token(TOK.plusQmark, skipOver2()); 813 else 814 _token = Token(TOK.plus, skipOver1()); 815 break; 816 case '|': 817 _token = Token(TOK.pipe, skipOver1()); 818 break; 819 case '~': 820 _token = Token(TOK.tilde, skipOver1()); 821 break; 822 case '<': 823 _token = Token(TOK.tokenSpecOptions, getTokenSpecOptions()); 824 break; 825 case ',': 826 _token = Token(TOK.comma, skipOver1()); 827 break; 828 case '!': 829 _token = Token(TOK.exclamation, skipOver1()); 830 break; 831 case '^': 832 _token = Token(TOK.rootNode, skipOver1()); 833 break; 834 case '.': 835 if (peek1() == '.') // `..` 836 _token = Token(TOK.dotdot, skipOver2()); 837 else 838 _token = Token(TOK.wildcard, skipOver1()); 839 break; 840 case '-': 841 if (peek1() == '>') // `->` 842 _token = Token(TOK.rewrite, skipOver2()); 843 else 844 errorAtOffset("unexpected character"); 845 break; 846 case '0': 847 .. 848 case '9': 849 _token = Token(TOK.number, getNumber()); 850 break; 851 case ' ': 852 case '\t': 853 case '\n': 854 case '\v': 855 case '\r': 856 case '\f': 857 // TODO: extend to std.uni 858 // import std.uni : isWhite; 859 // assert(peek0().isWhite); 860 const ws = getWhitespace(); 861 if (_includeWhitespace) 862 _token = Token(TOK.whitespace, ws); 863 else 864 return nextFront(); 865 break; 866 case '\0': 867 _token = Token.init; 868 _endOfFile = true; 869 return; 870 default: 871 if (peek0().isSymbolStart) 872 { 873 const symbol = getSymbol(); 874 if (symbol.endsWith("+=")) 875 { 876 if (_includeListLabelAssignment) 877 _token = Token(TOK.listLabelAssignment, symbol); 878 else 879 return nextFront(); 880 } 881 else if (symbol.endsWith('=')) 882 { 883 if (_includeLabelAssignment) 884 _token = Token(TOK.labelAssignment, symbol); 885 else 886 return nextFront(); 887 } 888 else 889 switch (symbol[0]) 890 { 891 case '$': 892 _token = Token(TOK.attributeSymbol, symbol); 893 break; 894 case '@': 895 _token = Token(TOK.actionSymbol, symbol); 896 break; 897 default: 898 _token = Token(TOK.symbol, symbol); 899 break; 900 } 901 } 902 else 903 { 904 _token = Token(TOK._error); 905 errorAtOffset("unexpected character"); 906 } 907 } 908 } 909 910 void infoAtFront(const scope Input msg) const nothrow scope @nogc 911 { 912 messageAtToken(front, "Info", msg); 913 } 914 915 void warningAtFront(const scope Input msg) const nothrow scope @nogc 916 { 917 messageAtToken(front, "Warning", msg); 918 } 919 920 void errorAtFront(const scope Input msg) const nothrow scope @nogc 921 { 922 messageAtToken(front, "Error", msg); 923 assert(false); ///< TODO: propagate error instead of assert 924 } 925 926 private void infoAtToken(const scope Token token, 927 const scope Input msg) const nothrow scope @nogc 928 { 929 messageAtToken(token, "Info", msg); 930 } 931 932 private void warningAtToken(const scope Token token, 933 const scope Input msg) const nothrow scope @nogc 934 { 935 messageAtToken(token, "Warning", msg); 936 } 937 938 private void errorAtToken(const scope Token token, 939 const scope Input msg) const nothrow scope @nogc 940 { 941 messageAtToken(token, "Error", msg); 942 assert(false); ///< TODO: propagate error instead of assert 943 } 944 945 private void messageAtToken(const scope Token token, 946 const scope string tag, 947 const scope Input msg) const nothrow scope @nogc 948 { 949 ptrdiff_t offset; 950 () @trusted { 951 offset = (token.input.ptr && _input.ptr) ? token.input.ptr - _input.ptr : 0; // unsafe 952 } (); 953 const lc = offsetLineColumn(_input, offset); 954 import nxt.conv_ex : toDefaulted; 955 const tokString = token.tok.toDefaulted!string("unknown"); 956 () @trusted { 957 printf("%.*s(%u,%u): %s: %.*s, `%.*s` (%.*s) at offset %llu\n", // move this to a member 958 cast(int)path.length, &path[0], 959 lc.line + 1, lc.column + 1, 960 &tag[0], 961 cast(int)msg.length, &msg[0], 962 cast(int)token.input.length, &token.input[0], 963 cast(int)tokString.length, &tokString[0], 964 offset); 965 } (); 966 } 967 968 // TODO: into warning(const char* format...) like in `dmd` and put in `nxt.parsing` and reuse here and in lispy.d 969 void errorAtOffset(const scope Input msg, 970 in size_t i = 0) const nothrow scope @nogc 971 { 972 messageAtOffset("Error", msg, i); 973 assert(false); ///< TODO: propagate error instead of assert 974 } 975 976 void warningAtOffset(const scope Input msg, 977 in size_t i = 0) const nothrow scope @nogc 978 { 979 messageAtOffset("Warning", msg, i); 980 } 981 982 void infoAtOffset(const scope Input msg, 983 in size_t i = 0, in const(char)[] ds = null) const nothrow scope @nogc 984 { 985 messageAtOffset("Info", msg, i, ds); 986 } 987 988 void messageAtOffset(const scope string tag, 989 const scope Input msg, 990 in size_t i = 0, 991 in const(char)[] ds = null) const @trusted nothrow @nogc scope 992 { 993 const lc = offsetLineColumn(_input, _offset + i); 994 // TODO: remove printf 995 debug printf("%.*s(%u,%u): %s: %.*s at offset %llu being char `%c` ds:`%.*s`\n", 996 cast(int)path.length, path.ptr, 997 lc.line + 1, lc.column + 1, 998 tag.ptr, 999 cast(int)msg.length, msg.ptr, 1000 _offset + i, 1001 peekN(i), 1002 cast(int)ds.length, ds.ptr); 1003 } 1004 1005 private: 1006 size_t _offset; // current offset in `_input` 1007 const Input _input; ///< Input data. 1008 const string path; ///< Input file (or null if in-memory). 1009 1010 Token _token; 1011 bool _endOfFile; // signals null terminator found 1012 bool _includeComments; 1013 bool _includeWhitespace; 1014 bool _includeLabelAssignment; 1015 bool _includeListLabelAssignment; 1016 bool _diagnoseLeftRecursion; ///< Diagnose left-recursion. 1017 } 1018 1019 /// Node. 1020 enum NODE 1021 { 1022 grammar, ///< Grammar defintion (name). 1023 rule ///< Grammar rule. 1024 } 1025 1026 /// Format when printing AST (nodes). 1027 enum Layout : ubyte 1028 { 1029 source, ///< Try to mimic original source. 1030 tree ///< Makes AST-structure clear. 1031 } 1032 1033 enum indentStep = 4; ///< Indentation size in number of spaces. 1034 1035 @safe struct Format 1036 { 1037 uint indentDepth; ///< Indentation depth. 1038 Layout layout; 1039 void showIndent() @safe const nothrow @nogc 1040 { 1041 showNSpaces(indentDepth); 1042 } 1043 } 1044 1045 void showNSpaces(uint indentDepth) @safe nothrow @nogc 1046 { 1047 foreach (_; 0 .. indentDepth*indentStep) 1048 putchar(' '); 1049 } 1050 1051 void showNSpaces(scope ref Output sink, uint n) @safe pure nothrow @nogc 1052 { 1053 foreach (_; 0 .. n) 1054 sink.put(" "); 1055 } 1056 1057 void showNIndents(scope ref Output sink, uint indentDepth) @safe pure nothrow @nogc 1058 { 1059 foreach (_; 0 .. indentDepth*indentStep) 1060 sink.put(" "); 1061 } 1062 1063 /// Put `x` indented at `indentDepth`. 1064 void iput(T)(scope ref Output sink, 1065 uint indentDepth, T x) @safe pure nothrow @nogc 1066 if (is(typeof(sink.put(x)))) 1067 { 1068 foreach (_; 0 .. indentDepth*indentStep) 1069 sink.put(" "); 1070 sink.put(x); 1071 } 1072 1073 private void showChars(in const(char)[] chars) @trusted 1074 { 1075 printf("%.*s", cast(uint)chars.length, chars.ptr); 1076 } 1077 1078 private void showToken(Token token, in Format fmt) 1079 { 1080 fmt.showIndent(); 1081 showChars(token.input); 1082 } 1083 1084 /** Lower and upper limit of `dchar` count. 1085 */ 1086 @safe struct DcharCountSpan 1087 { 1088 pure nothrow: 1089 @disable this(); // be explicit for now as default init is not obvious 1090 static typeof(this) start() @nogc 1091 { 1092 return typeof(this)(this.lower.min, 1093 this.upper.max); 1094 } 1095 static typeof(this) full() @nogc 1096 { 1097 return typeof(this)(this.lower.min, 1098 this.upper.max); 1099 } 1100 this(in uint lower, in uint upper) @nogc 1101 { 1102 this.lower = lower; 1103 this.upper = upper; 1104 } 1105 this(in size_t lower, in size_t upper) @nogc 1106 { 1107 assert(lower <= this.lower.max); 1108 assert(upper <= this.upper.max); 1109 this.lower = cast(typeof(this.lower))lower; 1110 this.upper = cast(typeof(this.upper))upper; 1111 } 1112 this(in uint length) @nogc 1113 { 1114 this(length, length); 1115 } 1116 this(in size_t length) @safe nothrow @nogc 1117 { 1118 this(length, length); 1119 } 1120 uint lower = uint.max; 1121 uint upper = 0; 1122 } 1123 1124 /// AST node. 1125 private abstract class Node 1126 { 1127 abstract void show(in Format fmt = Format.init) const; 1128 nothrow: 1129 abstract bool equals(const Node o) const pure @nogc; 1130 abstract void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const; 1131 this() @nogc {} 1132 } 1133 1134 import nxt.my_mallocator : Mallocator; 1135 1136 alias NodeArray = DynamicArray!(Node, Mallocator, uint); // `uint` capacity is enough 1137 alias PatternArray = DynamicArray!(Pattern, Mallocator, uint); // `uint` capacity is enough 1138 1139 bool equalsAll(const scope Node[] a, 1140 const scope Node[] b) pure nothrow @nogc 1141 { 1142 if (a.length != b.length) 1143 return false; 1144 foreach (const i; 0 .. a.length) 1145 if (!a[i].equals(b[i])) // TODO: use `.ptr` if needed 1146 return false; 1147 return true; 1148 } 1149 1150 /// N-ary expression. 1151 @safe abstract class NaryOpPattern : Pattern 1152 { 1153 @safe nothrow: 1154 this(Token head, PatternArray subs) @nogc 1155 { 1156 super(head); 1157 this.subs = subs.move(); // TODO: remove when compiler does this for us 1158 } 1159 override bool equals(const Node o) const @nogc 1160 { 1161 if (this is o) 1162 return true; 1163 if (const o_ = cast(const typeof(this))o) 1164 return equalsAll(this.subs[], o_.subs[]); 1165 return false; 1166 } 1167 this(uint n)(Token head, Pattern[n] subs) @nogc if (n >= 2) 1168 { 1169 super(head); 1170 foreach (const Pattern sub; subs) 1171 this.subs.put(sub); 1172 } 1173 PatternArray subs; 1174 } 1175 1176 /** Sequence. 1177 * 1178 * A `Sequence` is empty in case when a rule provides an empty alternative. 1179 * Such cases `() | ...` should be rewritten to `(...)?` in `makeAlt`. 1180 */ 1181 @safe final class SeqM : NaryOpPattern 1182 { 1183 override void show(in Format fmt = Format.init) const 1184 { 1185 fmt.showIndent(); 1186 foreach (const i, const sub; subs) 1187 { 1188 if (i) 1189 putchar(' '); 1190 sub.show(); 1191 } 1192 } 1193 nothrow: 1194 this(Token head) @nogc 1195 { 1196 super(head, PatternArray.init); 1197 } 1198 this(PatternArray subs) @nogc 1199 { 1200 super(Token.init, subs.move()); 1201 } 1202 this(uint n)(Node[n] subs) if (n >= 2) 1203 { 1204 super(subs); 1205 } 1206 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1207 { 1208 sink.put("seq("); 1209 foreach (const i, const sub; subs) 1210 { 1211 if (i) 1212 sink.put(", "); // separator 1213 sub.toMatchInSource(sink, parser); 1214 } 1215 sink.put(")"); 1216 } 1217 override DcharCountSpan dcharCountSpan() const pure @nogc 1218 { 1219 assert(!subs.empty); 1220 auto result = typeof(return)(0, uint.max); 1221 foreach (const sub; subs) 1222 { 1223 const sublr = sub.dcharCountSpan; 1224 if (result.lower == uint.max || 1225 sublr.lower == uint.max) 1226 result.lower = uint.max; 1227 else 1228 result.lower += sublr.lower; 1229 if (result.upper == uint.max || 1230 sublr.upper == uint.max) 1231 result.upper = uint.max; 1232 else 1233 result.upper += sublr.upper; 1234 } 1235 return result; 1236 } 1237 override bool opEquals(const scope Pattern that) const @nogc 1238 { 1239 if (auto that_ = cast(const typeof(this))that) 1240 { 1241 if (head.input == that_.head.input) 1242 { 1243 foreach (const index, const sub; subs) 1244 if (!sub.opEquals(that_.subs[index])) 1245 return false; 1246 return true; 1247 } 1248 } 1249 return false; 1250 } 1251 } 1252 1253 Pattern makeSeq(PatternArray subs, 1254 const ref GxLexer lexer, 1255 in bool rewriteFlag = true) nothrow 1256 { 1257 subs = flattenSubs!SeqM(subs.move()); 1258 if (subs.length == 1) 1259 return subs[0]; 1260 if (rewriteFlag) 1261 { 1262 foreach (const i, const sub; subs) 1263 { 1264 if (i + 1 == subs.length) // skip last 1265 break; 1266 if (const zom = cast(const GreedyZeroOrMore)subs[i + 1]) 1267 if (zom.sub.equals(sub)) 1268 lexer.warningAtToken(zom.head, "should be rewritten into `X+`"); 1269 } 1270 } 1271 if (subs.empty) 1272 lexer.warningAtToken(Token(TOK.leftParen, lexer._input[0 .. 0]), 1273 "empty sequence"); 1274 return new SeqM(subs.move()); 1275 } 1276 1277 Pattern makeSeq(scope Pattern[] subs, 1278 const ref GxLexer lexer, 1279 in bool rewriteFlag = true) nothrow 1280 { 1281 return makeSeq(PatternArray(subs), lexer, rewriteFlag); 1282 } 1283 1284 Pattern makeSeq(scope Node[] subs, 1285 const ref GxLexer lexer, 1286 in bool rewriteFlag = true) nothrow 1287 { 1288 return makeSeq(checkedCastSubs(subs, lexer), lexer, rewriteFlag); 1289 } 1290 1291 private PatternArray checkedCastSubs(scope Node[] subs, 1292 const ref GxLexer lexer) nothrow 1293 { 1294 auto psubs = typeof(return).withLength(subs.length); 1295 foreach (const i, sub; subs) 1296 { 1297 psubs[i] = cast(Pattern)sub; 1298 if (!psubs[i]) 1299 { 1300 lexer.errorAtFront("non-`Pattern` sub"); 1301 debug sub.show(); 1302 } 1303 } 1304 return psubs; 1305 } 1306 1307 PatternArray flattenSubs(BranchPattern)(PatternArray subs) pure nothrow @nogc 1308 if (is(BranchPattern : SeqM) || 1309 is(BranchPattern : AltM)) 1310 { 1311 typeof(subs) subs_; 1312 foreach (sub; subs) 1313 { 1314 if (auto sub_ = cast(BranchPattern)sub) 1315 subs_.insertBack(flattenSubs!(BranchPattern)(sub_.subs.move())); 1316 else 1317 subs_.insertBack(sub); 1318 } 1319 return subs_.move(); 1320 } 1321 1322 /// Rule. 1323 @safe class Rule : Node 1324 { 1325 override void show(in Format fmt = Format.init) const 1326 { 1327 showToken(head, fmt); 1328 showChars(":\n"); 1329 if (root) 1330 root.show(Format(fmt.indentDepth + 1)); 1331 showChars(" ;\n"); 1332 } 1333 nothrow: 1334 void diagnoseDirectLeftRecursion(const scope ref GxLexer lexer) 1335 { 1336 void checkLeft(const scope Pattern root) @safe nothrow 1337 { 1338 if (const alt = cast(const AltM)root) // common case 1339 foreach (const sub; alt.subs[]) // all alternatives 1340 checkLeft(sub); 1341 else if (const seq = cast(const SeqM)root) 1342 return checkLeft(seq.subs[0]); // only first in sequence 1343 else if (const s = cast(const SymbolRef)root) 1344 if (head.input == s.head.input) 1345 lexer.warningAtToken(s.head, "left-recursion"); 1346 } 1347 checkLeft(root); 1348 } 1349 this(Token head, Pattern root, bool skipFlag) @nogc 1350 { 1351 this.head = head; 1352 this.root = root; 1353 this.skipFlag = skipFlag; 1354 } 1355 override bool equals(const Node o) const @nogc 1356 { 1357 if (this is o) 1358 return true; 1359 if (const o_ = cast(const typeof(this))o) 1360 return head == o_.head && root.equals(o_.root); 1361 return false; 1362 } 1363 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1364 { 1365 // dummy 1366 } 1367 void toMatcherInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1368 { 1369 sink.iput(1, `Match `); 1370 if (head.input != "EOF") 1371 sink.put(matcherFunctionNamePrefix); 1372 sink.put(head.input); sink.put("()\n"); 1373 sink.iput(1, "{\n"); 1374 import std.ascii : isUpper; 1375 if (head.input[0].isUpper || 1376 cast(const FragmentRule)this) 1377 sink.iput(2, "pragma(inline, true);\n"); 1378 sink.iput(2, `return`); 1379 if (root) 1380 { 1381 sink.put(` `); 1382 root.toMatchInSource(sink, parser); 1383 } 1384 else 1385 sink.put(` Match.zero()`); 1386 sink.put(";\n"); 1387 sink.iput(1, "}\n"); 1388 } 1389 @property bool isFragmentRule() const @nogc 1390 { 1391 return false; 1392 } 1393 /** Is a lexer (token) rule (beginning with a capital letter) defining a token type. 1394 * 1395 * A lexer rule name starts with an uppercase letter distinguishing it from 1396 * a parser rule. 1397 * 1398 * See_Also: https://github.com/antlr/antlr4/blob/master/doc/lexer-rules.md#lexer-rule-elements 1399 */ 1400 @property final bool isLexerTokenRule() const @nogc 1401 { 1402 import std.ascii : isUpper; 1403 return (head.input.length && 1404 head.input[0].isUpper); 1405 } 1406 const Token head; ///< Name. 1407 Pattern root; ///< Root pattern. 1408 /** Set to `true` if is referenced by a `SymbolRef`. 1409 Set in `GxParserByStatement.tagReferencedRules` 1410 */ 1411 bool hasRef = false; 1412 /** Rule is skipped (ignored). For instance 1413 WS : [\r\n]+ -> skip ; 1414 */ 1415 const bool skipFlag; 1416 } 1417 1418 /** A reusable part of a lexer rule that doesn't match (a token) on its own. 1419 1420 A consequence is that fragment rules matches are never stored as a separate 1421 parse tree nodes. 1422 1423 For example: 1424 INTEGER: DIGIT+ 1425 | '0' [Xx] HEX_DIGIT+ 1426 ; 1427 fragment DIGIT: [0-9]; 1428 fragment HEX_DIGIT: [0-9A-Fa-f]; 1429 1430 See_Also: https://sodocumentation.net/antlr/topic/3271/lexer-rules-in-v4#fragments 1431 */ 1432 @safe final class FragmentRule : Rule 1433 { 1434 nothrow: 1435 this(Token head, Pattern root, bool skipFlag) @nogc 1436 { 1437 super(head, root, skipFlag); 1438 } 1439 @property final override bool isFragmentRule() const @nogc 1440 { 1441 return true; 1442 } 1443 } 1444 1445 @safe final class AltM : NaryOpPattern 1446 { 1447 override void show(in Format fmt = Format.init) const 1448 { 1449 const wrapFlag = needsWrapping(subs[]); 1450 if (wrapFlag) 1451 putchar('('); 1452 showSubs(fmt); 1453 if (wrapFlag) 1454 putchar(')'); 1455 } 1456 final void showSubs(in Format fmt) const 1457 { 1458 foreach (const i, const sub; subs) 1459 { 1460 sub.show(fmt); 1461 if (i + 1 != subs.length) 1462 { 1463 if (fmt.indentDepth) 1464 showChars(" |\n"); 1465 else 1466 showChars(" | "); 1467 } 1468 } 1469 } 1470 nothrow: 1471 this(Token head, PatternArray subs) @nogc 1472 { 1473 assert(!subs.empty); 1474 super(head, subs.move()); 1475 } 1476 this(uint n)(Pattern[n] subs) @nogc if (n >= 2) 1477 { 1478 super(subs); 1479 } 1480 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1481 { 1482 // preprocess 1483 bool allSubChars = true; // true if all sub-patterns are characters 1484 foreach (const sub; subs) 1485 { 1486 if (const lit = cast(const StrLiteral)sub) 1487 { 1488 if (lit.head.input.length != 3) // non-character literal 1489 { 1490 allSubChars = false; 1491 break; 1492 } 1493 } 1494 else 1495 { 1496 allSubChars = false; 1497 break; 1498 } 1499 } 1500 1501 // prefix 1502 if (allSubChars) 1503 { 1504 switch (subs.length) 1505 { 1506 case 2: 1507 sink.put("alt2!("); 1508 break; 1509 case 3: 1510 sink.put("alt3!("); 1511 break; 1512 case 4: 1513 sink.put("alt4!("); 1514 break; 1515 case 5: 1516 sink.put("alt5!("); 1517 break; 1518 default: 1519 sink.put("altN!("); 1520 break; 1521 } 1522 } 1523 else 1524 sink.put("alt("); 1525 1526 // iterate subs 1527 foreach (const i, const sub; subs) 1528 { 1529 if (allSubChars) 1530 { 1531 if (i) 1532 sink.put(","); // separator 1533 const lsub = cast(const StrLiteral)sub; 1534 if (lsub.head.input == "'") 1535 sink.put(`\'`); 1536 else 1537 sink.put(lsub.head.input); 1538 } 1539 else 1540 { 1541 if (i) 1542 { 1543 sink.put(",\n"); 1544 sink.showNIndents(2); 1545 sink.showNSpaces(11); 1546 } 1547 sub.toMatchInSource(sink, parser); 1548 } 1549 } 1550 1551 // postfix 1552 sink.put(")"); 1553 if (allSubChars) 1554 sink.put("()"); 1555 } 1556 override DcharCountSpan dcharCountSpan() const pure @nogc 1557 { 1558 return dcharCountSpanOf(subs[]); 1559 } 1560 override bool opEquals(const scope Pattern that) const @nogc 1561 { 1562 if (auto that_ = cast(const typeof(this))that) 1563 { 1564 if (head.input == that_.head.input) 1565 { 1566 foreach (const index, const sub; subs) 1567 if (!sub.opEquals(that_.subs[index])) 1568 return false; 1569 return true; 1570 } 1571 } 1572 return false; 1573 } 1574 } 1575 1576 DcharCountSpan dcharCountSpanOf(const scope Pattern[] subs) @safe pure nothrow @nogc 1577 { 1578 if (subs.length == 0) 1579 return typeof(return)(0, 0); 1580 auto result = typeof(return).start(); 1581 foreach (const sub; subs) 1582 { 1583 const sublr = sub.dcharCountSpan; 1584 result.lower = min(result.lower, sublr.lower); 1585 result.upper = max(result.upper, sublr.upper); 1586 } 1587 return result; 1588 } 1589 1590 Pattern makeAltA(Token head, 1591 PatternArray subs, 1592 in bool rewriteFlag = true) nothrow 1593 { 1594 subs = flattenSubs!AltM(subs.move()); 1595 switch (subs.length) 1596 { 1597 case 0: 1598 return null; 1599 case 1: 1600 return subs[0]; 1601 default: 1602 return new AltM(head, subs.move()); 1603 } 1604 } 1605 1606 Pattern makeAltM(Token head, 1607 Pattern[] subs, 1608 in bool rewriteFlag = true) nothrow 1609 { 1610 return makeAltA(head, PatternArray(subs), rewriteFlag); 1611 } 1612 1613 Pattern makeAltN(uint n)(Token head, 1614 Pattern[n] subs, 1615 in bool rewriteFlag = true) nothrow 1616 if (n >= 2) 1617 { 1618 return makeAltA(head, PatternArray(subs), rewriteFlag); 1619 } 1620 1621 @safe class TokenNode : Node 1622 { 1623 override void show(in Format fmt = Format.init) const 1624 { 1625 showToken(head, fmt); 1626 } 1627 nothrow: 1628 this(Token head) @nogc 1629 { 1630 this.head = head; 1631 } 1632 override bool equals(const Node o) const pure @nogc 1633 { 1634 if (this is o) 1635 return true; 1636 if (const o_ = cast(const typeof(this))o) 1637 return head == o_.head; 1638 return false; 1639 } 1640 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1641 { 1642 sink.put(`tok(`); 1643 sink.put(head.input[]); 1644 sink.put(`)`); 1645 } 1646 const Token head; 1647 } 1648 1649 /// Unary match combinator. 1650 @safe abstract class UnaryOpPattern : Pattern 1651 { 1652 final override void show(in Format fmt = Format.init) const 1653 { 1654 putchar('('); 1655 sub.show(fmt); 1656 putchar(')'); 1657 showToken(head, fmt); 1658 } 1659 nothrow: 1660 this(Token head, Pattern sub) @nogc 1661 { 1662 debug assert(head.input.ptr); 1663 assert(sub); 1664 super(head); 1665 this.sub = sub; 1666 } 1667 final override bool equals(const Node o) const 1668 { 1669 if (this is o) 1670 return true; 1671 if (const o_ = cast(const typeof(this))o) 1672 return (head == o_.head && 1673 this.sub.equals(o_.sub)); 1674 return false; 1675 } 1676 Pattern sub; 1677 } 1678 1679 /// Don't match an instance of type `sub`. 1680 @safe final class NotPattern : UnaryOpPattern 1681 { 1682 nothrow: 1683 this(Token head, Pattern sub) @nogc 1684 { 1685 super(head, sub); 1686 } 1687 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1688 { 1689 sink.put("not("); 1690 sub.toMatchInSource(sink, parser); 1691 sink.put(")"); 1692 } 1693 override DcharCountSpan dcharCountSpan() const pure @nogc 1694 { 1695 return sub.dcharCountSpan(); 1696 } 1697 final override bool opEquals(const scope Pattern that) const @nogc 1698 { 1699 if (auto that_ = cast(const typeof(this))that) 1700 return sub.opEquals(that_.sub); 1701 return false; 1702 } 1703 } 1704 1705 /// Match (greedily) zero or one instances of type `sub`. 1706 @safe final class GreedyZeroOrOne : UnaryOpPattern 1707 { 1708 nothrow: 1709 this(Token head, Pattern sub) @nogc 1710 { 1711 super(head, sub); 1712 } 1713 this(Token head, Node sub) @nogc 1714 { 1715 Pattern psub = cast(Pattern)sub; 1716 assert(psub); 1717 super(head, psub); 1718 } 1719 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1720 { 1721 sink.put("gzo("); 1722 sub.toMatchInSource(sink, parser); 1723 sink.put(")"); 1724 } 1725 override DcharCountSpan dcharCountSpan() const pure @nogc 1726 { 1727 return typeof(return)(0, sub.dcharCountSpan.upper); 1728 } 1729 final override bool opEquals(const scope Pattern that) const @nogc 1730 { 1731 if (auto that_ = cast(const typeof(this))that) 1732 return sub.opEquals(that_.sub); 1733 return false; 1734 } 1735 } 1736 1737 /// Match (greedily) zero or more instances of type `sub`. 1738 @safe final class GreedyZeroOrMore : UnaryOpPattern 1739 { 1740 nothrow: 1741 this(Token head, Pattern sub) @nogc 1742 { 1743 super(head, sub); 1744 } 1745 this(Token head, Node sub) @nogc 1746 { 1747 Pattern psub = cast(Pattern)sub; 1748 assert(psub); 1749 super(head, psub); 1750 } 1751 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1752 { 1753 sink.put("gzm("); 1754 sub.toMatchInSource(sink, parser); 1755 sink.put(")"); 1756 } 1757 override DcharCountSpan dcharCountSpan() const pure @nogc 1758 { 1759 return typeof(return).full(); 1760 } 1761 final override bool opEquals(const scope Pattern that) const @nogc 1762 { 1763 if (auto that_ = cast(const typeof(this))that) 1764 return sub.opEquals(that_.sub); 1765 return false; 1766 } 1767 } 1768 1769 /// Match (greedily) one or more instances of type `sub`. 1770 @safe final class GreedyOneOrMore : UnaryOpPattern 1771 { 1772 nothrow: 1773 this(Token head, Pattern sub) @nogc 1774 { 1775 super(head, sub); 1776 } 1777 this(Token head, Node sub) @nogc 1778 { 1779 Pattern psub = cast(Pattern)sub; 1780 if (!psub) 1781 { 1782 debug sub.show(); 1783 assert(false); 1784 } 1785 super(head, psub); 1786 } 1787 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1788 { 1789 sink.put("gom("); 1790 sub.toMatchInSource(sink, parser); 1791 sink.put(")"); 1792 } 1793 override DcharCountSpan dcharCountSpan() const pure @nogc 1794 { 1795 return typeof(return)(sub.dcharCountSpan.lower, 1796 typeof(return).upper.max); 1797 } 1798 final override bool opEquals(const scope Pattern that) const @nogc 1799 { 1800 if (auto that_ = cast(const typeof(this))that) 1801 return sub.opEquals(that_.sub); 1802 return false; 1803 } 1804 } 1805 1806 @safe abstract class TerminatedUnaryOpPattern : UnaryOpPattern 1807 { 1808 nothrow: 1809 this(Token head, Pattern sub, Pattern terminator = null) @nogc 1810 { 1811 debug assert(head.input.ptr); 1812 super(head, sub); 1813 this.terminator = terminator; 1814 } 1815 Pattern terminator; 1816 } 1817 1818 /// Match (non-greedily) zero or one instances of type `sub`. 1819 @safe final class NonGreedyZeroOrOne : TerminatedUnaryOpPattern 1820 { 1821 nothrow: 1822 this(Token head, Pattern sub, Pattern terminator = null) @nogc 1823 { 1824 super(head, sub, terminator); 1825 } 1826 this(Token head, Node sub, Pattern terminator = null) @nogc 1827 { 1828 Pattern psub = cast(Pattern)sub; 1829 assert(psub); 1830 super(head, psub, terminator); 1831 } 1832 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1833 { 1834 sink.put("nzo("); 1835 sub.toMatchInSource(sink, parser); 1836 if (terminator) 1837 { 1838 sink.put(","); 1839 terminator.toMatchInSource(sink, parser); 1840 } 1841 else 1842 parser._lexer.warningAtToken(head, "no terminator after non-greedy"); 1843 sink.put(")"); 1844 } 1845 override DcharCountSpan dcharCountSpan() const pure @nogc 1846 { 1847 return typeof(return)(0, sub.dcharCountSpan.upper); 1848 } 1849 override bool opEquals(const scope Pattern that) const @nogc 1850 { 1851 if (auto that_ = cast(const typeof(this))that) 1852 { 1853 return (sub.opEquals(that_.sub) && 1854 terminator.opEquals(that_.terminator)); 1855 } 1856 return false; 1857 } 1858 } 1859 1860 /// Match (non-greedily) zero or more instances of type `sub`. 1861 @safe final class NonGreedyZeroOrMore : TerminatedUnaryOpPattern 1862 { 1863 nothrow: 1864 this(Token head, Pattern sub, Pattern terminator = null) @nogc 1865 { 1866 debug assert(head.input.ptr); 1867 super(head, sub, terminator); 1868 } 1869 this(Token head, Node sub, Pattern terminator = null) @nogc 1870 { 1871 debug assert(head.input.ptr); 1872 Pattern psub = cast(Pattern)sub; 1873 assert(psub); 1874 super(head, psub, terminator); 1875 } 1876 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1877 { 1878 sink.put("nzm("); 1879 sub.toMatchInSource(sink, parser); 1880 debug assert(head.input.ptr); 1881 if (terminator) 1882 { 1883 sink.put(","); 1884 terminator.toMatchInSource(sink, parser); 1885 } 1886 else 1887 { 1888 parser._lexer.warningAtToken(head, "no terminator after non-greedy"); 1889 } 1890 sink.put(")"); 1891 } 1892 override DcharCountSpan dcharCountSpan() const pure @nogc 1893 { 1894 return typeof(return).full(); 1895 } 1896 override bool opEquals(const scope Pattern that) const @nogc 1897 { 1898 if (auto that_ = cast(const typeof(this))that) 1899 { 1900 return (sub.opEquals(that_.sub) && 1901 terminator.opEquals(that_.terminator)); 1902 } 1903 return false; 1904 } 1905 } 1906 1907 /// Match (non-greedily) one or more instances of type `sub`. 1908 @safe final class NonGreedyOneOrMore : TerminatedUnaryOpPattern 1909 { 1910 nothrow: 1911 this(Token head, Pattern sub, Pattern terminator = null) @nogc 1912 { 1913 super(head, sub, terminator); 1914 } 1915 this(Token head, Node sub, Pattern terminator = null) @nogc 1916 { 1917 Pattern psub = cast(Pattern)sub; 1918 assert(psub); 1919 super(head, psub, terminator); 1920 } 1921 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1922 { 1923 sink.put("nom("); 1924 sub.toMatchInSource(sink, parser); 1925 if (terminator) 1926 { 1927 sink.put(","); 1928 terminator.toMatchInSource(sink, parser); 1929 } 1930 else 1931 parser._lexer.warningAtToken(head, "no terminator after non-greedy"); 1932 sink.put(")"); 1933 } 1934 override DcharCountSpan dcharCountSpan() const pure @nogc 1935 { 1936 return typeof(return)(sub.dcharCountSpan.lower, 1937 typeof(return).upper.max); 1938 } 1939 override bool opEquals(const scope Pattern that) const @nogc 1940 { 1941 if (auto that_ = cast(const typeof(this))that) 1942 { 1943 return (sub.opEquals(that_.sub) && 1944 terminator.opEquals(that_.terminator)); 1945 } 1946 return false; 1947 } 1948 } 1949 1950 /// Match `count` number of instances of type `sub`. 1951 @safe final class GreedyCount : UnaryOpPattern 1952 { 1953 nothrow: 1954 this(Token head, Pattern sub) @nogc 1955 { 1956 super(head, sub); 1957 } 1958 this(Token head, Node sub) @nogc 1959 { 1960 Pattern psub = cast(Pattern)sub; 1961 assert(psub); 1962 super(head, psub); 1963 } 1964 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 1965 { 1966 sink.put("cnt("); 1967 sub.toMatchInSource(sink, parser); 1968 sink.put(")"); 1969 } 1970 override DcharCountSpan dcharCountSpan() const pure @nogc 1971 { 1972 const ss = sub.dcharCountSpan; 1973 return typeof(return)(ss.lower == ss.lower.max ? ss.lower.max : ss.lower * count, 1974 ss.upper == ss.upper.max ? ss.upper.max : ss.upper * count); 1975 } 1976 override bool opEquals(const scope Pattern that) const @nogc 1977 { 1978 if (auto that_ = cast(const typeof(this))that) 1979 return (count == that_.count && 1980 sub.opEquals(that_.sub)); 1981 return false; 1982 } 1983 ulong count; 1984 } 1985 1986 @safe final class RewriteSyntacticPredicate : UnaryOpPattern 1987 { 1988 nothrow: 1989 this(Token head, Pattern sub) @nogc 1990 { 1991 super(head, sub); 1992 } 1993 this(Token head, Node sub) @nogc 1994 { 1995 Pattern psub = cast(Pattern)sub; 1996 assert(psub); 1997 super(head, psub); 1998 } 1999 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 2000 { 2001 sink.put("syn("); 2002 sub.toMatchInSource(sink, parser); 2003 sink.put(")"); 2004 } 2005 override DcharCountSpan dcharCountSpan() const pure @nogc 2006 { 2007 return sub.dcharCountSpan; 2008 } 2009 override bool opEquals(const scope Pattern that) const @nogc 2010 { 2011 if (auto that_ = cast(const typeof(this))that) 2012 return sub.opEquals(that_.sub); 2013 return false; 2014 } 2015 } 2016 2017 @safe final class OtherSymbol : TokenNode 2018 { 2019 override void show(in Format fmt = Format.init) const 2020 { 2021 showToken(head, fmt); 2022 } 2023 nothrow: 2024 this(Token head) @nogc 2025 { 2026 super(head); 2027 } 2028 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 2029 { 2030 if (head.input != "EOF") 2031 sink.put(matcherFunctionNamePrefix); 2032 sink.put(head.input); 2033 sink.put(`()`); 2034 } 2035 } 2036 2037 @safe final class SymbolRef : Pattern 2038 { 2039 override void show(in Format fmt = Format.init) const 2040 { 2041 showToken(head, fmt); 2042 } 2043 nothrow: 2044 this(Token head) @nogc 2045 { 2046 super(head); 2047 } 2048 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 2049 { 2050 if (head.input != "EOF") 2051 sink.put(matcherFunctionNamePrefix); 2052 sink.put(head.input); 2053 if (parser.warnUnknownSymbolFlag && 2054 head.input !in parser.rulesByName) 2055 parser._lexer.warningAtToken(head, "No rule for symbol"); 2056 } 2057 override DcharCountSpan dcharCountSpan() const pure @nogc 2058 { 2059 assert(false); 2060 // return typeof(return).init; 2061 } 2062 override bool opEquals(const scope Pattern that) const @nogc 2063 { 2064 if (auto that_ = cast(const typeof(this))that) 2065 return head.input == that_.head.input; 2066 return false; 2067 } 2068 } 2069 2070 @safe final class LeftParenSentinel : TokenNode 2071 { 2072 nothrow: 2073 this(Token head) @nogc 2074 { 2075 super(head); 2076 } 2077 } 2078 2079 @safe final class PipeSentinel : TokenNode 2080 { 2081 nothrow: 2082 this(Token head) @nogc 2083 { 2084 super(head); 2085 } 2086 } 2087 2088 @safe final class DotDotSentinel : TokenNode 2089 { 2090 nothrow: 2091 this(Token head) @nogc 2092 { 2093 super(head); 2094 } 2095 } 2096 2097 @safe final class TildeSentinel : TokenNode 2098 { 2099 nothrow: 2100 this(Token head) @nogc 2101 { 2102 super(head); 2103 } 2104 } 2105 2106 @safe final class AnyClass : Pattern 2107 { 2108 nothrow: 2109 this(Token head) @nogc 2110 { 2111 super(head); 2112 } 2113 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 2114 { 2115 sink.put(`any()`); 2116 } 2117 override DcharCountSpan dcharCountSpan() const pure @nogc 2118 { 2119 return typeof(return)(1); 2120 } 2121 override bool opEquals(const scope Pattern that) const @nogc 2122 { 2123 if (auto that_ = cast(const typeof(this))that) 2124 return true; 2125 return false; 2126 } 2127 } 2128 2129 private bool isASCIICharacterLiteral(Input x) pure nothrow @nogc 2130 { 2131 return (x.length == 1 || 2132 (x.length == 2 && 2133 x[0] == '\\')); // backquoted character 2134 } 2135 2136 private uint isUnicodeCharacterLiteral(scope Input x) pure nothrow @nogc 2137 { 2138 if (!x.skipOver('\\')) 2139 return 0; 2140 if (!(x.skipOver('u') || 2141 x.skipOver('U'))) 2142 return 0; 2143 2144 x.skipOverAround('{', '}'); // optional 2145 2146 if (x.skipOver(`0x`) || // optional 2147 x.skipOver(`0X`)) {} 2148 2149 while (x.length && 2150 x[0] == '0') // trim leading zero 2151 x = x[1 .. $]; 2152 2153 uint u; 2154 while (x.length) 2155 { 2156 u *= 16; 2157 import std.ascii : isDigit, isLower, isUpper; 2158 const c = x[0]; 2159 if (c.isDigit) 2160 u += c - '0'; 2161 else if (c.isUpper) 2162 u += c - 'A' + 10; 2163 else if (c.isLower) 2164 u += c - 'a' + 10; 2165 else 2166 return 0; // string literal such as '\uD835\uDD38' 2167 x = x[1 .. $]; // pop front 2168 } 2169 2170 return u; 2171 } 2172 2173 @safe abstract class Pattern : TokenNode 2174 { 2175 nothrow: 2176 this(Token head) @nogc 2177 { 2178 super(head); 2179 } 2180 abstract DcharCountSpan dcharCountSpan() const pure @nogc; 2181 abstract bool opEquals(const scope Pattern that) const @nogc; 2182 } 2183 2184 @safe final class StrLiteral : Pattern 2185 { 2186 nothrow: 2187 this(Token head) @nogc 2188 { 2189 assert(head.input.length >= 2); 2190 assert((head.input[0] == '\'' && 2191 head.input[$-1] == '\'') || 2192 (head.input[0] == '"' && 2193 head.input[$-1] == '"')); 2194 super(head); 2195 } 2196 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 2197 { 2198 auto inp = unquotedInput; // skipping single-quotes 2199 if (inp.isASCIICharacterLiteral()) 2200 { 2201 sink.put(`ch(`); 2202 sink.putCharLiteral(inp); 2203 sink.put(`)`); 2204 } 2205 else if (const uvalue = inp.isUnicodeCharacterLiteral()) 2206 { 2207 if (uvalue <= 0x7f) 2208 sink.put(`ch(`); 2209 else 2210 sink.put(`dch(`); 2211 sink.putCharLiteral(inp); 2212 sink.put(`)`); 2213 } 2214 else 2215 { 2216 if (inp.canFind('`')) 2217 { 2218 sink.put(`str("`); 2219 sink.putStringLiteralDoubleQuoted(inp); 2220 sink.put(`")`); 2221 } 2222 else 2223 { 2224 sink.put("str(`"); 2225 sink.putStringLiteralBackQuoted(inp); 2226 sink.put("`)"); 2227 } 2228 } 2229 } 2230 override DcharCountSpan dcharCountSpan() const pure @nogc 2231 { 2232 const inp = unquotedInput; // skipping single-quotes 2233 size_t cnt; 2234 if (inp.isASCIICharacterLiteral()) // must come first 2235 cnt = 1; 2236 else if (const _ = inp.isUnicodeCharacterLiteral()) // must come second 2237 cnt = 1; 2238 else if (inp.isASCIIString) // must come third 2239 cnt = inp.length; 2240 else 2241 { 2242 // TODO: optimize 2243 import std.utf : byDchar; 2244 import std.algorithm.searching : count; 2245 cnt = inp.byDchar.count; 2246 } 2247 return typeof(return)(cnt); 2248 } 2249 final override bool opEquals(const scope Pattern that) const @nogc 2250 { 2251 if (auto that_ = cast(const typeof(this))that) 2252 return head.input == that_.head.input; 2253 return false; 2254 } 2255 Input unquotedInput() const pure scope return @nogc 2256 { 2257 return head.input[1 .. $ - 1]; 2258 } 2259 } 2260 2261 // TODO: avoid linker error when using version defined in `nxt.string_traits` 2262 private bool isASCIIString(scope const(char)[] input) pure nothrow @nogc 2263 { 2264 foreach (const e; cast(const(ubyte)[])input) // no decoding to `dchar` needed 2265 if (e >= 0x7F) 2266 return false; 2267 return true; 2268 } 2269 2270 void putStringLiteralDoubleQuoted(scope ref Output sink, 2271 const scope Input inp) pure nothrow @nogc 2272 { 2273 for (size_t i; i < inp.length; ++i) 2274 { 2275 if (inp[i] == '"') 2276 sink.put(`\"`); // backslash doublequote in D string 2277 else if (i + 2 <= inp.length && 2278 inp[i .. i + 2] == `\'`) 2279 { 2280 i += 1; // one extra char 2281 sink.put('\''); 2282 } 2283 else 2284 sink.put(inp[i]); 2285 } 2286 } 2287 2288 void putStringLiteralBackQuoted(scope ref Output sink, 2289 const scope Input inp) pure nothrow @nogc 2290 { 2291 for (size_t i; i < inp.length; ++i) 2292 { 2293 if (inp[i] == '`') 2294 sink.put("\\`"); // backslash backquote in D raw string 2295 else if (inp[i] == '\\' && 2296 i + 1 < inp.length) 2297 { 2298 if (inp[i + 1] == '\\') 2299 sink.put('\\'); // strip quoting of backslash in D raw string 2300 else if (inp[i + 1] == '\'') 2301 sink.put('\''); // strip quoting of single-quote in D raw string 2302 else if (inp[i + 1] == '"') 2303 sink.put('"'); // strip quoting of double-quote in D raw string 2304 else 2305 sink.put(inp[i]); 2306 i += 1; // one extra char 2307 } 2308 else 2309 sink.put(inp[i]); 2310 } 2311 } 2312 2313 @safe final class AltCharLiteral : Pattern 2314 { 2315 nothrow: 2316 this(Token head) @nogc 2317 { 2318 super(head); 2319 } 2320 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 2321 { 2322 if (head.input.startsWith(`\p`) || // https://github.com/antlr/antlr4/pull/1688 2323 head.input.startsWith(`\P`)) 2324 { 2325 sink.put(`cc!(`); 2326 } 2327 else if (head.input[0] >= 0x80) 2328 { 2329 sink.put(`dch(`); 2330 } 2331 else 2332 { 2333 const uvalue = head.input.isUnicodeCharacterLiteral(); 2334 if (uvalue < 0x00) 2335 sink.put(`ch(`); 2336 else 2337 sink.put(`dch(`); 2338 } 2339 sink.putCharLiteral(head.input); 2340 sink.put(`)`); 2341 } 2342 override DcharCountSpan dcharCountSpan() const pure @nogc 2343 { 2344 return DcharCountSpan(1, 1); 2345 // if (head.input.isASCIICharacterLiteral) 2346 // return DcharCountSpan(1, 1); 2347 // else 2348 // return DcharCountSpan(0, uint.max); 2349 } 2350 final override bool opEquals(const scope Pattern that) const @nogc 2351 { 2352 if (auto that_ = cast(const typeof(this))that) 2353 return head.input == that_.head.input; 2354 return false; 2355 } 2356 } 2357 2358 void putCharLiteral(scope ref Output sink, 2359 scope Input inp) pure nothrow @nogc 2360 { 2361 if (inp.skipOver(`\u`) || 2362 inp.skipOver(`\U`)) 2363 { 2364 inp.skipOverAround('{', '}'); 2365 2366 // strip leading zeros 2367 while (inp.length > 2 && 2368 inp[0] == '0') 2369 inp = inp[1 .. $]; 2370 2371 if (inp.length == 2) // if ASCII 2372 { 2373 sink.put(`'\u00`); 2374 sink.put(inp); 2375 sink.put('\''); 2376 } 2377 else 2378 { 2379 sink.put(`(cast(dchar)0x`); // TODO: use `dchar(...)` for valid numbers 2380 sink.put(inp); 2381 sink.put(`)`); 2382 } 2383 } 2384 else if (inp.skipOver(`\p`) || // https://github.com/antlr/antlr4/pull/1688 2385 inp.skipOver(`\P`)) 2386 { 2387 inp.skipOverAround('{', '}'); 2388 sink.put('"'); 2389 sink.put(inp); 2390 sink.put('"'); 2391 } 2392 else 2393 { 2394 sink.put(`'`); 2395 if (inp.length == 1 && 2396 (inp[0] == '\'' || 2397 inp[0] == '\\')) 2398 sink.put(`\`); // need backquoting 2399 sink.put(inp); 2400 sink.put(`'`); 2401 } 2402 } 2403 2404 version(none) // TODO use 2405 TokenNode makeLiteral(Token head) pure nothrow 2406 { 2407 assert(head.input.length >= 3); 2408 if (head.input[1 .. $-1].isASCIICharacterLiteral) 2409 return new AltCharLiteral(head); 2410 else 2411 return new StrLiteral(head); 2412 } 2413 2414 bool needsWrapping(const scope Node[] subs) @safe pure nothrow @nogc 2415 { 2416 bool wrapFlag; 2417 foreach (const sub; subs) 2418 if (!cast(const TokenNode)sub) 2419 wrapFlag = true; 2420 return wrapFlag; 2421 } 2422 2423 /// Binary pattern combinator. 2424 @safe abstract class BinaryOpPattern : Pattern 2425 { 2426 override void show(in Format fmt = Format.init) const 2427 { 2428 fmt.showIndent(); 2429 subs[0].show(fmt); 2430 putchar(' '); 2431 showChars(head.input); 2432 putchar(' '); 2433 subs[1].show(fmt); 2434 } 2435 nothrow: 2436 this(Token head, Pattern[2] subs) @nogc 2437 { 2438 assert(subs[0]); 2439 assert(subs[1]); 2440 super(head); 2441 this.subs = subs; 2442 } 2443 final override bool equals(const Node o) const @nogc 2444 { 2445 if (this is o) 2446 return true; 2447 if (const o_ = cast(const typeof(this))o) 2448 return (head == o_.head && 2449 equalsAll(this.subs[], o_.subs[])); 2450 return false; 2451 } 2452 Pattern[2] subs; 2453 } 2454 2455 /// Match value range between `limits[0]` and `limits[1]`. 2456 @safe final class Range : BinaryOpPattern 2457 { 2458 override void show(in Format fmt = Format.init) const 2459 { 2460 const wrapFlag = needsWrapping(subs[]); 2461 fmt.showIndent(); 2462 if (wrapFlag) 2463 putchar('('); 2464 subs[0].show(fmt); 2465 showChars(" .. "); 2466 subs[1].show(fmt); 2467 if (wrapFlag) 2468 putchar(')'); 2469 } 2470 nothrow: 2471 this(Token head, Pattern[2] limits) @nogc 2472 { 2473 assert(limits[0]); 2474 assert(limits[1]); 2475 super(head, limits); 2476 } 2477 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 2478 { 2479 sink.put("rng("); 2480 2481 if (const lower = cast(const StrLiteral)subs[0]) 2482 sink.putCharLiteral(lower.unquotedInput); 2483 else if (const lower = cast(const AltCharLiteral)subs[0]) 2484 sink.putCharLiteral(lower.head.input); 2485 else 2486 { 2487 debug writeln("handle sub[0] of type ", typeid(subs[0]).name); 2488 debug subs[0].show(); 2489 assert(false); 2490 } 2491 2492 sink.put(","); 2493 2494 if (const upper = cast(const StrLiteral)subs[1]) 2495 sink.putCharLiteral(upper.unquotedInput); 2496 else if (const upper = cast(const AltCharLiteral)subs[1]) 2497 sink.putCharLiteral(upper.head.input); 2498 else 2499 assert(false); 2500 2501 sink.put(")"); 2502 } 2503 override DcharCountSpan dcharCountSpan() const pure @nogc 2504 { 2505 return dcharCountSpanOf(subs[]); 2506 } 2507 final override bool opEquals(const scope Pattern that) const @nogc 2508 { 2509 if (auto that_ = cast(const typeof(this))that) 2510 return (head.input == that_.head.input && 2511 subs[0].opEquals(that_.subs[0]) && 2512 subs[1].opEquals(that_.subs[1])); 2513 return false; 2514 } 2515 } 2516 2517 Pattern parseCharAltM(const CharAltM alt, 2518 const scope ref GxLexer lexer) @safe nothrow 2519 { 2520 const Input inp = alt.unquotedInput; 2521 2522 bool inRange; 2523 PatternArray subs; 2524 for (size_t i; i < inp.length;) 2525 { 2526 Input inpi; 2527 2528 if (inp[i] == '-' && 2529 !subs.empty) // not first character 2530 { 2531 inRange = true; 2532 i += 1; 2533 continue; 2534 } 2535 2536 if (inp[i] == '\\') 2537 { 2538 i += 1; // skip '\\' 2539 switch (inp[i]) 2540 { 2541 case ']': 2542 case '-': 2543 case '\\': 2544 inpi = inp[i .. i + 1]; 2545 break; 2546 case 'p': 2547 if (inp[i + 1] != '{') 2548 lexer.errorAtToken(Token(alt.head.tok, inp[i + 1 .. $]), 2549 "expected brace"); 2550 const hit = inp[i + 1 .. $].indexOf('}'); 2551 if (hit >= 0) 2552 { 2553 inpi = inp[i - 1 .. i + 1 + hit + 1]; 2554 i += hit + 1; 2555 } 2556 else 2557 lexer.errorAtToken(Token(alt.head.tok, inp[i + 1 .. $]), 2558 "incorrect unicode escape sequence, missing matching closing brace '}'"); 2559 break; 2560 case 'u': 2561 if (inp[i + 1] == '{') 2562 { 2563 const hit = inp[i + 1 .. $].indexOf('}'); 2564 if (hit >= 0) 2565 { 2566 inpi = inp[i - 1 .. i + 1 + hit + 1]; 2567 i += hit + 1; 2568 } 2569 else 2570 lexer.errorAtToken(Token(alt.head.tok, inp[i + 1 .. $]), 2571 "incorrect unicode escape sequence, missing matching closing brace '}'"); 2572 } 2573 else 2574 { 2575 /* Unicode code point `\u....` where `....` is the hexadecimal 2576 number of the code point you want to match. */ 2577 import std.ascii : isHexDigit; 2578 if (i + 5 > inp.length && 2579 !(inp[i + 1].isHexDigit && 2580 inp[i + 2].isHexDigit && 2581 inp[i + 3].isHexDigit && 2582 inp[i + 4].isHexDigit)) 2583 lexer.errorAtToken(Token(alt.head.tok, inp[i + 1 .. $]), 2584 "incorrect unicode escape sequence"); 2585 inpi = inp[i - 1 .. i + 5]; 2586 i += 4; 2587 } 2588 break; 2589 default: 2590 inpi = inp[i - 1 .. i + 1]; 2591 break; 2592 } 2593 i += 1; 2594 } 2595 else if (inp[i] >= 0x80) 2596 { 2597 import std.typecons : Yes; 2598 import std.utf : decode; 2599 const replacementChar = cast(dchar)0x110000; 2600 const i0 = i; 2601 const ch = decode!(Yes.useReplacementDchar)(inp, i); 2602 if (ch == replacementChar) 2603 lexer.errorAtToken(alt.head, "invalid UTF-sequence `" ~ inp[i0 .. $] ~ "`"); 2604 inpi = inp[i0 .. i]; 2605 } 2606 else 2607 inpi = inp[i .. ++i]; 2608 2609 auto lit = new AltCharLiteral(Token(TOK.literal, inpi)); 2610 if (inRange) 2611 subs.insertBack(new Range(Token.init, [subs.backPop(), lit])); 2612 else 2613 subs.insertBack(lit); 2614 inRange = false; 2615 } 2616 return makeAltA(alt.head, subs.move()); // potentially flatten 2617 } 2618 2619 @safe final class CharAltM : Pattern 2620 { 2621 nothrow: 2622 2623 this(Token head) @nogc 2624 { 2625 super(head); 2626 } 2627 2628 Input unquotedInput() const @nogc 2629 { 2630 Input inp = head.input; 2631 assert(inp.skipOverAround('[', ']')); // trim 2632 return inp; 2633 } 2634 2635 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 2636 { 2637 Input inp = unquotedInput; 2638 2639 // TODO use `switch` `case` ranges 2640 2641 if (inp.canFind('-')) // check that range is not backquoted 2642 { 2643 size_t altCount; 2644 const asink = toMatchRangeInSource(inp, altCount); 2645 if (altCount >= 2) 2646 sink.put("alt("); 2647 sink.put(asink[]); 2648 if (altCount >= 2) 2649 sink.put(")"); 2650 return; 2651 } 2652 2653 sink.put("altN!("); 2654 for (size_t i; i < inp.length;) 2655 { 2656 if (i) 2657 sink.put(", "); // separator 2658 2659 sink.put('\''); // prefix 2660 2661 // contents: 2662 if (inp[i] == '\\') 2663 { 2664 i += 1; 2665 switch (inp[i]) 2666 { 2667 case ']': 2668 case '-': 2669 sink.put(inp[i]); // for instance: `\]` => `]` 2670 break; 2671 case '\\': 2672 sink.put(`\\`); // `\\` => `\\` 2673 break; 2674 default: 2675 sink.put('\\'); 2676 if (inp[i] == 'u') 2677 { 2678 import std.ascii : isHexDigit; 2679 if (i + 5 > inp.length && 2680 !(inp[i + 1].isHexDigit && 2681 inp[i + 2].isHexDigit && 2682 inp[i + 3].isHexDigit && 2683 inp[i + 4].isHexDigit)) 2684 parser._lexer.errorAtToken(Token(head.tok, inp[i + 1 .. $]), "incorrect unicode escape sequence"); 2685 sink.put(inp[i .. i + 5]); 2686 i += 4; 2687 } 2688 else 2689 sink.put(inp[i]); 2690 break; 2691 } 2692 } 2693 else if (inp[i] == '\'') 2694 sink.put(`\'`); 2695 else 2696 sink.put(inp[i]); 2697 i += 1; 2698 2699 sink.put('\''); // suffix 2700 } 2701 sink.put(")()"); 2702 } 2703 2704 private Output toMatchRangeInSource(Input input, 2705 out size_t altCount) const // alt count 2706 { 2707 typeof(return) sink; // argument sink 2708 for (size_t i; i < input.length; ++altCount) 2709 { 2710 if (i) 2711 sink.put(", "); // separator 2712 2713 if (i + 3 <= input.length && 2714 input[i] != '\\' && 2715 input[i + 1] == '-') // such as: `a-z` 2716 { 2717 sink.put("rng('"); 2718 sink.put(input[i]), 2719 sink.put("', '"); 2720 sink.put(input[i + 2]), 2721 sink.put("')"); 2722 i += 3; 2723 } 2724 else if (i + 13 <= input.length && 2725 input[i] == '\\' && 2726 input[i + 1] == 'u' && 2727 input[i + 5] != '\\' && 2728 input[i + 6] == '-') // such as: `\u0021-\u0031` 2729 { 2730 sink.put("rng('"); 2731 sink.put(input[i .. i + 6]), 2732 sink.put("', '"); 2733 sink.put(input[i + 7 .. i + 7 + 6]), 2734 sink.put("')"); 2735 i += 13; 2736 } 2737 else 2738 { 2739 sink.put("ch('"); 2740 if (input[i] == '\'') 2741 sink.put(`\'`); // escaped single quote 2742 else if (input[i] == '\\') 2743 { 2744 i += 1; // skip '\\' 2745 switch (input[i]) 2746 { 2747 case ']': 2748 case '-': 2749 sink.put(input[i]); // for instance: `\]` => `]` 2750 break; 2751 case '\\': 2752 sink.put(`\\`); // `\\` => `\\` 2753 break; 2754 default: 2755 sink.put('\\'); 2756 sink.put(input[i]); 2757 break; 2758 } 2759 } 2760 else 2761 sink.put(input[i]); 2762 i += 1; 2763 sink.put("')"); 2764 } 2765 } 2766 return sink; 2767 } 2768 override DcharCountSpan dcharCountSpan() const pure @nogc 2769 { 2770 return typeof(return)(1); 2771 } 2772 override bool opEquals(const scope Pattern that) const @nogc 2773 { 2774 if (auto that_ = cast(const typeof(this))that) 2775 return head.input == that_.head.input; 2776 return false; 2777 } 2778 } 2779 2780 @safe final class LineComment : TokenNode 2781 { 2782 nothrow: 2783 this(Token head) @nogc 2784 { 2785 super(head); 2786 } 2787 } 2788 2789 @safe final class BlockComment : TokenNode 2790 { 2791 nothrow: 2792 this(Token head) @nogc 2793 { 2794 super(head); 2795 } 2796 } 2797 2798 /// Grammar named `name`. 2799 @safe final class Grammar : TokenNode 2800 { 2801 override void show(in Format fmt = Format.init) const 2802 { 2803 showToken(head, fmt); 2804 putchar(' '); 2805 showChars(name); 2806 showChars(";\n"); 2807 } 2808 nothrow: 2809 this(Token head, Input name) @nogc 2810 { 2811 super(head); 2812 this.name = name; 2813 } 2814 final override bool equals(const Node o) const 2815 { 2816 if (this is o) 2817 return true; 2818 if (const o_ = cast(const typeof(this))o) 2819 return (head == o_.head && 2820 name == o_.name); 2821 return false; 2822 } 2823 Input name; 2824 } 2825 2826 /// Lexer grammar named `name`. 2827 @safe final class LexerGrammar : TokenNode 2828 { 2829 nothrow: 2830 this(Token head, Input name) @nogc 2831 { 2832 super(head); 2833 this.name = name; 2834 } 2835 Input name; 2836 } 2837 2838 /** Parser grammar named `name`. 2839 * 2840 * See_Also: https://theantlrguy.atlassian.net/wiki/spaces/ANTLR3/pages/2687210/Quick+Starter+on+Parser+Grammars+-+No+Past+Experience+Required 2841 */ 2842 @safe final class ParserGrammar : TokenNode 2843 { 2844 nothrow: 2845 this(Token head, Input name) @nogc 2846 { 2847 super(head); 2848 this.name = name; 2849 } 2850 Input name; 2851 } 2852 2853 /// Import of `modules`. 2854 @safe final class Import : TokenNode 2855 { 2856 override void show(in Format fmt = Format.init) const 2857 { 2858 showToken(head, fmt); 2859 putchar(' '); 2860 foreach (const i, const m ; moduleNames) 2861 { 2862 if (i) 2863 putchar(','); 2864 showChars(m); 2865 } 2866 putchar(';'); 2867 putchar('\n'); 2868 } 2869 nothrow: 2870 this(Token head, DynamicArray!(Input) moduleNames) @nogc 2871 { 2872 super(head); 2873 move(moduleNames, this.moduleNames); 2874 } 2875 DynamicArray!(Input) moduleNames; 2876 } 2877 2878 @safe final class Mode : TokenNode 2879 { 2880 nothrow: 2881 this(Token head, Input name) @nogc 2882 { 2883 super(head); 2884 this.name = name; 2885 } 2886 Input name; 2887 } 2888 2889 @safe final class Options : TokenNode 2890 { 2891 nothrow: 2892 this(Token head, Token code) @nogc 2893 { 2894 super(head); 2895 this.code = code; 2896 } 2897 Input name; 2898 Token code; 2899 } 2900 2901 @safe final class Header : TokenNode 2902 { 2903 nothrow: 2904 this(Token head, Token name, Token code) @nogc 2905 { 2906 super(head); 2907 this.name = name; 2908 this.code = code; 2909 } 2910 Token name; 2911 Token code; 2912 } 2913 2914 @safe final class ScopeSymbolAction : TokenNode 2915 { 2916 nothrow: 2917 this(Token head, 2918 Input name, 2919 Token code) @nogc 2920 { 2921 super(head); 2922 this.name = name; 2923 this.code = code; 2924 } 2925 Input name; 2926 Token code; 2927 } 2928 2929 @safe final class ScopeSymbol : TokenNode 2930 { 2931 nothrow: 2932 this(Token head, 2933 Input name) @nogc 2934 { 2935 super(head); 2936 this.name = name; 2937 } 2938 Input name; 2939 } 2940 2941 @safe final class ScopeAction : TokenNode 2942 { 2943 override void show(in Format fmt = Format.init) const 2944 { 2945 showToken(head, fmt); 2946 } 2947 nothrow: 2948 this(Token head, 2949 Token code) @nogc 2950 { 2951 super(head); 2952 this.code = code; 2953 } 2954 Token code; 2955 } 2956 2957 @safe final class AttributeSymbol : TokenNode 2958 { 2959 override void show(in Format fmt = Format.init) const 2960 { 2961 showToken(head, fmt); 2962 } 2963 nothrow: 2964 this(Token head, Token code) @nogc 2965 { 2966 super(head); 2967 this.code = code; 2968 } 2969 Token code; 2970 } 2971 2972 @safe final class Action : TokenNode 2973 { 2974 nothrow: 2975 this(Token head) @nogc 2976 { 2977 super(head); 2978 } 2979 } 2980 2981 @safe final class ActionSymbol : TokenNode 2982 { 2983 override void show(in Format fmt = Format.init) const 2984 { 2985 showToken(head, fmt); 2986 } 2987 nothrow: 2988 this(Token head, Token code) @nogc 2989 { 2990 super(head); 2991 this.code = code; 2992 } 2993 Token code; 2994 } 2995 2996 @safe final class Channels : TokenNode 2997 { 2998 nothrow: 2999 this(Token head, Token code) @nogc 3000 { 3001 super(head); 3002 this.code = code; 3003 } 3004 Token code; 3005 } 3006 3007 @safe final class Tokens : TokenNode 3008 { 3009 nothrow: 3010 this(Token head, Token code) @nogc 3011 { 3012 super(head); 3013 this.code = code; 3014 } 3015 override void toMatchInSource(scope ref Output sink, const scope GxParserByStatement parser) const 3016 { 3017 } 3018 Token code; 3019 } 3020 3021 @safe final class Class : TokenNode 3022 { 3023 nothrow: 3024 this(Token head, Input name, Input baseName) @nogc 3025 { 3026 super(head); 3027 this.name = name; 3028 this.baseName = baseName; 3029 } 3030 Input name; 3031 Input baseName; ///< Base class name. 3032 } 3033 3034 alias Imports = DynamicArray!(Import, Mallocator, uint); 3035 alias Rules = DynamicArray!(Rule, Mallocator, uint); 3036 alias SymbolRefs = DynamicArray!(SymbolRef, Mallocator, uint); 3037 3038 /** Gx parser with range interface over all statements. 3039 * 3040 * See: `ANTLRv4Parser.g4` 3041 */ 3042 @safe class GxParserByStatement 3043 { 3044 this(Input input, 3045 const string path = null, 3046 in bool includeComments = false) 3047 { 3048 _lexer = GxLexer(input, path, includeComments); 3049 if (!_lexer.empty) 3050 _front = nextFront(); 3051 } 3052 3053 @property bool empty() const nothrow scope @nogc 3054 { 3055 version(D_Coverage) {} else version(Do_Inline) version(LDC) pragma(inline, true); 3056 return _front is null; 3057 } 3058 3059 inout(Node) front() inout scope return 3060 { 3061 version(D_Coverage) {} else version(Do_Inline) pragma(inline, true); 3062 assert(!empty); 3063 return _front; 3064 } 3065 3066 void popFront() 3067 { 3068 version(D_Coverage) {} else version(Do_Inline) version(LDC) pragma(inline, true); 3069 assert(!empty); 3070 if (_lexer.empty) 3071 _front = null; // make `this` empty 3072 else 3073 _front = nextFront(); 3074 } 3075 3076 private Rule makeRule(Token name, 3077 in bool isFragmentRule, 3078 ActionSymbol actionSymbol = null, 3079 Action action = null) 3080 { 3081 _lexer.popFrontEnforce(TOK.colon, "no colon"); 3082 3083 bool skipFlag; 3084 3085 static if (useStaticTempArrays) 3086 FixedArray!(Pattern, 100) alts; 3087 else 3088 PatternArray alts; 3089 3090 while (_lexer.front.tok != TOK.semicolon) 3091 { 3092 size_t parentDepth = 0; 3093 3094 // temporary node sequence stack 3095 static if (useStaticTempArrays) 3096 FixedArray!(Node, 70) tseq; // doesn't speed up that much 3097 else 3098 NodeArray tseq; 3099 3100 void seqPutCheck(Node last) 3101 { 3102 if (last is null) 3103 return _lexer.warningAtToken(name, "empty sequence"); 3104 if (!_lexer.empty && 3105 _lexer.front.tok == TOK.dotdot) 3106 return tseq.put(last); // ... has higher prescedence 3107 if (!tseq.empty) 3108 { 3109 if (auto dotdot = cast(DotDotSentinel)tseq.back) // binary operator 3110 { 3111 tseq.popBack(); // pop `DotDotSentinel` 3112 return seqPutCheck(new Range(dotdot.head, 3113 [cast(Pattern)tseq.backPop(), cast(Pattern)last])); 3114 } 3115 if (auto tilde = cast(TildeSentinel)tseq.back) // prefix unary operator 3116 { 3117 tseq.popBack(); // pop `TildeSentinel` 3118 return seqPutCheck(new NotPattern(tilde.head, 3119 cast(Pattern)last)); 3120 } 3121 if (auto ng = cast(TerminatedUnaryOpPattern)tseq.back) 3122 { 3123 Pattern lastp = cast(Pattern)last; 3124 assert(lastp); 3125 ng.terminator = lastp; 3126 } 3127 } 3128 return tseq.put(last); 3129 } 3130 3131 while ((parentDepth != 0 || 3132 _lexer.front.tok != TOK.pipe) && 3133 _lexer.front.tok != TOK.semicolon) 3134 { 3135 // TODO: use static array with length being number of tokens till `TOK.pipe` 3136 const head = _lexer.frontPop(); 3137 3138 void groupLastSeq() @safe nothrow 3139 { 3140 // find backwards index `ih` in `tseq` at '(' or '|'. TODO: reuse `lastIndexOf` 3141 size_t ih = tseq.length; 3142 foreach_reverse (const i, const e; tseq) 3143 { 3144 if (auto sym = cast(const PipeSentinel)e) 3145 { 3146 ih = i; 3147 break; 3148 } 3149 else if (auto sym = cast(const LeftParenSentinel)e) 3150 { 3151 ih = i; 3152 break; 3153 } 3154 } 3155 if (ih == tseq.length) 3156 _lexer.errorAtToken(head, "missing left-hand side"); 3157 Pattern nseq = makeSeq(tseq[ih + 1 .. $], _lexer); 3158 tseq.popBackN(tseq.length - (ih + 1)); // exclude op sentinel 3159 tseq.insertBack(nseq); // put it back 3160 } 3161 3162 switch (head.tok) 3163 { 3164 case TOK.symbol: 3165 if (head.input == "options") 3166 auto _ = makeRuleOptions(head, true); 3167 else 3168 { 3169 if (_lexer.front.tok == TOK.colon) 3170 { 3171 _lexer.popFront(); 3172 continue; // skip element label: SYMBOL '.'. See_Also: https://www.antlr2.org/doc/metalang.html section "Element Labels" 3173 } 3174 auto sref = new SymbolRef(head); 3175 seqPutCheck(sref); 3176 symbolRefs.insertBack(sref); 3177 } 3178 break; 3179 case TOK.literal: 3180 seqPutCheck(new StrLiteral(head)); 3181 break; 3182 case TOK.qmark: 3183 if (tseq.empty) 3184 _lexer.errorAtToken(head, "missing left-hand side"); 3185 Pattern node; 3186 if (auto oom = cast(GreedyOneOrMore)tseq.back) 3187 { 3188 // See_Also: https://stackoverflow.com/questions/64706408/rewriting-x-as-x-in-antlr-grammars 3189 node = new GreedyZeroOrMore(head, oom.sub); 3190 tseq.popBack(); 3191 _lexer.warningAtToken(head, "read `(X+)?` as `X*`"); 3192 } 3193 else 3194 node = new GreedyZeroOrOne(head, tseq.backPop()); 3195 seqPutCheck(node); 3196 break; 3197 case TOK.star: 3198 if (tseq.empty) 3199 _lexer.errorAtToken(head, "missing left-hand side"); 3200 seqPutCheck(new GreedyZeroOrMore(head, tseq.backPop())); 3201 break; 3202 case TOK.plus: 3203 if (tseq.empty) 3204 _lexer.errorAtToken(head, "missing left-hand side"); 3205 seqPutCheck(new GreedyOneOrMore(head, tseq.backPop())); 3206 break; 3207 case TOK.qmarkQmark: 3208 if (tseq.empty) 3209 _lexer.errorAtToken(head, "missing left-hand side"); 3210 seqPutCheck(new NonGreedyZeroOrOne(head, tseq.backPop())); 3211 break; 3212 case TOK.starQmark: 3213 if (tseq.empty) 3214 _lexer.errorAtToken(head, "missing left-hand side"); 3215 seqPutCheck(new NonGreedyZeroOrMore(head, tseq.backPop())); 3216 break; 3217 case TOK.plusQmark: 3218 if (tseq.empty) 3219 _lexer.errorAtToken(head, "missing left-hand side"); 3220 seqPutCheck(new NonGreedyOneOrMore(head, tseq.backPop())); 3221 break; 3222 case TOK.rewriteSyntacticPredicate: 3223 if (tseq.empty) 3224 _lexer.errorAtToken(head, "missing left-hand side"); 3225 seqPutCheck(new RewriteSyntacticPredicate(head, tseq.backPop())); 3226 break; 3227 case TOK.tilde: 3228 tseq.put(new TildeSentinel(head)); 3229 break; 3230 case TOK.pipe: 3231 if (tseq.empty) 3232 { 3233 _lexer.warningAtFront("missing left-hand side"); 3234 continue; 3235 } 3236 else if (const symbol = cast(LeftParenSentinel)tseq.back) 3237 { 3238 _lexer.warningAtToken(symbol.head, "missing left-hand side"); 3239 continue; 3240 } 3241 groupLastSeq(); 3242 tseq.put(new PipeSentinel(head)); 3243 break; 3244 case TOK.dotdot: 3245 tseq.put(new DotDotSentinel(head)); 3246 break; 3247 case TOK.wildcard: 3248 seqPutCheck(new AnyClass(head)); 3249 break; 3250 case TOK.brackets: 3251 seqPutCheck(parseCharAltM(new CharAltM(head), _lexer)); 3252 break; 3253 case TOK.hash: 3254 case TOK.rewrite: 3255 // ignore `head` 3256 if (_lexer.front == Token(TOK.symbol, "skip")) 3257 { 3258 skipFlag = true; 3259 _lexer.popFront(); 3260 } 3261 while (_lexer.front.tok != TOK.pipe && 3262 _lexer.front.tok != TOK.semicolon) 3263 { 3264 _lexer.warningAtFront("TODO: use rewrite argument"); 3265 _lexer.popFront(); // ignore for now 3266 } 3267 break; 3268 case TOK.leftParen: 3269 parentDepth += 1; 3270 tseq.put(new LeftParenSentinel(head)); 3271 break; 3272 case TOK.rightParen: 3273 parentDepth -= 1; 3274 3275 groupLastSeq(); 3276 3277 // find matching '(' if any 3278 size_t si = tseq.length; // left paren sentinel index 3279 LeftParenSentinel ss; // left parent index Symbol 3280 foreach_reverse (const i, Node node; tseq[]) 3281 { 3282 if (auto lp = cast(LeftParenSentinel)node) 3283 { 3284 si = i; 3285 ss = lp; 3286 break; 3287 } 3288 } 3289 3290 PatternArray asubs; // TODO: use stack allocation of length tseq[si .. $].length - number of `PipeSentinel`s 3291 Token ahead; 3292 foreach (e; tseq[si + 1.. $]) 3293 { 3294 if (auto ps = cast(PipeSentinel)e) 3295 { 3296 if (ahead != Token.init) 3297 ahead = ps.head; // use first '|' as head of alternative below 3298 } 3299 else 3300 { 3301 Pattern pe = cast(Pattern)e; 3302 assert(pe); 3303 asubs.put(pe); 3304 } 3305 } 3306 if (asubs.empty) 3307 { 3308 auto nothing = new SeqM(ss.head); 3309 tseq.popBack(); // pop '(' 3310 seqPutCheck(nothing); 3311 } 3312 else 3313 { 3314 auto lalt = makeAltA(ahead, asubs.move()); 3315 tseq.popBackN(tseq.length - si); 3316 Node[1] ssubs = [lalt]; 3317 seqPutCheck(makeSeq(ssubs, _lexer)); 3318 } 3319 break; 3320 case TOK.action: 3321 // ignore action 3322 _lexer.skipOverTOK(TOK.qmark); // TODO: handle in a more generic way 3323 break; 3324 case TOK.labelAssignment: 3325 // ignore for now: SYMBOL '=' 3326 if (!cast(OtherSymbol)tseq.back) 3327 _lexer.errorAtFront("non-symbol before label assignment"); 3328 tseq.popBack(); // ignore 3329 break; 3330 case TOK.tokenSpecOptions: 3331 // ignore 3332 break; 3333 case TOK.colon: 3334 // ignore 3335 _lexer.warningAtFront("ignoring colon with no effect"); 3336 continue; 3337 case TOK.rootNode: 3338 /* AST root operator. When generating abstract syntax trees 3339 * (ASTs), token references suffixed with the "^" root 3340 * operator force AST nodes to be created and added as the 3341 * root of the current tree. This symbol is only effective 3342 * when the buildAST option is set. More information about 3343 * ASTs is also available. */ 3344 // ignore 3345 break; 3346 case TOK.exclamation: 3347 /* AST exclude operator. When generating abstract syntax 3348 * trees, token references suffixed with the "!" exclude 3349 * operator are not included in the AST constructed for that 3350 * rule. Rule references can also be suffixed with the 3351 * exclude operator, which implies that, while the tree for 3352 * the referenced rule is constructed, it is not linked into 3353 * the tree for the referencing rule. This symbol is only 3354 * effective when the buildAST option is set. More 3355 * information about ASTs is also available. */ 3356 // ignore 3357 break; 3358 case TOK.semicolon: 3359 assert(false); 3360 default: 3361 _lexer.infoAtFront("TODO: unhandled token type" ~ _lexer.front.to!string); 3362 seqPutCheck(new OtherSymbol(head)); 3363 break; 3364 } 3365 } 3366 if (!tseq.length) // may be empty 3367 { 3368 // _lexer.warningAtFront("empty rule sequence"); 3369 } 3370 static if (useStaticTempArrays) 3371 { 3372 alts.put(makeSeq(tseq[], _lexer)); 3373 tseq.clear(); 3374 } 3375 else 3376 { 3377 alts.put(makeSeq(tseq[], _lexer)); // TODO: use `tseq.move()` when tseq is a `PatternArray` 3378 tseq.clear(); 3379 } 3380 if (_lexer.front.tok == TOK.pipe) 3381 _lexer.popFront(); // skip terminator 3382 } 3383 3384 _lexer.popFrontEnforce(TOK.semicolon, "no terminating semicolon"); 3385 3386 // needed for ANTLRv2.g2: 3387 if (!_lexer.empty) 3388 { 3389 // if (_lexer.front == Token(TOK.symbol, "exception")) 3390 // _lexer.popFront(); 3391 // if (_lexer.front == Token(TOK.symbol, "catch")) 3392 // _lexer.popFront(); 3393 if (_lexer.front.tok == TOK.brackets) 3394 _lexer.popFront(); 3395 if (_lexer.front.tok == TOK.action) 3396 _lexer.popFront(); 3397 } 3398 3399 static if (useStaticTempArrays) 3400 { 3401 Pattern root = alts.length == 1 ? alts.backPop() : makeAltM(Token.init, alts[]); 3402 alts.clear(); 3403 } 3404 else 3405 Pattern root = alts.length == 1 ? alts.backPop() : makeAltA(Token.init, alts.move()); 3406 3407 Rule rule = (isFragmentRule 3408 ? new FragmentRule(name, root, skipFlag) 3409 : new Rule(name, root, skipFlag)); 3410 3411 if (_lexer._diagnoseLeftRecursion) 3412 rule.diagnoseDirectLeftRecursion(_lexer); 3413 3414 // insert rule 3415 const warnDuplicateRulePattern = false; // this is normally not an error, for instance, in `oncrpcv2.g4` 3416 if (warnDuplicateRulePattern) 3417 foreach (const existingRule; rules) 3418 { 3419 if (rule.root.opEquals(existingRule.root)) 3420 { 3421 _lexer.warningAtToken(rule.head, "rule with same root pattern"); // TODO: error 3422 _lexer.warningAtToken(existingRule.head, "existing definition here"); // TODO: errorSupplemental 3423 } 3424 } 3425 rules.insertBack(rule); 3426 rulesByName.update(rule.head.input, // See_Also: https://dlang.org/spec/hash-map.html#advanced_updating 3427 { 3428 return rule; 3429 }, 3430 (const scope ref Rule existingRule) 3431 { 3432 _lexer.warningAtToken(rule.head, "rule with same name already exists"); // TODO: error 3433 _lexer.warningAtToken(existingRule.head, "existing definition here"); // TODO: errorSupplemental 3434 return rule; 3435 }); 3436 3437 return rule; 3438 } 3439 3440 DynamicArray!(Input) makeArgs(in TOK separator, 3441 in TOK terminator) 3442 { 3443 typeof(return) result; 3444 while (true) 3445 { 3446 result.put(_lexer.frontPopEnforce(TOK.symbol).input); 3447 if (_lexer.front.tok != separator) 3448 break; 3449 _lexer.popFront(); 3450 } 3451 _lexer.popFrontEnforce(terminator, "no terminating semicolon"); 3452 return result; 3453 } 3454 3455 AttributeSymbol makeAttributeSymbol(Token head) nothrow 3456 { 3457 return new AttributeSymbol(head, _lexer.frontPopEnforce(TOK.action, "missing action")); 3458 } 3459 3460 ActionSymbol makeActionSymbol(Token head) nothrow 3461 { 3462 return new ActionSymbol(head, _lexer.frontPopEnforce(TOK.action, "missing action")); 3463 } 3464 3465 TokenNode makeScope(Token head) 3466 { 3467 if (_lexer.front.tok == TOK.symbol) 3468 { 3469 const symbol = _lexer.frontPop().input; 3470 if (_lexer.front.tok == TOK.action) 3471 return new ScopeSymbolAction(head, symbol, 3472 _lexer.frontPopEnforce(TOK.action, "missing action")); 3473 else 3474 { 3475 auto result = new ScopeSymbol(head, symbol); 3476 _lexer.frontPopEnforce(TOK.semicolon, 3477 "missing terminating semicolon"); 3478 return result; 3479 } 3480 } 3481 else 3482 { 3483 return new ScopeAction(head, 3484 _lexer.frontPopEnforce(TOK.action, "missing action")); 3485 } 3486 } 3487 3488 Import makeImport(Token head) 3489 { 3490 auto import_ = new Import(head, makeArgs(TOK.comma, TOK.semicolon)); 3491 this.imports.put(import_); 3492 return import_; 3493 } 3494 3495 TokenNode makeClass(Token head) 3496 { 3497 auto result = new Class(head, 3498 _lexer.frontPopEnforce(TOK.symbol, "missing symbol").input, 3499 _lexer.skipOverToken(Token(TOK.symbol, "extends")).input ? 3500 _lexer.frontPop().input : 3501 null); 3502 _lexer.popFrontEnforce(TOK.semicolon, "no terminating semicolon"); 3503 return result; 3504 } 3505 3506 OtherSymbol skipOverOtherSymbol(string symbolIdentifier) return 3507 { 3508 if (_lexer.front == Token(TOK.symbol, symbolIdentifier)) 3509 return new typeof(return)(_lexer.frontPop()); 3510 return null; 3511 } 3512 3513 /// Skip over scope if any. 3514 TokenNode skipOverScope() 3515 { 3516 if (_lexer.front == Token(TOK.symbol, "scope")) 3517 return makeScope(_lexer.frontPop()); 3518 return null; 3519 } 3520 3521 Options makeRuleOptions(Token head, 3522 in bool skipOverColon = false) nothrow 3523 { 3524 version(Do_Inline) version(LDC) pragma(inline, true); 3525 const action = _lexer.frontPopEnforce(TOK.action, "missing action"); 3526 if (skipOverColon) 3527 _lexer.skipOverTOK(TOK.colon); 3528 return new Options(head, action); 3529 } 3530 3531 Options makeTopOptions(Token head) nothrow 3532 { 3533 version(Do_Inline) version(LDC) pragma(inline, true); 3534 const action = _lexer.frontPopEnforce(TOK.action, "missing action"); 3535 _lexer.skipOverTOK(TOK.colon); // optionally scoped. See_Also: https://stackoverflow.com/questions/64477446/meaning-of-colon-inside-parenthesises/64477817#64477817 3536 return new Options(head, action); 3537 } 3538 3539 Channels makeChannels(Token head) nothrow 3540 { 3541 version(Do_Inline) version(LDC) pragma(inline, true); 3542 return new Channels(head, _lexer.frontPopEnforce(TOK.action, "missing action")); 3543 } 3544 3545 Tokens makeTokens(Token head) nothrow 3546 { 3547 version(Do_Inline) version(LDC) pragma(inline, true); 3548 return new Tokens(head, _lexer.frontPopEnforce(TOK.action, "missing action")); 3549 } 3550 3551 Header makeHeader(Token head) 3552 { 3553 const name = (_lexer.front.tok == TOK.literal ? 3554 _lexer.frontPop() : 3555 Token.init); 3556 const action = _lexer.frontPopEnforce(TOK.action, "missing action"); 3557 return new Header(head, name, action); 3558 } 3559 3560 Mode makeMode(Token head) 3561 { 3562 auto result = new Mode(head, _lexer.frontPop().input); 3563 _lexer.popFrontEnforce(TOK.semicolon, "no terminating semicolon"); 3564 return result; 3565 } 3566 3567 Action makeAction(Token head) 3568 { 3569 version(Do_Inline) version(LDC) pragma(inline, true); 3570 return new Action(head); 3571 } 3572 3573 /// Skip over options if any. 3574 Options skipOverPreRuleOptions() 3575 { 3576 if (_lexer.front == Token(TOK.symbol, "options")) 3577 return makeRuleOptions(_lexer.frontPop()); 3578 return null; 3579 } 3580 3581 bool skipOverExclusion() 3582 { 3583 if (_lexer.front.tok == TOK.exclamation) 3584 { 3585 _lexer.frontPop(); 3586 return true; 3587 } 3588 return false; 3589 } 3590 3591 bool skipOverReturns() 3592 { 3593 if (_lexer.front == Token(TOK.symbol, "returns")) 3594 { 3595 _lexer.frontPop(); 3596 return true; 3597 } 3598 return false; 3599 } 3600 3601 bool skipOverHooks() 3602 { 3603 if (_lexer.front.tok == TOK.brackets) 3604 { 3605 // _lexer.infoAtFront("TODO: use TOK.brackets"); 3606 _lexer.frontPop(); 3607 return true; 3608 } 3609 return false; 3610 } 3611 3612 Action skipOverAction() 3613 { 3614 if (_lexer.front.tok == TOK.action) 3615 return makeAction(_lexer.frontPop()); 3616 return null; 3617 } 3618 3619 ActionSymbol skipOverActionSymbol() 3620 { 3621 if (_lexer.front.tok == TOK.actionSymbol) 3622 return makeActionSymbol(_lexer.frontPop); 3623 return null; 3624 } 3625 3626 Node makeRuleOrOther(Token head) 3627 { 3628 if (_lexer.front.tok == TOK.colon) // normal case 3629 return makeRule(head, false); // fast path 3630 3631 if (head.input == "lexer" || 3632 head.input == "parser" || 3633 head.input == "grammar") 3634 { 3635 bool lexerFlag; 3636 bool parserFlag; 3637 if (head.input == "lexer") 3638 { 3639 lexerFlag = true; 3640 _lexer.popFrontEnforce(TOK.symbol, "expected `grammar` after `lexer`"); // TODO: enforce input grammar 3641 } 3642 else if (head.input == "parser") 3643 { 3644 parserFlag = true; 3645 _lexer.popFrontEnforce(TOK.symbol, "expected `grammar` after `parser`"); // TODO: enforce input grammar 3646 } 3647 3648 if (lexerFlag) 3649 { 3650 auto lexerGrammar = new LexerGrammar(head, _lexer.frontPop().input); 3651 _lexer.popFrontEnforce(TOK.semicolon, "no terminating semicolon"); 3652 return this.grammar = lexerGrammar; 3653 } 3654 else if (parserFlag) 3655 { 3656 auto parserGrammar = new ParserGrammar(head, _lexer.frontPop().input); 3657 _lexer.popFrontEnforce(TOK.semicolon, "no terminating semicolon"); 3658 return this.grammar = parserGrammar; 3659 } 3660 else 3661 { 3662 if (_lexer.front.tok == TOK.colon) 3663 return makeRule(head, false); 3664 else 3665 { 3666 auto grammar = new Grammar(head, _lexer.frontPop().input); 3667 _lexer.popFrontEnforce(TOK.semicolon, "no terminating semicolon"); 3668 this.grammar = grammar; 3669 return grammar; 3670 } 3671 } 3672 } 3673 3674 switch (head.input) 3675 { 3676 case `private`: 3677 _lexer.frontEnforce(TOK.symbol, "expected symbol after `private`"); 3678 return makeRuleOrOther(_lexer.frontPop); // TODO: set private qualifier 3679 case `protected`: 3680 _lexer.frontEnforce(TOK.symbol, "expected symbol after `protected`"); 3681 return makeRuleOrOther(_lexer.frontPop); // TODO: set protected qualifier 3682 case `channels`: 3683 return makeChannels(head); 3684 case `tokens`: 3685 return makeTokens(head); 3686 case `options`: 3687 auto options = makeTopOptions(head); 3688 optionsSet.insertBack(options); 3689 return options; 3690 case `header`: 3691 return makeHeader(head); 3692 case `mode`: 3693 return makeMode(head); 3694 case `class`: 3695 return makeClass(head); 3696 case `scope`: 3697 return makeScope(head); 3698 case `import`: 3699 return makeImport(head); 3700 case `fragment`: // lexer helper rule, not real token for parser. 3701 return makeRule(_lexer.frontPop(), true); 3702 default: 3703 while (_lexer.front.tok != TOK.colon) 3704 { 3705 // TODO: use switch 3706 if (skipOverExclusion()) // TODO: use 3707 continue; 3708 if (skipOverReturns()) // TODO: use 3709 continue; 3710 if (skipOverHooks()) // TODO: use 3711 continue; 3712 if (const _ = skipOverOtherSymbol("locals")) // TODO: use 3713 continue; 3714 if (const _ = skipOverPreRuleOptions()) // TODO: use 3715 continue; 3716 if (const _ = skipOverScope()) // TODO: use 3717 continue; 3718 if (const _ = skipOverAction()) // TODO: use 3719 continue; 3720 if (const _ = skipOverActionSymbol()) // TODO: use 3721 continue; 3722 break; // no progression so done 3723 } 3724 return makeRule(head, false); 3725 } 3726 } 3727 3728 Node nextFront() 3729 { 3730 const head = _lexer.frontPop(); 3731 switch (head.tok) 3732 { 3733 case TOK.attributeSymbol: 3734 return makeAttributeSymbol(head); 3735 case TOK.actionSymbol: 3736 return makeActionSymbol(head); 3737 case TOK.blockComment: 3738 return new BlockComment(head); 3739 case TOK.lineComment: 3740 return new LineComment(head); 3741 case TOK.action: 3742 return new Action(head); 3743 case TOK.symbol: 3744 return makeRuleOrOther(head); 3745 default: 3746 _lexer.errorAtFront("TODO: handle"); 3747 assert(false); 3748 } 3749 } 3750 3751 TokenNode grammar; 3752 DynamicArray!(Options) optionsSet; 3753 Imports imports; 3754 Rules rules; 3755 RulesByName rulesByName; 3756 SymbolRefs symbolRefs; ///< All of `SymbolRef` instances. 3757 bool warnUnknownSymbolFlag; 3758 private: 3759 GxLexer _lexer; 3760 Node _front; 3761 Rule _rootRule; ///< Root rule for grammar. 3762 } 3763 3764 /// Returns: `path` as module name. 3765 string toPathModuleName(string path) 3766 { 3767 string adjustDirectoryName(const return scope string name) pure nothrow @nogc 3768 { 3769 if (name == "asm") // TODO extend to check if a keyword 3770 return "asm_"; 3771 return name; 3772 } 3773 3774 const stripLeadingSlashesFlag = false; 3775 if (stripLeadingSlashesFlag) 3776 while (path[0] == '/' || 3777 path[0] == '\\') 3778 path = path[1 .. $]; // strip leading '/'s 3779 3780 const grammarsV4DirPath = "~/Work/grammars-v4".expandTilde; // TODO: move somewhere suitable 3781 3782 return path.expandTilde 3783 .relativePath(grammarsV4DirPath) 3784 .stripExtension 3785 .pathSplitter() 3786 .map!(_ => adjustDirectoryName(_)) 3787 .joiner("__") // "/" => "__" 3788 .substitute('-', '_') 3789 .to!string ~ "_parser"; // TODO: use lazy ranges that return `char`; 3790 } 3791 3792 /// Gx filer parser. 3793 @safe class GxFileParser : GxParserByStatement 3794 { 3795 this(string path, 3796 GxFileParser[string] cachedParsersByModuleName) 3797 { 3798 Input data = cast(Input)rawReadPath(path.expandTilde); // cast to Input because we don't want to keep all file around: 3799 super(data, path, false); 3800 this.cachedParsersByModuleName = cachedParsersByModuleName; 3801 } 3802 3803 alias RuleNames = DynamicArray!string; 3804 3805 void generateParserSourceString(scope ref Output output, 3806 out string moduleName) 3807 { 3808 const path = _lexer.path; 3809 moduleName = path.toPathModuleName(); 3810 3811 output.put("/// Automatically generated from `"); 3812 output.put(path); 3813 output.put("`.\n"); 3814 output.put("module " ~ moduleName ~ q{; 3815 3816 }); 3817 output.put(parserSourceBegin); 3818 toMatchers(output); 3819 output.put(parserSourceEnd); 3820 3821 rootRule(); // TODO: use this 3822 } 3823 3824 /** Get root rule. 3825 * 3826 * See_Also: https://github.com/antlr/grammars-v4/issues/2097 3827 * See_Also: https://stackoverflow.com/questions/29879626/antlr4-how-to-find-the-root-rules-in-a-gramar-which-maybe-used-to-find-the-star 3828 */ 3829 @property Rule rootRule() 3830 { 3831 if (_rootRule) 3832 return _rootRule; 3833 retagReferencedRules(); 3834 foreach (Rule rule; rules) 3835 { 3836 if (rule.hasRef || 3837 rule.skipFlag) 3838 continue; 3839 else if (rule.isFragmentRule) 3840 _lexer.warningAtToken(rule.head, "unused fragment rule"); 3841 else if (rule.isLexerTokenRule) 3842 _lexer.warningAtToken(rule.head, "unused (lexical) lexer token rule"); // TODO: don't warn about skipped rules -> skip 3843 else if (_rootRule) 3844 { 3845 _lexer.warningAtToken(rule.head, "second root rule defined"); 3846 _lexer.warningAtToken(_rootRule.head, " existing root rule defined here"); 3847 } 3848 else 3849 _rootRule = rule; 3850 } 3851 if (!_rootRule) 3852 _lexer.warningAtToken(grammar.head, "missing root rule, all rule symbols are referenced (cyclic grammar)"); 3853 return _rootRule; 3854 } 3855 3856 /** Retag all referenced rules. 3857 */ 3858 void retagReferencedRules() 3859 { 3860 untagReferencedRules(); 3861 tagReferencedRules(); 3862 } 3863 3864 /** Untag all rules (as unreferenced). 3865 */ 3866 void untagReferencedRules() nothrow 3867 { 3868 foreach (Rule rule; rulesByName.byValue) 3869 rule.hasRef = false; 3870 } 3871 3872 /** Tag all referenced rules in either `this` or imported modules. 3873 */ 3874 void tagReferencedRules() 3875 { 3876 foreach (const symbolRef; symbolRefs) 3877 { 3878 if (symbolRef.head.input == "EOF") 3879 continue; 3880 if (tryTagReferencedRule(symbolRef)) 3881 continue; 3882 foreach (const Import import_; imports) 3883 foreach (const Input moduleName; import_.moduleNames) 3884 { 3885 const string path = _lexer.path; 3886 string cwd = path.dirName; // current working directory 3887 const string ext = path.extension; 3888 typeof(this) sub = findModuleUpwards(cwd, moduleName, ext, cachedParsersByModuleName); 3889 //debug writeln("sub: ", sub); 3890 if (sub.tryTagReferencedRule(symbolRef)) 3891 goto done; 3892 } 3893 _lexer.warningAtToken(symbolRef.head, "no symbol named `" ~ symbolRef.head.input ~ "`"); 3894 done: 3895 } 3896 } 3897 3898 bool tryTagReferencedRule(const SymbolRef symbolRef) nothrow @nogc 3899 { 3900 debug writeln("path: ", _lexer.path, 3901 " symbolRef.head=", symbolRef.head, 3902 " rulesByName.length=", rulesByName.length, 3903 " rules=", rules.length); 3904 assert(rulesByName.length); 3905 if (auto hit = symbolRef.head.input in rulesByName) 3906 { 3907 hit.hasRef = true; 3908 return true; 3909 } 3910 return false; 3911 } 3912 3913 void toMatchers(scope ref Output output) 3914 { 3915 RuleNames doneRuleNames; 3916 toMatchersForRules(doneRuleNames, output); 3917 toMatchersForImports(doneRuleNames, output); 3918 toMatchersForOptionsTokenVocab(doneRuleNames, output); 3919 } 3920 3921 void toMatchersForImportedModule(in const(char)[] moduleName, 3922 scope ref RuleNames doneRuleNames, 3923 scope ref Output output) scope 3924 { 3925 const string path = _lexer.path; 3926 string cwd = path.dirName; // current working directory 3927 const string ext = path.extension; 3928 3929 GxFileParser fp_ = findModuleUpwards(cwd, moduleName, ext, cachedParsersByModuleName); 3930 3931 while (!fp_.empty) 3932 fp_.popFront(); 3933 3934 fp_.toMatchersForImports(doneRuleNames, output); // transitive imports 3935 3936 /** Rules in the “main grammar” override rules from imported 3937 grammars to implement inheritance. 3938 See_Also: https://github.com/antlr/antlr4/blob/master/doc/grammars.md#grammar-imports 3939 */ 3940 bool isOverridden(const scope Rule rule) const @safe pure nothrow @nogc 3941 { 3942 return doneRuleNames[].canFind(rule.head.input); 3943 } 3944 3945 foreach (const Rule importedRule; fp_.rules) 3946 { 3947 if (isOverridden(importedRule)) // if `importedRule` has already been defined 3948 { 3949 fp_._lexer.warningAtToken(importedRule.head, "ignoring rule overridden in top grammar"); 3950 continue; 3951 } 3952 importedRule.toMatcherInSource(output, this); 3953 doneRuleNames.put(importedRule.head.input); 3954 } 3955 } 3956 3957 private static GxFileParser findModuleUpwards(const string cwd, 3958 scope const(char)[] moduleName, 3959 scope const string ext, 3960 GxFileParser[string] cachedParsersByModuleName) 3961 { 3962 if (auto existingParser = moduleName in cachedParsersByModuleName) 3963 { 3964 debug writeln("reusing existing parser for module named ", moduleName.idup); 3965 return *existingParser; 3966 } 3967 import std.file : FileException; 3968 string modulePath; 3969 () @trusted { 3970 /* TODO: avoid cast somehow 3971 https://forum.dlang.org/post/ypnssfszqfuhjremnsqo@forum.dlang.org 3972 */ 3973 modulePath = cast(string)chainPath(cwd, moduleName ~ ext).array; 3974 } (); 3975 try 3976 return cachedParsersByModuleName[moduleName.to!string] = new GxFileParser(modulePath, cachedParsersByModuleName); 3977 catch (Exception e) 3978 { 3979 const cwdNext = cwd.dirName; 3980 if (cwdNext == cwd) // stuck at top directory 3981 throw new FileException("couldn't find module named " ~ moduleName); // TODO: add source of import statement 3982 return findModuleUpwards(cwdNext, moduleName, ext, cachedParsersByModuleName); 3983 } 3984 } 3985 3986 void toMatchersForRules(scope ref RuleNames doneRuleNames, scope ref Output output) const scope 3987 { 3988 foreach (const Rule rule; rules) 3989 { 3990 // rule.show(); 3991 rule.toMatcherInSource(output, this); 3992 doneRuleNames.put(rule.head.input); 3993 } 3994 } 3995 3996 void toMatchersForImports(scope ref RuleNames doneRuleNames, scope ref Output output) scope 3997 { 3998 foreach (const import_; imports) 3999 foreach (const moduleName; import_.moduleNames) 4000 toMatchersForImportedModule(moduleName, doneRuleNames, output); 4001 } 4002 4003 void toMatchersForOptionsTokenVocab(scope ref RuleNames doneRuleNames, scope ref Output output) scope 4004 { 4005 foreach (const options; optionsSet[]) 4006 { 4007 const(char)[] co = options.code.input; 4008 4009 void skipWhitespace() 4010 { 4011 import std.algorithm.comparison : among; 4012 size_t i; 4013 while (co.length && 4014 co[i].among!(GxLexer.whiteChars)) 4015 i += 1; 4016 co = co[i .. $]; 4017 } 4018 4019 co.skipOverAround('{', '}'); 4020 4021 skipWhitespace(); 4022 4023 // See_Also: https://stackoverflow.com/questions/28829049/antlr4-any-difference-between-import-and-tokenvocab 4024 if (co.skipOver("tokenVocab")) 4025 { 4026 skipWhitespace(); 4027 co.skipOver('='); 4028 skipWhitespace(); 4029 if (const ix = co.indexOfEither(" ;")) 4030 { 4031 const module_ = co[0 .. ix]; 4032 toMatchersForImportedModule(module_, doneRuleNames, output); 4033 } 4034 } 4035 } 4036 } 4037 GxFileParser[string] cachedParsersByModuleName; 4038 } 4039 4040 static immutable parserSourceBegin = 4041 `alias Input = const(char)[]; 4042 4043 @safe struct Match 4044 { 4045 @safe pure nothrow @nogc: 4046 static Match zero() 4047 { 4048 return typeof(return)(0); 4049 } 4050 static Match none() 4051 { 4052 return typeof(return)(_length.max); 4053 } 4054 /// Match length in number of UTF-8 chars or 0 if empty. 4055 @property uint length() 4056 { 4057 return _length; 4058 } 4059 bool opCast(U : bool)() const 4060 { 4061 return _length != _length.max; 4062 } 4063 this(size_t length) 4064 { 4065 assert(length <= _length.max); 4066 this._length = cast(typeof(_length))length; 4067 } 4068 const uint _length; // length == uint.max is no match 4069 } 4070 4071 /// https://forum.dlang.org/post/zcvjwdetohmklaxriswk@forum.dlang.org 4072 version(none) alias Matcher = Match function(lazy Matcher[] matchers...); 4073 4074 @safe struct Parser 4075 { 4076 Input inp; ///< Input. 4077 size_t off; ///< Current offset into inp. 4078 4079 Match EOF() pure nothrow @nogc 4080 { 4081 pragma(inline, true); 4082 if (inp[off] == '\r' && 4083 inp[off + 1] == '\n') // Windows 4084 { 4085 off += 2; 4086 return Match(2); 4087 } 4088 if (inp[off] == '\n' || // Unix/Linux 4089 inp[off] == '\r') // Mac? 4090 { 4091 off += 1; 4092 return Match(1); 4093 } 4094 return Match.none(); 4095 } 4096 4097 Match any() pure nothrow @nogc 4098 { 4099 version(LDC) pragma(inline, true); 4100 if (off == inp.length) // TODO: 4101 return Match.none(); 4102 off += 1; 4103 return Match(1); 4104 } 4105 4106 Match ch(in char x) pure nothrow @nogc 4107 { 4108 version(LDC) pragma(inline, true); 4109 if (off == inp.length) // TODO: 4110 return Match.none(); 4111 if (inp[off] == x) 4112 { 4113 off += 1; 4114 return Match(1); 4115 } 4116 return Match.none(); 4117 } 4118 4119 Match dch(const dchar x) pure nothrow @nogc 4120 { 4121 import std.typecons : Yes; 4122 import std.utf : encode; 4123 char[4] ch4; 4124 const replacementChar = cast(dchar)0x110000; 4125 const n = encode!(Yes.useReplacementDchar)(ch4, replacementChar); 4126 if (ch4[0 .. n] == [239, 191, 189]) // encoding of replacementChar 4127 return Match.none(); 4128 if (off + n > inp.length) // TODO: 4129 return Match.none(); 4130 if (inp[off .. off + n] == ch4[0 .. n]) 4131 { 4132 off += n; 4133 return Match(n); 4134 } 4135 return Match.none(); 4136 } 4137 4138 Match cc(string cclass)() pure nothrow @nogc 4139 { 4140 pragma(inline, true); 4141 off += 1; // TODO: switch on cclass 4142 if (off > inp.length) // TODO: 4143 return Match.none(); 4144 return Match(1); 4145 } 4146 4147 /// Match string x. 4148 Match str(const scope string x) pure nothrow @nogc 4149 { 4150 pragma(inline, true); 4151 if (off + x.length <= inp.length && // TODO: optimize by using null-sentinel 4152 inp[off .. off + x.length] == x) // inp[off .. $].startsWith(x) 4153 { 4154 off += x.length; 4155 return Match(x.length); 4156 } 4157 return Match.none(); 4158 } 4159 4160 Match seq(Matchers...)(const scope lazy Matchers matchers) 4161 { 4162 const off0 = off; 4163 static foreach (const matcher; matchers) 4164 {{ // scoped 4165 const match = matcher(); 4166 if (!match) 4167 { 4168 off = off0; // backtrack 4169 return match; // propagate failure 4170 } 4171 }} 4172 return Match(off - off0); 4173 } 4174 4175 Match alt(Matchers...)(const scope lazy Matchers matchers) 4176 { 4177 static foreach (const matcher; matchers) 4178 {{ // scoped 4179 const off0 = off; 4180 if (const match = matcher()) 4181 return match; 4182 else 4183 off = off0; // backtrack 4184 }} 4185 return Match.none(); 4186 } 4187 4188 Match not(Matcher)(const scope lazy Matcher matcher) 4189 { 4190 const off0 = off; 4191 const match = matcher(); 4192 if (!match) 4193 return match; 4194 off = off0; // backtrack 4195 return Match.none(); 4196 } 4197 4198 Match alt2(char a, char b)() pure nothrow @nogc 4199 { 4200 pragma(inline, true); 4201 const x = inp[off]; 4202 if (x == a || 4203 x == b) 4204 { 4205 off += 1; 4206 return Match(1); 4207 } 4208 return Match.none(); 4209 } 4210 4211 Match alt3(char a, char b, char c)() pure nothrow @nogc 4212 { 4213 pragma(inline, true); 4214 const x = inp[off]; 4215 if (x == a || 4216 x == b || 4217 x == c) 4218 { 4219 off += 1; 4220 return Match(1); 4221 } 4222 return Match.none(); 4223 } 4224 4225 Match alt4(char a, char b, char c, char d)() pure nothrow @nogc 4226 { 4227 pragma(inline, true); 4228 const x = inp[off]; 4229 if (x == a || 4230 x == b || 4231 x == c || 4232 x == d) 4233 { 4234 off += 1; 4235 return Match(1); 4236 } 4237 return Match.none(); 4238 } 4239 4240 Match alt5(char a, char b, char c, char d, char e)() pure nothrow @nogc 4241 { 4242 pragma(inline, true); 4243 const x = inp[off]; 4244 if (x == a || 4245 x == b || 4246 x == c || 4247 x == d || 4248 x == e) 4249 { 4250 off += 1; 4251 return Match(1); 4252 } 4253 return Match.none(); 4254 } 4255 4256 Match altN(chars...)() pure nothrow @nogc // TODO: non-char type in chars 4257 { 4258 pragma(inline, true); 4259 import std.algorithm.comparison : among; // TODO: replace with switch over static foreach to speed up compilation 4260 const x = inp[off]; 4261 if (x.among!(chars)) 4262 { 4263 off += 1; // TODO: skip over number of chars needed to encode hit 4264 return Match(1); 4265 } 4266 return Match.none(); 4267 } 4268 4269 Match rng(in char lower, in char upper) pure nothrow @nogc 4270 { 4271 pragma(inline, true); 4272 const x = inp[off]; 4273 if (lower <= x && 4274 x <= upper) 4275 { 4276 off += 1; 4277 return Match(1); 4278 } 4279 return Match.none(); 4280 } 4281 4282 Match rng(in dchar lower, in dchar upper) pure nothrow @nogc 4283 { 4284 pragma(inline, true); 4285 // TODO: decode dchar at inp[off] 4286 const x = inp[off]; 4287 if (lower <= x && 4288 x <= upper) 4289 { 4290 off += 1; // TODO: handle dchar at inp[off] 4291 return Match(1); 4292 } 4293 return Match.none(); 4294 } 4295 4296 Match gzm(Matcher)(const scope lazy Matcher matcher) 4297 { 4298 const off0 = off; 4299 while (true) 4300 { 4301 const off1 = off; 4302 const match = matcher(); 4303 if (!match) 4304 { 4305 off = off1; // backtrack 4306 break; 4307 } 4308 } 4309 return Match(off - off0); 4310 } 4311 4312 Match gzo(Matcher)(const scope lazy Matcher matcher) 4313 { 4314 const off0 = off; 4315 const match = matcher(); 4316 if (!match) 4317 { 4318 off = off0; // backtrack 4319 return Match.none(); 4320 } 4321 return Match(off - off0); 4322 } 4323 4324 Match gom(Matcher)(const scope lazy Matcher matcher) 4325 { 4326 const off0 = off; 4327 const match0 = matcher; 4328 if (!match0) 4329 { 4330 off = off0; // backtrack 4331 return Match.none(); 4332 } 4333 while (true) 4334 { 4335 const off1 = off; 4336 const match1 = matcher; 4337 if (!match1) 4338 { 4339 off = off1; // backtrack 4340 break; 4341 } 4342 } 4343 return Match(off - off0); 4344 } 4345 4346 // TODO merge overloads of nzo by using a default type and value for Matcher2 4347 Match nzo(Matcher1)(const scope lazy Matcher1 matcher) 4348 { 4349 const off0 = off; 4350 off = off0; // backtrack 4351 const match = matcher(); 4352 if (!match) 4353 { 4354 off = off0; // backtrack 4355 return Match.none(); 4356 } 4357 return Match(off - off0); 4358 } 4359 Match nzo(Matcher1, Matcher2)(const scope lazy Matcher1 matcher, const scope lazy Matcher2 terminator) 4360 { 4361 const off0 = off; 4362 if (terminator()) 4363 { 4364 off = off0; // backtrack 4365 return Match.zero(); // done 4366 } 4367 off = off0; // backtrack 4368 const match = matcher(); 4369 if (!match) 4370 { 4371 off = off0; // backtrack 4372 return Match.none(); 4373 } 4374 return Match(off - off0); 4375 } 4376 4377 // TODO merge overloads of nzm by using a default type and value for Matcher2 4378 Match nzm(Matcher1)(const scope lazy Matcher1 matcher) 4379 { 4380 const off0 = off; 4381 while (true) 4382 { 4383 const off1 = off; 4384 off = off1; // backtrack 4385 const off2 = off; 4386 const match = matcher(); 4387 if (!match) 4388 { 4389 off = off2; // backtrack 4390 break; 4391 } 4392 } 4393 return Match(off - off0); 4394 } 4395 Match nzm(Matcher1, Matcher2)(const scope lazy Matcher1 matcher, const scope lazy Matcher2 terminator) 4396 { 4397 const off0 = off; 4398 while (true) 4399 { 4400 const off1 = off; 4401 if (terminator()) 4402 { 4403 off = off1; // backtrack 4404 return Match(off1 - off0); // done 4405 } 4406 off = off1; // backtrack 4407 const off2 = off; 4408 const match = matcher(); 4409 if (!match) 4410 { 4411 off = off2; // backtrack 4412 break; 4413 } 4414 } 4415 return Match(off - off0); 4416 } 4417 4418 // TODO merge overloads of nom by using a default type and value for Matcher2 4419 Match nom(Matcher1)(const scope lazy Matcher1 matcher) 4420 { 4421 const off0 = off; 4422 bool firstFlag; 4423 while (true) 4424 { 4425 const off1 = off; 4426 off = off1; // backtrack 4427 const off2 = off; 4428 const match = matcher(); 4429 if (!match) 4430 { 4431 off = off2; // backtrack 4432 break; 4433 } 4434 firstFlag = true; 4435 } 4436 if (!firstFlag) 4437 { 4438 off = off0; // backtrack 4439 return Match.none(); 4440 } 4441 return Match(off - off0); 4442 } 4443 Match nom(Matcher1, Matcher2)(const scope lazy Matcher1 matcher, const scope lazy Matcher2 terminator) 4444 { 4445 const off0 = off; 4446 bool firstFlag; 4447 while (true) 4448 { 4449 const off1 = off; 4450 if (terminator()) 4451 { 4452 off = off1; // backtrack 4453 return Match(off1 - off0); // done 4454 } 4455 off = off1; // backtrack 4456 const off2 = off; 4457 const match = matcher(); 4458 if (!match) 4459 { 4460 off = off2; // backtrack 4461 break; 4462 } 4463 firstFlag = true; 4464 } 4465 if (!firstFlag) 4466 { 4467 off = off0; // backtrack 4468 return Match.none(); 4469 } 4470 return Match(off - off0); 4471 } 4472 4473 Match syn(Matcher)(const scope lazy Matcher matcher) 4474 { 4475 return Match.zero(); // pass, backtracking is performed by default 4476 } 4477 4478 `; 4479 4480 static immutable parserSourceEnd = 4481 `} // struct Parser 4482 `; 4483 4484 @safe struct GxFileReader 4485 { 4486 GxFileParser fp; 4487 this(string path, GxFileParser[string] cachedParsersByModuleName) 4488 { 4489 fp = new GxFileParser(path, cachedParsersByModuleName); 4490 while (!fp.empty) 4491 fp.popFront(); 4492 } 4493 4494 string createParserSourceFilePath(out string moduleName) 4495 { 4496 Output pss; 4497 fp.generateParserSourceString(pss, moduleName); 4498 import std.file : write; 4499 const path = fp._lexer.path; 4500 const ppath = chainPath(tempDir(), path.baseName.stripExtension).array ~ "_parser.d"; 4501 write(ppath, pss[]); 4502 debug writeln("Wrote ", ppath); 4503 return ppath.to!(typeof(return)); 4504 } 4505 4506 ~this() @nogc {} 4507 } 4508 4509 @safe struct SourceFile 4510 { 4511 this(string name, scope const(char)[] stdioOpenmode = "rb") @safe 4512 { 4513 _file = File(name, stdioOpenmode); 4514 } 4515 private File _file; 4516 alias _file this; 4517 } 4518 4519 @safe struct ObjectFile 4520 { 4521 this(string name, scope const(char)[] stdioOpenmode = "rb") @safe 4522 { 4523 _file = File(name, stdioOpenmode); 4524 } 4525 private File _file; 4526 alias _file this; 4527 } 4528 4529 @safe struct ExecutableFile 4530 { 4531 this(string name, scope const(char)[] stdioOpenmode = "rb") @safe 4532 { 4533 _file = File(name, stdioOpenmode); 4534 } 4535 File _file; 4536 alias _file this; 4537 } 4538 4539 static immutable mainName = `main`; 4540 4541 static immutable mainSource = 4542 `int ` ~ mainName ~ `(string[] args) 4543 { 4544 return 0; 4545 } 4546 `; 4547 4548 SourceFile createMainFile(string path, 4549 const string[] parserPaths, 4550 const string[] parserModules) 4551 { 4552 assert(parserPaths.length == parserModules.length); 4553 auto file = typeof(return)(path, "w"); 4554 foreach (const index, const ppath; parserPaths) 4555 { 4556 file.write(`import `); 4557 file.write(parserModules[index]); 4558 file.write(";\n"); 4559 } 4560 4561 file.write("\n"); // separator 4562 file.write(mainSource); 4563 file.close(); 4564 return file; 4565 } 4566 4567 /// Build the D source files `parserPaths`. 4568 string buildSourceFiles(const string[] parserPaths, 4569 const string[] parserModules, 4570 in bool linkFlag = false) 4571 { 4572 import std.process : execute; 4573 4574 const mainFilePath = buildPath(tempDir(), "gxmain.d"); 4575 const string mainDirPath = dirName(mainFilePath); 4576 4577 File mainFile = createMainFile(mainFilePath, parserPaths, parserModules); 4578 4579 const parserName = "parser"; 4580 const outFile = parserName ~ (linkFlag ? "" : ".o"); 4581 const args = (["dmd"] ~ 4582 (linkFlag ? [] : ["-c"]) ~ 4583 ["-dip25", "-dip1000", "-vcolumns", "-wi"] ~ 4584 parserPaths ~ 4585 (linkFlag ? [mainFilePath] : []) ~ 4586 ("-of=" ~ outFile)); 4587 writeln("args:", args); 4588 const dmd = execute(args); 4589 if (dmd.status == 0) 4590 writeln("Compilation of ", parserPaths, " successful"); 4591 else 4592 writeln("Compilation of ", parserPaths, " failed with output:\n", 4593 dmd.output); 4594 return outFile; 4595 } 4596 4597 private bool isGxFilename(const scope char[] name) @safe pure nothrow @nogc 4598 { 4599 return name.endsWith(`.g4`); 4600 } 4601 4602 private bool isGxFilenameParsed(const scope char[] name) @safe pure nothrow @nogc 4603 { 4604 if (!isGxFilename(name)) 4605 return false; 4606 // Pick specific file: 4607 // if (name != `Arithmetic.g4`) 4608 // return false; 4609 if (// TODO: 4610 name == `Python2.g4` || 4611 name == `Python3.g4` || 4612 name == `AltPython3.g4` || 4613 name == `PythonParser.g4` || 4614 // TODO: 4615 name == `ResourcePlanParser.g4` || 4616 name == `SelectClauseParser.g4` || 4617 name == `IdentifiersParser.g4` || 4618 // TODO: 4619 name == `AspectJParser.g4` || // TODO: find rule for `annotationName` in apex.g4 4620 name == `AspectJLexer.g4` || 4621 // TODO: missing tokens 4622 name == `FromClauseParser.g4` || 4623 name == `TSqlParser.g4` || 4624 name == `informix.g4` || 4625 name == `icon.g4` || 4626 name == `ANTLRv4Parser.g4` || 4627 name == `JPA.g4` || // INT_NUMERAL missing 4628 name == `STParser.g4` || 4629 name == `STGParser.g4` || 4630 // TODO: 4631 name == `RexxParser.g4` || 4632 name == `RexxLexer.g4` || 4633 name == `StackTrace.g4` || 4634 name == `memcached_protocol.g4`) // skip this crap 4635 return false; 4636 return true; 4637 } 4638 4639 import std.datetime.stopwatch : StopWatch; 4640 import std.file : dirEntries, SpanMode, getcwd; 4641 4642 enum showProgressFlag = true; 4643 4644 void lexAllInDirTree(scope ref BuildCtx bcx) @system 4645 { 4646 scope StopWatch swAll; 4647 swAll.start(); 4648 foreach (const e; dirEntries(bcx.rootDirPath, SpanMode.breadth)) 4649 { 4650 const fn = e.name; 4651 if (fn.isGxFilename) 4652 { 4653 static if (showProgressFlag) 4654 bcx.outFile.writeln("Lexing ", tryRelativePath(bcx.rootDirPath, fn), " ..."); // TODO: read use curren directory 4655 const data = cast(Input)rawReadPath(fn); // exclude from benchmark 4656 scope StopWatch swOne; 4657 swOne.start(); 4658 auto lexer = GxLexer(data, fn, false); 4659 while (!lexer.empty) 4660 lexer.popFront(); 4661 static if (showProgressFlag) 4662 bcx.outFile.writeln("Lexing ", tryRelativePath(bcx.rootDirPath, fn), " took ", swOne.peek()); 4663 } 4664 } 4665 bcx.outFile.writeln("Lexing all took ", swAll.peek()); 4666 } 4667 4668 /// Build Context 4669 @safe struct BuildCtx 4670 { 4671 string rootDirPath; 4672 File outFile; 4673 bool buildSingleFlag; ///< Build each parser separately in a separate compilation. 4674 bool buildAllFlag; ///< Build all parsers together in a common single compilation. 4675 bool lexerFlag; ///< Flag for separate lexing pass. 4676 bool parserFlag; ///< Flag for separate parsing pass. 4677 private GxFileParser[string] cachedParsersByModuleName; ///< Parser cache. 4678 } 4679 4680 void parseAllInDirTree(scope ref BuildCtx bcx) @system 4681 { 4682 scope StopWatch swAll; 4683 swAll.start(); 4684 DynamicArray!string parserPaths; ///< Paths to generated parsers in D. 4685 DynamicArray!string parserModules; 4686 foreach (const e; dirEntries(bcx.rootDirPath, SpanMode.breadth)) 4687 { 4688 const fn = e.name; 4689 const dn = fn.dirName; 4690 const bn = fn.baseName; 4691 if (bn.isGxFilenameParsed) 4692 { 4693 const exDirPath = buildPath(dn, "examples"); // examples directory 4694 import std.file : exists, isDir; 4695 const showParseExample = false; 4696 if (showParseExample && 4697 exDirPath.exists && 4698 exDirPath.isDir) 4699 foreach (const exf; dirEntries(exDirPath, SpanMode.breadth)) 4700 bcx.outFile.writeln("TODO: Parse example file: ", exf); 4701 static if (showProgressFlag) 4702 bcx.outFile.writeln("Reading ", tryRelativePath(bcx.rootDirPath, fn), " ..."); 4703 4704 scope StopWatch swOne; 4705 swOne.start(); 4706 4707 auto reader = GxFileReader(fn, bcx.cachedParsersByModuleName); 4708 string parserModule; 4709 const parserPath = reader.createParserSourceFilePath(parserModule); 4710 if (parserPaths[].canFind(parserPath)) // TODO: remove because this should not happen 4711 bcx.outFile.writeln("Warning: duplicate entry outFile ", parserPath); 4712 else 4713 { 4714 parserPaths.insertBack(parserPath); 4715 parserModules.insertBack(parserModule); 4716 } 4717 4718 if (bcx.buildSingleFlag) 4719 const parseExePath = buildSourceFiles([parserPath], [parserModule], true); 4720 4721 static if (showProgressFlag) 4722 bcx.outFile.writeln("Reading ", tryRelativePath(bcx.rootDirPath, fn), " took ", swOne.peek()); 4723 } 4724 } 4725 if (bcx.buildAllFlag) 4726 const parseExePath = buildSourceFiles(parserPaths[], parserModules[]); 4727 bcx.outFile.writeln("Reading all took ", swAll.peek()); 4728 } 4729 4730 void doTree(scope ref BuildCtx bcx) @system 4731 { 4732 if (bcx.lexerFlag) 4733 lexAllInDirTree(bcx); 4734 if (bcx.parserFlag) 4735 parseAllInDirTree(bcx); 4736 } 4737 4738 string tryRelativePath(scope string rootDirPath, 4739 const return scope string path) @safe 4740 { 4741 const cwd = getcwd(); 4742 if (rootDirPath.startsWith(cwd)) 4743 return path.relativePath(cwd); 4744 return path; 4745 } 4746 4747 /// 4748 version(show) 4749 @system unittest 4750 { 4751 BuildCtx bcx = { 4752 rootDirPath : "~/Work/grammars-v4/".expandTilde, 4753 outFile : stdout, 4754 buildSingleFlag : true, 4755 buildAllFlag : true, 4756 lexerFlag : false, 4757 parserFlag : true, 4758 }; 4759 doTree(bcx); 4760 }