1 /** Lexer and parser of Lisp-like languages, including SUO-KIF and Emacs-Lisp. 2 * 3 * See_Also: https://www.csee.umbc.edu/csee/research/kif/ 4 * See_Also: https://en.wikipedia.org/wiki/Knowledge_Interchange_Format 5 * See_Also: http://sigmakee.cvs.sourceforge.net/viewvc/sigmakee/sigma/suo-kif.pdf 6 * See_Also: http://forum.dlang.org/post/prsxfcmkngfwomygmthi@forum.dlang.org 7 * 8 * TODO: Try infinite loops with break or goto instead of for loops. 9 * 10 * TODO: Should we add `LispFile.bySExpr` to allow use of `offsetTo` inside 11 * `LispFileParser lfp; foreach (lfp.bySExpr)` now that copy-ctor is disabled 12 * for `LispParser`? 13 */ 14 module nxt.lispy; 15 16 /** Lisp-like token type. */ 17 enum TOK 18 { 19 unknown, ///< Unknown. 20 21 leftParen, ///< Left parenthesis. 22 rightParen, ///< Right parenthesis. 23 24 symbol, ///< Symbol. 25 26 stringLiteral, ///< String literal. 27 28 comma, ///< Lisp comma expression, `,`. 29 backquote, ///< Lisp backquote expression, `\``. 30 singlequote, ///< Lisp singlequote expression, `'`. 31 32 variable, 33 variableList, ///< one or more variables (parameters) starting with an at-sign, for instance `@ROW` 34 functionName, 35 36 number, ///< number as integer or floating point literal. 37 38 comment, ///< Comment (to end of line). 39 whitespace, ///< Whitespace. 40 41 emptyList, ///< Empty list. 42 } 43 44 /** Lisp-like token. */ 45 struct Token 46 { 47 @safe: 48 this(TOK tok, const(char)[] src = null) pure nothrow @nogc 49 { 50 this.tok = tok; 51 this.src = src; 52 } 53 54 @property final void toString(scope void delegate(scope const(char)[]) @safe sink) const @safe 55 { 56 switch (tok) 57 { 58 case TOK.symbol: 59 sink(src); 60 break; 61 case TOK.comma: 62 sink(`,`); 63 break; 64 case TOK.backquote: 65 sink("`"); 66 break; 67 case TOK.singlequote: 68 sink("'"); 69 break; 70 case TOK.stringLiteral: 71 sink(`"`); 72 sink(src); 73 sink(`"`); 74 break; 75 case TOK.emptyList: 76 sink(`()`); 77 break; 78 default: 79 // import std.conv : to; 80 // sink(tok.to!string); 81 if (src) 82 { 83 // sink(`:`); 84 sink(src); 85 } 86 break; 87 } 88 } 89 90 TOK tok; 91 const(char)[] src; // optional source slice 92 } 93 94 /** Lisp-like S-expression. */ 95 struct SExpr 96 { 97 @safe: 98 @property final void toString(scope void delegate(scope const(char)[]) @safe sink) const @safe 99 { 100 if (subs) { sink(`(`); } 101 102 token.toString(sink); 103 104 TOK lastTok = TOK.unknown; 105 foreach (const ref sub; subs) 106 { 107 import std.algorithm.comparison : among; 108 if (!lastTok.among!(TOK.comma, 109 TOK.backquote, 110 TOK.singlequote)) 111 { 112 sink(` `); 113 } 114 sub.toString(sink); 115 lastTok = sub.token.tok; 116 } 117 118 if (subs) { sink(`)`); } 119 } 120 121 Token token; 122 SExpr[] subs; 123 } 124 125 /** Returns: true if `s` is null-terminated (ending with `'\0'`). 126 * 127 * Prior to parsing used to verify input to parsers that make use of 128 * sentinel-based search. 129 * 130 * See_Also: https://en.wikipedia.org/wiki/Sentinel_value 131 */ 132 bool isNullTerminated(scope const(char)[] s) @safe pure nothrow @nogc 133 { 134 version(D_Coverage) {} else pragma(inline, true); 135 return s.length >= 1 && s[$ - 1] == '\0'; 136 } 137 138 /** Parse from `input` into lazy range over top-level expressions (`SExpr`). 139 * 140 * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org 141 */ 142 struct LispParser // TODO convert to `class` 143 { 144 import std.algorithm.comparison : among; 145 146 alias Input = const(char)[]; 147 148 @safe pure: 149 150 /** Parse `input` into returned array of expressions (`SExpr`). 151 */ 152 this(Input input, 153 bool includeComments = false, 154 bool includeWhitespace = false, 155 bool disallowEmptyLists = false, 156 size_t subExprCountsGuess = 0) @trusted 157 { 158 _input = input; 159 160 // if (subExprCountsGuess) // if guess use region 161 // { 162 // _subExprsStore = new SExpr[subExprCountsGuess]; // region store 163 // } 164 165 import std.exception : enforce; 166 enforce(_input.isNullTerminated, "Input isn't null-terminated"); // input cannot be trusted 167 168 _includeComments = includeComments; 169 _includeWhitespace = includeWhitespace; 170 _disallowEmptyLists = disallowEmptyLists; 171 172 nextFront(); 173 } 174 175 @disable this(this); 176 177 @property bool empty() const nothrow scope @nogc 178 { 179 version(D_Coverage) {} else pragma(inline, true); 180 return _endOfFile; 181 } 182 183 ref const(SExpr) front() const scope return 184 { 185 version(D_Coverage) {} else pragma(inline, true); 186 assert(!empty); 187 return _topExprs.back; 188 } 189 190 void popFront() 191 { 192 version(D_Coverage) {} else pragma(inline, true); 193 assert(!empty); 194 _topExprs.popBack(); 195 nextFront(); 196 } 197 198 @property size_t subExprsCount() @safe pure nothrow @nogc 199 { 200 return _subExprsCount; 201 } 202 203 import std.meta : AliasSeq; 204 205 // from std.ascii.isWhite 206 alias endOfLineChars = AliasSeq!('\n', // (0x0a) 207 '\r', // (0x0c) 208 ); 209 alias whiteChars = AliasSeq!(' ', // 0x20 210 '\t', // (0x09) 211 '\n', // (0x0a) 212 '\v', // (0x0b) 213 '\r', // (0x0c) 214 '\f' // (0x0d) 215 ); 216 alias digitChars = AliasSeq!('0', '1', '2', '3', '4', '5', '6', '7', '8', '9'); 217 218 private: 219 220 /// Get next `char` in input. 221 char peekNext() const scope nothrow @nogc 222 { 223 version(D_Coverage) {} else pragma(inline, true); 224 return _input[_offset]; // TODO .ptr 225 } 226 227 /// Get next `char` in input. 228 char peekNextNth(size_t n) const nothrow @nogc 229 { 230 version(D_Coverage) {} else pragma(inline, true); 231 return _input[_offset + n]; // TODO .ptr 232 } 233 234 /// Get next n `chars` in input. 235 Input peekNextsN(size_t n) const return nothrow @nogc 236 { 237 version(D_Coverage) {} else pragma(inline, true); 238 return _input[_offset .. _offset + n]; // TODO .ptr 239 } 240 241 /// Drop next byte in input. 242 void dropFront() nothrow @nogc 243 { 244 version(D_Coverage) {} else pragma(inline, true); 245 _offset += 1; 246 } 247 248 /// Drop next `n` bytes in input. 249 void dropFrontN(size_t n) nothrow @nogc 250 { 251 version(D_Coverage) {} else pragma(inline, true); 252 _offset += n; 253 } 254 255 /// Skip over `n` bytes in input. 256 Input skipOverN(size_t n) return nothrow @nogc 257 { 258 version(D_Coverage) {} else pragma(inline); 259 const part = _input[_offset .. _offset + n]; // TODO .ptr 260 dropFrontN(n); 261 return part; 262 } 263 264 /// Skip line comment. 265 void skipLineComment() scope nothrow @nogc 266 { 267 while (!peekNext().among!('\0', endOfLineChars)) 268 { 269 _offset += 1; 270 } 271 } 272 273 /// Get symbol. 274 Input getSymbol() return nothrow @nogc 275 { 276 size_t i = 0; 277 while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite 278 { 279 ++i; 280 } 281 return skipOverN(i); 282 } 283 284 /// Get numeric literal (number) in integer or decimal form. 285 Input getNumberOrSymbol(out bool gotSymbol) return nothrow @nogc 286 { 287 size_t i = 0; 288 while ((peekNextNth(i).among!('+', '-', '.', digitChars))) // NOTE this is faster than !src[i].isWhite 289 { 290 ++i; 291 } 292 import std.ascii : isAlpha; 293 if (peekNextNth(i).isAlpha) // if followed by letter 294 { 295 size_t alphaCount = 0; 296 while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite 297 { 298 alphaCount += peekNextNth(i).isAlpha; 299 ++i; 300 } 301 gotSymbol = alphaCount >= 2; // at least two letters, excluding floating point such as 1.0e+10 302 } 303 304 return skipOverN(i); 305 } 306 307 /// Get whitespace. 308 Input getWhitespace() return nothrow @nogc 309 { 310 size_t i = 0; 311 while (peekNextNth(i).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite` 312 { 313 ++i; 314 } 315 return skipOverN(i); 316 } 317 318 /// Get string literal in input. 319 Input getStringLiteral() return nothrow @nogc 320 { 321 dropFront(); 322 size_t i = 0; 323 while (!peekNextNth(i).among!('\0', '"')) 324 { 325 if (peekNextNth(i) == '\\' && 326 peekNextNth(i + 1) == '"') 327 { 328 i += 2; // skip \n 329 continue; 330 } 331 ++i; 332 } 333 const literal = peekNextsN(i); 334 dropFrontN(i); 335 if (peekNext() == '"') { dropFront(); } // pop ending double singlequote 336 return literal; 337 } 338 339 SExpr[] dupTopExprs(SExpr[] exprs) @safe pure nothrow 340 { 341 version(D_Coverage) {} else pragma(inline, true); 342 _subExprsCount += exprs.length; // log it for future optimizations 343 return exprs.dup; // TODO use region allocator stored locally in `LispParser` 344 } 345 346 void nextFront() 347 { 348 import std.range.primitives : empty, front, popFront, popFrontN; 349 import std.uni : isWhite, isAlpha; 350 import std.ascii : isDigit; 351 352 while (true) 353 { 354 switch (_input[_offset]) // TODO .ptr 355 { 356 case ';': 357 if (_includeComments) 358 { 359 assert(0, "TODO don't use skipLineComment"); 360 // _topExprs.insertBack(SExpr(Token(TOK.comment, src[0 .. 1]))); 361 } 362 else 363 { 364 skipLineComment(); 365 } 366 break; 367 case '(': 368 _topExprs.insertBack(SExpr(Token(TOK.leftParen, peekNextsN(1)))); 369 dropFront(); 370 ++_depth; 371 break; 372 case ')': 373 // NOTE: this is not needed: _topExprs.insertBack(SExpr(Token(TOK.rightParen, src[0 .. 1]))); 374 dropFront(); 375 --_depth; 376 // NOTE: this is not needed: _topExprs.popBack(); // pop right paren 377 378 assert(!_topExprs.empty); 379 380 // TODO retroIndexOf 381 size_t count; // number of elements between parens 382 while (_topExprs[$ - 1 - count].token.tok != TOK.leftParen) 383 { 384 ++count; 385 } 386 if (_disallowEmptyLists) 387 { 388 assert(count != 0); 389 } 390 391 import core.lifetime : move; 392 SExpr newExpr = ((count == 0) ? 393 SExpr(Token(TOK.emptyList)) : 394 SExpr(_topExprs[$ - count].token, 395 dupTopExprs(_topExprs[$ - count + 1 .. $]))); 396 _topExprs.popBackN(1 + count); // forget tokens including leftParen 397 _topExprs.insertBack(newExpr.move); 398 399 if (_depth == 0) // top-level expression done 400 { 401 assert(_topExprs.length >= 1); // we should have at least one `SExpr` 402 return; 403 } 404 405 break; 406 case '"': 407 const stringLiteral = getStringLiteral(); // TODO tokenize 408 _topExprs.insertBack(SExpr(Token(TOK.stringLiteral, stringLiteral))); 409 break; 410 case ',': 411 dropFront(); 412 _topExprs.insertBack(SExpr(Token(TOK.comma))); 413 break; 414 case '`': 415 dropFront(); 416 _topExprs.insertBack(SExpr(Token(TOK.backquote))); 417 break; 418 case '\'': 419 dropFront(); 420 _topExprs.insertBack(SExpr(Token(TOK.singlequote))); 421 break; 422 case '?': 423 dropFront(); 424 const variableSymbol = getSymbol(); 425 _topExprs.insertBack(SExpr(Token(TOK.variable, variableSymbol))); 426 break; 427 case '@': 428 dropFront(); 429 const variableListSymbol = getSymbol(); 430 _topExprs.insertBack(SExpr(Token(TOK.variableList, variableListSymbol))); 431 break; 432 // std.ascii.isDigit: 433 case '0': 434 case '1': 435 case '2': 436 case '3': 437 case '4': 438 case '5': 439 case '6': 440 case '7': 441 case '8': 442 case '9': 443 case '+': 444 case '-': 445 case '.': 446 bool gotSymbol; 447 const numberOrSymbol = getNumberOrSymbol(gotSymbol); 448 if (gotSymbol) 449 { 450 // debug writeln("TODO handle floating point: ", numberOrSymbol); 451 _topExprs.insertBack(SExpr(Token(TOK.symbol, numberOrSymbol))); 452 } 453 else 454 { 455 _topExprs.insertBack(SExpr(Token(TOK.number, numberOrSymbol))); 456 } 457 break; 458 // from std.ascii.isWhite 459 case ' ': 460 case '\t': 461 case '\n': 462 case '\v': 463 case '\r': 464 case '\f': 465 assert(peekNext.isWhite); 466 getWhitespace(); 467 if (_includeWhitespace) 468 { 469 _topExprs.insertBack(SExpr(Token(TOK.whitespace, null))); 470 } 471 break; 472 case '\0': 473 assert(_depth == 0, "Unbalanced parenthesis at end of file"); 474 _endOfFile = true; 475 return; 476 default: 477 // other 478 if (true// src.front.isAlpha 479 ) 480 { 481 const symbol = getSymbol(); // TODO tokenize 482 import nxt.array_algorithm : endsWith; 483 if (symbol.endsWith(`Fn`)) 484 { 485 _topExprs.insertBack(SExpr(Token(TOK.functionName, symbol))); 486 } 487 else 488 { 489 _topExprs.insertBack(SExpr(Token(TOK.symbol, symbol))); 490 } 491 } 492 else 493 { 494 import std.conv : to; 495 assert(false, 496 `Cannot handle character '` ~ peekNext.to!string ~ 497 `' at charater offset:` ~ _offset.to!string); 498 } 499 break; 500 } 501 } 502 } 503 504 public ptrdiff_t offsetTo(scope const char[] expr) const @trusted pure nothrow @nogc 505 { 506 return expr.ptr - _input.ptr; 507 } 508 509 import nxt.line_column : LineColumn, offsetLineColumn; 510 511 public LineColumn offsetToLineColumn(size_t offset) const @trusted pure nothrow @nogc 512 { 513 return offsetLineColumn(_input, offset); 514 } 515 516 public LineColumn sexprToLineColumn(scope const SExpr sexpr) const @trusted pure nothrow @nogc 517 { 518 return offsetLineColumn(_input, offsetTo(sexpr.token.src)); 519 } 520 521 public LineColumn charsToLineColumn(scope const(char)[] chars) const @trusted pure nothrow @nogc 522 { 523 return offsetLineColumn(_input, offsetTo(chars)); 524 } 525 526 private: 527 size_t _offset; // current offset in `_input` 528 const Input _input; // input 529 530 import nxt.fixed_array : FixedArray; 531 // import nxt.dynamic_array : DynamicArray; 532 alias TopExprs = FixedArray!(SExpr, 1024); 533 TopExprs _topExprs; // top s-expressions (stack) 534 size_t _subExprsCount; 535 536 // SExpr[] _subExprsStore; // sub s-expressions (region) 537 // size_t _subExprsOffset = 0; // offset into `_subExprsStore` 538 539 size_t _depth; // parenthesis depth 540 bool _endOfFile; // signals null terminator found 541 bool _includeComments = false; 542 bool _includeWhitespace = false; 543 bool _disallowEmptyLists = false; 544 } 545 546 /** Parse the contents of `filePath` into lazy range over top-level expressions (`SExpr`). 547 * 548 * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org 549 */ 550 struct LispFileParser // TODO convert to `class` 551 { 552 @safe: 553 this(const string filePath) 554 { 555 import std.path : expandTilde; 556 import nxt.file_ex : rawReadPath; 557 size_t subExprsCount = 0; 558 // TODO lookup `subExprsCount` using `filePath` extended attr or hash and pass to constructor 559 parser = LispParser(cast(LispParser.Input)filePath.expandTilde.rawReadPath(), 560 false, false, false, subExprsCount); 561 } 562 ~this() @nogc 563 { 564 // TODO write parser.subExprsCount 565 } 566 LispParser parser; 567 ptrdiff_t offsetTo(scope const char[] expr) const @safe pure nothrow @nogc 568 { 569 return parser.offsetTo(expr); 570 } 571 alias parser this; 572 } 573 574 /// 575 @safe pure unittest 576 { 577 const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0"; 578 auto _topExprs = LispParser(text); 579 assert(!_topExprs.empty); 580 581 assert(_topExprs.front.token.tok == TOK.symbol); 582 assert(_topExprs.front.token.src == `instance`); 583 584 assert(_topExprs.front.subs[0].token.tok == TOK.functionName); 585 assert(_topExprs.front.subs[0].token.src == "AttrFn"); 586 587 assert(_topExprs.front.subs[1].token.tok == TOK.symbol); 588 assert(_topExprs.front.subs[1].token.src == "BinaryFunction"); 589 590 _topExprs.popFront(); 591 assert(_topExprs.empty); 592 593 } 594 595 /// 596 @safe unittest 597 { 598 const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0"; 599 auto _topExprs = LispParser(text); 600 import std.conv : to; 601 assert(_topExprs.front.to!string == `(instance AttrFn BinaryFunction)`); 602 assert(!_topExprs.empty); 603 }