1 /** Lexer and parser of Lisp-like languages, including SUO-KIF and Emacs-Lisp. 2 * 3 * See_Also: https://www.csee.umbc.edu/csee/research/kif/ 4 * See_Also: https://en.wikipedia.org/wiki/Knowledge_Interchange_Format 5 * See_Also: http://sigmakee.cvs.sourceforge.net/viewvc/sigmakee/sigma/suo-kif.pdf 6 * See_Also: http://forum.dlang.org/post/prsxfcmkngfwomygmthi@forum.dlang.org 7 * 8 * TODO: Break out lexer and parser parts to to `nxt.`lexing and `nxt.parsing` 9 * 10 * TODO: Try infinite loops with break or goto instead of for loops. 11 * 12 * TODO: Should we add `LispFile.bySExpr` to allow use of `offsetTo` inside 13 * `LispFileParser lfp; foreach (lfp.bySExpr)` now that copy-ctor is disabled 14 * for `LispParser`? 15 */ 16 module nxt.lispy; 17 18 @safe: 19 20 /** Lisp-like token type. */ 21 enum TOK 22 { 23 unknown, ///< Unknown. 24 25 leftParen, ///< Left parenthesis. 26 rightParen, ///< Right parenthesis. 27 28 symbol, ///< Symbol. 29 30 stringLiteral, ///< String literal. 31 32 comma, ///< Lisp comma expression, `,`. 33 backquote, ///< Lisp backquote expression, `\``. 34 singlequote, ///< Lisp singlequote expression, `'`. 35 36 variable, 37 variableList, ///< one or more variables (parameters) starting with an at-sign, for instance `@ROW` 38 functionName, 39 40 number, ///< number as integer or floating point literal. 41 42 comment, ///< Comment (to end of line). 43 whitespace, ///< Whitespace. 44 45 emptyList, ///< Empty list. 46 } 47 48 /** Lisp-like token. */ 49 struct Token 50 { 51 @safe: 52 this(TOK tok, const(char)[] src = null) pure nothrow @nogc 53 { 54 this.tok = tok; 55 this.src = src; 56 } 57 58 @property final void toString(scope void delegate(scope const(char)[]) @safe sink) const @safe 59 { 60 switch (tok) 61 { 62 case TOK.symbol: 63 sink(src); 64 break; 65 case TOK.comma: 66 sink(`,`); 67 break; 68 case TOK.backquote: 69 sink("`"); 70 break; 71 case TOK.singlequote: 72 sink("'"); 73 break; 74 case TOK.stringLiteral: 75 sink(`"`); 76 sink(src); 77 sink(`"`); 78 break; 79 case TOK.emptyList: 80 sink(`()`); 81 break; 82 default: 83 // import std.conv : to; 84 // sink(tok.to!string); 85 if (src) 86 { 87 // sink(`:`); 88 sink(src); 89 } 90 break; 91 } 92 } 93 94 TOK tok; 95 const(char)[] src; // optional source slice 96 } 97 98 /** Lisp-like S-expression. */ 99 struct SExpr 100 { 101 @safe: 102 @property final void toString(scope void delegate(scope const(char)[]) @safe sink) const @safe 103 { 104 if (subs) { sink(`(`); } 105 106 token.toString(sink); 107 108 TOK lastTok = TOK.unknown; 109 foreach (const ref sub; subs) 110 { 111 import std.algorithm.comparison : among; 112 if (!lastTok.among!(TOK.comma, 113 TOK.backquote, 114 TOK.singlequote)) 115 { 116 sink(` `); 117 } 118 sub.toString(sink); 119 lastTok = sub.token.tok; 120 } 121 122 if (subs) { sink(`)`); } 123 } 124 125 Token token; 126 SExpr[] subs; 127 } 128 129 /** Parse from `input` into lazy range over top-level expressions (`SExpr`). 130 * 131 * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org 132 */ 133 struct LispParser // TODO: convert to `class` 134 { 135 import std.algorithm.comparison : among; 136 137 alias Input = const(char)[]; 138 139 @safe pure: 140 141 /** Parse `input` into returned array of expressions (`SExpr`). 142 */ 143 this(Input input, 144 bool includeComments = false, 145 bool includeWhitespace = false, 146 bool disallowEmptyLists = false, 147 size_t subExprCountsGuess = 0) @trusted 148 { 149 _input = input; 150 151 // if (subExprCountsGuess) // if guess use region 152 // { 153 // _subExprsStore = new SExpr[subExprCountsGuess]; // region store 154 // } 155 156 import std.exception : enforce; 157 import nxt.parsing : isNullTerminated; 158 enforce(_input.isNullTerminated, "Input isn't null-terminated"); // input cannot be trusted 159 160 _includeComments = includeComments; 161 _includeWhitespace = includeWhitespace; 162 _disallowEmptyLists = disallowEmptyLists; 163 164 nextFront(); 165 } 166 167 @disable this(this); 168 169 @property bool empty() const nothrow scope @nogc 170 { 171 version(D_Coverage) {} else pragma(inline, true); 172 return _endOfFile; 173 } 174 175 ref const(SExpr) front() const scope return 176 { 177 version(D_Coverage) {} else pragma(inline, true); 178 assert(!empty); 179 return _topExprs.back; 180 } 181 182 void popFront() 183 { 184 version(D_Coverage) {} else pragma(inline, true); 185 assert(!empty); 186 _topExprs.popBack(); 187 nextFront(); 188 } 189 190 @property size_t subExprsCount() const @safe pure nothrow @nogc 191 { 192 return _subExprsCount; 193 } 194 195 import std.meta : AliasSeq; 196 197 // from std.ascii.isWhite 198 alias endOfLineChars = AliasSeq!('\n', // (0x0a) 199 '\r', // (0x0c) 200 ); 201 alias whiteChars = AliasSeq!(' ', // 0x20 202 '\t', // (0x09) 203 '\n', // (0x0a) 204 '\v', // (0x0b) 205 '\r', // (0x0c) 206 '\f' // (0x0d) 207 ); 208 alias digitChars = AliasSeq!('0', '1', '2', '3', '4', '5', '6', '7', '8', '9'); 209 210 private: 211 212 /// Get next `dchar` in input. 213 dchar peekNext() const scope nothrow @nogc 214 { 215 version(D_Coverage) {} else pragma(inline, true); 216 return _input[_offset]; // TODO: .ptr. TODO: decode `dchar` 217 } 218 219 /// Get next `dchar` in input. 220 dchar peekNextNth(size_t n) const nothrow @nogc 221 { 222 version(D_Coverage) {} else pragma(inline, true); 223 return _input[_offset + n]; // TODO: .ptr. TODO: decode `dchar` 224 } 225 226 /// Get next n `chars` in input. 227 Input peekNextsN(size_t n) const return nothrow @nogc 228 { 229 version(D_Coverage) {} else pragma(inline, true); 230 return _input[_offset .. _offset + n]; // TODO: .ptr 231 } 232 233 /// Drop next byte in input. 234 void dropFront() nothrow @nogc 235 { 236 version(D_Coverage) {} else pragma(inline, true); 237 _offset += 1; 238 } 239 240 /// Drop next `n` bytes in input. 241 void dropFrontN(size_t n) nothrow @nogc 242 { 243 version(D_Coverage) {} else pragma(inline, true); 244 _offset += n; 245 } 246 247 /// Skip over `n` bytes in input. 248 Input skipOverN(size_t n) return nothrow @nogc 249 { 250 version(D_Coverage) {} else pragma(inline); 251 const part = _input[_offset .. _offset + n]; // TODO: .ptr 252 dropFrontN(n); 253 return part; 254 } 255 256 /// Skip line comment. 257 void skipLineComment() scope nothrow @nogc 258 { 259 while (!peekNext().among!('\0', endOfLineChars)) 260 { 261 _offset += 1; 262 } 263 } 264 265 /// Get symbol. 266 Input getSymbol() return nothrow @nogc 267 { 268 size_t i = 0; 269 while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite 270 { 271 ++i; 272 } 273 return skipOverN(i); 274 } 275 276 /// Get numeric literal (number) in integer or decimal form. 277 Input getNumberOrSymbol(out bool gotSymbol) return nothrow @nogc 278 { 279 size_t i = 0; 280 while ((peekNextNth(i).among!('+', '-', '.', digitChars))) // NOTE this is faster than !src[i].isWhite 281 { 282 ++i; 283 } 284 import std.ascii : isAlpha; 285 if (peekNextNth(i).isAlpha) // if followed by letter 286 { 287 size_t alphaCount = 0; 288 while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite 289 { 290 alphaCount += peekNextNth(i).isAlpha; 291 ++i; 292 } 293 gotSymbol = alphaCount >= 2; // at least two letters, excluding floating point such as 1.0e+10 294 } 295 296 return skipOverN(i); 297 } 298 299 /// Get whitespace. 300 Input getWhitespace() return nothrow @nogc 301 { 302 size_t i = 0; 303 while (peekNextNth(i).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite` 304 { 305 ++i; 306 } 307 return skipOverN(i); 308 } 309 310 /// Get string literal in input. 311 Input getStringLiteral() return nothrow @nogc 312 { 313 dropFront(); 314 size_t i = 0; 315 while (!peekNextNth(i).among!('\0', '"')) 316 { 317 if (peekNextNth(i) == '\\' && 318 peekNextNth(i + 1) == '"') 319 { 320 i += 2; // skip \n 321 continue; 322 } 323 ++i; 324 } 325 const literal = peekNextsN(i); 326 dropFrontN(i); 327 if (peekNext() == '"') { dropFront(); } // pop ending double singlequote 328 return literal; 329 } 330 331 SExpr[] dupTopExprs(SExpr[] exprs) @safe pure nothrow 332 { 333 version(D_Coverage) {} else pragma(inline, true); 334 _subExprsCount += exprs.length; // log it for future optimizations 335 return exprs.dup; // TODO: use region allocator stored locally in `LispParser` 336 } 337 338 void nextFront() 339 { 340 import std.range.primitives : empty, front, popFront, popFrontN; 341 import std.uni : isWhite, isAlpha; 342 import std.ascii : isDigit; 343 344 while (true) 345 { 346 switch (_input[_offset]) // TODO: .ptr 347 { 348 case ';': 349 if (_includeComments) 350 { 351 assert(0, "TODO: don't use skipLineComment"); 352 // _topExprs.insertBack(SExpr(Token(TOK.comment, src[0 .. 1]))); 353 } 354 else 355 { 356 skipLineComment(); 357 } 358 break; 359 case '(': 360 _topExprs.insertBack(SExpr(Token(TOK.leftParen, peekNextsN(1)))); 361 dropFront(); 362 ++_depth; 363 break; 364 case ')': 365 // NOTE: this is not needed: _topExprs.insertBack(SExpr(Token(TOK.rightParen, src[0 .. 1]))); 366 dropFront(); 367 --_depth; 368 // NOTE: this is not needed: _topExprs.popBack(); // pop right paren 369 370 assert(!_topExprs.empty); 371 372 // TODO: retroIndexOf 373 size_t count; // number of elements between parens 374 while (_topExprs[$ - 1 - count].token.tok != TOK.leftParen) 375 { 376 ++count; 377 } 378 if (_disallowEmptyLists) 379 { 380 assert(count != 0); 381 } 382 383 import core.lifetime : move; 384 SExpr newExpr = ((count == 0) ? 385 SExpr(Token(TOK.emptyList)) : 386 SExpr(_topExprs[$ - count].token, 387 dupTopExprs(_topExprs[$ - count + 1 .. $]))); 388 _topExprs.popBackN(1 + count); // forget tokens including leftParen 389 _topExprs.insertBack(newExpr.move); 390 391 if (_depth == 0) // top-level expression done 392 { 393 assert(_topExprs.length >= 1); // we should have at least one `SExpr` 394 return; 395 } 396 397 break; 398 case '"': 399 const stringLiteral = getStringLiteral(); // TODO: tokenize 400 _topExprs.insertBack(SExpr(Token(TOK.stringLiteral, stringLiteral))); 401 break; 402 case ',': 403 dropFront(); 404 _topExprs.insertBack(SExpr(Token(TOK.comma))); 405 break; 406 case '`': 407 dropFront(); 408 _topExprs.insertBack(SExpr(Token(TOK.backquote))); 409 break; 410 case '\'': 411 dropFront(); 412 _topExprs.insertBack(SExpr(Token(TOK.singlequote))); 413 break; 414 case '?': 415 dropFront(); 416 const variableSymbol = getSymbol(); 417 _topExprs.insertBack(SExpr(Token(TOK.variable, variableSymbol))); 418 break; 419 case '@': 420 dropFront(); 421 const variableListSymbol = getSymbol(); 422 _topExprs.insertBack(SExpr(Token(TOK.variableList, variableListSymbol))); 423 break; 424 // std.ascii.isDigit: 425 case '0': 426 .. 427 case '9': 428 case '+': 429 case '-': 430 case '.': 431 bool gotSymbol; 432 const numberOrSymbol = getNumberOrSymbol(gotSymbol); 433 if (gotSymbol) 434 { 435 // debug writeln("TODO: handle floating point: ", numberOrSymbol); 436 _topExprs.insertBack(SExpr(Token(TOK.symbol, numberOrSymbol))); 437 } 438 else 439 { 440 _topExprs.insertBack(SExpr(Token(TOK.number, numberOrSymbol))); 441 } 442 break; 443 // from std.ascii.isWhite 444 case ' ': 445 case '\t': 446 case '\n': 447 case '\v': 448 case '\r': 449 case '\f': 450 assert(peekNext.isWhite); 451 getWhitespace(); 452 if (_includeWhitespace) 453 { 454 _topExprs.insertBack(SExpr(Token(TOK.whitespace, null))); 455 } 456 break; 457 case '\0': 458 assert(_depth == 0, "Unbalanced parenthesis at end of file"); 459 _endOfFile = true; 460 return; 461 default: 462 // other 463 if (true// src.front.isAlpha 464 ) 465 { 466 const symbol = getSymbol(); // TODO: tokenize 467 import nxt.array_algorithm : endsWith; 468 if (symbol.endsWith(`Fn`)) 469 { 470 _topExprs.insertBack(SExpr(Token(TOK.functionName, symbol))); 471 } 472 else 473 { 474 _topExprs.insertBack(SExpr(Token(TOK.symbol, symbol))); 475 } 476 } 477 else 478 { 479 import std.conv : to; 480 assert(false, 481 `Cannot handle character '` ~ peekNext.to!string ~ 482 `' at charater offset:` ~ _offset.to!string); 483 } 484 break; 485 } 486 } 487 } 488 489 public ptrdiff_t offsetTo(scope const char[] expr) const @trusted pure nothrow @nogc 490 { 491 return expr.ptr - _input.ptr; 492 } 493 494 import nxt.line_column : LineColumn, offsetLineColumn; 495 496 public LineColumn offsetToLineColumn(size_t offset) const @trusted pure nothrow @nogc 497 { 498 return offsetLineColumn(_input, offset); 499 } 500 501 public LineColumn sexprToLineColumn(scope const SExpr sexpr) const @trusted pure nothrow @nogc 502 { 503 return offsetLineColumn(_input, offsetTo(sexpr.token.src)); 504 } 505 506 public LineColumn charsToLineColumn(scope const(char)[] chars) const @trusted pure nothrow @nogc 507 { 508 return offsetLineColumn(_input, offsetTo(chars)); 509 } 510 511 private: 512 size_t _offset; // current offset in `_input` 513 const Input _input; // input 514 515 import nxt.fixed_array : FixedArray; 516 // import nxt.dynamic_array : DynamicArray; 517 alias TopExprs = FixedArray!(SExpr, 1024); 518 TopExprs _topExprs; // top s-expressions (stack) 519 size_t _subExprsCount; 520 521 // SExpr[] _subExprsStore; // sub s-expressions (region) 522 // size_t _subExprsOffset = 0; // offset into `_subExprsStore` 523 524 size_t _depth; // parenthesis depth 525 bool _endOfFile; // signals null terminator found 526 bool _includeComments = false; 527 bool _includeWhitespace = false; 528 bool _disallowEmptyLists = false; 529 } 530 531 /** Parse the contents of `filePath` into lazy range over top-level expressions (`SExpr`). 532 * 533 * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org 534 */ 535 struct LispFileParser // TODO: convert to `class` 536 { 537 @safe: 538 this(const string filePath) 539 { 540 import std.path : expandTilde; 541 import nxt.file_ex : rawReadPath; 542 const size_t subExprsCount = 0; 543 // TODO: lookup `subExprsCount` using `filePath` extended attr or hash and pass to constructor 544 const path = filePath.expandTilde; 545 const data = cast(LispParser.Input)rawReadPath(path); // cast to Input because we don't want to keep all file around: 546 parser = LispParser(data, false, false, false, subExprsCount); 547 } 548 ~this() @nogc 549 { 550 // TODO: write parser.subExprsCount 551 } 552 LispParser parser; 553 ptrdiff_t offsetTo(scope const char[] expr) const @safe pure nothrow @nogc 554 { 555 return parser.offsetTo(expr); 556 } 557 alias parser this; 558 } 559 560 /// 561 @safe pure unittest 562 { 563 const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0"; 564 auto _topExprs = LispParser(text); 565 assert(!_topExprs.empty); 566 567 assert(_topExprs.front.token.tok == TOK.symbol); 568 assert(_topExprs.front.token.src == `instance`); 569 570 assert(_topExprs.front.subs[0].token.tok == TOK.functionName); 571 assert(_topExprs.front.subs[0].token.src == "AttrFn"); 572 573 assert(_topExprs.front.subs[1].token.tok == TOK.symbol); 574 assert(_topExprs.front.subs[1].token.src == "BinaryFunction"); 575 576 _topExprs.popFront(); 577 assert(_topExprs.empty); 578 579 } 580 581 /// 582 @safe unittest 583 { 584 const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0"; 585 auto _topExprs = LispParser(text); 586 import std.conv : to; 587 assert(_topExprs.front.to!string == `(instance AttrFn BinaryFunction)`); 588 assert(!_topExprs.empty); 589 }