1 /** Lexer and parser of Lisp-like languages, including SUO-KIF and Emacs-Lisp. 2 * 3 * See_Also: https://www.csee.umbc.edu/csee/research/kif/ 4 * See_Also: https://en.wikipedia.org/wiki/Knowledge_Interchange_Format 5 * See_Also: http://sigmakee.cvs.sourceforge.net/viewvc/sigmakee/sigma/suo-kif.pdf 6 * See_Also: http://forum.dlang.org/post/prsxfcmkngfwomygmthi@forum.dlang.org 7 * 8 * TODO: Break out lexer and parser parts to to `nxt.`lexing and `nxt.parsing` 9 * 10 * TODO: Try infinite loops with break or goto instead of for loops. 11 * 12 * TODO: Should we add `LispFile.bySExpr` to allow use of `offsetTo` inside 13 * `LispFileParser lfp; foreach (lfp.bySExpr)` now that copy-ctor is disabled 14 * for `LispParser`? 15 */ 16 module nxt.lispy; 17 18 @safe: 19 20 /** Lisp-like token type. */ 21 enum TOK 22 { 23 unknown, ///< Unknown. 24 25 leftParen, ///< Left parenthesis. 26 rightParen, ///< Right parenthesis. 27 28 symbol, ///< Symbol. 29 30 stringLiteral, ///< String literal. 31 32 comma, ///< Lisp comma expression, `,`. 33 backquote, ///< Lisp backquote expression, `\``. 34 singlequote, ///< Lisp singlequote expression, `'`. 35 36 variable, 37 variableList, ///< one or more variables (parameters) starting with an at-sign, for instance `@ROW` 38 functionName, 39 40 number, ///< number as integer or floating point literal. 41 42 comment, ///< Comment (to end of line). 43 whitespace, ///< Whitespace. 44 45 emptyList, ///< Empty list. 46 } 47 48 /** Lisp-like token. */ 49 struct Token 50 { 51 this(TOK tok, const(char)[] src = null) @safe pure nothrow @nogc 52 { 53 this.tok = tok; 54 this.src = src; 55 } 56 57 @property final void toString(Sink)(ref scope Sink sink) const @safe 58 { 59 switch (tok) 60 { 61 case TOK.symbol: 62 sink(src); 63 break; 64 case TOK.comma: 65 sink(`,`); 66 break; 67 case TOK.backquote: 68 sink("`"); 69 break; 70 case TOK.singlequote: 71 sink("'"); 72 break; 73 case TOK.stringLiteral: 74 sink(`"`); 75 sink(src); 76 sink(`"`); 77 break; 78 case TOK.emptyList: 79 sink(`()`); 80 break; 81 default: 82 // import std.conv : to; 83 // sink(tok.to!string); 84 if (src) 85 { 86 // sink(`:`); 87 sink(src); 88 } 89 break; 90 } 91 } 92 93 TOK tok; 94 const(char)[] src; // optional source slice 95 } 96 97 /** Lisp-like S-expression. */ 98 struct SExpr 99 { 100 final void toString(Sink)(ref scope Sink sink) const @safe @property 101 { 102 if (subs) { sink(`(`); } 103 104 token.toString(sink); 105 106 TOK lastTok = TOK.unknown; 107 foreach (const ref sub; subs) 108 { 109 import std.algorithm.comparison : among; 110 if (!lastTok.among!(TOK.comma, 111 TOK.backquote, 112 TOK.singlequote)) 113 { 114 sink(` `); 115 } 116 sub.toString(sink); 117 lastTok = sub.token.tok; 118 } 119 120 if (subs) { sink(`)`); } 121 } 122 123 /* This overload is needed in order for `Sexpr.init.to!string` to be @safe pure, I 124 * don’t know why. Perhaps because of incomplete attribute inference in compiler. */ 125 final string toString() const @property @safe pure 126 { 127 import std.conv : to; 128 return this.to!(typeof(return)); 129 } 130 131 Token token; 132 SExpr[] subs; 133 } 134 135 /** Parse from `input` into lazy range over top-level expressions (`SExpr`). 136 * 137 * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org 138 */ 139 struct LispParser // TODO: convert to `class` 140 { 141 import std.algorithm.comparison : among; 142 143 alias Input = const(char)[]; 144 145 struct Config 146 { 147 uint subExprCountsGuess = 0; 148 bool includeComments; // TODO: use bitfield 149 bool includeWhitespace; // TODO: use bitfield 150 bool disallowEmptyLists; // TODO: use bitfield 151 } 152 153 @safe pure: 154 155 /** Parse `input` into returned array of expressions (`SExpr`). 156 */ 157 this(Input input, Config config = Config.init) @trusted 158 { 159 _input = input; 160 // if (subExprCountsGuess) // if guess use region 161 // { 162 // _subExprsStore = new SExpr[subExprCountsGuess]; // region store 163 // } 164 import std.exception : enforce; 165 import nxt.parsing : isNullTerminated; 166 enforce(_input.isNullTerminated, "Input isn't null-terminated"); // input cannot be trusted 167 _config = config; 168 nextFront(); 169 } 170 171 @disable this(this); 172 173 pragma(inline, true) 174 bool empty() @property const nothrow scope @nogc => _endOfFile; 175 176 pragma(inline, true) 177 ref const(SExpr) front() @property const scope return in(!empty) => _topExprs.back; 178 179 pragma(inline, true) 180 void popFront() in(!empty) 181 { 182 _topExprs.popBack(); 183 nextFront(); 184 } 185 186 @property size_t subExprsCount() const @safe pure nothrow @nogc => _subExprsCount; 187 188 import std.meta : AliasSeq; 189 190 // from std.ascii.isWhite 191 alias endOfLineChars = AliasSeq!('\n', // (0x0a) 192 '\r', // (0x0c) 193 ); 194 alias whiteChars = AliasSeq!(' ', // 0x20 195 '\t', // (0x09) 196 '\n', // (0x0a) 197 '\v', // (0x0b) 198 '\r', // (0x0c) 199 '\f' // (0x0d) 200 ); 201 alias digitChars = AliasSeq!('0', '1', '2', '3', '4', '5', '6', '7', '8', '9'); 202 203 private: 204 205 /// Get next `dchar` in input. 206 pragma(inline, true) 207 dchar peekNext() const scope nothrow @nogc 208 => _input[_offset]; // TODO: .ptr. TODO: decode `dchar` 209 210 /// Get next `dchar` in input. 211 pragma(inline, true) 212 dchar peekNextNth(size_t n) const scope nothrow @nogc 213 => _input[_offset + n]; // TODO: .ptr. TODO: decode `dchar` 214 215 /// Get next n `chars` in input. 216 pragma(inline, true) 217 Input peekNextsN(size_t n) const return scope nothrow @nogc 218 => _input[_offset .. _offset + n]; // TODO: .ptr 219 220 /// Drop next byte in input. 221 pragma(inline, true) 222 void dropFront() scope nothrow @nogc { _offset += 1; } 223 224 /// Drop next `n` bytes in input. 225 pragma(inline, true) 226 void dropFrontN(size_t n) scope nothrow @nogc { _offset += n; } 227 228 /// Skip over `n` bytes in input. 229 pragma(inline, true) 230 Input skipOverN(size_t n) return scope nothrow @nogc 231 { 232 const part = _input[_offset .. _offset + n]; // TODO: .ptr 233 dropFrontN(n); 234 return part; 235 } 236 237 /// Skip line comment. 238 void skipLineComment() scope nothrow @nogc 239 { 240 while (!peekNext().among!('\0', endOfLineChars)) 241 _offset += 1; 242 } 243 244 /// Get symbol. 245 Input getSymbol() return nothrow @nogc 246 { 247 size_t i = 0; 248 while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite 249 ++i; 250 return skipOverN(i); 251 } 252 253 /// Get numeric literal (number) in integer or decimal form. 254 Input getNumberOrSymbol(out bool gotSymbol) return nothrow @nogc 255 { 256 size_t i = 0; 257 while ((peekNextNth(i).among!('+', '-', '.', digitChars))) // NOTE this is faster than !src[i].isWhite 258 ++i; 259 import std.ascii : isAlpha; 260 if (peekNextNth(i).isAlpha) // if followed by letter 261 { 262 size_t alphaCount = 0; 263 while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite 264 { 265 alphaCount += peekNextNth(i).isAlpha; 266 ++i; 267 } 268 gotSymbol = alphaCount >= 2; // at least two letters, excluding floating point such as 1.0e+10 269 } 270 271 return skipOverN(i); 272 } 273 274 /// Get whitespace. 275 Input getWhitespace() return nothrow @nogc 276 { 277 size_t i = 0; 278 while (peekNextNth(i).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite` 279 ++i; 280 return skipOverN(i); 281 } 282 283 /// Get string literal in input. 284 Input getStringLiteral() return nothrow @nogc 285 { 286 dropFront(); 287 size_t i = 0; 288 while (!peekNextNth(i).among!('\0', '"')) 289 { 290 if (peekNextNth(i) == '\\' && 291 peekNextNth(i + 1) == '"') 292 { 293 i += 2; // skip \n 294 continue; 295 } 296 ++i; 297 } 298 const literal = peekNextsN(i); 299 dropFrontN(i); 300 if (peekNext() == '"') { dropFront(); } // pop ending double singlequote 301 return literal; 302 } 303 304 SExpr[] dupTopExprs(scope SExpr[] exprs) scope @safe pure nothrow 305 { 306 pragma(inline, true); 307 _subExprsCount += exprs.length; // log it for future optimizations 308 return exprs.dup; // TODO: use region allocator stored locally in `LispParser` 309 } 310 311 void nextFront() 312 { 313 import std.range.primitives : empty, front, popFront, popFrontN; 314 import std.uni : isWhite, isAlpha; 315 import std.ascii : isDigit; 316 317 while (true) 318 { 319 switch (_input[_offset]) // TODO: .ptr 320 { 321 case ';': 322 if (_config.includeComments) 323 { 324 assert(0, "TODO: don't use skipLineComment"); 325 // _topExprs.insertBack(SExpr(Token(TOK.comment, src[0 .. 1]))); 326 } 327 else 328 skipLineComment(); 329 break; 330 case '(': 331 _topExprs.insertBack(SExpr(Token(TOK.leftParen, peekNextsN(1)))); 332 dropFront(); 333 ++_depth; 334 break; 335 case ')': 336 // NOTE: this is not needed: _topExprs.insertBack(SExpr(Token(TOK.rightParen, src[0 .. 1]))); 337 dropFront(); 338 --_depth; 339 // NOTE: this is not needed: _topExprs.popBack(); // pop right paren 340 341 assert(!_topExprs.empty); 342 343 // TODO: retroIndexOf 344 size_t count; // number of elements between parens 345 while (_topExprs[$ - 1 - count].token.tok != TOK.leftParen) 346 ++count; 347 if (_config.disallowEmptyLists) 348 assert(count != 0); 349 350 import core.lifetime : move; 351 SExpr newExpr = ((count == 0) ? 352 SExpr(Token(TOK.emptyList)) : 353 SExpr(_topExprs[$ - count].token, 354 dupTopExprs(_topExprs[$ - count + 1 .. $]))); 355 _topExprs.popBackN(1 + count); // forget tokens including leftParen 356 _topExprs.insertBack(newExpr.move); 357 358 if (_depth == 0) // top-level expression done 359 { 360 assert(_topExprs.length >= 1); // we should have at least one `SExpr` 361 return; 362 } 363 364 break; 365 case '"': 366 const stringLiteral = getStringLiteral(); // TODO: tokenize 367 _topExprs.insertBack(SExpr(Token(TOK.stringLiteral, stringLiteral))); 368 break; 369 case ',': 370 dropFront(); 371 _topExprs.insertBack(SExpr(Token(TOK.comma))); 372 break; 373 case '`': 374 dropFront(); 375 _topExprs.insertBack(SExpr(Token(TOK.backquote))); 376 break; 377 case '\'': 378 dropFront(); 379 _topExprs.insertBack(SExpr(Token(TOK.singlequote))); 380 break; 381 case '?': 382 dropFront(); 383 const variableSymbol = getSymbol(); 384 _topExprs.insertBack(SExpr(Token(TOK.variable, variableSymbol))); 385 break; 386 case '@': 387 dropFront(); 388 const variableListSymbol = getSymbol(); 389 _topExprs.insertBack(SExpr(Token(TOK.variableList, variableListSymbol))); 390 break; 391 // std.ascii.isDigit: 392 case '0': 393 .. 394 case '9': 395 case '+': 396 case '-': 397 case '.': 398 bool gotSymbol; 399 const numberOrSymbol = getNumberOrSymbol(gotSymbol); 400 if (gotSymbol) 401 { 402 // debug writeln("TODO: handle floating point: ", numberOrSymbol); 403 _topExprs.insertBack(SExpr(Token(TOK.symbol, numberOrSymbol))); 404 } 405 else 406 { 407 _topExprs.insertBack(SExpr(Token(TOK.number, numberOrSymbol))); 408 } 409 break; 410 // from std.ascii.isWhite 411 case ' ': 412 case '\t': 413 case '\n': 414 case '\v': 415 case '\r': 416 case '\f': 417 assert(peekNext.isWhite); 418 getWhitespace(); 419 if (_config.includeWhitespace) 420 { 421 _topExprs.insertBack(SExpr(Token(TOK.whitespace, null))); 422 } 423 break; 424 case '\0': 425 assert(_depth == 0, "Unbalanced parenthesis at end of file"); 426 _endOfFile = true; 427 return; 428 default: 429 // other 430 if (true// src.front.isAlpha 431 ) 432 { 433 const symbol = getSymbol(); // TODO: tokenize 434 import nxt.array_algorithm : endsWith; 435 if (symbol.endsWith(`Fn`)) 436 { 437 _topExprs.insertBack(SExpr(Token(TOK.functionName, symbol))); 438 } 439 else 440 { 441 _topExprs.insertBack(SExpr(Token(TOK.symbol, symbol))); 442 } 443 } 444 else 445 { 446 import std.conv : to; 447 assert(false, 448 `Cannot handle character '` ~ peekNext.to!string ~ 449 `' at charater offset:` ~ _offset.to!string); 450 } 451 break; 452 } 453 } 454 } 455 456 public ptrdiff_t offsetTo(scope const char[] expr) const @trusted pure nothrow @nogc 457 => expr.ptr - _input.ptr; 458 459 import nxt.line_column : LineColumn, offsetLineColumn; 460 461 public LineColumn offsetToLineColumn(size_t offset) const @trusted pure nothrow @nogc 462 => offsetLineColumn(_input, offset); 463 464 public LineColumn sexprToLineColumn(scope const SExpr sexpr) const @trusted pure nothrow @nogc 465 => offsetLineColumn(_input, offsetTo(sexpr.token.src)); 466 467 public LineColumn charsToLineColumn(scope const(char)[] chars) const @trusted pure nothrow @nogc 468 => offsetLineColumn(_input, offsetTo(chars)); 469 470 private: 471 size_t _offset; // current offset in `_input` 472 const Input _input; // input 473 474 import nxt.container.static_array : StaticArray; 475 // import nxt.container.dynamic_array : DynamicArray; 476 alias TopExprs = StaticArray!(SExpr, 1024); 477 TopExprs _topExprs; // top s-expressions (stack) 478 size_t _subExprsCount; 479 480 // SExpr[] _subExprsStore; // sub s-expressions (region) 481 // size_t _subExprsOffset = 0; // offset into `_subExprsStore` 482 483 size_t _depth; // parenthesis depth 484 bool _endOfFile; // signals null terminator found 485 Config _config; 486 } 487 488 /** Parse the contents of `filePath` into lazy range over top-level expressions (`SExpr`). 489 * 490 * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org 491 */ 492 struct LispFileParser // TODO: convert to `class` 493 { 494 @safe: 495 this(const string filePath) 496 { 497 import std.path : expandTilde; 498 import nxt.file_ex : rawReadPath; 499 const size_t subExprsCount = 0; 500 // TODO: lookup `subExprsCount` using `filePath` extended attr or hash and pass to constructor 501 const path = filePath.expandTilde; 502 const data = cast(LispParser.Input)rawReadPath(path); // cast to Input because we don't want to keep all file around: 503 parser = LispParser(data, LispParser.Config(subExprsCount)); 504 } 505 ~this() nothrow @nogc 506 { 507 // TODO: write parser.subExprsCount 508 } 509 LispParser parser; 510 ptrdiff_t offsetTo(scope const char[] expr) const @safe pure nothrow @nogc => parser.offsetTo(expr); 511 alias parser this; 512 } 513 514 /// 515 @safe pure unittest 516 { 517 const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0"; 518 auto _topExprs = LispParser(text); 519 assert(!_topExprs.empty); 520 521 assert(_topExprs.front.token.tok == TOK.symbol); 522 assert(_topExprs.front.token.src == `instance`); 523 524 assert(_topExprs.front.subs[0].token.tok == TOK.functionName); 525 assert(_topExprs.front.subs[0].token.src == "AttrFn"); 526 527 assert(_topExprs.front.subs[1].token.tok == TOK.symbol); 528 assert(_topExprs.front.subs[1].token.src == "BinaryFunction"); 529 530 _topExprs.popFront(); 531 assert(_topExprs.empty); 532 533 } 534 535 /// 536 @safe pure unittest 537 { 538 const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0"; 539 auto _topExprs = LispParser(text); 540 import std.conv : to; 541 assert(_topExprs.front.to!string == `(instance AttrFn BinaryFunction)`); 542 assert(!_topExprs.empty); 543 }