1 /** Lexer and parser of Lisp-like languages, including SUO-KIF and Emacs-Lisp. 2 * 3 * See_Also: https://www.csee.umbc.edu/csee/research/kif/ 4 * See_Also: https://en.wikipedia.org/wiki/Knowledge_Interchange_Format 5 * See_Also: http://sigmakee.cvs.sourceforge.net/viewvc/sigmakee/sigma/suo-kif.pdf 6 * See_Also: http://forum.dlang.org/post/prsxfcmkngfwomygmthi@forum.dlang.org 7 * 8 * TODO: Break out lexer and parser parts to to `nxt.`lexing and `nxt.parsing` 9 * 10 * TODO: Try infinite loops with break or goto instead of for loops. 11 * 12 * TODO: Should we add `LispFile.bySExpr` to allow use of `offsetTo` inside 13 * `LispFileParser lfp; foreach (lfp.bySExpr)` now that copy-ctor is disabled 14 * for `LispParser`? 15 */ 16 module nxt.lispy; 17 18 import nxt.path : FilePath, expandTilde, exists; 19 20 @safe: 21 22 /** Lisp-like token type. */ 23 enum TOK { 24 unknown, ///< Unknown. 25 26 leftParen, ///< Left parenthesis. 27 rightParen, ///< Right parenthesis. 28 29 symbol, ///< Symbol. 30 31 stringLiteral, ///< String literal. 32 33 comma, ///< Lisp comma expression, `,`. 34 backquote, ///< Lisp backquote expression, `\``. 35 singlequote, ///< Lisp singlequote expression, `'`. 36 37 variable, 38 variableList, ///< one or more variables (parameters) starting with an at-sign, for instance `@ROW` 39 functionName, 40 41 number, ///< number as integer or floating point literal. 42 43 comment, ///< Comment (to end of line). 44 whitespace, ///< Whitespace. 45 46 emptyList, ///< Empty list. 47 } 48 49 /** Lisp-like token. */ 50 struct Token 51 { 52 this(TOK tok, const(char)[] src = null) pure nothrow @safe @nogc { 53 this.tok = tok; 54 this.src = src; 55 } 56 57 @property final void toString(Sink)(ref scope Sink sink) const @safe { 58 switch (tok) { 59 case TOK.symbol: 60 sink(src); 61 break; 62 case TOK.comma: 63 sink(`,`); 64 break; 65 case TOK.backquote: 66 sink("`"); 67 break; 68 case TOK.singlequote: 69 sink("'"); 70 break; 71 case TOK.stringLiteral: 72 sink(`"`); 73 sink(src); 74 sink(`"`); 75 break; 76 case TOK.emptyList: 77 sink(`()`); 78 break; 79 default: 80 // import std.conv : to; 81 // sink(tok.to!string); 82 if (src) 83 { 84 // sink(`:`); 85 sink(src); 86 } 87 break; 88 } 89 } 90 91 TOK tok; 92 const(char)[] src; // optional source slice 93 } 94 95 /** Lisp-like S-expression. */ 96 struct SExpr { 97 final void toString(Sink)(ref scope Sink sink) const @safe @property { 98 if (subs) { sink(`(`); } 99 100 token.toString(sink); 101 102 TOK lastTok = TOK.unknown; 103 foreach (const ref sub; subs) { 104 import std.algorithm.comparison : among; 105 if (!lastTok.among!(TOK.comma, 106 TOK.backquote, 107 TOK.singlequote)) 108 sink(` `); 109 sub.toString(sink); 110 lastTok = sub.token.tok; 111 } 112 113 if (subs) { sink(`)`); } 114 } 115 116 /* This overload is needed in order for `Sexpr.init.to!string` to be @safe pure, I 117 * don’t know why. Perhaps because of incomplete attribute inference in compiler. */ 118 final string toString() const @property @safe pure { 119 import std.conv : to; 120 return this.to!(typeof(return)); 121 } 122 123 Token token; 124 SExpr[] subs; 125 } 126 127 /** Parse from `input` into lazy range over top-level expressions (`SExpr`). 128 * 129 * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org 130 */ 131 struct LispParser /+ TODO: convert to `class` +/ 132 { 133 import std.algorithm.comparison : among; 134 135 alias Input = const(char)[]; 136 137 struct Config { 138 uint subExprCountsGuess = 0; 139 bool includeComments; /+ TODO: use bitfield +/ 140 bool includeWhitespace; /+ TODO: use bitfield +/ 141 bool disallowEmptyLists; /+ TODO: use bitfield +/ 142 } 143 144 @safe pure: 145 146 /** Parse `input` into returned array of expressions (`SExpr`). 147 */ 148 this(Input input, Config config = Config.init) @trusted { 149 _input = input; 150 // if (subExprCountsGuess) // if guess use region 151 // { 152 // _subExprsStore = new SExpr[subExprCountsGuess]; // region store 153 // } 154 import std.exception : enforce; 155 import nxt.parsing : isNullTerminated; 156 enforce(_input.isNullTerminated, "Input isn't null-terminated"); // input cannot be trusted 157 _config = config; 158 nextFront(); 159 } 160 161 this(this) @disable; 162 163 pragma(inline, true) 164 bool empty() @property const nothrow scope @nogc => _endOfFile; 165 166 pragma(inline, true) 167 ref const(SExpr) front() @property const scope return in(!empty) => _topExprs.back; 168 169 pragma(inline, true) 170 void popFront() in(!empty) { 171 _topExprs.popBack(); 172 nextFront(); 173 } 174 175 @property size_t subExprsCount() const pure nothrow @safe @nogc => _subExprsCount; 176 177 import std.meta : AliasSeq; 178 179 // from std.ascii.isWhite 180 alias endOfLineChars = AliasSeq!('\n', // (0x0a) 181 '\r', // (0x0c) 182 ); 183 alias whiteChars = AliasSeq!(' ', // 0x20 184 '\t', // (0x09) 185 '\n', // (0x0a) 186 '\v', // (0x0b) 187 '\r', // (0x0c) 188 '\f' // (0x0d) 189 ); 190 alias digitChars = AliasSeq!('0', '1', '2', '3', '4', '5', '6', '7', '8', '9'); 191 192 private: 193 194 /// Get next `dchar` in input. 195 pragma(inline, true) 196 dchar peekNext() const scope nothrow @nogc 197 => _input[_offset]; /+ TODO: .ptr. TODO: decode `dchar` +/ 198 199 /// Get next `dchar` in input. 200 pragma(inline, true) 201 dchar peekNextNth(size_t n) const scope nothrow @nogc 202 => _input[_offset + n]; /+ TODO: .ptr. TODO: decode `dchar` +/ 203 204 /// Get next n `chars` in input. 205 pragma(inline, true) 206 Input peekNextsN(size_t n) const return scope nothrow @nogc 207 => _input[_offset .. _offset + n]; /+ TODO: .ptr +/ 208 209 /// Drop next byte in input. 210 pragma(inline, true) 211 void dropFront() scope nothrow @nogc { _offset += 1; } 212 213 /// Drop next `n` bytes in input. 214 pragma(inline, true) 215 void dropFrontN(size_t n) scope nothrow @nogc { _offset += n; } 216 217 /// Skip over `n` bytes in input. 218 pragma(inline, true) 219 Input skipOverN(size_t n) return scope nothrow @nogc { 220 const part = _input[_offset .. _offset + n]; /+ TODO: .ptr +/ 221 dropFrontN(n); 222 return part; 223 } 224 225 /// Skip line comment. 226 void skipLineComment() scope nothrow @nogc { 227 while (!peekNext().among!('\0', endOfLineChars)) 228 _offset += 1; 229 } 230 231 /// Get symbol. 232 Input getSymbol() return nothrow @nogc { 233 size_t i = 0; 234 while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite 235 ++i; 236 return skipOverN(i); 237 } 238 239 /// Get numeric literal (number) in integer or decimal form. 240 Input getNumberOrSymbol(out bool gotSymbol) return nothrow @nogc { 241 size_t i = 0; 242 while ((peekNextNth(i).among!('+', '-', '.', digitChars))) // NOTE this is faster than !src[i].isWhite 243 ++i; 244 import std.ascii : isAlpha; 245 if (peekNextNth(i).isAlpha) // if followed by letter 246 { 247 size_t alphaCount = 0; 248 while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite 249 { 250 alphaCount += peekNextNth(i).isAlpha; 251 ++i; 252 } 253 gotSymbol = alphaCount >= 2; // at least two letters, excluding floating point such as 1.0e+10 254 } 255 256 return skipOverN(i); 257 } 258 259 /// Get whitespace. 260 Input getWhitespace() return nothrow @nogc { 261 size_t i = 0; 262 while (peekNextNth(i).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite` 263 ++i; 264 return skipOverN(i); 265 } 266 267 /// Get string literal in input. 268 Input getStringLiteral() return nothrow @nogc { 269 dropFront(); 270 size_t i = 0; 271 while (!peekNextNth(i).among!('\0', '"')) 272 { 273 if (peekNextNth(i) == '\\' && 274 peekNextNth(i + 1) == '"') 275 { 276 i += 2; // skip \n 277 continue; 278 } 279 ++i; 280 } 281 const literal = peekNextsN(i); 282 dropFrontN(i); 283 if (peekNext() == '"') { dropFront(); } // pop ending double singlequote 284 return literal; 285 } 286 287 SExpr[] dupTopExprs(scope SExpr[] exprs) scope @safe pure nothrow { 288 pragma(inline, true); 289 _subExprsCount += exprs.length; // log it for future optimizations 290 return exprs.dup; /+ TODO: use region allocator stored locally in `LispParser` +/ 291 } 292 293 void nextFront() { 294 import std.range.primitives : empty, front, popFront, popFrontN; 295 import std.uni : isWhite, isAlpha; 296 import std.ascii : isDigit; 297 298 while (true) { 299 switch (_input[_offset]) /+ TODO: .ptr +/ 300 { 301 case ';': 302 if (_config.includeComments) { 303 assert(0, "TODO: don't use skipLineComment"); 304 // _topExprs.insertBack(SExpr(Token(TOK.comment, src[0 .. 1]))); 305 } 306 else 307 skipLineComment(); 308 break; 309 case '(': 310 _topExprs.insertBack(SExpr(Token(TOK.leftParen, peekNextsN(1)))); 311 dropFront(); 312 ++_depth; 313 break; 314 case ')': 315 // NOTE: this is not needed: _topExprs.insertBack(SExpr(Token(TOK.rightParen, src[0 .. 1]))); 316 dropFront(); 317 --_depth; 318 // NOTE: this is not needed: _topExprs.popBack(); // pop right paren 319 320 assert(!_topExprs.empty); 321 322 /+ TODO: retroIndexOf +/ 323 size_t count; // number of elements between parens 324 while (_topExprs[$ - 1 - count].token.tok != TOK.leftParen) 325 ++count; 326 if (_config.disallowEmptyLists) 327 assert(count != 0); 328 329 import core.lifetime : move; 330 SExpr newExpr = ((count == 0) ? 331 SExpr(Token(TOK.emptyList)) : 332 SExpr(_topExprs[$ - count].token, 333 dupTopExprs(_topExprs[$ - count + 1 .. $]))); 334 _topExprs.popBackN(1 + count); // forget tokens including leftParen 335 _topExprs.insertBack(newExpr.move); 336 337 if (_depth == 0) { // top-level expression done 338 assert(_topExprs.length >= 1); // we should have at least one `SExpr` 339 return; 340 } 341 342 break; 343 case '"': 344 const stringLiteral = getStringLiteral(); /+ TODO: tokenize +/ 345 () @trusted { 346 _topExprs.insertBack(SExpr(Token(TOK.stringLiteral, stringLiteral))); 347 }(); 348 break; 349 case ',': 350 dropFront(); 351 _topExprs.insertBack(SExpr(Token(TOK.comma))); 352 break; 353 case '`': 354 dropFront(); 355 _topExprs.insertBack(SExpr(Token(TOK.backquote))); 356 break; 357 case '\'': 358 dropFront(); 359 _topExprs.insertBack(SExpr(Token(TOK.singlequote))); 360 break; 361 case '?': 362 dropFront(); 363 const variableSymbol = getSymbol(); 364 () @trusted { 365 _topExprs.insertBack(SExpr(Token(TOK.variable, variableSymbol))); 366 }(); 367 break; 368 case '@': 369 dropFront(); 370 const variableListSymbol = getSymbol(); 371 () @trusted { 372 _topExprs.insertBack(SExpr(Token(TOK.variableList, variableListSymbol))); 373 }(); 374 break; 375 // std.ascii.isDigit: 376 case '0': 377 .. 378 case '9': 379 case '+': 380 case '-': 381 case '.': 382 bool gotSymbol; 383 const numberOrSymbol = getNumberOrSymbol(gotSymbol); 384 if (gotSymbol) { 385 // debug writeln("TODO: handle floating point: ", numberOrSymbol); 386 () @trusted { 387 _topExprs.insertBack(SExpr(Token(TOK.symbol, numberOrSymbol))); 388 }(); 389 } else { 390 () @trusted { 391 _topExprs.insertBack(SExpr(Token(TOK.number, numberOrSymbol))); 392 }(); 393 } 394 break; 395 // from std.ascii.isWhite 396 case ' ': 397 case '\t': 398 case '\n': 399 case '\v': 400 case '\r': 401 case '\f': 402 assert(peekNext.isWhite); 403 getWhitespace(); 404 if (_config.includeWhitespace) 405 _topExprs.insertBack(SExpr(Token(TOK.whitespace, null))); 406 break; 407 case '\0': 408 assert(_depth == 0, "Unbalanced parenthesis at end of file"); 409 _endOfFile = true; 410 return; 411 default: 412 // other 413 if (true// src.front.isAlpha 414 ) 415 { 416 const symbol = getSymbol(); /+ TODO: tokenize +/ 417 import nxt.algorithm.searching : endsWith; 418 if (symbol.endsWith(`Fn`)) { 419 () @trusted { 420 _topExprs.insertBack(SExpr(Token(TOK.functionName, symbol))); 421 }(); 422 } else { 423 () @trusted { 424 _topExprs.insertBack(SExpr(Token(TOK.symbol, symbol))); 425 }(); 426 } 427 } 428 else 429 { 430 import std.conv : to; 431 assert(false, 432 `Cannot handle character '` ~ peekNext.to!string ~ 433 `' at charater offset:` ~ _offset.to!string); 434 } 435 break; 436 } 437 } 438 } 439 440 public ptrdiff_t offsetTo(scope const char[] expr) const @trusted pure nothrow @nogc 441 => expr.ptr - _input.ptr; 442 443 import nxt.line_column : LineColumn, scanLineColumnToOffset; 444 import nxt.offset : Offset; 445 446 public LineColumn offsetToLineColumn(size_t offset) const @trusted pure nothrow @nogc 447 => scanLineColumnToOffset(_input, Offset(offset)); 448 449 public LineColumn sexprToLineColumn(scope const SExpr sexpr) const @trusted pure nothrow @nogc 450 => scanLineColumnToOffset(_input, Offset(offsetTo(sexpr.token.src))); 451 452 public LineColumn charsToLineColumn(scope const(char)[] chars) const @trusted pure nothrow @nogc 453 => scanLineColumnToOffset(_input, Offset(offsetTo(chars))); 454 455 private: 456 size_t _offset; // current offset in `_input` 457 const Input _input; // input 458 459 import nxt.container.static_array : StaticArray; 460 // import nxt.container.dynamic_array : DynamicArray; 461 alias TopExprs = StaticArray!(SExpr, 1024); 462 TopExprs _topExprs; // top s-expressions (stack) 463 size_t _subExprsCount; 464 465 // SExpr[] _subExprsStore; // sub s-expressions (region) 466 // size_t _subExprsOffset = 0; // offset into `_subExprsStore` 467 468 size_t _depth; // parenthesis depth 469 bool _endOfFile; // signals null terminator found 470 Config _config; 471 } 472 473 /// 474 pure @safe unittest { 475 const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0"; 476 auto parser = LispParser(text); 477 assert(!parser.empty); 478 479 assert(parser.front.token.tok == TOK.symbol); 480 assert(parser.front.token.src == `instance`); 481 482 assert(parser.front.subs[0].token.tok == TOK.functionName); 483 assert(parser.front.subs[0].token.src == "AttrFn"); 484 485 assert(parser.front.subs[1].token.tok == TOK.symbol); 486 assert(parser.front.subs[1].token.src == "BinaryFunction"); 487 488 parser.popFront(); 489 assert(parser.empty); 490 491 } 492 493 /// 494 pure @safe unittest { 495 const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0"; 496 auto parser = LispParser(text); 497 import std.conv : to; 498 assert(parser.front.to!string == `(instance AttrFn BinaryFunction)`); 499 assert(!parser.empty); 500 } 501 502 /** Parse the contents of `file` into lazy range over top-level expressions (`SExpr`). 503 * 504 * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org 505 */ 506 struct LispFileParser /+ TODO: convert to `class` +/ 507 { 508 import nxt.path : FilePath, expandTilde; 509 @safe: 510 this(FilePath file) { 511 const size_t subExprsCount = 0; 512 /+ TODO: lookup `subExprsCount` using `file` extended attr or hash and pass to constructor +/ 513 import nxt.file : rawReadZ; 514 const data = cast(LispParser.Input)rawReadZ(file.expandTilde); // cast to Input because we don't want to keep all file around: 515 parser = LispParser(data, LispParser.Config(subExprsCount)); 516 } 517 ~this() nothrow @nogc { 518 /+ TODO: write parser.subExprsCount +/ 519 } 520 LispParser parser; 521 ptrdiff_t offsetTo(scope const char[] expr) const pure nothrow @safe @nogc => parser.offsetTo(expr); 522 alias parser this; 523 } 524 525 /// Optional integration test if path exists. 526 @safe unittest { 527 const path = FilePath(`~/Work/knet/knowledge/xlg.el`); 528 if (path.exists) { 529 auto parser = LispFileParser(path); 530 assert(!parser.empty); 531 } 532 }