1 /** Lexer and parser of Lisp-like languages, including SUO-KIF and Emacs-Lisp.
2  *
3  * See_Also: https://www.csee.umbc.edu/csee/research/kif/
4  * See_Also: https://en.wikipedia.org/wiki/Knowledge_Interchange_Format
5  * See_Also: http://sigmakee.cvs.sourceforge.net/viewvc/sigmakee/sigma/suo-kif.pdf
6  * See_Also: http://forum.dlang.org/post/prsxfcmkngfwomygmthi@forum.dlang.org
7  *
8  * TODO: Try infinite loops with break or goto instead of for loops.
9  *
10  * TODO: Should we add `LispFile.bySExpr` to allow use of `offsetTo` inside
11  * `LispFileParser lfp; foreach (lfp.bySExpr)` now that copy-ctor is disabled
12  * for `LispParser`?
13  */
14 module nxt.lispy;
15 
16 /** Lisp-like token type. */
17 enum TOK
18 {
19     unknown,                    ///< Unknown.
20 
21     leftParen,                  ///< Left parenthesis.
22     rightParen,                 ///< Right parenthesis.
23 
24     symbol,                     ///< Symbol.
25 
26     stringLiteral,              ///< String literal.
27 
28     comma,                      ///< Lisp comma expression, `,`.
29     backquote,                  ///< Lisp backquote expression, `\``.
30     singlequote,                ///< Lisp singlequote expression, `'`.
31 
32     variable,
33     variableList, ///< one or more variables (parameters) starting with an at-sign, for instance `@ROW`
34     functionName,
35 
36     number,                     ///< number as integer or floating point literal.
37 
38     comment,                    ///< Comment (to end of line).
39     whitespace,                 ///< Whitespace.
40 
41     emptyList,                  ///< Empty list.
42 }
43 
44 /** Lisp-like token. */
45 struct Token
46 {
47 @safe:
48     this(TOK tok, const(char)[] src = null) pure nothrow @nogc
49     {
50         this.tok = tok;
51         this.src = src;
52     }
53 
54     @property final void toString(scope void delegate(scope const(char)[]) @safe sink) const @safe
55     {
56         switch (tok)
57         {
58         case TOK.symbol:
59             sink(src);
60             break;
61         case TOK.comma:
62             sink(`,`);
63             break;
64         case TOK.backquote:
65             sink("`");
66             break;
67         case TOK.singlequote:
68             sink("'");
69             break;
70         case TOK.stringLiteral:
71             sink(`"`);
72             sink(src);
73             sink(`"`);
74             break;
75         case TOK.emptyList:
76             sink(`()`);
77             break;
78         default:
79             // import std.conv : to;
80             // sink(tok.to!string);
81             if (src)
82             {
83                 // sink(`:`);
84                 sink(src);
85             }
86             break;
87         }
88     }
89 
90     TOK tok;
91     const(char)[] src;          // optional source slice
92 }
93 
94 /** Lisp-like S-expression. */
95 struct SExpr
96 {
97 @safe:
98     @property final void toString(scope void delegate(scope const(char)[]) @safe sink) const @safe
99     {
100         if (subs) { sink(`(`); }
101 
102         token.toString(sink);
103 
104         TOK lastTok = TOK.unknown;
105         foreach (const ref sub; subs)
106         {
107             import std.algorithm.comparison : among;
108             if (!lastTok.among!(TOK.comma,
109                                 TOK.backquote,
110                                 TOK.singlequote))
111             {
112                 sink(` `);
113             }
114             sub.toString(sink);
115             lastTok = sub.token.tok;
116         }
117 
118         if (subs) { sink(`)`); }
119     }
120 
121     Token token;
122     SExpr[] subs;
123 }
124 
125 /** Returns: true if `s` is null-terminated (ending with `'\0'`).
126  *
127  * Prior to parsing used to verify input to parsers that make use of
128  * sentinel-based search.
129  *
130  * See_Also: https://en.wikipedia.org/wiki/Sentinel_value
131  */
132 bool isNullTerminated(scope const(char)[] s) @safe pure nothrow @nogc
133 {
134     version(D_Coverage) {} else pragma(inline, true);
135     return s.length >= 1 && s[$ - 1] == '\0';
136 }
137 
138 /** Parse from `input` into lazy range over top-level expressions (`SExpr`).
139  *
140  * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org
141  */
142 struct LispParser               // TODO convert to `class`
143 {
144     import std.algorithm.comparison : among;
145 
146     alias Input = const(char)[];
147 
148 @safe pure:
149 
150     /** Parse `input` into returned array of expressions (`SExpr`).
151      */
152     this(Input input,
153          bool includeComments = false,
154          bool includeWhitespace = false,
155          bool disallowEmptyLists = false,
156          size_t subExprCountsGuess = 0) @trusted
157     {
158         _input = input;
159 
160         // if (subExprCountsGuess) // if guess use region
161         // {
162         //     _subExprsStore = new SExpr[subExprCountsGuess]; // region store
163         // }
164 
165         import std.exception : enforce;
166         enforce(_input.isNullTerminated, "Input isn't null-terminated"); // input cannot be trusted
167 
168         _includeComments = includeComments;
169         _includeWhitespace = includeWhitespace;
170         _disallowEmptyLists = disallowEmptyLists;
171 
172         nextFront();
173     }
174 
175     @disable this(this);
176 
177     @property bool empty() const nothrow scope @nogc
178     {
179         version(D_Coverage) {} else pragma(inline, true);
180         return _endOfFile;
181     }
182 
183     ref const(SExpr) front() const scope return
184     {
185         version(D_Coverage) {} else pragma(inline, true);
186         assert(!empty);
187         return _topExprs.back;
188     }
189 
190     void popFront()
191     {
192         version(D_Coverage) {} else pragma(inline, true);
193         assert(!empty);
194         _topExprs.popBack();
195         nextFront();
196     }
197 
198     @property size_t subExprsCount() @safe pure nothrow @nogc
199     {
200         return _subExprsCount;
201     }
202 
203     import std.meta : AliasSeq;
204 
205     // from std.ascii.isWhite
206     alias endOfLineChars = AliasSeq!('\n', // (0x0a)
207                                      '\r', // (0x0c)
208         );
209     alias whiteChars = AliasSeq!(' ', // 0x20
210                                  '\t', // (0x09)
211                                  '\n', // (0x0a)
212                                  '\v', // (0x0b)
213                                  '\r', // (0x0c)
214                                  '\f' // (0x0d)
215         );
216     alias digitChars = AliasSeq!('0', '1', '2', '3', '4', '5', '6', '7', '8', '9');
217 
218 private:
219 
220     /// Get next `char` in input.
221     char peekNext() const scope nothrow @nogc
222     {
223         version(D_Coverage) {} else pragma(inline, true);
224         return _input[_offset];    // TODO .ptr
225     }
226 
227     /// Get next `char` in input.
228     char peekNextNth(size_t n) const nothrow @nogc
229     {
230         version(D_Coverage) {} else pragma(inline, true);
231         return _input[_offset + n]; // TODO .ptr
232     }
233 
234     /// Get next n `chars` in input.
235     Input peekNextsN(size_t n) const return nothrow @nogc
236     {
237         version(D_Coverage) {} else pragma(inline, true);
238         return _input[_offset .. _offset + n]; // TODO .ptr
239     }
240 
241     /// Drop next byte in input.
242     void dropFront() nothrow @nogc
243     {
244         version(D_Coverage) {} else pragma(inline, true);
245         _offset += 1;
246     }
247 
248     /// Drop next `n` bytes in input.
249     void dropFrontN(size_t n) nothrow @nogc
250     {
251         version(D_Coverage) {} else pragma(inline, true);
252         _offset += n;
253     }
254 
255     /// Skip over `n` bytes in input.
256     Input skipOverN(size_t n) return nothrow @nogc
257     {
258         version(D_Coverage) {} else pragma(inline);
259         const part = _input[_offset .. _offset + n]; // TODO .ptr
260         dropFrontN(n);
261         return part;
262     }
263 
264     /// Skip line comment.
265     void skipLineComment() scope nothrow @nogc
266     {
267         while (!peekNext().among!('\0', endOfLineChars))
268         {
269             _offset += 1;
270         }
271     }
272 
273     /// Get symbol.
274     Input getSymbol() return nothrow @nogc
275     {
276         size_t i = 0;
277         while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite
278         {
279             ++i;
280         }
281         return skipOverN(i);
282     }
283 
284     /// Get numeric literal (number) in integer or decimal form.
285     Input getNumberOrSymbol(out bool gotSymbol) return nothrow @nogc
286     {
287         size_t i = 0;
288         while ((peekNextNth(i).among!('+', '-', '.', digitChars))) // NOTE this is faster than !src[i].isWhite
289         {
290             ++i;
291         }
292         import std.ascii : isAlpha;
293         if (peekNextNth(i).isAlpha) // if followed by letter
294         {
295             size_t alphaCount = 0;
296             while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite
297             {
298                 alphaCount += peekNextNth(i).isAlpha;
299                 ++i;
300             }
301             gotSymbol = alphaCount >= 2; // at least two letters, excluding floating point such as 1.0e+10
302         }
303 
304         return skipOverN(i);
305     }
306 
307     /// Get whitespace.
308     Input getWhitespace() return nothrow @nogc
309     {
310         size_t i = 0;
311         while (peekNextNth(i).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite`
312         {
313             ++i;
314         }
315         return skipOverN(i);
316     }
317 
318     /// Get string literal in input.
319     Input getStringLiteral() return nothrow @nogc
320     {
321         dropFront();
322         size_t i = 0;
323         while (!peekNextNth(i).among!('\0', '"'))
324         {
325             if (peekNextNth(i) == '\\' &&
326                 peekNextNth(i + 1) == '"')
327             {
328                 i += 2;         // skip \n
329                 continue;
330             }
331             ++i;
332         }
333         const literal = peekNextsN(i);
334         dropFrontN(i);
335         if (peekNext() == '"') { dropFront(); } // pop ending double singlequote
336         return literal;
337     }
338 
339     SExpr[] dupTopExprs(SExpr[] exprs) @safe pure nothrow
340     {
341         version(D_Coverage) {} else pragma(inline, true);
342         _subExprsCount += exprs.length; // log it for future optimizations
343         return exprs.dup; // TODO use region allocator stored locally in `LispParser`
344     }
345 
346     void nextFront()
347     {
348         import std.range.primitives : empty, front, popFront, popFrontN;
349         import std.uni : isWhite, isAlpha;
350         import std.ascii : isDigit;
351 
352         while (true)
353         {
354             switch (_input[_offset]) // TODO .ptr
355             {
356             case ';':
357                 if (_includeComments)
358                 {
359                     assert(0, "TODO don't use skipLineComment");
360                     // _topExprs.insertBack(SExpr(Token(TOK.comment, src[0 .. 1])));
361                 }
362                 else
363                 {
364                     skipLineComment();
365                 }
366                 break;
367             case '(':
368                 _topExprs.insertBack(SExpr(Token(TOK.leftParen, peekNextsN(1))));
369                 dropFront();
370                 ++_depth;
371                 break;
372             case ')':
373                 // NOTE: this is not needed: _topExprs.insertBack(SExpr(Token(TOK.rightParen, src[0 .. 1])));
374                 dropFront();
375                 --_depth;
376                 // NOTE: this is not needed: _topExprs.popBack();   // pop right paren
377 
378                 assert(!_topExprs.empty);
379 
380                 // TODO retroIndexOf
381                 size_t count; // number of elements between parens
382                 while (_topExprs[$ - 1 - count].token.tok != TOK.leftParen)
383                 {
384                     ++count;
385                 }
386                 if (_disallowEmptyLists)
387                 {
388                     assert(count != 0);
389                 }
390 
391                 import core.lifetime : move;
392                 SExpr newExpr = ((count == 0) ?
393                                  SExpr(Token(TOK.emptyList)) :
394                                  SExpr(_topExprs[$ - count].token,
395                                        dupTopExprs(_topExprs[$ - count + 1 .. $])));
396                 _topExprs.popBackN(1 + count); // forget tokens including leftParen
397                 _topExprs.insertBack(newExpr.move);
398 
399                 if (_depth == 0) // top-level expression done
400                 {
401                     assert(_topExprs.length >= 1); // we should have at least one `SExpr`
402                     return;
403                 }
404 
405                 break;
406             case '"':
407                 const stringLiteral = getStringLiteral(); // TODO tokenize
408                 _topExprs.insertBack(SExpr(Token(TOK.stringLiteral, stringLiteral)));
409                 break;
410             case ',':
411                 dropFront();
412                 _topExprs.insertBack(SExpr(Token(TOK.comma)));
413                 break;
414             case '`':
415                 dropFront();
416                 _topExprs.insertBack(SExpr(Token(TOK.backquote)));
417                 break;
418             case '\'':
419                 dropFront();
420                 _topExprs.insertBack(SExpr(Token(TOK.singlequote)));
421                 break;
422             case '?':
423                 dropFront();
424                 const variableSymbol = getSymbol();
425                 _topExprs.insertBack(SExpr(Token(TOK.variable, variableSymbol)));
426                 break;
427             case '@':
428                 dropFront();
429                 const variableListSymbol = getSymbol();
430                 _topExprs.insertBack(SExpr(Token(TOK.variableList, variableListSymbol)));
431                 break;
432                 // std.ascii.isDigit:
433             case '0':
434             case '1':
435             case '2':
436             case '3':
437             case '4':
438             case '5':
439             case '6':
440             case '7':
441             case '8':
442             case '9':
443             case '+':
444             case '-':
445             case '.':
446                 bool gotSymbol;
447                 const numberOrSymbol = getNumberOrSymbol(gotSymbol);
448                 if (gotSymbol)
449                 {
450                     // debug writeln("TODO handle floating point: ", numberOrSymbol);
451                     _topExprs.insertBack(SExpr(Token(TOK.symbol, numberOrSymbol)));
452                 }
453                 else
454                 {
455                     _topExprs.insertBack(SExpr(Token(TOK.number, numberOrSymbol)));
456                 }
457                 break;
458                 // from std.ascii.isWhite
459             case ' ':
460             case '\t':
461             case '\n':
462             case '\v':
463             case '\r':
464             case '\f':
465                 assert(peekNext.isWhite);
466                 getWhitespace();
467                 if (_includeWhitespace)
468                 {
469                     _topExprs.insertBack(SExpr(Token(TOK.whitespace, null)));
470                 }
471                 break;
472             case '\0':
473                 assert(_depth == 0, "Unbalanced parenthesis at end of file");
474                 _endOfFile = true;
475                 return;
476             default:
477                 // other
478                 if (true// src.front.isAlpha
479                     )
480                 {
481                     const symbol = getSymbol(); // TODO tokenize
482                     import nxt.array_algorithm : endsWith;
483                     if (symbol.endsWith(`Fn`))
484                     {
485                         _topExprs.insertBack(SExpr(Token(TOK.functionName, symbol)));
486                     }
487                     else
488                     {
489                         _topExprs.insertBack(SExpr(Token(TOK.symbol, symbol)));
490                     }
491                 }
492                 else
493                 {
494                     import std.conv : to;
495                     assert(false,
496                            `Cannot handle character '` ~ peekNext.to!string ~
497                            `' at charater offset:` ~ _offset.to!string);
498                 }
499                 break;
500             }
501         }
502     }
503 
504     public ptrdiff_t offsetTo(scope const char[] expr) const @trusted pure nothrow @nogc
505     {
506         return expr.ptr - _input.ptr;
507     }
508 
509     import nxt.line_column : LineColumn, offsetLineColumn;
510 
511     public LineColumn offsetToLineColumn(size_t offset) const @trusted pure nothrow @nogc
512     {
513         return offsetLineColumn(_input, offset);
514     }
515 
516     public LineColumn sexprToLineColumn(scope const SExpr sexpr) const @trusted pure nothrow @nogc
517     {
518         return offsetLineColumn(_input, offsetTo(sexpr.token.src));
519     }
520 
521     public LineColumn charsToLineColumn(scope const(char)[] chars) const @trusted pure nothrow @nogc
522     {
523         return offsetLineColumn(_input, offsetTo(chars));
524     }
525 
526 private:
527     size_t _offset;             // current offset in `_input`
528     const Input _input;         // input
529 
530     import nxt.fixed_array : FixedArray;
531     // import nxt.dynamic_array : DynamicArray;
532     alias TopExprs = FixedArray!(SExpr, 1024);
533     TopExprs _topExprs;           // top s-expressions (stack)
534     size_t _subExprsCount;
535 
536     // SExpr[] _subExprsStore;     // sub s-expressions (region)
537     // size_t _subExprsOffset = 0; // offset into `_subExprsStore`
538 
539     size_t _depth;              // parenthesis depth
540     bool _endOfFile;            // signals null terminator found
541     bool _includeComments = false;
542     bool _includeWhitespace = false;
543     bool _disallowEmptyLists = false;
544 }
545 
546 /** Parse the contents of `filePath` into lazy range over top-level expressions (`SExpr`).
547  *
548  * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org
549  */
550 struct LispFileParser           // TODO convert to `class`
551 {
552 @safe:
553     this(const string filePath)
554     {
555         import std.path : expandTilde;
556         import nxt.file_ex : rawReadPath;
557         size_t subExprsCount = 0;
558         // TODO lookup `subExprsCount` using `filePath` extended attr or hash and pass to constructor
559         parser = LispParser(cast(LispParser.Input)filePath.expandTilde.rawReadPath(),
560                             false, false, false, subExprsCount);
561     }
562     ~this() @nogc
563     {
564         // TODO write parser.subExprsCount
565     }
566     LispParser parser;
567     ptrdiff_t offsetTo(scope const char[] expr) const @safe pure nothrow @nogc
568     {
569         return parser.offsetTo(expr);
570     }
571     alias parser this;
572 }
573 
574 ///
575 @safe pure unittest
576 {
577     const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0";
578     auto _topExprs = LispParser(text);
579     assert(!_topExprs.empty);
580 
581     assert(_topExprs.front.token.tok == TOK.symbol);
582     assert(_topExprs.front.token.src == `instance`);
583 
584     assert(_topExprs.front.subs[0].token.tok == TOK.functionName);
585     assert(_topExprs.front.subs[0].token.src == "AttrFn");
586 
587     assert(_topExprs.front.subs[1].token.tok == TOK.symbol);
588     assert(_topExprs.front.subs[1].token.src == "BinaryFunction");
589 
590     _topExprs.popFront();
591     assert(_topExprs.empty);
592 
593 }
594 
595 ///
596 @safe unittest
597 {
598     const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0";
599     auto _topExprs = LispParser(text);
600     import std.conv : to;
601     assert(_topExprs.front.to!string == `(instance AttrFn BinaryFunction)`);
602     assert(!_topExprs.empty);
603 }