1 /** Lexer and parser of Lisp-like languages, including SUO-KIF and Emacs-Lisp.
2  *
3  * See_Also: https://www.csee.umbc.edu/csee/research/kif/
4  * See_Also: https://en.wikipedia.org/wiki/Knowledge_Interchange_Format
5  * See_Also: http://sigmakee.cvs.sourceforge.net/viewvc/sigmakee/sigma/suo-kif.pdf
6  * See_Also: http://forum.dlang.org/post/prsxfcmkngfwomygmthi@forum.dlang.org
7  *
8  * TODO: Break out lexer and parser parts to to `nxt.`lexing and `nxt.parsing`
9  *
10  * TODO: Try infinite loops with break or goto instead of for loops.
11  *
12  * TODO: Should we add `LispFile.bySExpr` to allow use of `offsetTo` inside
13  * `LispFileParser lfp; foreach (lfp.bySExpr)` now that copy-ctor is disabled
14  * for `LispParser`?
15  */
16 module nxt.lispy;
17 
18 @safe:
19 
20 /** Lisp-like token type. */
21 enum TOK
22 {
23     unknown,                    ///< Unknown.
24 
25     leftParen,                  ///< Left parenthesis.
26     rightParen,                 ///< Right parenthesis.
27 
28     symbol,                     ///< Symbol.
29 
30     stringLiteral,              ///< String literal.
31 
32     comma,                      ///< Lisp comma expression, `,`.
33     backquote,                  ///< Lisp backquote expression, `\``.
34     singlequote,                ///< Lisp singlequote expression, `'`.
35 
36     variable,
37     variableList, ///< one or more variables (parameters) starting with an at-sign, for instance `@ROW`
38     functionName,
39 
40     number,                     ///< number as integer or floating point literal.
41 
42     comment,                    ///< Comment (to end of line).
43     whitespace,                 ///< Whitespace.
44 
45     emptyList,                  ///< Empty list.
46 }
47 
48 /** Lisp-like token. */
49 struct Token
50 {
51 @safe:
52     this(TOK tok, const(char)[] src = null) pure nothrow @nogc
53     {
54         this.tok = tok;
55         this.src = src;
56     }
57 
58     @property final void toString(scope void delegate(scope const(char)[]) @safe sink) const @safe
59     {
60         switch (tok)
61         {
62         case TOK.symbol:
63             sink(src);
64             break;
65         case TOK.comma:
66             sink(`,`);
67             break;
68         case TOK.backquote:
69             sink("`");
70             break;
71         case TOK.singlequote:
72             sink("'");
73             break;
74         case TOK.stringLiteral:
75             sink(`"`);
76             sink(src);
77             sink(`"`);
78             break;
79         case TOK.emptyList:
80             sink(`()`);
81             break;
82         default:
83             // import std.conv : to;
84             // sink(tok.to!string);
85             if (src)
86             {
87                 // sink(`:`);
88                 sink(src);
89             }
90             break;
91         }
92     }
93 
94     TOK tok;
95     const(char)[] src;          // optional source slice
96 }
97 
98 /** Lisp-like S-expression. */
99 struct SExpr
100 {
101 @safe:
102     @property final void toString(scope void delegate(scope const(char)[]) @safe sink) const @safe
103     {
104         if (subs) { sink(`(`); }
105 
106         token.toString(sink);
107 
108         TOK lastTok = TOK.unknown;
109         foreach (const ref sub; subs)
110         {
111             import std.algorithm.comparison : among;
112             if (!lastTok.among!(TOK.comma,
113                                 TOK.backquote,
114                                 TOK.singlequote))
115             {
116                 sink(` `);
117             }
118             sub.toString(sink);
119             lastTok = sub.token.tok;
120         }
121 
122         if (subs) { sink(`)`); }
123     }
124 
125     Token token;
126     SExpr[] subs;
127 }
128 
129 /** Parse from `input` into lazy range over top-level expressions (`SExpr`).
130  *
131  * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org
132  */
133 struct LispParser               // TODO: convert to `class`
134 {
135     import std.algorithm.comparison : among;
136 
137     alias Input = const(char)[];
138 
139 @safe pure:
140 
141     /** Parse `input` into returned array of expressions (`SExpr`).
142      */
143     this(Input input,
144          bool includeComments = false,
145          bool includeWhitespace = false,
146          bool disallowEmptyLists = false,
147          size_t subExprCountsGuess = 0) @trusted
148     {
149         _input = input;
150 
151         // if (subExprCountsGuess) // if guess use region
152         // {
153         //     _subExprsStore = new SExpr[subExprCountsGuess]; // region store
154         // }
155 
156         import std.exception : enforce;
157         import nxt.parsing : isNullTerminated;
158         enforce(_input.isNullTerminated, "Input isn't null-terminated"); // input cannot be trusted
159 
160         _includeComments = includeComments;
161         _includeWhitespace = includeWhitespace;
162         _disallowEmptyLists = disallowEmptyLists;
163 
164         nextFront();
165     }
166 
167     @disable this(this);
168 
169     @property bool empty() const nothrow scope @nogc
170     {
171         version(D_Coverage) {} else pragma(inline, true);
172         return _endOfFile;
173     }
174 
175     ref const(SExpr) front() const scope return
176     {
177         version(D_Coverage) {} else pragma(inline, true);
178         assert(!empty);
179         return _topExprs.back;
180     }
181 
182     void popFront()
183     {
184         version(D_Coverage) {} else pragma(inline, true);
185         assert(!empty);
186         _topExprs.popBack();
187         nextFront();
188     }
189 
190     @property size_t subExprsCount() const @safe pure nothrow @nogc
191     {
192         return _subExprsCount;
193     }
194 
195     import std.meta : AliasSeq;
196 
197     // from std.ascii.isWhite
198     alias endOfLineChars = AliasSeq!('\n', // (0x0a)
199                                      '\r', // (0x0c)
200         );
201     alias whiteChars = AliasSeq!(' ', // 0x20
202                                  '\t', // (0x09)
203                                  '\n', // (0x0a)
204                                  '\v', // (0x0b)
205                                  '\r', // (0x0c)
206                                  '\f' // (0x0d)
207         );
208     alias digitChars = AliasSeq!('0', '1', '2', '3', '4', '5', '6', '7', '8', '9');
209 
210 private:
211 
212     /// Get next `dchar` in input.
213     dchar peekNext() const scope nothrow @nogc
214     {
215         version(D_Coverage) {} else pragma(inline, true);
216         return _input[_offset];    // TODO: .ptr. TODO: decode `dchar`
217     }
218 
219     /// Get next `dchar` in input.
220     dchar peekNextNth(size_t n) const nothrow @nogc
221     {
222         version(D_Coverage) {} else pragma(inline, true);
223         return _input[_offset + n]; // TODO: .ptr. TODO: decode `dchar`
224     }
225 
226     /// Get next n `chars` in input.
227     Input peekNextsN(size_t n) const return nothrow @nogc
228     {
229         version(D_Coverage) {} else pragma(inline, true);
230         return _input[_offset .. _offset + n]; // TODO: .ptr
231     }
232 
233     /// Drop next byte in input.
234     void dropFront() nothrow @nogc
235     {
236         version(D_Coverage) {} else pragma(inline, true);
237         _offset += 1;
238     }
239 
240     /// Drop next `n` bytes in input.
241     void dropFrontN(size_t n) nothrow @nogc
242     {
243         version(D_Coverage) {} else pragma(inline, true);
244         _offset += n;
245     }
246 
247     /// Skip over `n` bytes in input.
248     Input skipOverN(size_t n) return nothrow @nogc
249     {
250         version(D_Coverage) {} else pragma(inline);
251         const part = _input[_offset .. _offset + n]; // TODO: .ptr
252         dropFrontN(n);
253         return part;
254     }
255 
256     /// Skip line comment.
257     void skipLineComment() scope nothrow @nogc
258     {
259         while (!peekNext().among!('\0', endOfLineChars))
260         {
261             _offset += 1;
262         }
263     }
264 
265     /// Get symbol.
266     Input getSymbol() return nothrow @nogc
267     {
268         size_t i = 0;
269         while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite
270         {
271             ++i;
272         }
273         return skipOverN(i);
274     }
275 
276     /// Get numeric literal (number) in integer or decimal form.
277     Input getNumberOrSymbol(out bool gotSymbol) return nothrow @nogc
278     {
279         size_t i = 0;
280         while ((peekNextNth(i).among!('+', '-', '.', digitChars))) // NOTE this is faster than !src[i].isWhite
281         {
282             ++i;
283         }
284         import std.ascii : isAlpha;
285         if (peekNextNth(i).isAlpha) // if followed by letter
286         {
287             size_t alphaCount = 0;
288             while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite
289             {
290                 alphaCount += peekNextNth(i).isAlpha;
291                 ++i;
292             }
293             gotSymbol = alphaCount >= 2; // at least two letters, excluding floating point such as 1.0e+10
294         }
295 
296         return skipOverN(i);
297     }
298 
299     /// Get whitespace.
300     Input getWhitespace() return nothrow @nogc
301     {
302         size_t i = 0;
303         while (peekNextNth(i).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite`
304         {
305             ++i;
306         }
307         return skipOverN(i);
308     }
309 
310     /// Get string literal in input.
311     Input getStringLiteral() return nothrow @nogc
312     {
313         dropFront();
314         size_t i = 0;
315         while (!peekNextNth(i).among!('\0', '"'))
316         {
317             if (peekNextNth(i) == '\\' &&
318                 peekNextNth(i + 1) == '"')
319             {
320                 i += 2;         // skip \n
321                 continue;
322             }
323             ++i;
324         }
325         const literal = peekNextsN(i);
326         dropFrontN(i);
327         if (peekNext() == '"') { dropFront(); } // pop ending double singlequote
328         return literal;
329     }
330 
331     SExpr[] dupTopExprs(SExpr[] exprs) @safe pure nothrow
332     {
333         version(D_Coverage) {} else pragma(inline, true);
334         _subExprsCount += exprs.length; // log it for future optimizations
335         return exprs.dup; // TODO: use region allocator stored locally in `LispParser`
336     }
337 
338     void nextFront()
339     {
340         import std.range.primitives : empty, front, popFront, popFrontN;
341         import std.uni : isWhite, isAlpha;
342         import std.ascii : isDigit;
343 
344         while (true)
345         {
346             switch (_input[_offset]) // TODO: .ptr
347             {
348             case ';':
349                 if (_includeComments)
350                 {
351                     assert(0, "TODO: don't use skipLineComment");
352                     // _topExprs.insertBack(SExpr(Token(TOK.comment, src[0 .. 1])));
353                 }
354                 else
355                 {
356                     skipLineComment();
357                 }
358                 break;
359             case '(':
360                 _topExprs.insertBack(SExpr(Token(TOK.leftParen, peekNextsN(1))));
361                 dropFront();
362                 ++_depth;
363                 break;
364             case ')':
365                 // NOTE: this is not needed: _topExprs.insertBack(SExpr(Token(TOK.rightParen, src[0 .. 1])));
366                 dropFront();
367                 --_depth;
368                 // NOTE: this is not needed: _topExprs.popBack();   // pop right paren
369 
370                 assert(!_topExprs.empty);
371 
372                 // TODO: retroIndexOf
373                 size_t count; // number of elements between parens
374                 while (_topExprs[$ - 1 - count].token.tok != TOK.leftParen)
375                 {
376                     ++count;
377                 }
378                 if (_disallowEmptyLists)
379                 {
380                     assert(count != 0);
381                 }
382 
383                 import core.lifetime : move;
384                 SExpr newExpr = ((count == 0) ?
385                                  SExpr(Token(TOK.emptyList)) :
386                                  SExpr(_topExprs[$ - count].token,
387                                        dupTopExprs(_topExprs[$ - count + 1 .. $])));
388                 _topExprs.popBackN(1 + count); // forget tokens including leftParen
389                 _topExprs.insertBack(newExpr.move);
390 
391                 if (_depth == 0) // top-level expression done
392                 {
393                     assert(_topExprs.length >= 1); // we should have at least one `SExpr`
394                     return;
395                 }
396 
397                 break;
398             case '"':
399                 const stringLiteral = getStringLiteral(); // TODO: tokenize
400                 _topExprs.insertBack(SExpr(Token(TOK.stringLiteral, stringLiteral)));
401                 break;
402             case ',':
403                 dropFront();
404                 _topExprs.insertBack(SExpr(Token(TOK.comma)));
405                 break;
406             case '`':
407                 dropFront();
408                 _topExprs.insertBack(SExpr(Token(TOK.backquote)));
409                 break;
410             case '\'':
411                 dropFront();
412                 _topExprs.insertBack(SExpr(Token(TOK.singlequote)));
413                 break;
414             case '?':
415                 dropFront();
416                 const variableSymbol = getSymbol();
417                 _topExprs.insertBack(SExpr(Token(TOK.variable, variableSymbol)));
418                 break;
419             case '@':
420                 dropFront();
421                 const variableListSymbol = getSymbol();
422                 _topExprs.insertBack(SExpr(Token(TOK.variableList, variableListSymbol)));
423                 break;
424                 // std.ascii.isDigit:
425             case '0':
426                 ..
427             case '9':
428             case '+':
429             case '-':
430             case '.':
431                 bool gotSymbol;
432                 const numberOrSymbol = getNumberOrSymbol(gotSymbol);
433                 if (gotSymbol)
434                 {
435                     // debug writeln("TODO: handle floating point: ", numberOrSymbol);
436                     _topExprs.insertBack(SExpr(Token(TOK.symbol, numberOrSymbol)));
437                 }
438                 else
439                 {
440                     _topExprs.insertBack(SExpr(Token(TOK.number, numberOrSymbol)));
441                 }
442                 break;
443                 // from std.ascii.isWhite
444             case ' ':
445             case '\t':
446             case '\n':
447             case '\v':
448             case '\r':
449             case '\f':
450                 assert(peekNext.isWhite);
451                 getWhitespace();
452                 if (_includeWhitespace)
453                 {
454                     _topExprs.insertBack(SExpr(Token(TOK.whitespace, null)));
455                 }
456                 break;
457             case '\0':
458                 assert(_depth == 0, "Unbalanced parenthesis at end of file");
459                 _endOfFile = true;
460                 return;
461             default:
462                 // other
463                 if (true// src.front.isAlpha
464                     )
465                 {
466                     const symbol = getSymbol(); // TODO: tokenize
467                     import nxt.array_algorithm : endsWith;
468                     if (symbol.endsWith(`Fn`))
469                     {
470                         _topExprs.insertBack(SExpr(Token(TOK.functionName, symbol)));
471                     }
472                     else
473                     {
474                         _topExprs.insertBack(SExpr(Token(TOK.symbol, symbol)));
475                     }
476                 }
477                 else
478                 {
479                     import std.conv : to;
480                     assert(false,
481                            `Cannot handle character '` ~ peekNext.to!string ~
482                            `' at charater offset:` ~ _offset.to!string);
483                 }
484                 break;
485             }
486         }
487     }
488 
489     public ptrdiff_t offsetTo(scope const char[] expr) const @trusted pure nothrow @nogc
490     {
491         return expr.ptr - _input.ptr;
492     }
493 
494     import nxt.line_column : LineColumn, offsetLineColumn;
495 
496     public LineColumn offsetToLineColumn(size_t offset) const @trusted pure nothrow @nogc
497     {
498         return offsetLineColumn(_input, offset);
499     }
500 
501     public LineColumn sexprToLineColumn(scope const SExpr sexpr) const @trusted pure nothrow @nogc
502     {
503         return offsetLineColumn(_input, offsetTo(sexpr.token.src));
504     }
505 
506     public LineColumn charsToLineColumn(scope const(char)[] chars) const @trusted pure nothrow @nogc
507     {
508         return offsetLineColumn(_input, offsetTo(chars));
509     }
510 
511 private:
512     size_t _offset;             // current offset in `_input`
513     const Input _input;         // input
514 
515     import nxt.fixed_array : FixedArray;
516     // import nxt.dynamic_array : DynamicArray;
517     alias TopExprs = FixedArray!(SExpr, 1024);
518     TopExprs _topExprs;           // top s-expressions (stack)
519     size_t _subExprsCount;
520 
521     // SExpr[] _subExprsStore;     // sub s-expressions (region)
522     // size_t _subExprsOffset = 0; // offset into `_subExprsStore`
523 
524     size_t _depth;              // parenthesis depth
525     bool _endOfFile;            // signals null terminator found
526     bool _includeComments = false;
527     bool _includeWhitespace = false;
528     bool _disallowEmptyLists = false;
529 }
530 
531 /** Parse the contents of `filePath` into lazy range over top-level expressions (`SExpr`).
532  *
533  * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org
534  */
535 struct LispFileParser           // TODO: convert to `class`
536 {
537 @safe:
538     this(const string filePath)
539     {
540         import std.path : expandTilde;
541         import nxt.file_ex : rawReadPath;
542         const size_t subExprsCount = 0;
543         // TODO: lookup `subExprsCount` using `filePath` extended attr or hash and pass to constructor
544         const path = filePath.expandTilde;
545         const data = cast(LispParser.Input)rawReadPath(path); // cast to Input because we don't want to keep all file around:
546         parser = LispParser(data, false, false, false, subExprsCount);
547     }
548     ~this() @nogc
549     {
550         // TODO: write parser.subExprsCount
551     }
552     LispParser parser;
553     ptrdiff_t offsetTo(scope const char[] expr) const @safe pure nothrow @nogc
554     {
555         return parser.offsetTo(expr);
556     }
557     alias parser this;
558 }
559 
560 ///
561 @safe pure unittest
562 {
563     const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0";
564     auto _topExprs = LispParser(text);
565     assert(!_topExprs.empty);
566 
567     assert(_topExprs.front.token.tok == TOK.symbol);
568     assert(_topExprs.front.token.src == `instance`);
569 
570     assert(_topExprs.front.subs[0].token.tok == TOK.functionName);
571     assert(_topExprs.front.subs[0].token.src == "AttrFn");
572 
573     assert(_topExprs.front.subs[1].token.tok == TOK.symbol);
574     assert(_topExprs.front.subs[1].token.src == "BinaryFunction");
575 
576     _topExprs.popFront();
577     assert(_topExprs.empty);
578 
579 }
580 
581 ///
582 @safe unittest
583 {
584     const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0";
585     auto _topExprs = LispParser(text);
586     import std.conv : to;
587     assert(_topExprs.front.to!string == `(instance AttrFn BinaryFunction)`);
588     assert(!_topExprs.empty);
589 }