1 /** Lexer and parser of Lisp-like languages, including SUO-KIF and Emacs-Lisp.
2  *
3  * See_Also: https://www.csee.umbc.edu/csee/research/kif/
4  * See_Also: https://en.wikipedia.org/wiki/Knowledge_Interchange_Format
5  * See_Also: http://sigmakee.cvs.sourceforge.net/viewvc/sigmakee/sigma/suo-kif.pdf
6  * See_Also: http://forum.dlang.org/post/prsxfcmkngfwomygmthi@forum.dlang.org
7  *
8  * TODO: Break out lexer and parser parts to to `nxt.`lexing and `nxt.parsing`
9  *
10  * TODO: Try infinite loops with break or goto instead of for loops.
11  *
12  * TODO: Should we add `LispFile.bySExpr` to allow use of `offsetTo` inside
13  * `LispFileParser lfp; foreach (lfp.bySExpr)` now that copy-ctor is disabled
14  * for `LispParser`?
15  */
16 module nxt.lispy;
17 
18 @safe:
19 
20 /** Lisp-like token type. */
21 enum TOK
22 {
23     unknown,                    ///< Unknown.
24 
25     leftParen,                  ///< Left parenthesis.
26     rightParen,                 ///< Right parenthesis.
27 
28     symbol,                     ///< Symbol.
29 
30     stringLiteral,              ///< String literal.
31 
32     comma,                      ///< Lisp comma expression, `,`.
33     backquote,                  ///< Lisp backquote expression, `\``.
34     singlequote,                ///< Lisp singlequote expression, `'`.
35 
36     variable,
37     variableList, ///< one or more variables (parameters) starting with an at-sign, for instance `@ROW`
38     functionName,
39 
40     number,                     ///< number as integer or floating point literal.
41 
42     comment,                    ///< Comment (to end of line).
43     whitespace,                 ///< Whitespace.
44 
45     emptyList,                  ///< Empty list.
46 }
47 
48 /** Lisp-like token. */
49 struct Token
50 {
51     this(TOK tok, const(char)[] src = null) @safe pure nothrow @nogc
52     {
53         this.tok = tok;
54         this.src = src;
55     }
56 
57     @property final void toString(Sink)(ref scope Sink sink) const @safe
58     {
59         switch (tok)
60         {
61         case TOK.symbol:
62             sink(src);
63             break;
64         case TOK.comma:
65             sink(`,`);
66             break;
67         case TOK.backquote:
68             sink("`");
69             break;
70         case TOK.singlequote:
71             sink("'");
72             break;
73         case TOK.stringLiteral:
74             sink(`"`);
75             sink(src);
76             sink(`"`);
77             break;
78         case TOK.emptyList:
79             sink(`()`);
80             break;
81         default:
82             // import std.conv : to;
83             // sink(tok.to!string);
84             if (src)
85             {
86                 // sink(`:`);
87                 sink(src);
88             }
89             break;
90         }
91     }
92 
93     TOK tok;
94     const(char)[] src;          // optional source slice
95 }
96 
97 /** Lisp-like S-expression. */
98 struct SExpr
99 {
100     final void toString(Sink)(ref scope Sink sink) const @safe @property
101     {
102         if (subs) { sink(`(`); }
103 
104         token.toString(sink);
105 
106         TOK lastTok = TOK.unknown;
107         foreach (const ref sub; subs)
108         {
109             import std.algorithm.comparison : among;
110             if (!lastTok.among!(TOK.comma,
111                                 TOK.backquote,
112                                 TOK.singlequote))
113             {
114                 sink(` `);
115             }
116             sub.toString(sink);
117             lastTok = sub.token.tok;
118         }
119 
120         if (subs) { sink(`)`); }
121     }
122 
123 	/* This overload is needed in order for `Sexpr.init.to!string` to be @safe pure, I
124 	 * don’t know why. Perhaps because of incomplete attribute inference in compiler. */
125     final string toString() const @property @safe pure
126 	{
127 		import std.conv : to;
128 		return this.to!(typeof(return));
129 	}
130 
131     Token token;
132     SExpr[] subs;
133 }
134 
135 /** Parse from `input` into lazy range over top-level expressions (`SExpr`).
136  *
137  * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org
138  */
139 struct LispParser               // TODO: convert to `class`
140 {
141     import std.algorithm.comparison : among;
142 
143     alias Input = const(char)[];
144 
145 	struct Config
146 	{
147 		uint subExprCountsGuess = 0;
148 		bool includeComments;	 // TODO: use bitfield
149 		bool includeWhitespace;	 // TODO: use bitfield
150 		bool disallowEmptyLists; // TODO: use bitfield
151 	}
152 
153 @safe pure:
154 
155     /** Parse `input` into returned array of expressions (`SExpr`).
156      */
157     this(Input input, Config config = Config.init) @trusted
158     {
159         _input = input;
160         // if (subExprCountsGuess) // if guess use region
161         // {
162         //     _subExprsStore = new SExpr[subExprCountsGuess]; // region store
163         // }
164         import std.exception : enforce;
165         import nxt.parsing : isNullTerminated;
166         enforce(_input.isNullTerminated, "Input isn't null-terminated"); // input cannot be trusted
167 		_config = config;
168         nextFront();
169     }
170 
171     @disable this(this);
172 
173 	pragma(inline, true)
174 	bool empty() @property const nothrow scope @nogc => _endOfFile;
175 
176 	pragma(inline, true)
177     ref const(SExpr) front() @property const scope return in(!empty) => _topExprs.back;
178 
179 	pragma(inline, true)
180     void popFront() in(!empty)
181     {
182         _topExprs.popBack();
183         nextFront();
184     }
185 
186     @property size_t subExprsCount() const @safe pure nothrow @nogc => _subExprsCount;
187 
188     import std.meta : AliasSeq;
189 
190     // from std.ascii.isWhite
191     alias endOfLineChars = AliasSeq!('\n', // (0x0a)
192                                      '\r', // (0x0c)
193         );
194     alias whiteChars = AliasSeq!(' ', // 0x20
195                                  '\t', // (0x09)
196                                  '\n', // (0x0a)
197                                  '\v', // (0x0b)
198                                  '\r', // (0x0c)
199                                  '\f' // (0x0d)
200         );
201     alias digitChars = AliasSeq!('0', '1', '2', '3', '4', '5', '6', '7', '8', '9');
202 
203 private:
204 
205     /// Get next `dchar` in input.
206 	pragma(inline, true)
207     dchar peekNext() const scope nothrow @nogc
208 		=> _input[_offset]; // TODO: .ptr. TODO: decode `dchar`
209 
210     /// Get next `dchar` in input.
211 	pragma(inline, true)
212     dchar peekNextNth(size_t n) const scope nothrow @nogc
213 		=> _input[_offset + n]; // TODO: .ptr. TODO: decode `dchar`
214 
215     /// Get next n `chars` in input.
216 	pragma(inline, true)
217     Input peekNextsN(size_t n) const return scope nothrow @nogc
218 		=> _input[_offset .. _offset + n]; // TODO: .ptr
219 
220     /// Drop next byte in input.
221 	pragma(inline, true)
222     void dropFront() scope nothrow @nogc { _offset += 1; }
223 
224     /// Drop next `n` bytes in input.
225 	pragma(inline, true)
226     void dropFrontN(size_t n) scope nothrow @nogc { _offset += n; }
227 
228     /// Skip over `n` bytes in input.
229 	pragma(inline, true)
230     Input skipOverN(size_t n) return scope nothrow @nogc
231     {
232         const part = _input[_offset .. _offset + n]; // TODO: .ptr
233         dropFrontN(n);
234         return part;
235     }
236 
237     /// Skip line comment.
238     void skipLineComment() scope nothrow @nogc
239     {
240         while (!peekNext().among!('\0', endOfLineChars))
241             _offset += 1;
242     }
243 
244     /// Get symbol.
245     Input getSymbol() return nothrow @nogc
246     {
247         size_t i = 0;
248         while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite
249             ++i;
250         return skipOverN(i);
251     }
252 
253     /// Get numeric literal (number) in integer or decimal form.
254     Input getNumberOrSymbol(out bool gotSymbol) return nothrow @nogc
255     {
256         size_t i = 0;
257         while ((peekNextNth(i).among!('+', '-', '.', digitChars))) // NOTE this is faster than !src[i].isWhite
258             ++i;
259         import std.ascii : isAlpha;
260         if (peekNextNth(i).isAlpha) // if followed by letter
261         {
262             size_t alphaCount = 0;
263             while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite
264             {
265                 alphaCount += peekNextNth(i).isAlpha;
266                 ++i;
267             }
268             gotSymbol = alphaCount >= 2; // at least two letters, excluding floating point such as 1.0e+10
269         }
270 
271         return skipOverN(i);
272     }
273 
274     /// Get whitespace.
275     Input getWhitespace() return nothrow @nogc
276     {
277         size_t i = 0;
278         while (peekNextNth(i).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite`
279             ++i;
280         return skipOverN(i);
281     }
282 
283     /// Get string literal in input.
284     Input getStringLiteral() return nothrow @nogc
285     {
286         dropFront();
287         size_t i = 0;
288         while (!peekNextNth(i).among!('\0', '"'))
289         {
290             if (peekNextNth(i) == '\\' &&
291                 peekNextNth(i + 1) == '"')
292             {
293                 i += 2;         // skip \n
294                 continue;
295             }
296             ++i;
297         }
298         const literal = peekNextsN(i);
299         dropFrontN(i);
300         if (peekNext() == '"') { dropFront(); } // pop ending double singlequote
301         return literal;
302     }
303 
304     SExpr[] dupTopExprs(scope SExpr[] exprs) scope @safe pure nothrow
305     {
306 		pragma(inline, true);
307         _subExprsCount += exprs.length; // log it for future optimizations
308         return exprs.dup; // TODO: use region allocator stored locally in `LispParser`
309     }
310 
311     void nextFront()
312     {
313         import std.range.primitives : empty, front, popFront, popFrontN;
314         import std.uni : isWhite, isAlpha;
315         import std.ascii : isDigit;
316 
317         while (true)
318         {
319             switch (_input[_offset]) // TODO: .ptr
320             {
321             case ';':
322                 if (_config.includeComments)
323                 {
324                     assert(0, "TODO: don't use skipLineComment");
325                     // _topExprs.insertBack(SExpr(Token(TOK.comment, src[0 .. 1])));
326                 }
327                 else
328                     skipLineComment();
329                 break;
330             case '(':
331                 _topExprs.insertBack(SExpr(Token(TOK.leftParen, peekNextsN(1))));
332                 dropFront();
333                 ++_depth;
334                 break;
335             case ')':
336                 // NOTE: this is not needed: _topExprs.insertBack(SExpr(Token(TOK.rightParen, src[0 .. 1])));
337                 dropFront();
338                 --_depth;
339                 // NOTE: this is not needed: _topExprs.popBack();   // pop right paren
340 
341                 assert(!_topExprs.empty);
342 
343                 // TODO: retroIndexOf
344                 size_t count; // number of elements between parens
345                 while (_topExprs[$ - 1 - count].token.tok != TOK.leftParen)
346                     ++count;
347                 if (_config.disallowEmptyLists)
348                     assert(count != 0);
349 
350                 import core.lifetime : move;
351                 SExpr newExpr = ((count == 0) ?
352                                  SExpr(Token(TOK.emptyList)) :
353                                  SExpr(_topExprs[$ - count].token,
354                                        dupTopExprs(_topExprs[$ - count + 1 .. $])));
355                 _topExprs.popBackN(1 + count); // forget tokens including leftParen
356                 _topExprs.insertBack(newExpr.move);
357 
358                 if (_depth == 0) // top-level expression done
359                 {
360                     assert(_topExprs.length >= 1); // we should have at least one `SExpr`
361                     return;
362                 }
363 
364                 break;
365             case '"':
366                 const stringLiteral = getStringLiteral(); // TODO: tokenize
367                 _topExprs.insertBack(SExpr(Token(TOK.stringLiteral, stringLiteral)));
368                 break;
369             case ',':
370                 dropFront();
371                 _topExprs.insertBack(SExpr(Token(TOK.comma)));
372                 break;
373             case '`':
374                 dropFront();
375                 _topExprs.insertBack(SExpr(Token(TOK.backquote)));
376                 break;
377             case '\'':
378                 dropFront();
379                 _topExprs.insertBack(SExpr(Token(TOK.singlequote)));
380                 break;
381             case '?':
382                 dropFront();
383                 const variableSymbol = getSymbol();
384                 _topExprs.insertBack(SExpr(Token(TOK.variable, variableSymbol)));
385                 break;
386             case '@':
387                 dropFront();
388                 const variableListSymbol = getSymbol();
389                 _topExprs.insertBack(SExpr(Token(TOK.variableList, variableListSymbol)));
390                 break;
391                 // std.ascii.isDigit:
392             case '0':
393                 ..
394             case '9':
395             case '+':
396             case '-':
397             case '.':
398                 bool gotSymbol;
399                 const numberOrSymbol = getNumberOrSymbol(gotSymbol);
400                 if (gotSymbol)
401                 {
402                     // debug writeln("TODO: handle floating point: ", numberOrSymbol);
403                     _topExprs.insertBack(SExpr(Token(TOK.symbol, numberOrSymbol)));
404                 }
405                 else
406                 {
407                     _topExprs.insertBack(SExpr(Token(TOK.number, numberOrSymbol)));
408                 }
409                 break;
410                 // from std.ascii.isWhite
411             case ' ':
412             case '\t':
413             case '\n':
414             case '\v':
415             case '\r':
416             case '\f':
417                 assert(peekNext.isWhite);
418                 getWhitespace();
419                 if (_config.includeWhitespace)
420                 {
421                     _topExprs.insertBack(SExpr(Token(TOK.whitespace, null)));
422                 }
423                 break;
424             case '\0':
425                 assert(_depth == 0, "Unbalanced parenthesis at end of file");
426                 _endOfFile = true;
427                 return;
428             default:
429                 // other
430                 if (true// src.front.isAlpha
431                     )
432                 {
433                     const symbol = getSymbol(); // TODO: tokenize
434                     import nxt.array_algorithm : endsWith;
435                     if (symbol.endsWith(`Fn`))
436                     {
437                         _topExprs.insertBack(SExpr(Token(TOK.functionName, symbol)));
438                     }
439                     else
440                     {
441                         _topExprs.insertBack(SExpr(Token(TOK.symbol, symbol)));
442                     }
443                 }
444                 else
445                 {
446                     import std.conv : to;
447                     assert(false,
448                            `Cannot handle character '` ~ peekNext.to!string ~
449                            `' at charater offset:` ~ _offset.to!string);
450                 }
451                 break;
452             }
453         }
454     }
455 
456     public ptrdiff_t offsetTo(scope const char[] expr) const @trusted pure nothrow @nogc
457     	=> expr.ptr - _input.ptr;
458 
459     import nxt.line_column : LineColumn, offsetLineColumn;
460 
461     public LineColumn offsetToLineColumn(size_t offset) const @trusted pure nothrow @nogc
462     	=> offsetLineColumn(_input, offset);
463 
464     public LineColumn sexprToLineColumn(scope const SExpr sexpr) const @trusted pure nothrow @nogc
465 		=> offsetLineColumn(_input, offsetTo(sexpr.token.src));
466 
467     public LineColumn charsToLineColumn(scope const(char)[] chars) const @trusted pure nothrow @nogc
468 		=> offsetLineColumn(_input, offsetTo(chars));
469 
470 private:
471     size_t _offset;             // current offset in `_input`
472     const Input _input;         // input
473 
474     import nxt.container.static_array : StaticArray;
475     // import nxt.container.dynamic_array : DynamicArray;
476     alias TopExprs = StaticArray!(SExpr, 1024);
477     TopExprs _topExprs;           // top s-expressions (stack)
478     size_t _subExprsCount;
479 
480     // SExpr[] _subExprsStore;     // sub s-expressions (region)
481     // size_t _subExprsOffset = 0; // offset into `_subExprsStore`
482 
483     size_t _depth;              // parenthesis depth
484     bool _endOfFile;            // signals null terminator found
485 	Config _config;
486 }
487 
488 /** Parse the contents of `filePath` into lazy range over top-level expressions (`SExpr`).
489  *
490  * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org
491  */
492 struct LispFileParser           // TODO: convert to `class`
493 {
494 @safe:
495     this(const string filePath)
496     {
497         import std.path : expandTilde;
498         import nxt.file_ex : rawReadPath;
499         const size_t subExprsCount = 0;
500         // TODO: lookup `subExprsCount` using `filePath` extended attr or hash and pass to constructor
501         const path = filePath.expandTilde;
502         const data = cast(LispParser.Input)rawReadPath(path); // cast to Input because we don't want to keep all file around:
503         parser = LispParser(data, LispParser.Config(subExprsCount));
504     }
505     ~this() nothrow @nogc
506     {
507         // TODO: write parser.subExprsCount
508     }
509     LispParser parser;
510     ptrdiff_t offsetTo(scope const char[] expr) const @safe pure nothrow @nogc => parser.offsetTo(expr);
511     alias parser this;
512 }
513 
514 ///
515 @safe pure unittest
516 {
517     const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0";
518     auto _topExprs = LispParser(text);
519     assert(!_topExprs.empty);
520 
521     assert(_topExprs.front.token.tok == TOK.symbol);
522     assert(_topExprs.front.token.src == `instance`);
523 
524     assert(_topExprs.front.subs[0].token.tok == TOK.functionName);
525     assert(_topExprs.front.subs[0].token.src == "AttrFn");
526 
527     assert(_topExprs.front.subs[1].token.tok == TOK.symbol);
528     assert(_topExprs.front.subs[1].token.src == "BinaryFunction");
529 
530     _topExprs.popFront();
531     assert(_topExprs.empty);
532 
533 }
534 
535 ///
536 @safe pure unittest
537 {
538     const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0";
539     auto _topExprs = LispParser(text);
540     import std.conv : to;
541     assert(_topExprs.front.to!string == `(instance AttrFn BinaryFunction)`);
542     assert(!_topExprs.empty);
543 }