1 /** Lexer and parser of Lisp-like languages, including SUO-KIF and Emacs-Lisp.
2  *
3  * See_Also: https://www.csee.umbc.edu/csee/research/kif/
4  * See_Also: https://en.wikipedia.org/wiki/Knowledge_Interchange_Format
5  * See_Also: http://sigmakee.cvs.sourceforge.net/viewvc/sigmakee/sigma/suo-kif.pdf
6  * See_Also: http://forum.dlang.org/post/prsxfcmkngfwomygmthi@forum.dlang.org
7  *
8  * TODO: Break out lexer and parser parts to to `nxt.`lexing and `nxt.parsing`
9  *
10  * TODO: Try infinite loops with break or goto instead of for loops.
11  *
12  * TODO: Should we add `LispFile.bySExpr` to allow use of `offsetTo` inside
13  * `LispFileParser lfp; foreach (lfp.bySExpr)` now that copy-ctor is disabled
14  * for `LispParser`?
15  */
16 module nxt.lispy;
17 
18 import nxt.path : FilePath, expandTilde, exists;
19 
20 @safe:
21 
22 /** Lisp-like token type. */
23 enum TOK {
24 	unknown,					///< Unknown.
25 
26 	leftParen,				  ///< Left parenthesis.
27 	rightParen,				 ///< Right parenthesis.
28 
29 	symbol,					 ///< Symbol.
30 
31 	stringLiteral,			  ///< String literal.
32 
33 	comma,					  ///< Lisp comma expression, `,`.
34 	backquote,				  ///< Lisp backquote expression, `\``.
35 	singlequote,				///< Lisp singlequote expression, `'`.
36 
37 	variable,
38 	variableList, ///< one or more variables (parameters) starting with an at-sign, for instance `@ROW`
39 	functionName,
40 
41 	number,					 ///< number as integer or floating point literal.
42 
43 	comment,					///< Comment (to end of line).
44 	whitespace,				 ///< Whitespace.
45 
46 	emptyList,				  ///< Empty list.
47 }
48 
49 /** Lisp-like token. */
50 struct Token
51 {
52 	this(TOK tok, const(char)[] src = null) pure nothrow @safe @nogc {
53 		this.tok = tok;
54 		this.src = src;
55 	}
56 
57 	@property final void toString(Sink)(ref scope Sink sink) const @safe {
58 		switch (tok) {
59 		case TOK.symbol:
60 			sink(src);
61 			break;
62 		case TOK.comma:
63 			sink(`,`);
64 			break;
65 		case TOK.backquote:
66 			sink("`");
67 			break;
68 		case TOK.singlequote:
69 			sink("'");
70 			break;
71 		case TOK.stringLiteral:
72 			sink(`"`);
73 			sink(src);
74 			sink(`"`);
75 			break;
76 		case TOK.emptyList:
77 			sink(`()`);
78 			break;
79 		default:
80 			// import std.conv : to;
81 			// sink(tok.to!string);
82 			if (src)
83 			{
84 				// sink(`:`);
85 				sink(src);
86 			}
87 			break;
88 		}
89 	}
90 
91 	TOK tok;
92 	const(char)[] src;		  // optional source slice
93 }
94 
95 /** Lisp-like S-expression. */
96 struct SExpr {
97 	final void toString(Sink)(ref scope Sink sink) const @safe @property {
98 		if (subs) { sink(`(`); }
99 
100 		token.toString(sink);
101 
102 		TOK lastTok = TOK.unknown;
103 		foreach (const ref sub; subs) {
104 			import std.algorithm.comparison : among;
105 			if (!lastTok.among!(TOK.comma,
106 								TOK.backquote,
107 								TOK.singlequote))
108 				sink(` `);
109 			sub.toString(sink);
110 			lastTok = sub.token.tok;
111 		}
112 
113 		if (subs) { sink(`)`); }
114 	}
115 
116 	/* This overload is needed in order for `Sexpr.init.to!string` to be @safe pure, I
117 	 * don’t know why. Perhaps because of incomplete attribute inference in compiler. */
118 	final string toString() const @property @safe pure {
119 		import std.conv : to;
120 		return this.to!(typeof(return));
121 	}
122 
123 	Token token;
124 	SExpr[] subs;
125 }
126 
127 /** Parse from `input` into lazy range over top-level expressions (`SExpr`).
128  *
129  * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org
130  */
131 struct LispParser			   /+ TODO: convert to `class` +/
132 {
133 	import std.algorithm.comparison : among;
134 
135 	alias Input = const(char)[];
136 
137 	struct Config {
138 		uint subExprCountsGuess = 0;
139 		bool includeComments;	 /+ TODO: use bitfield +/
140 		bool includeWhitespace;	 /+ TODO: use bitfield +/
141 		bool disallowEmptyLists; /+ TODO: use bitfield +/
142 	}
143 
144 @safe pure:
145 
146 	/** Parse `input` into returned array of expressions (`SExpr`).
147 	 */
148 	this(Input input, Config config = Config.init) @trusted {
149 		_input = input;
150 		// if (subExprCountsGuess) // if guess use region
151 		// {
152 		//	 _subExprsStore = new SExpr[subExprCountsGuess]; // region store
153 		// }
154 		import std.exception : enforce;
155 		import nxt.parsing : isNullTerminated;
156 		enforce(_input.isNullTerminated, "Input isn't null-terminated"); // input cannot be trusted
157 		_config = config;
158 		nextFront();
159 	}
160 
161 	this(this) @disable;
162 
163 	pragma(inline, true)
164 	bool empty() @property const nothrow scope @nogc => _endOfFile;
165 
166 	pragma(inline, true)
167 	ref const(SExpr) front() @property const scope return in(!empty) => _topExprs.back;
168 
169 	pragma(inline, true)
170 	void popFront() in(!empty) {
171 		_topExprs.popBack();
172 		nextFront();
173 	}
174 
175 	@property size_t subExprsCount() const pure nothrow @safe @nogc => _subExprsCount;
176 
177 	import std.meta : AliasSeq;
178 
179 	// from std.ascii.isWhite
180 	alias endOfLineChars = AliasSeq!('\n', // (0x0a)
181 									 '\r', // (0x0c)
182 		);
183 	alias whiteChars = AliasSeq!(' ', // 0x20
184 								 '\t', // (0x09)
185 								 '\n', // (0x0a)
186 								 '\v', // (0x0b)
187 								 '\r', // (0x0c)
188 								 '\f' // (0x0d)
189 		);
190 	alias digitChars = AliasSeq!('0', '1', '2', '3', '4', '5', '6', '7', '8', '9');
191 
192 private:
193 
194 	/// Get next `dchar` in input.
195 	pragma(inline, true)
196 	dchar peekNext() const scope nothrow @nogc
197 		=> _input[_offset]; /+ TODO: .ptr. TODO: decode `dchar` +/
198 
199 	/// Get next `dchar` in input.
200 	pragma(inline, true)
201 	dchar peekNextNth(size_t n) const scope nothrow @nogc
202 		=> _input[_offset + n]; /+ TODO: .ptr. TODO: decode `dchar` +/
203 
204 	/// Get next n `chars` in input.
205 	pragma(inline, true)
206 	Input peekNextsN(size_t n) const return scope nothrow @nogc
207 		=> _input[_offset .. _offset + n]; /+ TODO: .ptr +/
208 
209 	/// Drop next byte in input.
210 	pragma(inline, true)
211 	void dropFront() scope nothrow @nogc { _offset += 1; }
212 
213 	/// Drop next `n` bytes in input.
214 	pragma(inline, true)
215 	void dropFrontN(size_t n) scope nothrow @nogc { _offset += n; }
216 
217 	/// Skip over `n` bytes in input.
218 	pragma(inline, true)
219 	Input skipOverN(size_t n) return scope nothrow @nogc {
220 		const part = _input[_offset .. _offset + n]; /+ TODO: .ptr +/
221 		dropFrontN(n);
222 		return part;
223 	}
224 
225 	/// Skip line comment.
226 	void skipLineComment() scope nothrow @nogc {
227 		while (!peekNext().among!('\0', endOfLineChars))
228 			_offset += 1;
229 	}
230 
231 	/// Get symbol.
232 	Input getSymbol() return nothrow @nogc {
233 		size_t i = 0;
234 		while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite
235 			++i;
236 		return skipOverN(i);
237 	}
238 
239 	/// Get numeric literal (number) in integer or decimal form.
240 	Input getNumberOrSymbol(out bool gotSymbol) return nothrow @nogc {
241 		size_t i = 0;
242 		while ((peekNextNth(i).among!('+', '-', '.', digitChars))) // NOTE this is faster than !src[i].isWhite
243 			++i;
244 		import std.ascii : isAlpha;
245 		if (peekNextNth(i).isAlpha) // if followed by letter
246 		{
247 			size_t alphaCount = 0;
248 			while ((!peekNextNth(i).among!('\0', '(', ')', '"', whiteChars))) // NOTE this is faster than !src[i].isWhite
249 			{
250 				alphaCount += peekNextNth(i).isAlpha;
251 				++i;
252 			}
253 			gotSymbol = alphaCount >= 2; // at least two letters, excluding floating point such as 1.0e+10
254 		}
255 
256 		return skipOverN(i);
257 	}
258 
259 	/// Get whitespace.
260 	Input getWhitespace() return nothrow @nogc {
261 		size_t i = 0;
262 		while (peekNextNth(i).among!(whiteChars)) // NOTE this is faster than `src[i].isWhite`
263 			++i;
264 		return skipOverN(i);
265 	}
266 
267 	/// Get string literal in input.
268 	Input getStringLiteral() return nothrow @nogc {
269 		dropFront();
270 		size_t i = 0;
271 		while (!peekNextNth(i).among!('\0', '"'))
272 		{
273 			if (peekNextNth(i) == '\\' &&
274 				peekNextNth(i + 1) == '"')
275 			{
276 				i += 2;		 // skip \n
277 				continue;
278 			}
279 			++i;
280 		}
281 		const literal = peekNextsN(i);
282 		dropFrontN(i);
283 		if (peekNext() == '"') { dropFront(); } // pop ending double singlequote
284 		return literal;
285 	}
286 
287 	SExpr[] dupTopExprs(scope SExpr[] exprs) scope @safe pure nothrow {
288 		pragma(inline, true);
289 		_subExprsCount += exprs.length; // log it for future optimizations
290 		return exprs.dup; /+ TODO: use region allocator stored locally in `LispParser` +/
291 	}
292 
293 	void nextFront() {
294 		import std.range.primitives : empty, front, popFront, popFrontN;
295 		import std.uni : isWhite, isAlpha;
296 		import std.ascii : isDigit;
297 
298 		while (true) {
299 			switch (_input[_offset]) /+ TODO: .ptr +/
300 			{
301 			case ';':
302 				if (_config.includeComments) {
303 					assert(0, "TODO: don't use skipLineComment");
304 					// _topExprs.insertBack(SExpr(Token(TOK.comment, src[0 .. 1])));
305 				}
306 				else
307 					skipLineComment();
308 				break;
309 			case '(':
310 				_topExprs.insertBack(SExpr(Token(TOK.leftParen, peekNextsN(1))));
311 				dropFront();
312 				++_depth;
313 				break;
314 			case ')':
315 				// NOTE: this is not needed: _topExprs.insertBack(SExpr(Token(TOK.rightParen, src[0 .. 1])));
316 				dropFront();
317 				--_depth;
318 				// NOTE: this is not needed: _topExprs.popBack();   // pop right paren
319 
320 				assert(!_topExprs.empty);
321 
322 				/+ TODO: retroIndexOf +/
323 				size_t count; // number of elements between parens
324 				while (_topExprs[$ - 1 - count].token.tok != TOK.leftParen)
325 					++count;
326 				if (_config.disallowEmptyLists)
327 					assert(count != 0);
328 
329 				import core.lifetime : move;
330 				SExpr newExpr = ((count == 0) ?
331 								 SExpr(Token(TOK.emptyList)) :
332 								 SExpr(_topExprs[$ - count].token,
333 									   dupTopExprs(_topExprs[$ - count + 1 .. $])));
334 				_topExprs.popBackN(1 + count); // forget tokens including leftParen
335 				_topExprs.insertBack(newExpr.move);
336 
337 				if (_depth == 0) {				   // top-level expression done
338 					assert(_topExprs.length >= 1); // we should have at least one `SExpr`
339 					return;
340 				}
341 
342 				break;
343 			case '"':
344 				const stringLiteral = getStringLiteral(); /+ TODO: tokenize +/
345 				() @trusted {
346 					_topExprs.insertBack(SExpr(Token(TOK.stringLiteral, stringLiteral)));
347 				}();
348 				break;
349 			case ',':
350 				dropFront();
351 				_topExprs.insertBack(SExpr(Token(TOK.comma)));
352 				break;
353 			case '`':
354 				dropFront();
355 				_topExprs.insertBack(SExpr(Token(TOK.backquote)));
356 				break;
357 			case '\'':
358 				dropFront();
359 				_topExprs.insertBack(SExpr(Token(TOK.singlequote)));
360 				break;
361 			case '?':
362 				dropFront();
363 				const variableSymbol = getSymbol();
364 				() @trusted {
365 					_topExprs.insertBack(SExpr(Token(TOK.variable, variableSymbol)));
366 				}();
367 				break;
368 			case '@':
369 				dropFront();
370 				const variableListSymbol = getSymbol();
371 				() @trusted {
372 					_topExprs.insertBack(SExpr(Token(TOK.variableList, variableListSymbol)));
373 				}();
374 				break;
375 				// std.ascii.isDigit:
376 			case '0':
377 				..
378 			case '9':
379 			case '+':
380 			case '-':
381 			case '.':
382 				bool gotSymbol;
383 				const numberOrSymbol = getNumberOrSymbol(gotSymbol);
384 				if (gotSymbol) {
385 					// debug writeln("TODO: handle floating point: ", numberOrSymbol);
386 					() @trusted {
387 						_topExprs.insertBack(SExpr(Token(TOK.symbol, numberOrSymbol)));
388 					}();
389 				} else {
390 					() @trusted {
391 						_topExprs.insertBack(SExpr(Token(TOK.number, numberOrSymbol)));
392 					}();
393 				}
394 				break;
395 				// from std.ascii.isWhite
396 			case ' ':
397 			case '\t':
398 			case '\n':
399 			case '\v':
400 			case '\r':
401 			case '\f':
402 				assert(peekNext.isWhite);
403 				getWhitespace();
404 				if (_config.includeWhitespace)
405 					_topExprs.insertBack(SExpr(Token(TOK.whitespace, null)));
406 				break;
407 			case '\0':
408 				assert(_depth == 0, "Unbalanced parenthesis at end of file");
409 				_endOfFile = true;
410 				return;
411 			default:
412 				// other
413 				if (true// src.front.isAlpha
414 					)
415 				{
416 					const symbol = getSymbol(); /+ TODO: tokenize +/
417 					import nxt.algorithm.searching : endsWith;
418 					if (symbol.endsWith(`Fn`)) {
419 						() @trusted {
420 							_topExprs.insertBack(SExpr(Token(TOK.functionName, symbol)));
421 						}();
422 					} else {
423 						() @trusted {
424 							_topExprs.insertBack(SExpr(Token(TOK.symbol, symbol)));
425 						}();
426 					}
427 				}
428 				else
429 				{
430 					import std.conv : to;
431 					assert(false,
432 						   `Cannot handle character '` ~ peekNext.to!string ~
433 						   `' at charater offset:` ~ _offset.to!string);
434 				}
435 				break;
436 			}
437 		}
438 	}
439 
440 	public ptrdiff_t offsetTo(scope const char[] expr) const @trusted pure nothrow @nogc
441 		=> expr.ptr - _input.ptr;
442 
443 	import nxt.line_column : LineColumn, scanLineColumnToOffset;
444 	import nxt.offset : Offset;
445 
446 	public LineColumn offsetToLineColumn(size_t offset) const @trusted pure nothrow @nogc
447 		=> scanLineColumnToOffset(_input, Offset(offset));
448 
449 	public LineColumn sexprToLineColumn(scope const SExpr sexpr) const @trusted pure nothrow @nogc
450 		=> scanLineColumnToOffset(_input, Offset(offsetTo(sexpr.token.src)));
451 
452 	public LineColumn charsToLineColumn(scope const(char)[] chars) const @trusted pure nothrow @nogc
453 		=> scanLineColumnToOffset(_input, Offset(offsetTo(chars)));
454 
455 private:
456 	size_t _offset;			 // current offset in `_input`
457 	const Input _input;		 // input
458 
459 	import nxt.container.static_array : StaticArray;
460 	// import nxt.container.dynamic_array : DynamicArray;
461 	alias TopExprs = StaticArray!(SExpr, 1024);
462 	TopExprs _topExprs;		   // top s-expressions (stack)
463 	size_t _subExprsCount;
464 
465 	// SExpr[] _subExprsStore;	 // sub s-expressions (region)
466 	// size_t _subExprsOffset = 0; // offset into `_subExprsStore`
467 
468 	size_t _depth;			  // parenthesis depth
469 	bool _endOfFile;			// signals null terminator found
470 	Config _config;
471 }
472 
473 ///
474 pure @safe unittest {
475 	const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0";
476 	auto parser = LispParser(text);
477 	assert(!parser.empty);
478 
479 	assert(parser.front.token.tok == TOK.symbol);
480 	assert(parser.front.token.src == `instance`);
481 
482 	assert(parser.front.subs[0].token.tok == TOK.functionName);
483 	assert(parser.front.subs[0].token.src == "AttrFn");
484 
485 	assert(parser.front.subs[1].token.tok == TOK.symbol);
486 	assert(parser.front.subs[1].token.src == "BinaryFunction");
487 
488 	parser.popFront();
489 	assert(parser.empty);
490 
491 }
492 
493 ///
494 pure @safe unittest {
495 	const text = ";;a comment\n(instance AttrFn BinaryFunction);;another comment\0";
496 	auto parser = LispParser(text);
497 	import std.conv : to;
498 	assert(parser.front.to!string == `(instance AttrFn BinaryFunction)`);
499 	assert(!parser.empty);
500 }
501 
502 /** Parse the contents of `file` into lazy range over top-level expressions (`SExpr`).
503  *
504  * See_Also: https://forum.dlang.org/post/okqdldjnoyrtuizevqeo@forum.dlang.org
505  */
506 struct LispFileParser		   /+ TODO: convert to `class` +/
507 {
508 	import nxt.path : FilePath, expandTilde;
509 @safe:
510 	this(FilePath file) {
511 		const size_t subExprsCount = 0;
512 		/+ TODO: lookup `subExprsCount` using `file` extended attr or hash and pass to constructor +/
513 		import nxt.file : rawReadZ;
514 		const data = cast(LispParser.Input)rawReadZ(file.expandTilde); // cast to Input because we don't want to keep all file around:
515 		parser = LispParser(data, LispParser.Config(subExprsCount));
516 	}
517 	~this() nothrow @nogc {
518 		/+ TODO: write parser.subExprsCount +/
519 	}
520 	LispParser parser;
521 	ptrdiff_t offsetTo(scope const char[] expr) const pure nothrow @safe @nogc => parser.offsetTo(expr);
522 	alias parser this;
523 }
524 
525 /// Optional integration test if path exists.
526 @safe unittest {
527 	const path = FilePath(`~/Work/knet/knowledge/xlg.el`);
528 	if (path.exists) {
529 		auto parser = LispFileParser(path);
530 		assert(!parser.empty);
531 	}
532 }