1 /** Ada Lexer.  */
2 module nxt.ada_lexer;
3 
4 import std.typecons;
5 import std.meta;
6 import std.array;
7 import std.algorithm;
8 import std.range;
9 
10 import nxt.lexer;
11 import nxt.ada_defs;
12 import nxt.stringcache;
13 
14 /// Operators
15 private enum operators = ada_defs.operators;
16 
17 /// Keywords
18 private enum keywords = ada_defs.keywords2012;
19 
20 /// Other tokens
21 private enum dynamicTokens = [
22 	`string`, `number`, `identifier`, `comment`, `whitespace`
23 	];
24 
25 private enum pseudoTokenHandlers = [
26 	`"`, `lexStringLiteral`,
27 	`0`, `lexNumber`,
28 	`1`, `lexNumber`,
29 	`2`, `lexNumber`,
30 	`3`, `lexNumber`,
31 	`4`, `lexNumber`,
32 	`5`, `lexNumber`,
33 	`6`, `lexNumber`,
34 	`7`, `lexNumber`,
35 	`8`, `lexNumber`,
36 	`9`, `lexNumber`,
37 	` `, `lexWhitespace`,
38 	`\t`, `lexWhitespace`,
39 	`\r`, `lexWhitespace`,
40 	`\n`, `lexWhitespace`,
41 	`--`, `lexComment`,
42 	];
43 
44 /// Token ID type for the D lexer.
45 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
46 
47 /**
48  * Function used for converting an IdType to a string.
49  *
50  * Examples:
51  * ---
52  * IdType c = tok!"case";
53  * assert (str(c) == "case");
54  * ---
55  */
56 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
57 
58 /**
59  * Template used to refer to D token types.
60  *
61  * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for
62  * values that can be passed to this template.
63  * Example:
64  * ---
65  * import std.d.lexer;
66  * IdType t = tok!"floatLiteral";
67  * ---
68  */
69 public template tok(string token)
70 {
71 	alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
72 }
73 
74 private enum extraFields = q{
75 	string comment;
76 	string trailingComment;
77 
78 	int opCmp(size_t i) const pure nothrow @safe {
79 		if (index < i) return -1;
80 		if (index > i) return 1;
81 		return 0;
82 	}
83 
84 	int opCmp(ref const typeof(this) other) const pure nothrow @safe {
85 		return opCmp(other.index);
86 	}
87 };
88 
89 /// The token type in the D lexer
90 public alias Token = lexer.TokenStructure!(IdType, extraFields);
91 
92 /**
93  * Lexer configuration struct
94  */
95 public struct LexerConfig
96 {
97 	string fileName;
98 }
99 
100 /**
101  * Returns: an array of tokens lexed from the given source code to the output range. All
102  * whitespace tokens are skipped and comments are attached to the token nearest
103  * to them.
104  */
105 const(Token)[] getTokensForParser(ubyte[] sourceCode, const LexerConfig config,
106 								  StringCache* cache)
107 {
108 //	import std.stdio;
109 	enum CommentType : ubyte
110 	{
111 		notDoc,
112 		line,
113 		block
114 	}
115 
116 	static CommentType commentType(string comment) pure nothrow @safe
117 	{
118 		if (comment.length < 3)
119 			return CommentType.notDoc;
120 		if (comment[0 ..3] == "///")
121 			return CommentType.line;
122 		if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**")
123 			return CommentType.block;
124 		return CommentType.notDoc;
125 	}
126 
127 	auto output = appender!(typeof(return))();
128 	auto lexer = AdaLexer(sourceCode, config, cache);
129 	string blockComment;
130 	size_t tokenCount;
131 	while (!lexer.empty)
132 	{
133 		switch (lexer.front.type)
134 		{
135 			case tok!"whitespace":
136 				lexer.popFront();
137 			break;
138 			case tok!"comment":
139 				final switch (commentType(lexer.front.text))
140 			{
141 				case CommentType.block:
142 					blockComment = lexer.front.text;
143 					lexer.popFront();
144 					break;
145 				case CommentType.line:
146 					if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line)
147 					{
148 						// writeln("attaching comment");
149 						(cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text;
150 					}
151 					else
152 					{
153 						blockComment = cache.intern(blockComment.length == 0 ? lexer.front.text
154 													: blockComment ~ "\n" ~ lexer.front.text);
155 					}
156 					lexer.popFront();
157 					break;
158 				case CommentType.notDoc:
159 					lexer.popFront();
160 					break;
161 			}
162 				break;
163 			default:
164 				Token t = lexer.front;
165 				lexer.popFront();
166 				tokenCount++;
167 				t.comment = blockComment;
168 				blockComment = null;
169 				output.put(t);
170 				break;
171 		}
172 	}
173 
174 	return output.data;
175 }
176 
177 /**
178  * The Ada lexer.
179  */
180 public struct AdaLexer
181 {
182 	import core.vararg;
183 
184 	mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
185 				 keywords, pseudoTokenHandlers);
186 
187 	@disable this();
188 
189 	/**
190 	 * Params:
191 	 *	 range = the bytes that compose the source code that will be lexed.
192 	 *	 config = the lexer configuration to use.
193 	 *	 cache = the string interning cache for de-duplicating identifiers and
194 	 *		 other token text.
195 	 */
196 	this(ubyte[] range, const LexerConfig config, StringCache* cache)
197 	{
198 		this.range = LexerRange(range);
199 		this.config = config;
200 		this.cache = cache;
201 		popFront();
202 	}
203 
204 	public void popFront() pure
205 	{
206 		_popFront();
207 	}
208 
209 	bool isWhitespace() pure const nothrow
210 	{
211 		switch (range.front)
212 		{
213 			case ' ':
214 			case '\r':
215 			case '\n':
216 			case '\t':
217 				return true;
218 			case 0xe2:
219 				auto peek = range.peek(2);
220 				return peek.length == 2
221 				&& peek[0] == 0x80
222 				&& (peek[1] == 0xa8 || peek[1] == 0xa9);
223 			default:
224 				return false;
225 		}
226 	}
227 
228 	void popFrontWhitespaceAware() pure nothrow
229 	{
230 		switch (range.front)
231 		{
232 			case '\r':
233 				range.popFront();
234 				if (!range.empty && range.front == '\n')
235 				{
236 					range.popFront();
237 					range.incrementLine();
238 				}
239 				else
240 				range.incrementLine();
241 				return;
242 			case '\n':
243 				range.popFront();
244 				range.incrementLine();
245 				return;
246 			case 0xe2:
247 				auto lookahead = range.peek(3);
248 				if (lookahead.length == 3 && lookahead[1] == 0x80
249 					&& (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
250 				{
251 					range.popFront();
252 					range.popFront();
253 					range.popFront();
254 					range.incrementLine();
255 					return;
256 				}
257 				else
258 				{
259 					range.popFront();
260 					return;
261 				}
262 			default:
263 				range.popFront();
264 				return;
265 		}
266 	}
267 
268 	/// https://en.wikibooks.org/wiki/Ada_Programming/Lexical_elements#String_literals
269 	Token lexStringLiteral() pure nothrow @safe
270 	{
271 		mixin (tokenStart);
272 		ubyte quote = range.front;
273 		range.popFront();
274 		while (true)
275 		{
276 			if (range.empty)
277 				return Token(tok!"", null, 0, 0, 0);
278 			if (range.front == '\\')
279 			{
280 				range.popFront();
281 				if (range.empty)
282 					return Token(tok!"", null, 0, 0, 0);
283 				range.popFront();
284 			}
285 			else if (range.front == quote)
286 			{
287 				range.popFront();
288 				break;
289 			}
290 			else
291 			range.popFront();
292 		}
293 		return Token(tok!"string", cache.intern(range.slice(mark)), line,
294 					 column, index);
295 	}
296 
297 	Token lexWhitespace() pure nothrow @safe
298 	{
299 		import std.ascii: isWhite;
300 		mixin (tokenStart);
301 		while (!range.empty && isWhite(range.front))
302 			range.popFront();
303 		string text = cache.intern(range.slice(mark));
304 		return Token(tok!"whitespace", text, line, column, index);
305 	}
306 
307 	void lexExponent() pure nothrow @safe
308 	{
309 		range.popFront();
310 		bool foundSign = false;
311 		bool foundDigit = false;
312 		while (!range.empty)
313 		{
314 			switch (range.front)
315 			{
316 				case '-':
317 				case '+':
318 					if (foundSign)
319 						return;
320 					foundSign = true;
321 					range.popFront();
322 					break;
323 				case '0': .. case '9':
324 					foundDigit = true;
325 					range.popFront();
326 					break;
327 				default:
328 					return;
329 			}
330 		}
331 	}
332 
333 	Token lexNumber() pure nothrow
334 	{
335 		mixin (tokenStart);
336 		bool foundDot = range.front == '.';
337 		if (foundDot)
338 			range.popFront();
339 	decimalLoop: while (!range.empty)
340 		{
341 			switch (range.front)
342 			{
343 				case '0': .. case '9':
344 					range.popFront();
345 					break;
346 				case 'e':
347 				case 'E':
348 					lexExponent();
349 					break decimalLoop;
350 				case '.':
351 					if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.')
352 						break decimalLoop;
353 					else
354 					{
355 						// The following bit of silliness tries to tell the
356 						// difference between "int dot identifier" and
357 						// "double identifier".
358 						if (range.canPeek(1))
359 						{
360 							switch (range.peekAt(1))
361 							{
362 								case '0': .. case '9':
363 									goto doubleLiteral;
364 								default:
365 									break decimalLoop;
366 							}
367 						}
368 						else
369 						{
370 						doubleLiteral:
371 							range.popFront();
372 							foundDot = true;
373 						}
374 					}
375 					break;
376 				default:
377 					break decimalLoop;
378 			}
379 		}
380 		return Token(tok!"number", cache.intern(range.slice(mark)),
381 					 line, column, index);
382 	}
383 
384 	Token lexComment() pure
385 	{
386 		mixin (tokenStart);
387 		IdType type = tok!"comment";
388 		range.popFrontN(2);
389 		while (!range.empty)
390 		{
391 			if (range.front == '*')
392 			{
393 				range.popFront();
394 				if (!range.empty && range.front == '/')
395 				{
396 					range.popFront();
397 					break;
398 				}
399 			}
400 			else
401 			popFrontWhitespaceAware();
402 		}
403 	end:
404 		return Token(type, cache.intern(range.slice(mark)), line, column,
405 					 index);
406 	}
407 
408 	Token lexSlashSlashComment() pure nothrow
409 	{
410 		mixin (tokenStart);
411 		IdType type = tok!"comment";
412 		range.popFrontN(2);
413 		while (!range.empty)
414 		{
415 			if (range.front == '\r' || range.front == '\n')
416 				break;
417 			range.popFront();
418 		}
419 	end:
420 		return Token(type, cache.intern(range.slice(mark)), line, column,
421 					 index);
422 	}
423 
424 	Token lexIdentifier() pure nothrow
425 	{
426 		import std.stdio;
427 		mixin (tokenStart);
428 		uint hash = 0;
429 		if (isSeparating(0) || range.empty)
430 		{
431 			error("Invalid identifier");
432 			range.popFront();
433 		}
434 		while (!range.empty && !isSeparating(0))
435 		{
436 			hash = StringCache.hashStep(range.front, hash);
437 			range.popFront();
438 		}
439 		return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line,
440 					 column, index);
441 	}
442 
443 	bool isNewline() pure @safe nothrow
444 	{
445 		if (range.front == '\n') return true;
446 		if (range.front == '\r') return true;
447 		return (range.front & 0x80) && range.canPeek(2)
448 		&& (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
449 	}
450 
451 	bool isSeparating(size_t offset) pure nothrow @safe
452 	{
453 		if (!range.canPeek(offset)) return true;
454 		auto c = range.peekAt(offset);
455 		if (c >= 'A' && c <= 'Z') return false;
456 		if (c >= 'a' && c <= 'z') return false;
457 		if (c <= 0x2f) return true;
458 		if (c >= ':' && c <= '@') return true;
459 		if (c >= '[' && c <= '^') return true;
460 		if (c >= '{' && c <= '~') return true;
461 		if (c == '`') return true;
462 		if (c & 0x80)
463 		{
464 			auto r = range;
465 			range.popFrontN(offset);
466 			return (r.canPeek(2) && (r.peek(2) == "\u2028"
467 									 || r.peek(2) == "\u2029"));
468 		}
469 		return false;
470 	}
471 
472 	enum tokenStart = q{
473 		size_t index = range.index;
474 		size_t column = range.column;
475 		size_t line = range.line;
476 		auto mark = range.mark();
477 	};
478 
479 	void error(string message) pure nothrow @safe
480 	{
481 		messages ~= Message(range.line, range.column, message, true);
482 	}
483 
484 	void warning(string message) pure nothrow @safe
485 	{
486 		messages ~= Message(range.line, range.column, message, false);
487 		assert (messages.length > 0);
488 	}
489 
490 	struct Message
491 	{
492 		size_t line;
493 		size_t column;
494 		string message;
495 		bool isError;
496 	}
497 
498 	Message[] messages;
499 	StringCache* cache;
500 	LexerConfig config;
501 }
502 
503 public auto byToken(ubyte[] range)
504 {
505 	LexerConfig config;
506 	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
507 	return AdaLexer(range, config, cache);
508 }
509 
510 public auto byToken(ubyte[] range, StringCache* cache)
511 {
512 	LexerConfig config;
513 	return AdaLexer(range, config, cache);
514 }
515 
516 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
517 {
518 	return AdaLexer(range, config, cache);
519 }
520 
521 unittest {
522 	assert(getTokensForParser(cast(ubyte[])`X;`, LexerConfig(), new StringCache(StringCache.defaultBucketCount))
523 		   .map!`a.type`()
524 		   .equal([tok!`identifier`,
525 				   tok!`;`]));
526 }
527 
528 unittest {
529 	assert(getTokensForParser(cast(ubyte[])`x = "a";`, LexerConfig(), new StringCache(StringCache.defaultBucketCount))
530 		   .map!`a.type`()
531 		   .equal([tok!`identifier`,
532 				   tok!`=`,
533 				   tok!`string`,
534 				   tok!`;`]));
535 }