1 /** Ada Lexer. */ 2 module nxt.ada_lexer; 3 4 import std.typecons; 5 import std.meta; 6 import std.array; 7 import std.algorithm; 8 import std.range; 9 10 import nxt.lexer; 11 import nxt.ada_defs; 12 import nxt.stringcache; 13 14 /// Operators 15 private enum operators = ada_defs.operators; 16 17 /// Keywords 18 private enum keywords = ada_defs.keywords2012; 19 20 /// Other tokens 21 private enum dynamicTokens = [ 22 `string`, `number`, `identifier`, `comment`, `whitespace` 23 ]; 24 25 private enum pseudoTokenHandlers = [ 26 `"`, `lexStringLiteral`, 27 `0`, `lexNumber`, 28 `1`, `lexNumber`, 29 `2`, `lexNumber`, 30 `3`, `lexNumber`, 31 `4`, `lexNumber`, 32 `5`, `lexNumber`, 33 `6`, `lexNumber`, 34 `7`, `lexNumber`, 35 `8`, `lexNumber`, 36 `9`, `lexNumber`, 37 ` `, `lexWhitespace`, 38 `\t`, `lexWhitespace`, 39 `\r`, `lexWhitespace`, 40 `\n`, `lexWhitespace`, 41 `--`, `lexComment`, 42 ]; 43 44 /// Token ID type for the D lexer. 45 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); 46 47 /** 48 * Function used for converting an IdType to a string. 49 * 50 * Examples: 51 * --- 52 * IdType c = tok!"case"; 53 * assert (str(c) == "case"); 54 * --- 55 */ 56 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); 57 58 /** 59 * Template used to refer to D token types. 60 * 61 * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for 62 * values that can be passed to this template. 63 * Example: 64 * --- 65 * import std.d.lexer; 66 * IdType t = tok!"floatLiteral"; 67 * --- 68 */ 69 public template tok(string token) 70 { 71 alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); 72 } 73 74 private enum extraFields = q{ 75 string comment; 76 string trailingComment; 77 78 int opCmp(size_t i) const pure nothrow @safe { 79 if (index < i) return -1; 80 if (index > i) return 1; 81 return 0; 82 } 83 84 int opCmp(ref const typeof(this) other) const pure nothrow @safe { 85 return opCmp(other.index); 86 } 87 }; 88 89 /// The token type in the D lexer 90 public alias Token = lexer.TokenStructure!(IdType, extraFields); 91 92 /** 93 * Lexer configuration struct 94 */ 95 public struct LexerConfig 96 { 97 string fileName; 98 } 99 100 /** 101 * Returns: an array of tokens lexed from the given source code to the output range. All 102 * whitespace tokens are skipped and comments are attached to the token nearest 103 * to them. 104 */ 105 const(Token)[] getTokensForParser(ubyte[] sourceCode, const LexerConfig config, 106 StringCache* cache) 107 { 108 // import std.stdio; 109 enum CommentType : ubyte 110 { 111 notDoc, 112 line, 113 block 114 } 115 116 static CommentType commentType(string comment) pure nothrow @safe 117 { 118 if (comment.length < 3) 119 return CommentType.notDoc; 120 if (comment[0 ..3] == "///") 121 return CommentType.line; 122 if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**") 123 return CommentType.block; 124 return CommentType.notDoc; 125 } 126 127 auto output = appender!(typeof(return))(); 128 auto lexer = AdaLexer(sourceCode, config, cache); 129 string blockComment; 130 size_t tokenCount; 131 while (!lexer.empty) 132 { 133 switch (lexer.front.type) 134 { 135 case tok!"whitespace": 136 lexer.popFront(); 137 break; 138 case tok!"comment": 139 final switch (commentType(lexer.front.text)) 140 { 141 case CommentType.block: 142 blockComment = lexer.front.text; 143 lexer.popFront(); 144 break; 145 case CommentType.line: 146 if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) 147 { 148 // writeln("attaching comment"); 149 (cast() output.data[tokenCount - 1]).trailingComment = lexer.front.text; 150 } 151 else 152 { 153 blockComment = cache.intern(blockComment.length == 0 ? lexer.front.text 154 : blockComment ~ "\n" ~ lexer.front.text); 155 } 156 lexer.popFront(); 157 break; 158 case CommentType.notDoc: 159 lexer.popFront(); 160 break; 161 } 162 break; 163 default: 164 Token t = lexer.front; 165 lexer.popFront(); 166 tokenCount++; 167 t.comment = blockComment; 168 blockComment = null; 169 output.put(t); 170 break; 171 } 172 } 173 174 return output.data; 175 } 176 177 /** 178 * The Ada lexer. 179 */ 180 public struct AdaLexer 181 { 182 import core.vararg; 183 184 mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, 185 keywords, pseudoTokenHandlers); 186 187 @disable this(); 188 189 /** 190 * Params: 191 * range = the bytes that compose the source code that will be lexed. 192 * config = the lexer configuration to use. 193 * cache = the string interning cache for de-duplicating identifiers and 194 * other token text. 195 */ 196 this(ubyte[] range, const LexerConfig config, StringCache* cache) 197 { 198 this.range = LexerRange(range); 199 this.config = config; 200 this.cache = cache; 201 popFront(); 202 } 203 204 public void popFront() pure 205 { 206 _popFront(); 207 } 208 209 bool isWhitespace() pure const nothrow 210 { 211 switch (range.front) 212 { 213 case ' ': 214 case '\r': 215 case '\n': 216 case '\t': 217 return true; 218 case 0xe2: 219 auto peek = range.peek(2); 220 return peek.length == 2 221 && peek[0] == 0x80 222 && (peek[1] == 0xa8 || peek[1] == 0xa9); 223 default: 224 return false; 225 } 226 } 227 228 void popFrontWhitespaceAware() pure nothrow 229 { 230 switch (range.front) 231 { 232 case '\r': 233 range.popFront(); 234 if (!range.empty && range.front == '\n') 235 { 236 range.popFront(); 237 range.incrementLine(); 238 } 239 else 240 range.incrementLine(); 241 return; 242 case '\n': 243 range.popFront(); 244 range.incrementLine(); 245 return; 246 case 0xe2: 247 auto lookahead = range.peek(3); 248 if (lookahead.length == 3 && lookahead[1] == 0x80 249 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) 250 { 251 range.popFront(); 252 range.popFront(); 253 range.popFront(); 254 range.incrementLine(); 255 return; 256 } 257 else 258 { 259 range.popFront(); 260 return; 261 } 262 default: 263 range.popFront(); 264 return; 265 } 266 } 267 268 /// https://en.wikibooks.org/wiki/Ada_Programming/Lexical_elements#String_literals 269 Token lexStringLiteral() pure nothrow @safe 270 { 271 mixin (tokenStart); 272 ubyte quote = range.front; 273 range.popFront(); 274 while (true) 275 { 276 if (range.empty) 277 return Token(tok!"", null, 0, 0, 0); 278 if (range.front == '\\') 279 { 280 range.popFront(); 281 if (range.empty) 282 return Token(tok!"", null, 0, 0, 0); 283 range.popFront(); 284 } 285 else if (range.front == quote) 286 { 287 range.popFront(); 288 break; 289 } 290 else 291 range.popFront(); 292 } 293 return Token(tok!"string", cache.intern(range.slice(mark)), line, 294 column, index); 295 } 296 297 Token lexWhitespace() pure nothrow @safe 298 { 299 import std.ascii: isWhite; 300 mixin (tokenStart); 301 while (!range.empty && isWhite(range.front)) 302 range.popFront(); 303 string text = cache.intern(range.slice(mark)); 304 return Token(tok!"whitespace", text, line, column, index); 305 } 306 307 void lexExponent() pure nothrow @safe 308 { 309 range.popFront(); 310 bool foundSign = false; 311 bool foundDigit = false; 312 while (!range.empty) 313 { 314 switch (range.front) 315 { 316 case '-': 317 case '+': 318 if (foundSign) 319 return; 320 foundSign = true; 321 range.popFront(); 322 break; 323 case '0': .. case '9': 324 foundDigit = true; 325 range.popFront(); 326 break; 327 default: 328 return; 329 } 330 } 331 } 332 333 Token lexNumber() pure nothrow 334 { 335 mixin (tokenStart); 336 bool foundDot = range.front == '.'; 337 if (foundDot) 338 range.popFront(); 339 decimalLoop: while (!range.empty) 340 { 341 switch (range.front) 342 { 343 case '0': .. case '9': 344 range.popFront(); 345 break; 346 case 'e': 347 case 'E': 348 lexExponent(); 349 break decimalLoop; 350 case '.': 351 if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.') 352 break decimalLoop; 353 else 354 { 355 // The following bit of silliness tries to tell the 356 // difference between "int dot identifier" and 357 // "double identifier". 358 if (range.canPeek(1)) 359 { 360 switch (range.peekAt(1)) 361 { 362 case '0': .. case '9': 363 goto doubleLiteral; 364 default: 365 break decimalLoop; 366 } 367 } 368 else 369 { 370 doubleLiteral: 371 range.popFront(); 372 foundDot = true; 373 } 374 } 375 break; 376 default: 377 break decimalLoop; 378 } 379 } 380 return Token(tok!"number", cache.intern(range.slice(mark)), 381 line, column, index); 382 } 383 384 Token lexComment() pure 385 { 386 mixin (tokenStart); 387 IdType type = tok!"comment"; 388 range.popFrontN(2); 389 while (!range.empty) 390 { 391 if (range.front == '*') 392 { 393 range.popFront(); 394 if (!range.empty && range.front == '/') 395 { 396 range.popFront(); 397 break; 398 } 399 } 400 else 401 popFrontWhitespaceAware(); 402 } 403 end: 404 return Token(type, cache.intern(range.slice(mark)), line, column, 405 index); 406 } 407 408 Token lexSlashSlashComment() pure nothrow 409 { 410 mixin (tokenStart); 411 IdType type = tok!"comment"; 412 range.popFrontN(2); 413 while (!range.empty) 414 { 415 if (range.front == '\r' || range.front == '\n') 416 break; 417 range.popFront(); 418 } 419 end: 420 return Token(type, cache.intern(range.slice(mark)), line, column, 421 index); 422 } 423 424 Token lexIdentifier() pure nothrow 425 { 426 import std.stdio; 427 mixin (tokenStart); 428 uint hash = 0; 429 if (isSeparating(0) || range.empty) 430 { 431 error("Invalid identifier"); 432 range.popFront(); 433 } 434 while (!range.empty && !isSeparating(0)) 435 { 436 hash = StringCache.hashStep(range.front, hash); 437 range.popFront(); 438 } 439 return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line, 440 column, index); 441 } 442 443 bool isNewline() pure @safe nothrow 444 { 445 if (range.front == '\n') return true; 446 if (range.front == '\r') return true; 447 return (range.front & 0x80) && range.canPeek(2) 448 && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); 449 } 450 451 bool isSeparating(size_t offset) pure nothrow @safe 452 { 453 if (!range.canPeek(offset)) return true; 454 auto c = range.peekAt(offset); 455 if (c >= 'A' && c <= 'Z') return false; 456 if (c >= 'a' && c <= 'z') return false; 457 if (c <= 0x2f) return true; 458 if (c >= ':' && c <= '@') return true; 459 if (c >= '[' && c <= '^') return true; 460 if (c >= '{' && c <= '~') return true; 461 if (c == '`') return true; 462 if (c & 0x80) 463 { 464 auto r = range; 465 range.popFrontN(offset); 466 return (r.canPeek(2) && (r.peek(2) == "\u2028" 467 || r.peek(2) == "\u2029")); 468 } 469 return false; 470 } 471 472 enum tokenStart = q{ 473 size_t index = range.index; 474 size_t column = range.column; 475 size_t line = range.line; 476 auto mark = range.mark(); 477 }; 478 479 void error(string message) pure nothrow @safe 480 { 481 messages ~= Message(range.line, range.column, message, true); 482 } 483 484 void warning(string message) pure nothrow @safe 485 { 486 messages ~= Message(range.line, range.column, message, false); 487 assert (messages.length > 0); 488 } 489 490 struct Message 491 { 492 size_t line; 493 size_t column; 494 string message; 495 bool isError; 496 } 497 498 Message[] messages; 499 StringCache* cache; 500 LexerConfig config; 501 } 502 503 public auto byToken(ubyte[] range) 504 { 505 LexerConfig config; 506 StringCache* cache = new StringCache(StringCache.defaultBucketCount); 507 return AdaLexer(range, config, cache); 508 } 509 510 public auto byToken(ubyte[] range, StringCache* cache) 511 { 512 LexerConfig config; 513 return AdaLexer(range, config, cache); 514 } 515 516 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache) 517 { 518 return AdaLexer(range, config, cache); 519 } 520 521 unittest { 522 assert(getTokensForParser(cast(ubyte[])`X;`, LexerConfig(), new StringCache(StringCache.defaultBucketCount)) 523 .map!`a.type`() 524 .equal([tok!`identifier`, 525 tok!`;`])); 526 } 527 528 unittest { 529 assert(getTokensForParser(cast(ubyte[])`x = "a";`, LexerConfig(), new StringCache(StringCache.defaultBucketCount)) 530 .map!`a.type`() 531 .equal([tok!`identifier`, 532 tok!`=`, 533 tok!`string`, 534 tok!`;`])); 535 }