1 /*
   2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package jdk.incubator.code.parser.impl;
  27 
  28 import jdk.incubator.code.parser.impl.Tokens.TokenKind;
  29 import java.util.ArrayList;
  30 import java.util.List;
  31 
  32 /**
  33  * The lexical analyzer maps an input stream consisting of UTF-8 characters and unicode
  34  * escape sequences into a token sequence.
  35  */
  36 final class JavaBasedTokenizer extends UnicodeReader {
  37     /**
  38      * If true then prints token information after each nextToken().
  39      */
  40     private static final boolean scannerDebug = false;
  41 
  42     /**
  43      * Sentinel for non-value.
  44      */
  45     private static final int NOT_FOUND = -1;
  46 
  47     /**
  48      * The log to be used for error reporting. Copied from scanner factory.
  49      */
  50     private final Log log;
  51 
  52     /**
  53      * The token factory. Copied from scanner factory.
  54      */
  55     private final Tokens tokens;
  56 
  57     /**
  58      * The token kind, set by nextToken().
  59      */
  60     Tokens.TokenKind tk;
  61 
  62     /**
  63      * The token's radix, set by nextToken().
  64      */
  65     int radix;
  66 
  67     /**
  68      * The token's name, set by nextToken().
  69      */
  70     String name;
  71 
  72     /**
  73      * The position where a lexical error occurred;
  74      */
  75     int errPos = Position.NOPOS;
  76 
  77     /**
  78      * true if contains escape sequences, set by nextToken().
  79      */
  80     boolean hasEscapeSequences;
  81 
  82     /**
  83      * Buffer for building literals, used by nextToken().
  84      */
  85     StringBuilder sb;
  86 
  87     /**
  88      * Construct a Java token scanner from the input character array.
  89      *
  90      * @param fac    the factory which created this Scanner
  91      * @param array  the input character array.
  92      * @param length The length of the meaningful content in the array.
  93      */
  94     JavaBasedTokenizer(Scanner.Factory fac, char[] array, int length) {
  95         super(fac, array, length);
  96         this.log = fac.log;
  97         this.tokens = fac.tokens;
  98         this.sb = new StringBuilder(256);
  99     }
 100 
 101     /**
 102      * Report an error at the given position using the provided arguments.
 103      *
 104      * @param pos position in input buffer.
 105      * @param key error key to report.
 106      */
 107     void lexError(int pos, Errors.Error key) {
 108         tk = Tokens.TokenKind.ERROR;
 109         errPos = pos;
 110         log.error(pos, key);
 111     }
 112 
 113     /**
 114      * Add a character to the literal buffer.
 115      *
 116      * @param ch character to add.
 117      */
 118     void put(char ch) {
 119         sb.append(ch);
 120     }
 121 
 122     /**
 123      * Add a codepoint to the literal buffer.
 124      *
 125      * @param codePoint codepoint to add.
 126      */
 127     void putCodePoint(int codePoint) {
 128         sb.appendCodePoint(codePoint);
 129     }
 130 
 131     /**
 132      * Add current character or codepoint to the literal buffer.
 133      */
 134     void put() {
 135         if (isSurrogate()) {
 136             putCodePoint(getCodepoint());
 137         } else {
 138             put(get());
 139         }
 140     }
 141 
 142     /**
 143      * Add a string to the literal buffer.
 144      */
 145     void put(String string) {
 146         sb.append(string);
 147     }
 148 
 149     /**
 150      * Add current character or codepoint to the literal buffer then return next character.
 151      */
 152     char putThenNext() {
 153         put();
 154 
 155         return next();
 156     }
 157 
 158     /**
 159      * If the specified character ch matches the current character then add current character
 160      * to the literal buffer and then advance.
 161      *
 162      * @param ch character to match.
 163      * @return true if ch matches current character.
 164      */
 165     boolean acceptThenPut(char ch) {
 166         if (is(ch)) {
 167             put(get());
 168             next();
 169 
 170             return true;
 171         }
 172 
 173         return false;
 174     }
 175 
 176     /**
 177      * If either ch1 or ch2 matches the current character then add current character
 178      * to the literal buffer and then advance.
 179      *
 180      * @param ch1 first character to match.
 181      * @param ch2 second character to match.
 182      * @return true if either ch1 or ch2 matches current character.
 183      */
 184     boolean acceptOneOfThenPut(char ch1, char ch2) {
 185         if (isOneOf(ch1, ch2)) {
 186             put(get());
 187             next();
 188 
 189             return true;
 190         }
 191 
 192         return false;
 193     }
 194 
 195     /**
 196      * Test if the current character is a line terminator.
 197      *
 198      * @return true if current character is a line terminator.
 199      */
 200     private boolean isEOLN() {
 201         return isOneOf('\n', '\r');
 202     }
 203 
 204     /**
 205      * Processes the current character and places in the literal buffer. If the current
 206      * character is a backslash then the next character is validated as a proper
 207      * escape character. Conversion of escape sequences takes place at end of nextToken().
 208      *
 209      * @param pos position of the first character in literal.
 210      */
 211     private void scanLitChar(int pos) {
 212         if (acceptThenPut('\\')) {
 213             hasEscapeSequences = true;
 214 
 215             switch (get()) {
 216                 case '0':
 217                 case '1':
 218                 case '2':
 219                 case '3':
 220                 case '4':
 221                 case '5':
 222                 case '6':
 223                 case '7':
 224                     char leadch = get();
 225                     putThenNext();
 226 
 227                     if (inRange('0', '7')) {
 228                         putThenNext();
 229 
 230                         if (leadch <= '3' && inRange('0', '7')) {
 231                             putThenNext();
 232                         }
 233                     }
 234                     break;
 235 
 236                 case 'b':
 237                 case 't':
 238                 case 'n':
 239                 case 'f':
 240                 case 'r':
 241                 case '\'':
 242                 case '\"':
 243                 case '\\':
 244                     putThenNext();
 245                     break;
 246 
 247                 case '\n':
 248                 case '\r':
 249                     lexError(position(), Errors.IllegalEscChar);
 250                     break;
 251 
 252                 default:
 253                     lexError(position(), Errors.IllegalEscChar);
 254                     break;
 255             }
 256         } else {
 257             putThenNext();
 258         }
 259     }
 260 
 261     /**
 262      * Scan a string literal.
 263      *
 264      * @param pos position of the first character in literal.
 265      */
 266     private void scanString(int pos) {
 267         // Assume the best.
 268         tk = Tokens.TokenKind.STRINGLITERAL;
 269         // Skip first quote.
 270         next();
 271 
 272         // While characters are available.
 273         while (isAvailable()) {
 274             if (accept('\"')) {
 275                 return;
 276             }
 277 
 278             if (isEOLN()) {
 279                 // Line terminator in string literal is an error.
 280                 // Fall out to unclosed string literal error.
 281                 break;
 282             } else {
 283                 // Add character to string buffer.
 284                 scanLitChar(pos);
 285             }
 286         }
 287 
 288         lexError(pos, Errors.UnclosedStrLit);
 289     }
 290 
 291     /**
 292      * Scan sequence of digits.
 293      *
 294      * @param pos        position of the first character in literal.
 295      * @param digitRadix radix of numeric literal.
 296      */
 297     private void scanDigits(int pos, int digitRadix) {
 298         int leadingUnderscorePos = is('_') ? position() : NOT_FOUND;
 299         int trailingUnderscorePos;
 300 
 301         do {
 302             if (!is('_')) {
 303                 put();
 304                 trailingUnderscorePos = NOT_FOUND;
 305             } else {
 306                 trailingUnderscorePos = position();
 307             }
 308 
 309             next();
 310         } while (digit(pos, digitRadix) >= 0 || is('_'));
 311 
 312         if (leadingUnderscorePos != NOT_FOUND) {
 313             lexError(leadingUnderscorePos, Errors.IllegalUnderscore);
 314         } else if (trailingUnderscorePos != NOT_FOUND) {
 315             lexError(trailingUnderscorePos, Errors.IllegalUnderscore);
 316         }
 317     }
 318 
 319     /**
 320      * Read fractional part of hexadecimal floating point number.
 321      *
 322      * @param pos position of the first character in literal.
 323      */
 324     private void scanHexExponentAndSuffix(int pos) {
 325         if (acceptOneOfThenPut('p', 'P')) {
 326             skipIllegalUnderscores();
 327             acceptOneOfThenPut('+', '-');
 328             skipIllegalUnderscores();
 329 
 330             if (digit(pos, 10) >= 0) {
 331                 scanDigits(pos, 10);
 332             } else {
 333                 lexError(pos, Errors.MalformedFpLit);
 334             }
 335         } else {
 336             lexError(pos, Errors.MalformedFpLit);
 337         }
 338 
 339         if (acceptOneOfThenPut('f', 'F')) {
 340             tk = Tokens.TokenKind.FLOATLITERAL;
 341             radix = 16;
 342         } else {
 343             acceptOneOfThenPut('d', 'D');
 344             tk = Tokens.TokenKind.DOUBLELITERAL;
 345             radix = 16;
 346         }
 347     }
 348 
 349     /**
 350      * Read fractional part of floating point number.
 351      *
 352      * @param pos position of the first character in literal.
 353      */
 354     private void scanFraction(int pos) {
 355         skipIllegalUnderscores();
 356 
 357         if (digit(pos, 10) >= 0) {
 358             scanDigits(pos, 10);
 359         }
 360 
 361         int index = sb.length();
 362 
 363         if (acceptOneOfThenPut('e', 'E')) {
 364             skipIllegalUnderscores();
 365             acceptOneOfThenPut('+', '-');
 366             skipIllegalUnderscores();
 367 
 368             if (digit(pos, 10) >= 0) {
 369                 scanDigits(pos, 10);
 370                 return;
 371             }
 372 
 373             lexError(pos, Errors.MalformedFpLit);
 374             sb.setLength(index);
 375         }
 376     }
 377 
 378     /**
 379      * Read fractional part and 'd' or 'f' suffix of floating point number.
 380      *
 381      * @param pos position of the first character in literal.
 382      */
 383     private void scanFractionAndSuffix(int pos) {
 384         radix = 10;
 385         scanFraction(pos);
 386 
 387         if (acceptOneOfThenPut('f', 'F')) {
 388             tk = Tokens.TokenKind.FLOATLITERAL;
 389         } else {
 390             acceptOneOfThenPut('d', 'D');
 391             tk = Tokens.TokenKind.DOUBLELITERAL;
 392         }
 393     }
 394 
 395     /**
 396      * Read fractional part and 'd' or 'f' suffix of hexadecimal floating point number.
 397      *
 398      * @param pos position of the first character in literal.
 399      */
 400     private void scanHexFractionAndSuffix(int pos, boolean seendigit) {
 401         radix = 16;
 402         assert is('.');
 403         putThenNext();
 404         skipIllegalUnderscores();
 405 
 406         if (digit(pos, 16) >= 0) {
 407             seendigit = true;
 408             scanDigits(pos, 16);
 409         }
 410 
 411         if (!seendigit)
 412             lexError(pos, Errors.InvalidHexNumber);
 413         else
 414             scanHexExponentAndSuffix(pos);
 415     }
 416 
 417     /**
 418      * Skip over underscores and report as a error if found.
 419      */
 420     private void skipIllegalUnderscores() {
 421         if (is('_')) {
 422             lexError(position(), Errors.IllegalUnderscore);
 423             skip('_');
 424         }
 425     }
 426 
 427     /**
 428      * Read a number. (Spec. 3.10)
 429      *
 430      * @param pos   position of the first character in literal.
 431      * @param radix the radix of the number; one of 2, 8, 10, 16.
 432      */
 433     private void scanNumber(int pos, int radix) {
 434         // for octal, allow base-10 digit in case it's a float literal
 435         this.radix = radix;
 436         int digitRadix = (radix == 8 ? 10 : radix);
 437         int firstDigit = digit(pos, Math.max(10, digitRadix));
 438         boolean seendigit = firstDigit >= 0;
 439         boolean seenValidDigit = firstDigit >= 0 && firstDigit < digitRadix;
 440 
 441         if (seendigit) {
 442             scanDigits(pos, digitRadix);
 443         }
 444 
 445         if (radix == 16 && is('.')) {
 446             scanHexFractionAndSuffix(pos, seendigit);
 447         } else if (seendigit && radix == 16 && isOneOf('p', 'P')) {
 448             scanHexExponentAndSuffix(pos);
 449         } else if (digitRadix == 10 && is('.')) {
 450             putThenNext();
 451             scanFractionAndSuffix(pos);
 452         } else if (digitRadix == 10 && isOneOf('e', 'E', 'f', 'F', 'd', 'D')) {
 453             scanFractionAndSuffix(pos);
 454         } else {
 455             if (!seenValidDigit) {
 456                 switch (radix) {
 457                     case 2:
 458                         lexError(pos, Errors.InvalidBinaryNumber);
 459                         break;
 460                     case 16:
 461                         lexError(pos, Errors.InvalidHexNumber);
 462                         break;
 463                 }
 464             }
 465             // If it is not a floating point literal,
 466             // the octal number should be rescanned correctly.
 467             if (radix == 8) {
 468                 sb.setLength(0);
 469                 reset(pos);
 470                 scanDigits(pos, 8);
 471             }
 472 
 473             if (acceptOneOf('l', 'L')) {
 474                 tk = Tokens.TokenKind.LONGLITERAL;
 475             } else {
 476                 tk = Tokens.TokenKind.INTLITERAL;
 477             }
 478         }
 479     }
 480 
 481     /**
 482      * Determines if the sequence in the literal buffer is a token (keyword, operator.)
 483      */
 484     private void checkIdent(Tokens.TokenKind identifier) {
 485         name = sb.toString();
 486         tk = tokens.lookupKind(name, identifier);
 487     }
 488 
 489     /**
 490      * Read an identifier. (Spec. 3.8)
 491      */
 492     private void scanIdent() {
 493         scanIdent(Tokens.TokenKind.IDENTIFIER);
 494     }
 495 
 496     /**
 497      * Read an identifier. (Spec. 3.8)
 498      */
 499     private void scanIdent(Tokens.TokenKind identifier) {
 500         putThenNext();
 501 
 502         do {
 503             switch (get()) {
 504                 case 'A':
 505                 case 'B':
 506                 case 'C':
 507                 case 'D':
 508                 case 'E':
 509                 case 'F':
 510                 case 'G':
 511                 case 'H':
 512                 case 'I':
 513                 case 'J':
 514                 case 'K':
 515                 case 'L':
 516                 case 'M':
 517                 case 'N':
 518                 case 'O':
 519                 case 'P':
 520                 case 'Q':
 521                 case 'R':
 522                 case 'S':
 523                 case 'T':
 524                 case 'U':
 525                 case 'V':
 526                 case 'W':
 527                 case 'X':
 528                 case 'Y':
 529                 case 'Z':
 530                 case 'a':
 531                 case 'b':
 532                 case 'c':
 533                 case 'd':
 534                 case 'e':
 535                 case 'f':
 536                 case 'g':
 537                 case 'h':
 538                 case 'i':
 539                 case 'j':
 540                 case 'k':
 541                 case 'l':
 542                 case 'm':
 543                 case 'n':
 544                 case 'o':
 545                 case 'p':
 546                 case 'q':
 547                 case 'r':
 548                 case 's':
 549                 case 't':
 550                 case 'u':
 551                 case 'v':
 552                 case 'w':
 553                 case 'x':
 554                 case 'y':
 555                 case 'z':
 556                 case '$':
 557                 case '_':
 558                 case '0':
 559                 case '1':
 560                 case '2':
 561                 case '3':
 562                 case '4':
 563                 case '5':
 564                 case '6':
 565                 case '7':
 566                 case '8':
 567                 case '9':
 568                     break;
 569 
 570                 case '\u0000':
 571                 case '\u0001':
 572                 case '\u0002':
 573                 case '\u0003':
 574                 case '\u0004':
 575                 case '\u0005':
 576                 case '\u0006':
 577                 case '\u0007':
 578                 case '\u0008':
 579                 case '\u000E':
 580                 case '\u000F':
 581                 case '\u0010':
 582                 case '\u0011':
 583                 case '\u0012':
 584                 case '\u0013':
 585                 case '\u0014':
 586                 case '\u0015':
 587                 case '\u0016':
 588                 case '\u0017':
 589                 case '\u0018':
 590                 case '\u0019':
 591                 case '\u001B':
 592                 case '\u007F':
 593                     next();
 594                     continue;
 595 
 596                 case '\u001A': // EOI is also a legal identifier part
 597                     if (isAvailable()) {
 598                         next();
 599                         continue;
 600                     }
 601 
 602                     checkIdent(identifier);
 603                     return;
 604 
 605                 default:
 606                     boolean isJavaIdentifierPart;
 607 
 608                     if (isASCII()) {
 609                         // all ASCII range chars already handled, above
 610                         isJavaIdentifierPart = false;
 611                     } else {
 612                         if (Character.isIdentifierIgnorable(get())) {
 613                             next();
 614                             continue;
 615                         }
 616 
 617                         isJavaIdentifierPart = isSurrogate()
 618                                 ? Character.isJavaIdentifierPart(getCodepoint())
 619                                 : Character.isJavaIdentifierPart(get());
 620                     }
 621 
 622                     if (!isJavaIdentifierPart) {
 623                         checkIdent(identifier);
 624                         return;
 625                     }
 626             }
 627 
 628             putThenNext();
 629         } while (true);
 630     }
 631 
 632     /**
 633      * Read token (main entrypoint.)
 634      */
 635     public Tokens.Token readToken() {
 636         sb.setLength(0);
 637         name = null;
 638         radix = 0;
 639         hasEscapeSequences = false;
 640 
 641         int pos = 0;
 642         List<Tokens.Comment> comments = null;
 643 
 644         try {
 645             loop:
 646             while (true) {
 647                 pos = position();
 648 
 649                 switch (get()) {
 650                     case ' ':  // (Spec 3.6)
 651                     case '\t': // (Spec 3.6)
 652                     case '\f': // (Spec 3.6)
 653                         skipWhitespace();
 654                         processWhiteSpace(pos, position());
 655                         break;
 656 
 657                     case '\n': // (Spec 3.4)
 658                         next();
 659                         processLineTerminator(pos, position());
 660                         break;
 661 
 662                     case '\r': // (Spec 3.4)
 663                         next();
 664                         accept('\n');
 665                         processLineTerminator(pos, position());
 666                         break;
 667 
 668                     case 'A':
 669                     case 'B':
 670                     case 'C':
 671                     case 'D':
 672                     case 'E':
 673                     case 'F':
 674                     case 'G':
 675                     case 'H':
 676                     case 'I':
 677                     case 'J':
 678                     case 'K':
 679                     case 'L':
 680                     case 'M':
 681                     case 'N':
 682                     case 'O':
 683                     case 'P':
 684                     case 'Q':
 685                     case 'R':
 686                     case 'S':
 687                     case 'T':
 688                     case 'U':
 689                     case 'V':
 690                     case 'W':
 691                     case 'X':
 692                     case 'Y':
 693                     case 'Z':
 694                     case 'a':
 695                     case 'b':
 696                     case 'c':
 697                     case 'd':
 698                     case 'e':
 699                     case 'f':
 700                     case 'g':
 701                     case 'h':
 702                     case 'i':
 703                     case 'j':
 704                     case 'k':
 705                     case 'l':
 706                     case 'm':
 707                     case 'n':
 708                     case 'o':
 709                     case 'p':
 710                     case 'q':
 711                     case 'r':
 712                     case 's':
 713                     case 't':
 714                     case 'u':
 715                     case 'v':
 716                     case 'w':
 717                     case 'x':
 718                     case 'y':
 719                     case 'z':
 720                     case '$':
 721                     case '_': // (Spec. 3.8)
 722                         scanIdent();
 723                         break loop;
 724 
 725                     case '%':
 726                         scanIdent(Tokens.TokenKind.VALUE_IDENTIFIER);
 727                         break loop;
 728 
 729                     case '0': // (Spec. 3.10)
 730                         next();
 731 
 732                         if (acceptOneOf('x', 'X')) {
 733                             skipIllegalUnderscores();
 734                             scanNumber(pos, 16);
 735                         } else if (acceptOneOf('b', 'B')) {
 736                             skipIllegalUnderscores();
 737                             scanNumber(pos, 2);
 738                         } else {
 739                             put('0');
 740 
 741                             if (is('_')) {
 742                                 int savePos = position();
 743                                 skip('_');
 744 
 745                                 if (digit(pos, 10) < 0) {
 746                                     lexError(savePos, Errors.IllegalUnderscore);
 747                                 }
 748                             }
 749 
 750                             scanNumber(pos, 8);
 751                         }
 752                         break loop;
 753 
 754                     case '1':
 755                     case '2':
 756                     case '3':
 757                     case '4':
 758                     case '5':
 759                     case '6':
 760                     case '7':
 761                     case '8':
 762                     case '9':  // (Spec. 3.10)
 763                         scanNumber(pos, 10);
 764                         break loop;
 765 
 766                     case '.': // (Spec. 3.12)
 767                         next();
 768                         int savePos = position();
 769 
 770                         if (accept('.')) {
 771                             lexError(savePos, Errors.IllegalDot);
 772                         } else if (digit(pos, 10) >= 0) {
 773                             put('.');
 774                             scanFractionAndSuffix(pos); // (Spec. 3.10)
 775                         } else {
 776                             tk = Tokens.TokenKind.DOT;
 777                         }
 778                         break loop;
 779 
 780                     case ',': // (Spec. 3.12)
 781                         next();
 782                         tk = Tokens.TokenKind.COMMA;
 783                         break loop;
 784 
 785                     case '(': // (Spec. 3.12)
 786                         next();
 787                         tk = Tokens.TokenKind.LPAREN;
 788                         break loop;
 789 
 790                     case ')': // (Spec. 3.12)
 791                         next();
 792                         tk = Tokens.TokenKind.RPAREN;
 793                         break loop;
 794 
 795                     case '[': // (Spec. 3.12)
 796                         next();
 797                         tk = Tokens.TokenKind.LBRACKET;
 798                         break loop;
 799 
 800                     case ']': // (Spec. 3.12)
 801                         next();
 802                         tk = Tokens.TokenKind.RBRACKET;
 803                         break loop;
 804 
 805                     case '{': // (Spec. 3.12)
 806                         next();
 807                         tk = Tokens.TokenKind.LBRACE;
 808                         break loop;
 809 
 810                     case '}': // (Spec. 3.12)
 811                         next();
 812                         tk = Tokens.TokenKind.RBRACE;
 813                         break loop;
 814 
 815                     case '?':
 816                         next();
 817                         tk = Tokens.TokenKind.QUES;
 818                         break loop;
 819 
 820                     case ';':
 821                         next();
 822                         tk = Tokens.TokenKind.SEMI;
 823                         break loop;
 824 
 825                     case ':':
 826                         next();
 827                         if (accept(':')) {
 828                             tk = Tokens.TokenKind.COLCOL;
 829                         } else {
 830                             tk = Tokens.TokenKind.COLON;
 831                         }
 832                         break loop;
 833 
 834                     case '&':
 835                         next();
 836                         tk = Tokens.TokenKind.AMP;
 837                         break loop;
 838 
 839                     case '@':
 840                         next();
 841                         tk = Tokens.TokenKind.MONKEYS_AT;
 842                         break loop;
 843 
 844                     case '^':
 845                         next();
 846                         tk = Tokens.TokenKind.CARET;
 847                         break loop;
 848 
 849                     case '=':
 850                         next();
 851                         tk = Tokens.TokenKind.EQ;
 852                         break loop;
 853 
 854                     case '<':
 855                         next();
 856                         tk = Tokens.TokenKind.LT;
 857                         break loop;
 858 
 859                     case '>':
 860                         next();
 861                         tk = Tokens.TokenKind.GT;
 862                         break loop;
 863 
 864                     case '#':
 865                         next();
 866                         tk = TokenKind.HASH;
 867                         break loop;
 868 
 869                     case '+':
 870                         next();
 871                         tk = Tokens.TokenKind.PLUS;
 872                         break loop;
 873 
 874                     case '-':
 875                         next();
 876                         if (accept('>')) {
 877                             tk = Tokens.TokenKind.ARROW;
 878                         } else {
 879                             tk = Tokens.TokenKind.SUB;
 880                         }
 881                         break loop;
 882 
 883                     case '/':
 884                         next();
 885 
 886                         if (accept('/')) { // (Spec. 3.7)
 887                             skipToEOLN();
 888 
 889                             if (isAvailable()) {
 890                                 comments = appendComment(comments, processComment(pos, position(), Tokens.Comment.CommentStyle.LINE));
 891                             }
 892                             break;
 893                         } else if (accept('*')) { // (Spec. 3.7)
 894                             while (isAvailable()) {
 895                                 if (accept('*')) {
 896                                     if (is('/')) {
 897                                         break;
 898                                     }
 899                                 } else {
 900                                     next();
 901                                 }
 902                             }
 903 
 904                             if (accept('/')) {
 905                                 comments = appendComment(comments, processComment(pos, position(), Tokens.Comment.CommentStyle.BLOCK));
 906 
 907                                 break;
 908                             } else {
 909                                 lexError(pos, Errors.UnclosedComment);
 910 
 911                                 break loop;
 912                             }
 913                         } else {
 914                             lexError(pos, Errors.UnclosedComment);
 915                         }
 916                         break loop;
 917 
 918                     case '\'': // (Spec. 3.10)
 919                         next();
 920 
 921                         if (accept('\'')) {
 922                             lexError(pos, Errors.EmptyCharLit);
 923                         } else {
 924                             if (isEOLN()) {
 925                                 lexError(pos, Errors.IllegalLineEndInCharLit);
 926                             }
 927 
 928                             scanLitChar(pos);
 929 
 930                             if (accept('\'')) {
 931                                 tk = Tokens.TokenKind.CHARLITERAL;
 932                             } else {
 933                                 lexError(pos, Errors.UnclosedCharLit);
 934                             }
 935                         }
 936                         break loop;
 937 
 938                     case '\"': // (Spec. 3.10)
 939                         scanString(pos);
 940                         break loop;
 941 
 942                     default:
 943                         boolean isJavaIdentifierStart;
 944 
 945                         if (isASCII()) {
 946                             // all ASCII range chars already handled, above
 947                             isJavaIdentifierStart = false;
 948                         } else {
 949                             isJavaIdentifierStart = isSurrogate()
 950                                     ? Character.isJavaIdentifierStart(getCodepoint())
 951                                     : Character.isJavaIdentifierStart(get());
 952                         }
 953 
 954                         if (isJavaIdentifierStart) {
 955                             scanIdent();
 956                         } else if (digit(pos, 10) >= 0) {
 957                             scanNumber(pos, 10);
 958                         } else if (is((char) EOI) || !isAvailable()) {
 959                             tk = Tokens.TokenKind.EOF;
 960                             pos = position();
 961                         } else {
 962                             String arg;
 963 
 964                             if (isSurrogate()) {
 965                                 int codePoint = getCodepoint();
 966                                 char hi = Character.highSurrogate(codePoint);
 967                                 char lo = Character.lowSurrogate(codePoint);
 968                                 arg = String.format("\\u%04x\\u%04x", (int) hi, (int) lo);
 969                             } else {
 970                                 char ch = get();
 971                                 arg = (32 < ch && ch < 127) ? String.format("%s", ch) :
 972                                         String.format("\\u%04x", (int) ch);
 973                             }
 974 
 975                             lexError(pos, Errors.IllegalChar(arg));
 976                             next();
 977                         }
 978                         break loop;
 979                 }
 980             }
 981 
 982             int endPos = position();
 983 
 984             if (tk.tag == Tokens.Token.Tag.DEFAULT) {
 985                 return new Tokens.Token(tk, pos, endPos, comments);
 986             } else if (tk.tag == Tokens.Token.Tag.NAMED) {
 987                 return new Tokens.NamedToken(tk, pos, endPos, name, comments);
 988             } else {
 989                 // Get characters from string buffer.
 990                 String string = sb.toString();
 991 
 992                 // Translate escape sequences if present.
 993                 if (hasEscapeSequences) {
 994                     try {
 995                         string = string.translateEscapes();
 996                     } catch (Exception ex) {
 997                         // Error already reported, just use untranslated string.
 998                     }
 999                 }
1000 
1001                 if (tk.tag == Tokens.Token.Tag.STRING) {
1002                     // Build string token.
1003                     return new Tokens.StringToken(tk, pos, endPos, string, comments);
1004                 } else {
1005                     // Build numeric token.
1006                     return new Tokens.NumericToken(tk, pos, endPos, string, radix, comments);
1007                 }
1008             }
1009         } finally {
1010             int endPos = position();
1011 
1012             if (scannerDebug) {
1013                 System.out.println("nextToken(" + pos
1014                         + "," + endPos + ")=|" +
1015                         new String(getRawCharacters(pos, endPos))
1016                         + "| " + tk.name());
1017             }
1018         }
1019     }
1020 
1021     /**
1022      * Appends a comment to the list of comments preceding the current token.
1023      *
1024      * @param comments existing list of comments.
1025      * @param comment  comment to append.
1026      * @return new list with comment prepended to the existing list.
1027      */
1028     List<Tokens.Comment> appendComment(List<Tokens.Comment> comments, Tokens.Comment comment) {
1029         if (comments == null) {
1030             comments = new ArrayList<>();
1031         }
1032         // prepend
1033         comments.add(0, comment);
1034         return comments;
1035     }
1036 
1037     /**
1038      * Return the position where a lexical error occurred.
1039      *
1040      * @return position in the input buffer of where the error occurred.
1041      */
1042     public int errPos() {
1043         return errPos;
1044     }
1045 
1046     /**
1047      * Set the position where a lexical error occurred.
1048      *
1049      * @param pos position in the input buffer of where the error occurred.
1050      */
1051     public void errPos(int pos) {
1052         errPos = pos;
1053     }
1054 
1055     /**
1056      * Called when a complete comment has been scanned. pos and endPos
1057      * will mark the comment boundary.
1058      *
1059      * @param pos    position of the opening / in the input buffer.
1060      * @param endPos position + 1 of the closing / in the input buffer.
1061      * @param style  style of comment.
1062      * @return the constructed BasicComment.
1063      */
1064     Tokens.Comment processComment(int pos, int endPos, Tokens.Comment.CommentStyle style) {
1065         if (scannerDebug) {
1066             System.out.println("processComment(" + pos
1067                     + "," + endPos + "," + style + ")=|"
1068                     + new String(getRawCharacters(pos, endPos))
1069                     + "|");
1070         }
1071 
1072         char[] buf = getRawCharacters(pos, endPos);
1073         return new BasicComment(style, new String(buf));
1074     }
1075 
1076     /**
1077      * Called when a complete whitespace run has been scanned. pos and endPos
1078      * will mark the whitespace boundary.
1079      * <p>
1080      * (Spec 3.6)
1081      *
1082      * @param pos    position in input buffer of first whitespace character.
1083      * @param endPos position + 1 in input buffer of last whitespace character.
1084      */
1085     void processWhiteSpace(int pos, int endPos) {
1086         if (scannerDebug) {
1087             System.out.println("processWhitespace(" + pos
1088                     + "," + endPos + ")=|" +
1089                     new String(getRawCharacters(pos, endPos))
1090                     + "|");
1091         }
1092     }
1093 
1094     /**
1095      * Called when a line terminator has been processed.
1096      *
1097      * @param pos    position in input buffer of first character in sequence.
1098      * @param endPos position + 1 in input buffer of last character in sequence.
1099      */
1100     void processLineTerminator(int pos, int endPos) {
1101         if (scannerDebug) {
1102             System.out.println("processTerminator(" + pos
1103                     + "," + endPos + ")=|" +
1104                     new String(getRawCharacters(pos, endPos))
1105                     + "|");
1106         }
1107     }
1108 
1109     /**
1110      * Build a map for translating between line numbers and positions in the input.
1111      *
1112      * @return a LineMap
1113      */
1114     public Position.LineMap getLineMap() {
1115         return Position.makeLineMap(getRawCharacters(), length(), false);
1116     }
1117 
1118     /**
1119      * Scan a documentation comment; determine if a deprecated tag is present.
1120      * Called once the initial /, * have been skipped, positioned at the second *
1121      * (which is treated as the beginning of the first line).
1122      * Stops positioned at the closing '/'.
1123      *
1124      * @param style Style of comment
1125      *              LINE starting with //
1126      *              BLOCK starting with /*
1127      */
1128     record BasicComment(Tokens.Comment.CommentStyle style, String text) implements Tokens.Comment {
1129     }
1130 }