1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.incubator.code.parser.impl; 27 28 import jdk.incubator.code.parser.impl.Tokens.TokenKind; 29 import java.util.ArrayList; 30 import java.util.List; 31 32 /** 33 * The lexical analyzer maps an input stream consisting of UTF-8 characters and unicode 34 * escape sequences into a token sequence. 35 */ 36 final class JavaBasedTokenizer extends UnicodeReader { 37 /** 38 * If true then prints token information after each nextToken(). 39 */ 40 private static final boolean scannerDebug = false; 41 42 /** 43 * Sentinel for non-value. 44 */ 45 private static final int NOT_FOUND = -1; 46 47 /** 48 * The log to be used for error reporting. Copied from scanner factory. 49 */ 50 private final Log log; 51 52 /** 53 * The token factory. Copied from scanner factory. 54 */ 55 private final Tokens tokens; 56 57 /** 58 * The token kind, set by nextToken(). 59 */ 60 Tokens.TokenKind tk; 61 62 /** 63 * The token's radix, set by nextToken(). 64 */ 65 int radix; 66 67 /** 68 * The token's name, set by nextToken(). 69 */ 70 String name; 71 72 /** 73 * The position where a lexical error occurred; 74 */ 75 int errPos = Position.NOPOS; 76 77 /** 78 * true if contains escape sequences, set by nextToken(). 79 */ 80 boolean hasEscapeSequences; 81 82 /** 83 * Buffer for building literals, used by nextToken(). 84 */ 85 StringBuilder sb; 86 87 /** 88 * Construct a Java token scanner from the input character array. 89 * 90 * @param fac the factory which created this Scanner 91 * @param array the input character array. 92 * @param length The length of the meaningful content in the array. 93 */ 94 JavaBasedTokenizer(Scanner.Factory fac, char[] array, int length) { 95 super(fac, array, length); 96 this.log = fac.log; 97 this.tokens = fac.tokens; 98 this.sb = new StringBuilder(256); 99 } 100 101 /** 102 * Report an error at the given position using the provided arguments. 103 * 104 * @param pos position in input buffer. 105 * @param key error key to report. 106 */ 107 void lexError(int pos, Errors.Error key) { 108 tk = Tokens.TokenKind.ERROR; 109 errPos = pos; 110 log.error(pos, key); 111 } 112 113 /** 114 * Add a character to the literal buffer. 115 * 116 * @param ch character to add. 117 */ 118 void put(char ch) { 119 sb.append(ch); 120 } 121 122 /** 123 * Add a codepoint to the literal buffer. 124 * 125 * @param codePoint codepoint to add. 126 */ 127 void putCodePoint(int codePoint) { 128 sb.appendCodePoint(codePoint); 129 } 130 131 /** 132 * Add current character or codepoint to the literal buffer. 133 */ 134 void put() { 135 if (isSurrogate()) { 136 putCodePoint(getCodepoint()); 137 } else { 138 put(get()); 139 } 140 } 141 142 /** 143 * Add a string to the literal buffer. 144 */ 145 void put(String string) { 146 sb.append(string); 147 } 148 149 /** 150 * Add current character or codepoint to the literal buffer then return next character. 151 */ 152 char putThenNext() { 153 put(); 154 155 return next(); 156 } 157 158 /** 159 * If the specified character ch matches the current character then add current character 160 * to the literal buffer and then advance. 161 * 162 * @param ch character to match. 163 * @return true if ch matches current character. 164 */ 165 boolean acceptThenPut(char ch) { 166 if (is(ch)) { 167 put(get()); 168 next(); 169 170 return true; 171 } 172 173 return false; 174 } 175 176 /** 177 * If either ch1 or ch2 matches the current character then add current character 178 * to the literal buffer and then advance. 179 * 180 * @param ch1 first character to match. 181 * @param ch2 second character to match. 182 * @return true if either ch1 or ch2 matches current character. 183 */ 184 boolean acceptOneOfThenPut(char ch1, char ch2) { 185 if (isOneOf(ch1, ch2)) { 186 put(get()); 187 next(); 188 189 return true; 190 } 191 192 return false; 193 } 194 195 /** 196 * Test if the current character is a line terminator. 197 * 198 * @return true if current character is a line terminator. 199 */ 200 private boolean isEOLN() { 201 return isOneOf('\n', '\r'); 202 } 203 204 /** 205 * Processes the current character and places in the literal buffer. If the current 206 * character is a backslash then the next character is validated as a proper 207 * escape character. Conversion of escape sequences takes place at end of nextToken(). 208 * 209 * @param pos position of the first character in literal. 210 */ 211 private void scanLitChar(int pos) { 212 if (acceptThenPut('\\')) { 213 hasEscapeSequences = true; 214 215 switch (get()) { 216 case '0': 217 case '1': 218 case '2': 219 case '3': 220 case '4': 221 case '5': 222 case '6': 223 case '7': 224 char leadch = get(); 225 putThenNext(); 226 227 if (inRange('0', '7')) { 228 putThenNext(); 229 230 if (leadch <= '3' && inRange('0', '7')) { 231 putThenNext(); 232 } 233 } 234 break; 235 236 case 'b': 237 case 't': 238 case 'n': 239 case 'f': 240 case 'r': 241 case '\'': 242 case '\"': 243 case '\\': 244 putThenNext(); 245 break; 246 247 case '\n': 248 case '\r': 249 lexError(position(), Errors.IllegalEscChar); 250 break; 251 252 default: 253 lexError(position(), Errors.IllegalEscChar); 254 break; 255 } 256 } else { 257 putThenNext(); 258 } 259 } 260 261 /** 262 * Scan a string literal. 263 * 264 * @param pos position of the first character in literal. 265 */ 266 private void scanString(int pos) { 267 // Assume the best. 268 tk = Tokens.TokenKind.STRINGLITERAL; 269 // Skip first quote. 270 next(); 271 272 // While characters are available. 273 while (isAvailable()) { 274 if (accept('\"')) { 275 return; 276 } 277 278 if (isEOLN()) { 279 // Line terminator in string literal is an error. 280 // Fall out to unclosed string literal error. 281 break; 282 } else { 283 // Add character to string buffer. 284 scanLitChar(pos); 285 } 286 } 287 288 lexError(pos, Errors.UnclosedStrLit); 289 } 290 291 /** 292 * Scan sequence of digits. 293 * 294 * @param pos position of the first character in literal. 295 * @param digitRadix radix of numeric literal. 296 */ 297 private void scanDigits(int pos, int digitRadix) { 298 int leadingUnderscorePos = is('_') ? position() : NOT_FOUND; 299 int trailingUnderscorePos; 300 301 do { 302 if (!is('_')) { 303 put(); 304 trailingUnderscorePos = NOT_FOUND; 305 } else { 306 trailingUnderscorePos = position(); 307 } 308 309 next(); 310 } while (digit(pos, digitRadix) >= 0 || is('_')); 311 312 if (leadingUnderscorePos != NOT_FOUND) { 313 lexError(leadingUnderscorePos, Errors.IllegalUnderscore); 314 } else if (trailingUnderscorePos != NOT_FOUND) { 315 lexError(trailingUnderscorePos, Errors.IllegalUnderscore); 316 } 317 } 318 319 /** 320 * Read fractional part of hexadecimal floating point number. 321 * 322 * @param pos position of the first character in literal. 323 */ 324 private void scanHexExponentAndSuffix(int pos) { 325 if (acceptOneOfThenPut('p', 'P')) { 326 skipIllegalUnderscores(); 327 acceptOneOfThenPut('+', '-'); 328 skipIllegalUnderscores(); 329 330 if (digit(pos, 10) >= 0) { 331 scanDigits(pos, 10); 332 } else { 333 lexError(pos, Errors.MalformedFpLit); 334 } 335 } else { 336 lexError(pos, Errors.MalformedFpLit); 337 } 338 339 if (acceptOneOfThenPut('f', 'F')) { 340 tk = Tokens.TokenKind.FLOATLITERAL; 341 radix = 16; 342 } else { 343 acceptOneOfThenPut('d', 'D'); 344 tk = Tokens.TokenKind.DOUBLELITERAL; 345 radix = 16; 346 } 347 } 348 349 /** 350 * Read fractional part of floating point number. 351 * 352 * @param pos position of the first character in literal. 353 */ 354 private void scanFraction(int pos) { 355 skipIllegalUnderscores(); 356 357 if (digit(pos, 10) >= 0) { 358 scanDigits(pos, 10); 359 } 360 361 int index = sb.length(); 362 363 if (acceptOneOfThenPut('e', 'E')) { 364 skipIllegalUnderscores(); 365 acceptOneOfThenPut('+', '-'); 366 skipIllegalUnderscores(); 367 368 if (digit(pos, 10) >= 0) { 369 scanDigits(pos, 10); 370 return; 371 } 372 373 lexError(pos, Errors.MalformedFpLit); 374 sb.setLength(index); 375 } 376 } 377 378 /** 379 * Read fractional part and 'd' or 'f' suffix of floating point number. 380 * 381 * @param pos position of the first character in literal. 382 */ 383 private void scanFractionAndSuffix(int pos) { 384 radix = 10; 385 scanFraction(pos); 386 387 if (acceptOneOfThenPut('f', 'F')) { 388 tk = Tokens.TokenKind.FLOATLITERAL; 389 } else { 390 acceptOneOfThenPut('d', 'D'); 391 tk = Tokens.TokenKind.DOUBLELITERAL; 392 } 393 } 394 395 /** 396 * Read fractional part and 'd' or 'f' suffix of hexadecimal floating point number. 397 * 398 * @param pos position of the first character in literal. 399 */ 400 private void scanHexFractionAndSuffix(int pos, boolean seendigit) { 401 radix = 16; 402 assert is('.'); 403 putThenNext(); 404 skipIllegalUnderscores(); 405 406 if (digit(pos, 16) >= 0) { 407 seendigit = true; 408 scanDigits(pos, 16); 409 } 410 411 if (!seendigit) 412 lexError(pos, Errors.InvalidHexNumber); 413 else 414 scanHexExponentAndSuffix(pos); 415 } 416 417 /** 418 * Skip over underscores and report as a error if found. 419 */ 420 private void skipIllegalUnderscores() { 421 if (is('_')) { 422 lexError(position(), Errors.IllegalUnderscore); 423 skip('_'); 424 } 425 } 426 427 /** 428 * Read a number. (Spec. 3.10) 429 * 430 * @param pos position of the first character in literal. 431 * @param radix the radix of the number; one of 2, 8, 10, 16. 432 */ 433 private void scanNumber(int pos, int radix) { 434 // for octal, allow base-10 digit in case it's a float literal 435 this.radix = radix; 436 int digitRadix = (radix == 8 ? 10 : radix); 437 int firstDigit = digit(pos, Math.max(10, digitRadix)); 438 boolean seendigit = firstDigit >= 0; 439 boolean seenValidDigit = firstDigit >= 0 && firstDigit < digitRadix; 440 441 if (seendigit) { 442 scanDigits(pos, digitRadix); 443 } 444 445 if (radix == 16 && is('.')) { 446 scanHexFractionAndSuffix(pos, seendigit); 447 } else if (seendigit && radix == 16 && isOneOf('p', 'P')) { 448 scanHexExponentAndSuffix(pos); 449 } else if (digitRadix == 10 && is('.')) { 450 putThenNext(); 451 scanFractionAndSuffix(pos); 452 } else if (digitRadix == 10 && isOneOf('e', 'E', 'f', 'F', 'd', 'D')) { 453 scanFractionAndSuffix(pos); 454 } else { 455 if (!seenValidDigit) { 456 switch (radix) { 457 case 2: 458 lexError(pos, Errors.InvalidBinaryNumber); 459 break; 460 case 16: 461 lexError(pos, Errors.InvalidHexNumber); 462 break; 463 } 464 } 465 // If it is not a floating point literal, 466 // the octal number should be rescanned correctly. 467 if (radix == 8) { 468 sb.setLength(0); 469 reset(pos); 470 scanDigits(pos, 8); 471 } 472 473 if (acceptOneOf('l', 'L')) { 474 tk = Tokens.TokenKind.LONGLITERAL; 475 } else { 476 tk = Tokens.TokenKind.INTLITERAL; 477 } 478 } 479 } 480 481 /** 482 * Determines if the sequence in the literal buffer is a token (keyword, operator.) 483 */ 484 private void checkIdent(Tokens.TokenKind identifier) { 485 name = sb.toString(); 486 tk = tokens.lookupKind(name, identifier); 487 } 488 489 /** 490 * Read an identifier. (Spec. 3.8) 491 */ 492 private void scanIdent() { 493 scanIdent(Tokens.TokenKind.IDENTIFIER); 494 } 495 496 /** 497 * Read an identifier. (Spec. 3.8) 498 */ 499 private void scanIdent(Tokens.TokenKind identifier) { 500 putThenNext(); 501 502 do { 503 switch (get()) { 504 case 'A': 505 case 'B': 506 case 'C': 507 case 'D': 508 case 'E': 509 case 'F': 510 case 'G': 511 case 'H': 512 case 'I': 513 case 'J': 514 case 'K': 515 case 'L': 516 case 'M': 517 case 'N': 518 case 'O': 519 case 'P': 520 case 'Q': 521 case 'R': 522 case 'S': 523 case 'T': 524 case 'U': 525 case 'V': 526 case 'W': 527 case 'X': 528 case 'Y': 529 case 'Z': 530 case 'a': 531 case 'b': 532 case 'c': 533 case 'd': 534 case 'e': 535 case 'f': 536 case 'g': 537 case 'h': 538 case 'i': 539 case 'j': 540 case 'k': 541 case 'l': 542 case 'm': 543 case 'n': 544 case 'o': 545 case 'p': 546 case 'q': 547 case 'r': 548 case 's': 549 case 't': 550 case 'u': 551 case 'v': 552 case 'w': 553 case 'x': 554 case 'y': 555 case 'z': 556 case '$': 557 case '_': 558 case '0': 559 case '1': 560 case '2': 561 case '3': 562 case '4': 563 case '5': 564 case '6': 565 case '7': 566 case '8': 567 case '9': 568 break; 569 570 case '\u0000': 571 case '\u0001': 572 case '\u0002': 573 case '\u0003': 574 case '\u0004': 575 case '\u0005': 576 case '\u0006': 577 case '\u0007': 578 case '\u0008': 579 case '\u000E': 580 case '\u000F': 581 case '\u0010': 582 case '\u0011': 583 case '\u0012': 584 case '\u0013': 585 case '\u0014': 586 case '\u0015': 587 case '\u0016': 588 case '\u0017': 589 case '\u0018': 590 case '\u0019': 591 case '\u001B': 592 case '\u007F': 593 next(); 594 continue; 595 596 case '\u001A': // EOI is also a legal identifier part 597 if (isAvailable()) { 598 next(); 599 continue; 600 } 601 602 checkIdent(identifier); 603 return; 604 605 default: 606 boolean isJavaIdentifierPart; 607 608 if (isASCII()) { 609 // all ASCII range chars already handled, above 610 isJavaIdentifierPart = false; 611 } else { 612 if (Character.isIdentifierIgnorable(get())) { 613 next(); 614 continue; 615 } 616 617 isJavaIdentifierPart = isSurrogate() 618 ? Character.isJavaIdentifierPart(getCodepoint()) 619 : Character.isJavaIdentifierPart(get()); 620 } 621 622 if (!isJavaIdentifierPart) { 623 checkIdent(identifier); 624 return; 625 } 626 } 627 628 putThenNext(); 629 } while (true); 630 } 631 632 /** 633 * Read token (main entrypoint.) 634 */ 635 public Tokens.Token readToken() { 636 sb.setLength(0); 637 name = null; 638 radix = 0; 639 hasEscapeSequences = false; 640 641 int pos = 0; 642 List<Tokens.Comment> comments = null; 643 644 try { 645 loop: 646 while (true) { 647 pos = position(); 648 649 switch (get()) { 650 case ' ': // (Spec 3.6) 651 case '\t': // (Spec 3.6) 652 case '\f': // (Spec 3.6) 653 skipWhitespace(); 654 processWhiteSpace(pos, position()); 655 break; 656 657 case '\n': // (Spec 3.4) 658 next(); 659 processLineTerminator(pos, position()); 660 break; 661 662 case '\r': // (Spec 3.4) 663 next(); 664 accept('\n'); 665 processLineTerminator(pos, position()); 666 break; 667 668 case 'A': 669 case 'B': 670 case 'C': 671 case 'D': 672 case 'E': 673 case 'F': 674 case 'G': 675 case 'H': 676 case 'I': 677 case 'J': 678 case 'K': 679 case 'L': 680 case 'M': 681 case 'N': 682 case 'O': 683 case 'P': 684 case 'Q': 685 case 'R': 686 case 'S': 687 case 'T': 688 case 'U': 689 case 'V': 690 case 'W': 691 case 'X': 692 case 'Y': 693 case 'Z': 694 case 'a': 695 case 'b': 696 case 'c': 697 case 'd': 698 case 'e': 699 case 'f': 700 case 'g': 701 case 'h': 702 case 'i': 703 case 'j': 704 case 'k': 705 case 'l': 706 case 'm': 707 case 'n': 708 case 'o': 709 case 'p': 710 case 'q': 711 case 'r': 712 case 's': 713 case 't': 714 case 'u': 715 case 'v': 716 case 'w': 717 case 'x': 718 case 'y': 719 case 'z': 720 case '$': 721 case '_': // (Spec. 3.8) 722 scanIdent(); 723 break loop; 724 725 case '%': 726 scanIdent(Tokens.TokenKind.VALUE_IDENTIFIER); 727 break loop; 728 729 case '0': // (Spec. 3.10) 730 next(); 731 732 if (acceptOneOf('x', 'X')) { 733 skipIllegalUnderscores(); 734 scanNumber(pos, 16); 735 } else if (acceptOneOf('b', 'B')) { 736 skipIllegalUnderscores(); 737 scanNumber(pos, 2); 738 } else { 739 put('0'); 740 741 if (is('_')) { 742 int savePos = position(); 743 skip('_'); 744 745 if (digit(pos, 10) < 0) { 746 lexError(savePos, Errors.IllegalUnderscore); 747 } 748 } 749 750 scanNumber(pos, 8); 751 } 752 break loop; 753 754 case '1': 755 case '2': 756 case '3': 757 case '4': 758 case '5': 759 case '6': 760 case '7': 761 case '8': 762 case '9': // (Spec. 3.10) 763 scanNumber(pos, 10); 764 break loop; 765 766 case '.': // (Spec. 3.12) 767 next(); 768 int savePos = position(); 769 770 if (accept('.')) { 771 lexError(savePos, Errors.IllegalDot); 772 } else if (digit(pos, 10) >= 0) { 773 put('.'); 774 scanFractionAndSuffix(pos); // (Spec. 3.10) 775 } else { 776 tk = Tokens.TokenKind.DOT; 777 } 778 break loop; 779 780 case ',': // (Spec. 3.12) 781 next(); 782 tk = Tokens.TokenKind.COMMA; 783 break loop; 784 785 case '(': // (Spec. 3.12) 786 next(); 787 tk = Tokens.TokenKind.LPAREN; 788 break loop; 789 790 case ')': // (Spec. 3.12) 791 next(); 792 tk = Tokens.TokenKind.RPAREN; 793 break loop; 794 795 case '[': // (Spec. 3.12) 796 next(); 797 tk = Tokens.TokenKind.LBRACKET; 798 break loop; 799 800 case ']': // (Spec. 3.12) 801 next(); 802 tk = Tokens.TokenKind.RBRACKET; 803 break loop; 804 805 case '{': // (Spec. 3.12) 806 next(); 807 tk = Tokens.TokenKind.LBRACE; 808 break loop; 809 810 case '}': // (Spec. 3.12) 811 next(); 812 tk = Tokens.TokenKind.RBRACE; 813 break loop; 814 815 case '?': 816 next(); 817 tk = Tokens.TokenKind.QUES; 818 break loop; 819 820 case ';': 821 next(); 822 tk = Tokens.TokenKind.SEMI; 823 break loop; 824 825 case ':': 826 next(); 827 if (accept(':')) { 828 tk = Tokens.TokenKind.COLCOL; 829 } else { 830 tk = Tokens.TokenKind.COLON; 831 } 832 break loop; 833 834 case '&': 835 next(); 836 tk = Tokens.TokenKind.AMP; 837 break loop; 838 839 case '@': 840 next(); 841 tk = Tokens.TokenKind.MONKEYS_AT; 842 break loop; 843 844 case '^': 845 next(); 846 tk = Tokens.TokenKind.CARET; 847 break loop; 848 849 case '=': 850 next(); 851 tk = Tokens.TokenKind.EQ; 852 break loop; 853 854 case '<': 855 next(); 856 tk = Tokens.TokenKind.LT; 857 break loop; 858 859 case '>': 860 next(); 861 tk = Tokens.TokenKind.GT; 862 break loop; 863 864 case '#': 865 next(); 866 tk = TokenKind.HASH; 867 break loop; 868 869 case '+': 870 next(); 871 tk = Tokens.TokenKind.PLUS; 872 break loop; 873 874 case '-': 875 next(); 876 if (accept('>')) { 877 tk = Tokens.TokenKind.ARROW; 878 } else { 879 tk = Tokens.TokenKind.SUB; 880 } 881 break loop; 882 883 case '/': 884 next(); 885 886 if (accept('/')) { // (Spec. 3.7) 887 skipToEOLN(); 888 889 if (isAvailable()) { 890 comments = appendComment(comments, processComment(pos, position(), Tokens.Comment.CommentStyle.LINE)); 891 } 892 break; 893 } else if (accept('*')) { // (Spec. 3.7) 894 while (isAvailable()) { 895 if (accept('*')) { 896 if (is('/')) { 897 break; 898 } 899 } else { 900 next(); 901 } 902 } 903 904 if (accept('/')) { 905 comments = appendComment(comments, processComment(pos, position(), Tokens.Comment.CommentStyle.BLOCK)); 906 907 break; 908 } else { 909 lexError(pos, Errors.UnclosedComment); 910 911 break loop; 912 } 913 } else { 914 lexError(pos, Errors.UnclosedComment); 915 } 916 break loop; 917 918 case '\'': // (Spec. 3.10) 919 next(); 920 921 if (accept('\'')) { 922 lexError(pos, Errors.EmptyCharLit); 923 } else { 924 if (isEOLN()) { 925 lexError(pos, Errors.IllegalLineEndInCharLit); 926 } 927 928 scanLitChar(pos); 929 930 if (accept('\'')) { 931 tk = Tokens.TokenKind.CHARLITERAL; 932 } else { 933 lexError(pos, Errors.UnclosedCharLit); 934 } 935 } 936 break loop; 937 938 case '\"': // (Spec. 3.10) 939 scanString(pos); 940 break loop; 941 942 default: 943 boolean isJavaIdentifierStart; 944 945 if (isASCII()) { 946 // all ASCII range chars already handled, above 947 isJavaIdentifierStart = false; 948 } else { 949 isJavaIdentifierStart = isSurrogate() 950 ? Character.isJavaIdentifierStart(getCodepoint()) 951 : Character.isJavaIdentifierStart(get()); 952 } 953 954 if (isJavaIdentifierStart) { 955 scanIdent(); 956 } else if (digit(pos, 10) >= 0) { 957 scanNumber(pos, 10); 958 } else if (is((char) EOI) || !isAvailable()) { 959 tk = Tokens.TokenKind.EOF; 960 pos = position(); 961 } else { 962 String arg; 963 964 if (isSurrogate()) { 965 int codePoint = getCodepoint(); 966 char hi = Character.highSurrogate(codePoint); 967 char lo = Character.lowSurrogate(codePoint); 968 arg = String.format("\\u%04x\\u%04x", (int) hi, (int) lo); 969 } else { 970 char ch = get(); 971 arg = (32 < ch && ch < 127) ? String.format("%s", ch) : 972 String.format("\\u%04x", (int) ch); 973 } 974 975 lexError(pos, Errors.IllegalChar(arg)); 976 next(); 977 } 978 break loop; 979 } 980 } 981 982 int endPos = position(); 983 984 if (tk.tag == Tokens.Token.Tag.DEFAULT) { 985 return new Tokens.Token(tk, pos, endPos, comments); 986 } else if (tk.tag == Tokens.Token.Tag.NAMED) { 987 return new Tokens.NamedToken(tk, pos, endPos, name, comments); 988 } else { 989 // Get characters from string buffer. 990 String string = sb.toString(); 991 992 // Translate escape sequences if present. 993 if (hasEscapeSequences) { 994 try { 995 string = string.translateEscapes(); 996 } catch (Exception ex) { 997 // Error already reported, just use untranslated string. 998 } 999 } 1000 1001 if (tk.tag == Tokens.Token.Tag.STRING) { 1002 // Build string token. 1003 return new Tokens.StringToken(tk, pos, endPos, string, comments); 1004 } else { 1005 // Build numeric token. 1006 return new Tokens.NumericToken(tk, pos, endPos, string, radix, comments); 1007 } 1008 } 1009 } finally { 1010 int endPos = position(); 1011 1012 if (scannerDebug) { 1013 System.out.println("nextToken(" + pos 1014 + "," + endPos + ")=|" + 1015 new String(getRawCharacters(pos, endPos)) 1016 + "| " + tk.name()); 1017 } 1018 } 1019 } 1020 1021 /** 1022 * Appends a comment to the list of comments preceding the current token. 1023 * 1024 * @param comments existing list of comments. 1025 * @param comment comment to append. 1026 * @return new list with comment prepended to the existing list. 1027 */ 1028 List<Tokens.Comment> appendComment(List<Tokens.Comment> comments, Tokens.Comment comment) { 1029 if (comments == null) { 1030 comments = new ArrayList<>(); 1031 } 1032 // prepend 1033 comments.add(0, comment); 1034 return comments; 1035 } 1036 1037 /** 1038 * Return the position where a lexical error occurred. 1039 * 1040 * @return position in the input buffer of where the error occurred. 1041 */ 1042 public int errPos() { 1043 return errPos; 1044 } 1045 1046 /** 1047 * Set the position where a lexical error occurred. 1048 * 1049 * @param pos position in the input buffer of where the error occurred. 1050 */ 1051 public void errPos(int pos) { 1052 errPos = pos; 1053 } 1054 1055 /** 1056 * Called when a complete comment has been scanned. pos and endPos 1057 * will mark the comment boundary. 1058 * 1059 * @param pos position of the opening / in the input buffer. 1060 * @param endPos position + 1 of the closing / in the input buffer. 1061 * @param style style of comment. 1062 * @return the constructed BasicComment. 1063 */ 1064 Tokens.Comment processComment(int pos, int endPos, Tokens.Comment.CommentStyle style) { 1065 if (scannerDebug) { 1066 System.out.println("processComment(" + pos 1067 + "," + endPos + "," + style + ")=|" 1068 + new String(getRawCharacters(pos, endPos)) 1069 + "|"); 1070 } 1071 1072 char[] buf = getRawCharacters(pos, endPos); 1073 return new BasicComment(style, new String(buf)); 1074 } 1075 1076 /** 1077 * Called when a complete whitespace run has been scanned. pos and endPos 1078 * will mark the whitespace boundary. 1079 * <p> 1080 * (Spec 3.6) 1081 * 1082 * @param pos position in input buffer of first whitespace character. 1083 * @param endPos position + 1 in input buffer of last whitespace character. 1084 */ 1085 void processWhiteSpace(int pos, int endPos) { 1086 if (scannerDebug) { 1087 System.out.println("processWhitespace(" + pos 1088 + "," + endPos + ")=|" + 1089 new String(getRawCharacters(pos, endPos)) 1090 + "|"); 1091 } 1092 } 1093 1094 /** 1095 * Called when a line terminator has been processed. 1096 * 1097 * @param pos position in input buffer of first character in sequence. 1098 * @param endPos position + 1 in input buffer of last character in sequence. 1099 */ 1100 void processLineTerminator(int pos, int endPos) { 1101 if (scannerDebug) { 1102 System.out.println("processTerminator(" + pos 1103 + "," + endPos + ")=|" + 1104 new String(getRawCharacters(pos, endPos)) 1105 + "|"); 1106 } 1107 } 1108 1109 /** 1110 * Build a map for translating between line numbers and positions in the input. 1111 * 1112 * @return a LineMap 1113 */ 1114 public Position.LineMap getLineMap() { 1115 return Position.makeLineMap(getRawCharacters(), length(), false); 1116 } 1117 1118 /** 1119 * Scan a documentation comment; determine if a deprecated tag is present. 1120 * Called once the initial /, * have been skipped, positioned at the second * 1121 * (which is treated as the beginning of the first line). 1122 * Stops positioned at the closing '/'. 1123 * 1124 * @param style Style of comment 1125 * LINE starting with // 1126 * BLOCK starting with /* 1127 */ 1128 record BasicComment(Tokens.Comment.CommentStyle style, String text) implements Tokens.Comment { 1129 } 1130 }