1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.incubator.code.parser.impl; 27 28 import java.util.Arrays; 29 30 /** 31 * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters 32 * one by one as contained in the input stream, handling unicode escape sequences accordingly. 33 * 34 * <p><b>This is NOT part of any supported API. 35 * If you write code that depends on this, you do so at your own risk. 36 * This code and its internal interfaces are subject to change or 37 * deletion without notice.</b></p> 38 */ 39 sealed class UnicodeReader permits JavaBasedTokenizer { 40 /** 41 * End of input character. Used as a sentinel to denote the 42 * character one beyond the last defined character in a 43 * source file. 44 */ 45 static final byte EOI = 0x1A; 46 47 /** 48 * Buffer containing characters from source file. May contain extraneous characters 49 * beyond this.length. 50 */ 51 private final char[] buffer; 52 53 /** 54 * Length of meaningful content in buffer. 55 */ 56 private final int length; 57 58 /** 59 * Character buffer index of character currently being observed. 60 */ 61 private int position; 62 63 /** 64 * Number of characters combined to provide character currently being observed. Typically 65 * one, but may be more when combinations of surrogate pairs and unicode escape sequences 66 * are read. 67 */ 68 private int width; 69 70 /** 71 * Character currently being observed. If a surrogate pair is read then will be the high 72 * member of the pair. 73 */ 74 private char character; 75 76 /** 77 * Codepoint of character currently being observed. Typically equivalent to the character 78 * but will have a value greater that 0xFFFF when a surrogate pair. 79 */ 80 private int codepoint; 81 82 /** 83 * true if the last character was a backslash. This is used to handle the special case 84 * when a backslash precedes an unicode escape. In that case, the second backslash 85 * is treated as a backslash and not part of an unicode escape. 86 */ 87 private boolean wasBackslash; 88 89 /** 90 * true if the last character was derived from an unicode escape sequence. 91 */ 92 private boolean wasUnicodeEscape; 93 94 /** 95 * Log for error reporting. 96 */ 97 private final Log log; 98 99 /** 100 * Constructor. 101 * 102 * @param sf scan factory. 103 * @param array array containing contents of source. 104 * @param length length of meaningful content in buffer. 105 */ 106 UnicodeReader(Scanner.Factory sf, char[] array, int length) { 107 this.buffer = array; 108 this.length = length; 109 this.position = 0; 110 this.width = 0; 111 this.character = '\0'; 112 this.codepoint = 0; 113 this.wasBackslash = false; 114 this.wasUnicodeEscape = false; 115 this.log = sf.log; 116 117 nextCodePoint(); 118 } 119 120 /** 121 * Returns the length of the buffer. This is length of meaningful content in buffer and 122 * not the length of the buffer array. 123 * 124 * @return length of the buffer. 125 */ 126 protected int length() { 127 return length; 128 } 129 130 /** 131 * Return true if current position is within the meaningful part of the buffer. 132 * 133 * @return true if current position is within the meaningful part of the buffer. 134 */ 135 protected boolean isAvailable() { 136 return position < length; 137 } 138 139 /** 140 * Fetches the next 16-bit character from the buffer and places it in this.character. 141 */ 142 private void nextCodeUnit() { 143 // Index of next character in buffer. 144 int index = position + width; 145 146 // If past end of buffer. 147 if (length <= index) { 148 // End of file is marked with EOI. 149 character = EOI; 150 } else { 151 // Next character in buffer. 152 character = buffer[index]; 153 // Increment length of codepoint. 154 width++; 155 } 156 } 157 158 /** 159 * Fetches the next 16-bit character from the buffer. If an unicode escape 160 * is detected then converts the unicode escape to a character. 161 */ 162 private void nextUnicodeInputCharacter() { 163 // Position to next codepoint. 164 position += width; 165 // Codepoint has no characters yet. 166 width = 0; 167 168 // Fetch next character. 169 nextCodeUnit(); 170 171 if (character == '\\' && (!wasBackslash || wasUnicodeEscape)) { 172 // Is a backslash and may be an unicode escape. 173 switch (unicodeEscape()) { 174 case BACKSLASH -> { 175 wasUnicodeEscape = false; 176 wasBackslash = !wasBackslash; 177 } 178 case VALID_ESCAPE -> { 179 wasUnicodeEscape = true; 180 wasBackslash = character == '\\' && !wasBackslash; 181 } 182 case BROKEN_ESCAPE -> nextUnicodeInputCharacter(); //skip broken unicode escapes 183 } 184 } else { 185 wasBackslash = false; 186 wasUnicodeEscape = false; 187 } 188 189 // Codepoint and character match if not surrogate. 190 codepoint = (int) character; 191 } 192 193 /** 194 * Fetches the nextcode point from the buffer. If an unicode escape is recognized 195 * then converts unicode escape to a character. If two characters are a surrogate pair 196 * then converts to a codepoint. 197 */ 198 private void nextCodePoint() { 199 // Next unicode character. 200 nextUnicodeInputCharacter(); 201 202 // Return early if ASCII or not a surrogate pair. 203 if (isASCII() || !Character.isHighSurrogate(character)) { 204 return; 205 } 206 207 // Capture high surrogate and position. 208 char hi = character; 209 int savePosition = position; 210 int saveWidth = width; 211 212 // Get potential low surrogate. 213 nextUnicodeInputCharacter(); 214 char lo = character; 215 216 if (Character.isLowSurrogate(lo)) { 217 // Start codepoint at start of high surrogate. 218 position = savePosition; 219 width += saveWidth; 220 // Compute codepoint. 221 codepoint = Character.toCodePoint(hi, lo); 222 } else { 223 // Restore to treat high surrogate as just a character. 224 position = savePosition; 225 width = saveWidth; 226 character = hi; 227 codepoint = (int) hi; 228 // Could potential report an error here (old code did not.) 229 } 230 } 231 232 /** 233 * Converts an unicode escape into a character. 234 * 235 * @return true if was an unicode escape. 236 */ 237 private UnicodeEscapeResult unicodeEscape() { 238 // Start of unicode escape (past backslash.) 239 int start = position + width; 240 241 // Default to backslash result, unless proven otherwise. 242 character = '\\'; 243 width = 1; 244 245 // Skip multiple 'u'. 246 int index; 247 for (index = start; index < length; index++) { 248 if (buffer[index] != 'u') { 249 break; 250 } 251 } 252 253 // Needs to have been at least one u. 254 if (index == start) { 255 return UnicodeEscapeResult.BACKSLASH; 256 } 257 258 int code = 0; 259 260 for (int i = 0; i < 4; i++) { 261 // Translate and merge digit. 262 int digit = index < length ? Character.digit(buffer[index], 16) : -1; 263 code = code << 4 | digit; 264 265 // If invalid digit. 266 if (code < 0) { 267 break; 268 } 269 270 // On to next character. 271 index++; 272 } 273 274 // Skip digits even if error. 275 width = index - position; 276 277 // If all digits are good. 278 if (code >= 0) { 279 character = (char) code; 280 return UnicodeEscapeResult.VALID_ESCAPE; 281 } else { 282 log.error(index, Errors.IllegalUnicodeEsc); 283 return UnicodeEscapeResult.BROKEN_ESCAPE; 284 } 285 } 286 287 private enum UnicodeEscapeResult { 288 BACKSLASH, 289 VALID_ESCAPE, 290 BROKEN_ESCAPE 291 } 292 293 /** 294 * Return the current position in the character buffer. 295 * 296 * @return current position in the character buffer. 297 */ 298 protected int position() { 299 return position; 300 } 301 302 303 /** 304 * Reset the reader to the specified position. 305 * Warning: Do not use when previous character was an ASCII or unicode backslash. 306 * 307 * @param pos 308 */ 309 protected void reset(int pos) { 310 position = pos; 311 width = 0; 312 wasBackslash = false; 313 wasUnicodeEscape = false; 314 nextCodePoint(); 315 } 316 317 /** 318 * Return the current character in at the current position. 319 * 320 * @return current character in at the current position. 321 */ 322 protected char get() { 323 return character; 324 } 325 326 /** 327 * Return the current codepoint in at the current position. 328 * 329 * @return current codepoint in at the current position. 330 */ 331 protected int getCodepoint() { 332 return codepoint; 333 } 334 335 /** 336 * Returns true if the current codepoint is a surrogate. 337 * 338 * @return true if the current codepoint is a surrogate. 339 */ 340 protected boolean isSurrogate() { 341 return 0xFFFF < codepoint; 342 } 343 344 /** 345 * Returns true if the current character is ASCII. 346 * 347 * @return true if the current character is ASCII. 348 */ 349 protected boolean isASCII() { 350 return character <= 0x7F; 351 } 352 353 /** 354 * Advances the current character to the next character. 355 * 356 * @return next character. 357 */ 358 protected char next() { 359 nextCodePoint(); 360 361 return character; 362 } 363 364 /** 365 * Compare character. Returns true if a match. 366 * 367 * @param ch character to match. 368 * @return true if a match. 369 */ 370 protected boolean is(char ch) { 371 return character == ch; 372 } 373 374 /** 375 * Match one of the arguments. Returns true if a match. 376 */ 377 protected boolean isOneOf(char ch1, char ch2) { 378 return is(ch1) || is(ch2); 379 } 380 381 protected boolean isOneOf(char ch1, char ch2, char ch3) { 382 return is(ch1) || is(ch2) || is(ch3); 383 } 384 385 protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) { 386 return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6); 387 } 388 389 /** 390 * Tests to see if current character is in the range of lo to hi characters (inclusive). 391 * 392 * @param lo lowest character in range. 393 * @param hi highest character in range. 394 * @return true if the current character is in range. 395 */ 396 protected boolean inRange(char lo, char hi) { 397 return lo <= character && character <= hi; 398 } 399 400 /** 401 * Compare character and advance if a match. Returns true if a match. 402 * 403 * @param ch character to match. 404 * @return true if a match. 405 */ 406 protected boolean accept(char ch) { 407 if (is(ch)) { 408 next(); 409 410 return true; 411 } 412 413 return false; 414 } 415 416 /** 417 * Match one of the arguments and advance if a match. Returns true if a match. 418 */ 419 protected boolean acceptOneOf(char ch1, char ch2) { 420 if (isOneOf(ch1, ch2)) { 421 next(); 422 423 return true; 424 } 425 426 return false; 427 } 428 429 protected boolean acceptOneOf(char ch1, char ch2, char ch3) { 430 if (isOneOf(ch1, ch2, ch3)) { 431 next(); 432 433 return true; 434 } 435 436 return false; 437 } 438 439 /** 440 * Skip over all occurrences of character. 441 * 442 * @param ch character to accept. 443 */ 444 protected void skip(char ch) { 445 while (accept(ch)) { 446 // next 447 } 448 } 449 450 /** 451 * Skip over ASCII white space characters. 452 */ 453 protected void skipWhitespace() { 454 while (acceptOneOf(' ', '\t', '\f')) { 455 // next 456 } 457 } 458 459 /** 460 * Skip to end of line. 461 */ 462 protected void skipToEOLN() { 463 while (isAvailable()) { 464 if (isOneOf('\r', '\n')) { 465 break; 466 } 467 468 next(); 469 } 470 471 } 472 473 /** 474 * Compare string and advance if a match. Returns true if a match. 475 * Warning: Do not use when previous character was a backslash 476 * (confuses state of wasBackslash.) 477 * 478 * @param string string to match character for character. 479 * @return true if a match. 480 */ 481 protected boolean accept(String string) { 482 // Quick test. 483 if (string.length() == 0 || !is(string.charAt(0))) { 484 return false; 485 } 486 487 // Be prepared to retreat if not a match. 488 int savedPosition = position; 489 490 nextCodePoint(); 491 492 // Check each character. 493 for (int i = 1; i < string.length(); i++) { 494 if (!is(string.charAt(i))) { 495 // Restart if not a match. 496 reset(savedPosition); 497 498 return false; 499 } 500 501 nextCodePoint(); 502 } 503 504 return true; 505 } 506 507 /** 508 * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not 509 * advance character. 510 * 511 * @param pos starting position. 512 * @param digitRadix base of number being converted. 513 * @return value of digit. 514 */ 515 protected int digit(int pos, int digitRadix) { 516 int result; 517 518 // Just an ASCII digit. 519 if (inRange('0', '9')) { 520 // Fast common case. 521 result = character - '0'; 522 523 return result < digitRadix ? result : -1; 524 } 525 526 // Handle other digits. 527 result = isSurrogate() ? Character.digit(codepoint, digitRadix) : 528 Character.digit(character, digitRadix); 529 530 if (result >= 0 && !isASCII()) { 531 log.error(position(), Errors.IllegalNonasciiDigit); 532 character = "0123456789abcdef".charAt(result); 533 } 534 535 return result; 536 } 537 538 /** 539 * Returns the input buffer. Unicode escape sequences are not translated. 540 * 541 * @return the input buffer. 542 */ 543 public char[] getRawCharacters() { 544 return length == buffer.length ? buffer : Arrays.copyOf(buffer, length); 545 } 546 547 /** 548 * Returns a copy of a character array subset of the input buffer. 549 * The returned array begins at the {@code beginIndex} and 550 * extends to the character at index {@code endIndex - 1}. 551 * Thus the length of the substring is {@code endIndex-beginIndex}. 552 * This behavior is like 553 * {@code String.substring(beginIndex, endIndex)}. 554 * Unicode escape sequences are not translated. 555 * 556 * @param beginIndex the beginning index, inclusive. 557 * @param endIndex the ending index, exclusive. 558 * @throws ArrayIndexOutOfBoundsException if either offset is outside of the 559 * array bounds 560 */ 561 public char[] getRawCharacters(int beginIndex, int endIndex) { 562 return Arrays.copyOfRange(buffer, beginIndex, endIndex); 563 } 564 }