1 /*
2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package jdk.incubator.code.extern.impl;
27
28 import java.util.Arrays;
29
30 /**
31 * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters
32 * one by one as contained in the input stream, handling unicode escape sequences accordingly.
33 *
34 * <p><b>This is NOT part of any supported API.
35 * If you write code that depends on this, you do so at your own risk.
36 * This code and its internal interfaces are subject to change or
37 * deletion without notice.</b></p>
38 */
39 sealed class UnicodeReader permits JavaBasedTokenizer {
40 /**
41 * End of input character. Used as a sentinel to denote the
42 * character one beyond the last defined character in a
43 * source file.
44 */
45 static final byte EOI = 0x1A;
46
47 /**
48 * Buffer containing characters from source file. May contain extraneous characters
49 * beyond this.length.
50 */
51 private final char[] buffer;
52
53 /**
54 * Length of meaningful content in buffer.
55 */
56 private final int length;
57
58 /**
59 * Character buffer index of character currently being observed.
60 */
61 private int position;
62
63 /**
64 * Number of characters combined to provide character currently being observed. Typically
65 * one, but may be more when combinations of surrogate pairs and unicode escape sequences
66 * are read.
67 */
68 private int width;
69
70 /**
71 * Character currently being observed. If a surrogate pair is read then will be the high
72 * member of the pair.
73 */
74 private char character;
75
76 /**
77 * Codepoint of character currently being observed. Typically equivalent to the character
78 * but will have a value greater that 0xFFFF when a surrogate pair.
79 */
80 private int codepoint;
81
82 /**
83 * true if the last character was a backslash. This is used to handle the special case
84 * when a backslash precedes an unicode escape. In that case, the second backslash
85 * is treated as a backslash and not part of an unicode escape.
86 */
87 private boolean wasBackslash;
88
89 /**
90 * true if the last character was derived from an unicode escape sequence.
91 */
92 private boolean wasUnicodeEscape;
93
94 /**
95 * Log for error reporting.
96 */
97 private final Log log;
98
99 /**
100 * Constructor.
101 *
102 * @param sf scan factory.
103 * @param array array containing contents of source.
104 * @param length length of meaningful content in buffer.
105 */
106 UnicodeReader(Scanner.Factory sf, char[] array, int length) {
107 this.buffer = array;
108 this.length = length;
109 this.position = 0;
110 this.width = 0;
111 this.character = '\0';
112 this.codepoint = 0;
113 this.wasBackslash = false;
114 this.wasUnicodeEscape = false;
115 this.log = sf.log;
116
117 nextCodePoint();
118 }
119
120 /**
121 * Returns the length of the buffer. This is length of meaningful content in buffer and
122 * not the length of the buffer array.
123 *
124 * @return length of the buffer.
125 */
126 protected int length() {
127 return length;
128 }
129
130 /**
131 * Return true if current position is within the meaningful part of the buffer.
132 *
133 * @return true if current position is within the meaningful part of the buffer.
134 */
135 protected boolean isAvailable() {
136 return position < length;
137 }
138
139 /**
140 * Fetches the next 16-bit character from the buffer and places it in this.character.
141 */
142 private void nextCodeUnit() {
143 // Index of next character in buffer.
144 int index = position + width;
145
146 // If past end of buffer.
147 if (length <= index) {
148 // End of file is marked with EOI.
149 character = EOI;
150 } else {
151 // Next character in buffer.
152 character = buffer[index];
153 // Increment length of codepoint.
154 width++;
155 }
156 }
157
158 /**
159 * Fetches the next 16-bit character from the buffer. If an unicode escape
160 * is detected then converts the unicode escape to a character.
161 */
162 private void nextUnicodeInputCharacter() {
163 // Position to next codepoint.
164 position += width;
165 // Codepoint has no characters yet.
166 width = 0;
167
168 // Fetch next character.
169 nextCodeUnit();
170
171 if (character == '\\' && (!wasBackslash || wasUnicodeEscape)) {
172 // Is a backslash and may be an unicode escape.
173 switch (unicodeEscape()) {
174 case BACKSLASH -> {
175 wasUnicodeEscape = false;
176 wasBackslash = !wasBackslash;
177 }
178 case VALID_ESCAPE -> {
179 wasUnicodeEscape = true;
180 wasBackslash = character == '\\' && !wasBackslash;
181 }
182 case BROKEN_ESCAPE -> nextUnicodeInputCharacter(); //skip broken unicode escapes
183 }
184 } else {
185 wasBackslash = false;
186 wasUnicodeEscape = false;
187 }
188
189 // Codepoint and character match if not surrogate.
190 codepoint = (int) character;
191 }
192
193 /**
194 * Fetches the nextcode point from the buffer. If an unicode escape is recognized
195 * then converts unicode escape to a character. If two characters are a surrogate pair
196 * then converts to a codepoint.
197 */
198 private void nextCodePoint() {
199 // Next unicode character.
200 nextUnicodeInputCharacter();
201
202 // Return early if ASCII or not a surrogate pair.
203 if (isASCII() || !Character.isHighSurrogate(character)) {
204 return;
205 }
206
207 // Capture high surrogate and position.
208 char hi = character;
209 int savePosition = position;
210 int saveWidth = width;
211
212 // Get potential low surrogate.
213 nextUnicodeInputCharacter();
214 char lo = character;
215
216 if (Character.isLowSurrogate(lo)) {
217 // Start codepoint at start of high surrogate.
218 position = savePosition;
219 width += saveWidth;
220 // Compute codepoint.
221 codepoint = Character.toCodePoint(hi, lo);
222 } else {
223 // Restore to treat high surrogate as just a character.
224 position = savePosition;
225 width = saveWidth;
226 character = hi;
227 codepoint = (int) hi;
228 // Could potential report an error here (old code did not.)
229 }
230 }
231
232 /**
233 * Converts an unicode escape into a character.
234 *
235 * @return true if was an unicode escape.
236 */
237 private UnicodeEscapeResult unicodeEscape() {
238 // Start of unicode escape (past backslash.)
239 int start = position + width;
240
241 // Default to backslash result, unless proven otherwise.
242 character = '\\';
243 width = 1;
244
245 // Skip multiple 'u'.
246 int index;
247 for (index = start; index < length; index++) {
248 if (buffer[index] != 'u') {
249 break;
250 }
251 }
252
253 // Needs to have been at least one u.
254 if (index == start) {
255 return UnicodeEscapeResult.BACKSLASH;
256 }
257
258 int code = 0;
259
260 for (int i = 0; i < 4; i++) {
261 // Translate and merge digit.
262 int digit = index < length ? Character.digit(buffer[index], 16) : -1;
263 code = code << 4 | digit;
264
265 // If invalid digit.
266 if (code < 0) {
267 break;
268 }
269
270 // On to next character.
271 index++;
272 }
273
274 // Skip digits even if error.
275 width = index - position;
276
277 // If all digits are good.
278 if (code >= 0) {
279 character = (char) code;
280 return UnicodeEscapeResult.VALID_ESCAPE;
281 } else {
282 log.error(index, Errors.IllegalUnicodeEsc);
283 return UnicodeEscapeResult.BROKEN_ESCAPE;
284 }
285 }
286
287 private enum UnicodeEscapeResult {
288 BACKSLASH,
289 VALID_ESCAPE,
290 BROKEN_ESCAPE
291 }
292
293 /**
294 * Return the current position in the character buffer.
295 *
296 * @return current position in the character buffer.
297 */
298 protected int position() {
299 return position;
300 }
301
302
303 /**
304 * Reset the reader to the specified position.
305 * Warning: Do not use when previous character was an ASCII or unicode backslash.
306 *
307 * @param pos
308 */
309 protected void reset(int pos) {
310 position = pos;
311 width = 0;
312 wasBackslash = false;
313 wasUnicodeEscape = false;
314 nextCodePoint();
315 }
316
317 /**
318 * Return the current character in at the current position.
319 *
320 * @return current character in at the current position.
321 */
322 protected char get() {
323 return character;
324 }
325
326 /**
327 * Return the current codepoint in at the current position.
328 *
329 * @return current codepoint in at the current position.
330 */
331 protected int getCodepoint() {
332 return codepoint;
333 }
334
335 /**
336 * Returns true if the current codepoint is a surrogate.
337 *
338 * @return true if the current codepoint is a surrogate.
339 */
340 protected boolean isSurrogate() {
341 return 0xFFFF < codepoint;
342 }
343
344 /**
345 * Returns true if the current character is ASCII.
346 *
347 * @return true if the current character is ASCII.
348 */
349 protected boolean isASCII() {
350 return character <= 0x7F;
351 }
352
353 /**
354 * Advances the current character to the next character.
355 *
356 * @return next character.
357 */
358 protected char next() {
359 nextCodePoint();
360
361 return character;
362 }
363
364 /**
365 * Compare character. Returns true if a match.
366 *
367 * @param ch character to match.
368 * @return true if a match.
369 */
370 protected boolean is(char ch) {
371 return character == ch;
372 }
373
374 /**
375 * Match one of the arguments. Returns true if a match.
376 */
377 protected boolean isOneOf(char ch1, char ch2) {
378 return is(ch1) || is(ch2);
379 }
380
381 protected boolean isOneOf(char ch1, char ch2, char ch3) {
382 return is(ch1) || is(ch2) || is(ch3);
383 }
384
385 protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) {
386 return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6);
387 }
388
389 /**
390 * Tests to see if current character is in the range of lo to hi characters (inclusive).
391 *
392 * @param lo lowest character in range.
393 * @param hi highest character in range.
394 * @return true if the current character is in range.
395 */
396 protected boolean inRange(char lo, char hi) {
397 return lo <= character && character <= hi;
398 }
399
400 /**
401 * Compare character and advance if a match. Returns true if a match.
402 *
403 * @param ch character to match.
404 * @return true if a match.
405 */
406 protected boolean accept(char ch) {
407 if (is(ch)) {
408 next();
409
410 return true;
411 }
412
413 return false;
414 }
415
416 /**
417 * Match one of the arguments and advance if a match. Returns true if a match.
418 */
419 protected boolean acceptOneOf(char ch1, char ch2) {
420 if (isOneOf(ch1, ch2)) {
421 next();
422
423 return true;
424 }
425
426 return false;
427 }
428
429 protected boolean acceptOneOf(char ch1, char ch2, char ch3) {
430 if (isOneOf(ch1, ch2, ch3)) {
431 next();
432
433 return true;
434 }
435
436 return false;
437 }
438
439 /**
440 * Skip over all occurrences of character.
441 *
442 * @param ch character to accept.
443 */
444 protected void skip(char ch) {
445 while (accept(ch)) {
446 // next
447 }
448 }
449
450 /**
451 * Skip over ASCII white space characters.
452 */
453 protected void skipWhitespace() {
454 while (acceptOneOf(' ', '\t', '\f')) {
455 // next
456 }
457 }
458
459 /**
460 * Skip to end of line.
461 */
462 protected void skipToEOLN() {
463 while (isAvailable()) {
464 if (isOneOf('\r', '\n')) {
465 break;
466 }
467
468 next();
469 }
470
471 }
472
473 /**
474 * Compare string and advance if a match. Returns true if a match.
475 * Warning: Do not use when previous character was a backslash
476 * (confuses state of wasBackslash.)
477 *
478 * @param string string to match character for character.
479 * @return true if a match.
480 */
481 protected boolean accept(String string) {
482 // Quick test.
483 if (string.length() == 0 || !is(string.charAt(0))) {
484 return false;
485 }
486
487 // Be prepared to retreat if not a match.
488 int savedPosition = position;
489
490 nextCodePoint();
491
492 // Check each character.
493 for (int i = 1; i < string.length(); i++) {
494 if (!is(string.charAt(i))) {
495 // Restart if not a match.
496 reset(savedPosition);
497
498 return false;
499 }
500
501 nextCodePoint();
502 }
503
504 return true;
505 }
506
507 /**
508 * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not
509 * advance character.
510 *
511 * @param pos starting position.
512 * @param digitRadix base of number being converted.
513 * @return value of digit.
514 */
515 protected int digit(int pos, int digitRadix) {
516 int result;
517
518 // Just an ASCII digit.
519 if (inRange('0', '9')) {
520 // Fast common case.
521 result = character - '0';
522
523 return result < digitRadix ? result : -1;
524 }
525
526 // Handle other digits.
527 result = isSurrogate() ? Character.digit(codepoint, digitRadix) :
528 Character.digit(character, digitRadix);
529
530 if (result >= 0 && !isASCII()) {
531 log.error(position(), Errors.IllegalNonasciiDigit);
532 character = "0123456789abcdef".charAt(result);
533 }
534
535 return result;
536 }
537
538 /**
539 * Returns the input buffer. Unicode escape sequences are not translated.
540 *
541 * @return the input buffer.
542 */
543 public char[] getRawCharacters() {
544 return length == buffer.length ? buffer : Arrays.copyOf(buffer, length);
545 }
546
547 /**
548 * Returns a copy of a character array subset of the input buffer.
549 * The returned array begins at the {@code beginIndex} and
550 * extends to the character at index {@code endIndex - 1}.
551 * Thus the length of the substring is {@code endIndex-beginIndex}.
552 * This behavior is like
553 * {@code String.substring(beginIndex, endIndex)}.
554 * Unicode escape sequences are not translated.
555 *
556 * @param beginIndex the beginning index, inclusive.
557 * @param endIndex the ending index, exclusive.
558 * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
559 * array bounds
560 */
561 public char[] getRawCharacters(int beginIndex, int endIndex) {
562 return Arrays.copyOfRange(buffer, beginIndex, endIndex);
563 }
564 }