New src/jdk.incubator.code/share/classes/jdk/incubator/code/extern/impl/UnicodeReader.java

  1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 package jdk.incubator.code.extern.impl;
 27 
 28 import java.util.Arrays;
 29 
 30 /**
 31  * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters
 32  * one by one as contained in the input stream, handling unicode escape sequences accordingly.
 33  *
 34  * <p><b>This is NOT part of any supported API.
 35  * If you write code that depends on this, you do so at your own risk.
 36  * This code and its internal interfaces are subject to change or
 37  * deletion without notice.</b></p>
 38  */
 39 sealed class UnicodeReader permits JavaBasedTokenizer {
 40     /**
 41      * End of input character.  Used as a sentinel to denote the
 42      * character one beyond the last defined character in a
 43      * source file.
 44      */
 45     static final byte EOI = 0x1A;
 46 
 47     /**
 48      * Buffer containing characters from source file. May contain extraneous characters
 49      * beyond this.length.
 50      */
 51     private final char[] buffer;
 52 
 53     /**
 54      * Length of meaningful content in buffer.
 55      */
 56     private final int length;
 57 
 58     /**
 59      * Character buffer index of character currently being observed.
 60      */
 61     private int position;
 62 
 63     /**
 64      * Number of characters combined to provide character currently being observed. Typically
 65      * one, but may be more when combinations of surrogate pairs and unicode escape sequences
 66      * are read.
 67      */
 68     private int width;
 69 
 70     /**
 71      * Character currently being observed. If a surrogate pair is read then will be the high
 72      * member of the pair.
 73      */
 74     private char character;
 75 
 76     /**
 77      * Codepoint of character currently being observed. Typically equivalent to the character
 78      * but will have a value greater that 0xFFFF when a surrogate pair.
 79      */
 80     private int codepoint;
 81 
 82     /**
 83      * true if the last character was a backslash. This is used to handle the special case
 84      * when a backslash precedes an unicode escape. In that case, the second backslash
 85      * is treated as a backslash and not part of an unicode escape.
 86      */
 87     private boolean wasBackslash;
 88 
 89     /**
 90      * true if the last character was derived from an unicode escape sequence.
 91      */
 92     private boolean wasUnicodeEscape;
 93 
 94     /**
 95      * Log for error reporting.
 96      */
 97     private final Log log;
 98 
 99     /**
100      * Constructor.
101      *
102      * @param sf     scan factory.
103      * @param array  array containing contents of source.
104      * @param length length of meaningful content in buffer.
105      */
106     UnicodeReader(Scanner.Factory sf, char[] array, int length) {
107         this.buffer = array;
108         this.length = length;
109         this.position = 0;
110         this.width = 0;
111         this.character = '\0';
112         this.codepoint = 0;
113         this.wasBackslash = false;
114         this.wasUnicodeEscape = false;
115         this.log = sf.log;
116 
117         nextCodePoint();
118     }
119 
120     /**
121      * Returns the length of the buffer. This is length of meaningful content in buffer and
122      * not the length of the buffer array.
123      *
124      * @return length of the buffer.
125      */
126     protected int length() {
127         return length;
128     }
129 
130     /**
131      * Return true if current position is within the meaningful part of the buffer.
132      *
133      * @return true if current position is within the meaningful part of the buffer.
134      */
135     protected boolean isAvailable() {
136         return position < length;
137     }
138 
139     /**
140      * Fetches the next 16-bit character from the buffer and places it in this.character.
141      */
142     private void nextCodeUnit() {
143         // Index of next character in buffer.
144         int index = position + width;
145 
146         // If past end of buffer.
147         if (length <= index) {
148             // End of file is marked with EOI.
149             character = EOI;
150         } else {
151             // Next character in buffer.
152             character = buffer[index];
153             // Increment length of codepoint.
154             width++;
155         }
156     }
157 
158     /**
159      * Fetches the next 16-bit character from the buffer. If an unicode escape
160      * is detected then converts the unicode escape to a character.
161      */
162     private void nextUnicodeInputCharacter() {
163         // Position to next codepoint.
164         position += width;
165         // Codepoint has no characters yet.
166         width = 0;
167 
168         // Fetch next character.
169         nextCodeUnit();
170 
171         if (character == '\\' && (!wasBackslash || wasUnicodeEscape)) {
172             // Is a backslash and may be an unicode escape.
173             switch (unicodeEscape()) {
174                 case BACKSLASH -> {
175                     wasUnicodeEscape = false;
176                     wasBackslash = !wasBackslash;
177                 }
178                 case VALID_ESCAPE -> {
179                     wasUnicodeEscape = true;
180                     wasBackslash = character == '\\' && !wasBackslash;
181                 }
182                 case BROKEN_ESCAPE -> nextUnicodeInputCharacter(); //skip broken unicode escapes
183             }
184         } else {
185             wasBackslash = false;
186             wasUnicodeEscape = false;
187         }
188 
189         // Codepoint and character match if not surrogate.
190         codepoint = (int) character;
191     }
192 
193     /**
194      * Fetches the nextcode point from the buffer. If an unicode escape is recognized
195      * then converts unicode escape to a character. If two characters are a surrogate pair
196      * then converts to a codepoint.
197      */
198     private void nextCodePoint() {
199         // Next unicode character.
200         nextUnicodeInputCharacter();
201 
202         // Return early if ASCII or not a surrogate pair.
203         if (isASCII() || !Character.isHighSurrogate(character)) {
204             return;
205         }
206 
207         // Capture high surrogate and position.
208         char hi = character;
209         int savePosition = position;
210         int saveWidth = width;
211 
212         // Get potential low surrogate.
213         nextUnicodeInputCharacter();
214         char lo = character;
215 
216         if (Character.isLowSurrogate(lo)) {
217             // Start codepoint at start of high surrogate.
218             position = savePosition;
219             width += saveWidth;
220             // Compute codepoint.
221             codepoint = Character.toCodePoint(hi, lo);
222         } else {
223             // Restore to treat high surrogate as just a character.
224             position = savePosition;
225             width = saveWidth;
226             character = hi;
227             codepoint = (int) hi;
228             // Could potential report an error here (old code did not.)
229         }
230     }
231 
232     /**
233      * Converts an unicode escape into a character.
234      *
235      * @return true if was an unicode escape.
236      */
237     private UnicodeEscapeResult unicodeEscape() {
238         // Start of unicode escape (past backslash.)
239         int start = position + width;
240 
241         // Default to backslash result, unless proven otherwise.
242         character = '\\';
243         width = 1;
244 
245         // Skip multiple 'u'.
246         int index;
247         for (index = start; index < length; index++) {
248             if (buffer[index] != 'u') {
249                 break;
250             }
251         }
252 
253         // Needs to have been at least one u.
254         if (index == start) {
255             return UnicodeEscapeResult.BACKSLASH;
256         }
257 
258         int code = 0;
259 
260         for (int i = 0; i < 4; i++) {
261             // Translate and merge digit.
262             int digit = index < length ? Character.digit(buffer[index], 16) : -1;
263             code = code << 4 | digit;
264 
265             // If invalid digit.
266             if (code < 0) {
267                 break;
268             }
269 
270             // On to next character.
271             index++;
272         }
273 
274         // Skip digits even if error.
275         width = index - position;
276 
277         // If all digits are good.
278         if (code >= 0) {
279             character = (char) code;
280             return UnicodeEscapeResult.VALID_ESCAPE;
281         } else {
282             log.error(index, Errors.IllegalUnicodeEsc);
283             return UnicodeEscapeResult.BROKEN_ESCAPE;
284         }
285     }
286 
287     private enum UnicodeEscapeResult {
288         BACKSLASH,
289         VALID_ESCAPE,
290         BROKEN_ESCAPE
291     }
292 
293     /**
294      * Return the current position in the character buffer.
295      *
296      * @return current position in the character buffer.
297      */
298     protected int position() {
299         return position;
300     }
301 
302 
303     /**
304      * Reset the reader to the specified position.
305      * Warning: Do not use when previous character was an ASCII or unicode backslash.
306      *
307      * @param pos
308      */
309     protected void reset(int pos) {
310         position = pos;
311         width = 0;
312         wasBackslash = false;
313         wasUnicodeEscape = false;
314         nextCodePoint();
315     }
316 
317     /**
318      * Return the current character in at the current position.
319      *
320      * @return current character in at the current position.
321      */
322     protected char get() {
323         return character;
324     }
325 
326     /**
327      * Return the current codepoint in at the current position.
328      *
329      * @return current codepoint in at the current position.
330      */
331     protected int getCodepoint() {
332         return codepoint;
333     }
334 
335     /**
336      * Returns true if the current codepoint is a surrogate.
337      *
338      * @return true if the current codepoint is a surrogate.
339      */
340     protected boolean isSurrogate() {
341         return 0xFFFF < codepoint;
342     }
343 
344     /**
345      * Returns true if the current character is ASCII.
346      *
347      * @return true if the current character is ASCII.
348      */
349     protected boolean isASCII() {
350         return character <= 0x7F;
351     }
352 
353     /**
354      * Advances the current character to the next character.
355      *
356      * @return next character.
357      */
358     protected char next() {
359         nextCodePoint();
360 
361         return character;
362     }
363 
364     /**
365      * Compare character. Returns true if a match.
366      *
367      * @param ch character to match.
368      * @return true if a match.
369      */
370     protected boolean is(char ch) {
371         return character == ch;
372     }
373 
374     /**
375      * Match one of the arguments. Returns true if a match.
376      */
377     protected boolean isOneOf(char ch1, char ch2) {
378         return is(ch1) || is(ch2);
379     }
380 
381     protected boolean isOneOf(char ch1, char ch2, char ch3) {
382         return is(ch1) || is(ch2) || is(ch3);
383     }
384 
385     protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) {
386         return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6);
387     }
388 
389     /**
390      * Tests to see if current character is in the range of lo to hi characters (inclusive).
391      *
392      * @param lo lowest character in range.
393      * @param hi highest character in range.
394      * @return true if the current character is in range.
395      */
396     protected boolean inRange(char lo, char hi) {
397         return lo <= character && character <= hi;
398     }
399 
400     /**
401      * Compare character and advance if a match. Returns true if a match.
402      *
403      * @param ch character to match.
404      * @return true if a match.
405      */
406     protected boolean accept(char ch) {
407         if (is(ch)) {
408             next();
409 
410             return true;
411         }
412 
413         return false;
414     }
415 
416     /**
417      * Match one of the arguments and advance if a match. Returns true if a match.
418      */
419     protected boolean acceptOneOf(char ch1, char ch2) {
420         if (isOneOf(ch1, ch2)) {
421             next();
422 
423             return true;
424         }
425 
426         return false;
427     }
428 
429     protected boolean acceptOneOf(char ch1, char ch2, char ch3) {
430         if (isOneOf(ch1, ch2, ch3)) {
431             next();
432 
433             return true;
434         }
435 
436         return false;
437     }
438 
439     /**
440      * Skip over all occurrences of character.
441      *
442      * @param ch character to accept.
443      */
444     protected void skip(char ch) {
445         while (accept(ch)) {
446             // next
447         }
448     }
449 
450     /**
451      * Skip over ASCII white space characters.
452      */
453     protected void skipWhitespace() {
454         while (acceptOneOf(' ', '\t', '\f')) {
455             // next
456         }
457     }
458 
459     /**
460      * Skip to end of line.
461      */
462     protected void skipToEOLN() {
463         while (isAvailable()) {
464             if (isOneOf('\r', '\n')) {
465                 break;
466             }
467 
468             next();
469         }
470 
471     }
472 
473     /**
474      * Compare string and advance if a match. Returns true if a match.
475      * Warning: Do not use when previous character was a backslash
476      * (confuses state of wasBackslash.)
477      *
478      * @param string string to match character for character.
479      * @return true if a match.
480      */
481     protected boolean accept(String string) {
482         // Quick test.
483         if (string.length() == 0 || !is(string.charAt(0))) {
484             return false;
485         }
486 
487         // Be prepared to retreat if not a match.
488         int savedPosition = position;
489 
490         nextCodePoint();
491 
492         // Check each character.
493         for (int i = 1; i < string.length(); i++) {
494             if (!is(string.charAt(i))) {
495                 // Restart if not a match.
496                 reset(savedPosition);
497 
498                 return false;
499             }
500 
501             nextCodePoint();
502         }
503 
504         return true;
505     }
506 
507     /**
508      * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not
509      * advance character.
510      *
511      * @param pos        starting position.
512      * @param digitRadix base of number being converted.
513      * @return value of digit.
514      */
515     protected int digit(int pos, int digitRadix) {
516         int result;
517 
518         // Just an ASCII digit.
519         if (inRange('0', '9')) {
520             // Fast common case.
521             result = character - '0';
522 
523             return result < digitRadix ? result : -1;
524         }
525 
526         // Handle other digits.
527         result = isSurrogate() ? Character.digit(codepoint, digitRadix) :
528                 Character.digit(character, digitRadix);
529 
530         if (result >= 0 && !isASCII()) {
531             log.error(position(), Errors.IllegalNonasciiDigit);
532             character = "0123456789abcdef".charAt(result);
533         }
534 
535         return result;
536     }
537 
538     /**
539      * Returns the input buffer. Unicode escape sequences are not translated.
540      *
541      * @return the input buffer.
542      */
543     public char[] getRawCharacters() {
544         return length == buffer.length ? buffer : Arrays.copyOf(buffer, length);
545     }
546 
547     /**
548      * Returns a copy of a character array subset of the input buffer.
549      * The returned array begins at the {@code beginIndex} and
550      * extends to the character at index {@code endIndex - 1}.
551      * Thus the length of the substring is {@code endIndex-beginIndex}.
552      * This behavior is like
553      * {@code String.substring(beginIndex, endIndex)}.
554      * Unicode escape sequences are not translated.
555      *
556      * @param beginIndex the beginning index, inclusive.
557      * @param endIndex   the ending index, exclusive.
558      * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
559      *                                        array bounds
560      */
561     public char[] getRawCharacters(int beginIndex, int endIndex) {
562         return Arrays.copyOfRange(buffer, beginIndex, endIndex);
563     }
564 }