1 /*
   2  * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.tools.javac.parser;
  27 
  28 import java.nio.CharBuffer;
  29 import java.util.Arrays;
  30 
  31 import com.sun.tools.javac.file.JavacFileManager;
  32 import com.sun.tools.javac.resources.CompilerProperties.Errors;
  33 import com.sun.tools.javac.util.ArrayUtils;
  34 import com.sun.tools.javac.util.Log;
  35 import com.sun.tools.javac.util.Name;
  36 import com.sun.tools.javac.util.Names;
  37 
  38 import static com.sun.tools.javac.util.LayoutCharacters.*;
  39 
  40 /** The char reader used by the javac lexer/tokenizer. Returns the sequence of
  41  * characters contained in the input stream, handling unicode escape accordingly.
  42  * Additionally, it provides features for saving chars into a buffer and to retrieve
  43  * them at a later stage.
  44  *
  45  *  <p><b>This is NOT part of any supported API.
  46  *  If you write code that depends on this, you do so at your own risk.
  47  *  This code and its internal interfaces are subject to change or
  48  *  deletion without notice.</b>
  49  */
  50 public class UnicodeReader {
  51 
  52     /** The input buffer, index of next character to be read,
  53      *  index of one past last character in buffer.
  54      */
  55     protected char[] buf;
  56     protected int bp;
  57     protected final int buflen;
  58 
  59     /** The current character.
  60      */
  61     protected char ch;
  62 
  63     /** The buffer index of the last converted unicode character
  64      */
  65     protected int unicodeConversionBp = -1;
  66 
  67     protected Log log;
  68     protected Names names;
  69 
  70     /** A character buffer for saved chars.
  71      */
  72     protected char[] sbuf = new char[128];
  73     protected int realLength;
  74     protected int sp;
  75 
  76     /**
  77      * Create a scanner from the input array.  This method might
  78      * modify the array.  To avoid copying the input array, ensure
  79      * that {@code inputLength < input.length} or
  80      * {@code input[input.length -1]} is a white space character.
  81      *
  82      * @param sf the factory which created this Scanner
  83      * @param buffer the input, might be modified
  84      * Must be positive and less than or equal to input.length.
  85      */
  86     protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
  87         this(sf, JavacFileManager.toArray(buffer), buffer.limit());
  88     }
  89 
  90     protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
  91         log = sf.log;
  92         names = sf.names;
  93         realLength = inputLength;
  94         if (inputLength == input.length) {
  95             if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
  96                 inputLength--;
  97             } else {
  98                 input = Arrays.copyOf(input, inputLength + 1);
  99             }
 100         }
 101         buf = input;
 102         buflen = inputLength;
 103         buf[buflen] = EOI;
 104         bp = -1;
 105         scanChar();
 106     }
 107 
 108     /** Read next character.
 109      */
 110     protected void scanChar() {
 111         if (bp < buflen) {
 112             ch = buf[++bp];
 113             if (ch == '\\') {
 114                 convertUnicode();
 115             }
 116         }
 117     }
 118 
 119     /** Read next character in comment, skipping over double '\' characters.
 120      */
 121     protected void scanCommentChar() {
 122         scanChar();
 123         if (ch == '\\') {
 124             if (peekChar() == '\\' && !isUnicode()) {
 125                 skipChar();
 126             } else {
 127                 convertUnicode();
 128             }
 129         }
 130     }
 131 
 132     /** Append a character to sbuf.
 133      */
 134     protected void putChar(char ch, boolean scan) {
 135         sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
 136         sbuf[sp++] = ch;
 137         if (scan)
 138             scanChar();
 139     }
 140 
 141     protected void putChar(char ch) {
 142         putChar(ch, false);
 143     }
 144 
 145     protected void putChar(boolean scan) {
 146         putChar(ch, scan);
 147     }
 148 
 149     Name name() {
 150         return names.fromChars(sbuf, 0, sp);
 151     }
 152 
 153     String chars() {
 154         return new String(sbuf, 0, sp);
 155     }
 156 
 157     /** Convert unicode escape; bp points to initial '\' character
 158      *  (Spec 3.3).
 159      */
 160     protected void convertUnicode() {
 161         if (ch == '\\' && unicodeConversionBp != bp ) {
 162             bp++; ch = buf[bp];
 163             if (ch == 'u') {
 164                 do {
 165                     bp++; ch = buf[bp];
 166                 } while (ch == 'u');
 167                 int limit = bp + 3;
 168                 if (limit < buflen) {
 169                     int d = digit(bp, 16);
 170                     int code = d;
 171                     while (bp < limit && d >= 0) {
 172                         bp++; ch = buf[bp];
 173                         d = digit(bp, 16);
 174                         code = (code << 4) + d;
 175                     }
 176                     if (d >= 0) {
 177                         ch = (char)code;
 178                         unicodeConversionBp = bp;
 179                         return;
 180                     }
 181                 }
 182                 log.error(bp, Errors.IllegalUnicodeEsc);
 183             } else {
 184                 bp--;
 185                 ch = '\\';
 186             }
 187         }
 188     }
 189 
 190     /** Are surrogates supported?
 191      */
 192     final static boolean surrogatesSupported = surrogatesSupported();
 193     private static boolean surrogatesSupported() {
 194         try {
 195             Character.isHighSurrogate('a');
 196             return true;
 197         } catch (NoSuchMethodError ex) {
 198             return false;
 199         }
 200     }
 201 
 202     /** Scan surrogate pairs.  If 'ch' is a high surrogate and
 203      *  the next character is a low surrogate, returns the code point
 204      *  constructed from these surrogates. Otherwise, returns -1.
 205      *  This method will not consume any of the characters.
 206      */
 207     protected int peekSurrogates() {
 208         if (surrogatesSupported && Character.isHighSurrogate(ch)) {
 209             char high = ch;
 210             int prevBP = bp;
 211 
 212             scanChar();
 213 
 214             char low = ch;
 215 
 216             ch = high;
 217             bp = prevBP;
 218 
 219             if (Character.isLowSurrogate(low)) {
 220                 return Character.toCodePoint(high, low);
 221             }
 222         }
 223 
 224         return -1;
 225     }
 226 
 227     /** Convert an ASCII digit from its base (8, 10, or 16)
 228      *  to its value.
 229      */
 230     protected int digit(int pos, int base) {
 231         char c = ch;
 232         if ('0' <= c && c <= '9')
 233             return Character.digit(c, base); //a fast common case
 234         int codePoint = peekSurrogates();
 235         int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
 236         if (result >= 0 && c > 0x7f) {
 237             log.error(pos + 1, Errors.IllegalNonasciiDigit);
 238             if (codePoint >= 0)
 239                 scanChar();
 240             ch = "0123456789abcdef".charAt(result);
 241         }
 242         return result;
 243     }
 244 
 245     protected boolean isUnicode() {
 246         return unicodeConversionBp == bp;
 247     }
 248 
 249     protected void skipChar() {
 250         bp++;
 251     }
 252 
 253     protected char peekChar() {
 254         return buf[bp + 1];
 255     }
 256 
 257     /**
 258      * Returns a copy of the input buffer, up to its inputLength.
 259      * Unicode escape sequences are not translated.
 260      */
 261     public char[] getRawCharacters() {
 262         char[] chars = new char[buflen];
 263         System.arraycopy(buf, 0, chars, 0, buflen);
 264         return chars;
 265     }
 266 
 267     /**
 268      * Returns a copy of a character array subset of the input buffer.
 269      * The returned array begins at the {@code beginIndex} and
 270      * extends to the character at index {@code endIndex - 1}.
 271      * Thus the length of the substring is {@code endIndex-beginIndex}.
 272      * This behavior is like
 273      * {@code String.substring(beginIndex, endIndex)}.
 274      * Unicode escape sequences are not translated.
 275      *
 276      * @param beginIndex the beginning index, inclusive.
 277      * @param endIndex the ending index, exclusive.
 278      * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
 279      *         array bounds
 280      */
 281     public char[] getRawCharacters(int beginIndex, int endIndex) {
 282         int length = endIndex - beginIndex;
 283         char[] chars = new char[length];
 284         System.arraycopy(buf, beginIndex, chars, 0, length);
 285         return chars;
 286     }
 287 }