1 /*
   2  * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.tools.javac.parser;
  27 
  28 import java.nio.CharBuffer;
  29 import java.util.Arrays;
  30 
  31 import com.sun.tools.javac.file.JavacFileManager;
  32 import com.sun.tools.javac.resources.CompilerProperties.Errors;
  33 import com.sun.tools.javac.util.ArrayUtils;
  34 import com.sun.tools.javac.util.Log;
  35 import com.sun.tools.javac.util.Name;
  36 import com.sun.tools.javac.util.Names;
  37 
  38 import static com.sun.tools.javac.util.LayoutCharacters.*;
  39 
  40 /** The char reader used by the javac lexer/tokenizer. Returns the sequence of
  41  * characters contained in the input stream, handling unicode escape accordingly.
  42  * Additionally, it provides features for saving chars into a buffer and to retrieve
  43  * them at a later stage.
  44  *
  45  *  <p><b>This is NOT part of any supported API.
  46  *  If you write code that depends on this, you do so at your own risk.
  47  *  This code and its internal interfaces are subject to change or
  48  *  deletion without notice.</b>
  49  */
  50 public class UnicodeReader {
  51 
  52     /** The input buffer, index of next character to be read,
  53      *  index of one past last character in buffer.
  54      */
  55     protected char[] buf;
  56     protected int bp;
  57     protected final int buflen;
  58 
  59     /** The current character.
  60      */
  61     protected char ch;
  62 
  63     /** The buffer index of the last converted unicode character
  64      */
  65     protected int unicodeConversionBp = -1;
  66 
  67     protected Log log;
  68     protected Names names;
  69 
  70     /** A character buffer for saved chars.
  71      */
  72     protected char[] sbuf = new char[128];
  73     protected int realLength;
  74     protected int sp;
  75 
  76     /**
  77      * Create a scanner from the input array.  This method might
  78      * modify the array.  To avoid copying the input array, ensure
  79      * that {@code inputLength < input.length} or
  80      * {@code input[input.length -1]} is a white space character.
  81      *
  82      * @param sf the factory which created this Scanner
  83      * @param buffer the input, might be modified
  84      * Must be positive and less than or equal to input.length.
  85      */
  86     protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
  87         this(sf, JavacFileManager.toArray(buffer), buffer.limit());
  88     }
  89 
  90     protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
  91         log = sf.log;
  92         names = sf.names;
  93         realLength = inputLength;
  94         if (inputLength == input.length) {
  95             if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
  96                 inputLength--;
  97             } else {
  98                 input = Arrays.copyOf(input, inputLength + 1);
  99             }
 100         }
 101         buf = input;
 102         buflen = inputLength;
 103         buf[buflen] = EOI;
 104         bp = -1;
 105         scanChar();
 106     }
 107 
 108     /** Read next character.
 109      */
 110     protected void scanChar() {
 111         if (bp < buflen) {
 112             ch = buf[++bp];
 113             if (ch == '\\') {
 114                 convertUnicode();
 115             }
 116         }
 117     }
 118 
 119     /** Read next character in comment, skipping over double '\' characters.
 120      */
 121     protected void scanCommentChar() {
 122         scanChar();
 123         if (ch == '\\') {
 124             if (peekChar() == '\\' && !isUnicode()) {
 125                 skipChar();
 126             } else {
 127                 convertUnicode();
 128             }
 129         }
 130     }
 131 
 132     /** Append a character to sbuf.
 133      */
 134     protected void putChar(char ch, boolean scan) {
 135         sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
 136         sbuf[sp++] = ch;
 137         if (scan)
 138             scanChar();
 139     }
 140 
 141     protected void putChar(char ch) {
 142         putChar(ch, false);
 143     }
 144 
 145     protected void putChar(boolean scan) {
 146         putChar(ch, scan);
 147     }
 148 
 149     Name name() {
 150         return names.fromChars(sbuf, 0, sp);
 151     }
 152 
 153     String chars() {
 154         return new String(sbuf, 0, sp);
 155     }
 156 
 157     /** Add 'count' copies of the character 'ch' to the string buffer.
 158      */
 159     protected void repeat(char ch, int count) {
 160         for ( ; 0 < count; count--) {
 161             putChar(ch, false);
 162         }
 163     }
 164 
 165     /** Reset the scan buffer pointer to 'pos'.
 166      */
 167     protected void reset(int pos) {
 168         bp = pos - 1;
 169         scanChar();
 170     }
 171 
 172     /** Convert unicode escape; bp points to initial '\' character
 173      *  (Spec 3.3).
 174      */
 175     protected void convertUnicode() {
 176         if (ch == '\\' && unicodeConversionBp != bp ) {
 177             bp++; ch = buf[bp];
 178             if (ch == 'u') {
 179                 do {
 180                     bp++; ch = buf[bp];
 181                 } while (ch == 'u');
 182                 int limit = bp + 3;
 183                 if (limit < buflen) {
 184                     int d = digit(bp, 16);
 185                     int code = d;
 186                     while (bp < limit && d >= 0) {
 187                         bp++; ch = buf[bp];
 188                         d = digit(bp, 16);
 189                         code = (code << 4) + d;
 190                     }
 191                     if (d >= 0) {
 192                         ch = (char)code;
 193                         unicodeConversionBp = bp;
 194                         return;
 195                     }
 196                 }
 197                 log.error(bp, Errors.IllegalUnicodeEsc);
 198             } else {
 199                 bp--;
 200                 ch = '\\';
 201             }
 202         }
 203     }
 204 
 205     /** Are surrogates supported?
 206      */
 207     final static boolean surrogatesSupported = surrogatesSupported();
 208     private static boolean surrogatesSupported() {
 209         try {
 210             Character.isHighSurrogate('a');
 211             return true;
 212         } catch (NoSuchMethodError ex) {
 213             return false;
 214         }
 215     }
 216 
 217     /** Scan surrogate pairs.  If 'ch' is a high surrogate and
 218      *  the next character is a low surrogate, returns the code point
 219      *  constructed from these surrogates. Otherwise, returns -1.
 220      *  This method will not consume any of the characters.
 221      */
 222     protected int peekSurrogates() {
 223         if (surrogatesSupported && Character.isHighSurrogate(ch)) {
 224             char high = ch;
 225             int prevBP = bp;
 226 
 227             scanChar();
 228 
 229             char low = ch;
 230 
 231             ch = high;
 232             bp = prevBP;
 233 
 234             if (Character.isLowSurrogate(low)) {
 235                 return Character.toCodePoint(high, low);
 236             }
 237         }
 238 
 239         return -1;
 240     }
 241 
 242     /** Convert an ASCII digit from its base (8, 10, or 16)
 243      *  to its value.
 244      */
 245     protected int digit(int pos, int base) {
 246         char c = ch;
 247         if ('0' <= c && c <= '9')
 248             return Character.digit(c, base); //a fast common case
 249         int codePoint = peekSurrogates();
 250         int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
 251         if (result >= 0 && c > 0x7f) {
 252             log.error(pos + 1, Errors.IllegalNonasciiDigit);
 253             if (codePoint >= 0)
 254                 scanChar();
 255             ch = "0123456789abcdef".charAt(result);
 256         }
 257         return result;
 258     }
 259 
 260     protected boolean isUnicode() {
 261         return unicodeConversionBp == bp;
 262     }
 263 
 264     protected void skipChar() {
 265         bp++;
 266     }
 267 
 268     protected char peekChar() {
 269         return buf[bp + 1];
 270     }
 271 
 272     /**
 273      * Returns a copy of the input buffer, up to its inputLength.
 274      * Unicode escape sequences are not translated.
 275      */
 276     public char[] getRawCharacters() {
 277         char[] chars = new char[buflen];
 278         System.arraycopy(buf, 0, chars, 0, buflen);
 279         return chars;
 280     }
 281 
 282     /**
 283      * Returns a copy of a character array subset of the input buffer.
 284      * The returned array begins at the {@code beginIndex} and
 285      * extends to the character at index {@code endIndex - 1}.
 286      * Thus the length of the substring is {@code endIndex-beginIndex}.
 287      * This behavior is like
 288      * {@code String.substring(beginIndex, endIndex)}.
 289      * Unicode escape sequences are not translated.
 290      *
 291      * @param beginIndex the beginning index, inclusive.
 292      * @param endIndex the ending index, exclusive.
 293      * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
 294      *         array bounds
 295      */
 296     public char[] getRawCharacters(int beginIndex, int endIndex) {
 297         int length = endIndex - beginIndex;
 298         char[] chars = new char[length];
 299         System.arraycopy(buf, beginIndex, chars, 0, length);
 300         return chars;
 301     }
 302 }