1 /* 2 * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package oracle.code.json.impl; 27 28 import oracle.code.json.*; 29 30 import java.util.ArrayList; 31 import java.util.HashMap; 32 import java.util.LinkedHashMap; 33 34 /** 35 * Parses a JSON Document char[] into a tree of JsonValues. JsonObject and JsonArray 36 * nodes create their data structures which maintain the connection to children. 37 * JsonNumber and JsonString contain only a start and end offset, which 38 * are used to lazily procure their underlying value/string on demand. Singletons 39 * are used for JsonBoolean and JsonNull. 40 */ 41 public final class JsonParser { 42 43 // Access to the underlying JSON contents 44 private final char[] doc; 45 // Current offset during parsing 46 private int offset; 47 // For exception message on failure 48 private int line; 49 private int lineStart; 50 private StringBuilder builder; 51 52 public JsonParser(char[] doc) { 53 this.doc = doc; 54 } 55 56 // Parses the lone JsonValue root 57 public JsonValue parseRoot() { 58 JsonValue root = parseValue(); 59 if (hasInput()) { 60 throw failure("Unexpected character(s)"); 61 } 62 return root; 63 } 64 65 /* 66 * Parse any one of the JSON value types: object, array, number, string, 67 * true, false, or null. 68 * JSON-text = ws value ws 69 * See https://datatracker.ietf.org/doc/html/rfc8259#section-3 70 */ 71 private JsonValue parseValue() { 72 skipWhitespaces(); 73 if (!hasInput()) { 74 throw failure("Missing JSON value"); 75 } 76 var val = switch (doc[offset]) { 77 case '{' -> parseObject(); 78 case '[' -> parseArray(); 79 case '"' -> parseString(); 80 case 't' -> parseTrue(); 81 case 'f' -> parseFalse(); 82 case 'n' -> parseNull(); 83 // While JSON Number does not support leading '+', '.', or 'e' 84 // we still accept, so that we can provide a better error message 85 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '+', 'e', '.' 86 -> parseNumber(); 87 default -> throw failure("Unexpected character(s)"); 88 }; 89 skipWhitespaces(); 90 return val; 91 } 92 93 /* 94 * The parsed JsonObject contains a map which holds all lazy member mappings. 95 * No offsets are required as member values hold their own offsets. 96 * See https://datatracker.ietf.org/doc/html/rfc8259#section-4 97 */ 98 private JsonObject parseObject() { 99 // @@@ Do not preserve encounter order, requires adjustment to the API 100 // var members = new LinkedHashMap<String, JsonValue>(); 101 var members = new HashMap<String, JsonValue>(); 102 offset++; // Walk past the '{' 103 skipWhitespaces(); 104 // Check for empty case 105 if (currCharEquals('}')) { 106 offset++; 107 return new JsonObjectImpl(members); 108 } 109 while (hasInput()) { 110 // Get the member name, which should be unescaped 111 // Why not parse the name as a JsonString and then return its value()? 112 // Would requires 2 passes; we should build the String as we parse. 113 var name = parseName(); 114 115 if (members.containsKey(name)) { 116 throw failure("The duplicate member name: '%s' was already parsed".formatted(name)); 117 } 118 119 // Move from name to ':' 120 skipWhitespaces(); 121 if (!currCharEquals(':')) { 122 throw failure( 123 "Expected ':' after the member name"); 124 } 125 126 // Move from ':' to JsonValue 127 offset++; 128 members.put(name, parseValue()); 129 // Ensure current char is either ',' or '}' 130 if (currCharEquals('}')) { 131 offset++; 132 return new JsonObjectImpl(members); 133 } else if (currCharEquals(',')) { 134 // Add the comma, and move to the next key 135 offset++; 136 skipWhitespaces(); 137 } else { 138 // Neither ',' nor '}' so fail 139 break; 140 } 141 } 142 throw failure("Object was not closed with '}'"); 143 } 144 145 /* 146 * Member name equality and storage in the map should be done with the 147 * unescaped String value. 148 * See https://datatracker.ietf.org/doc/html/rfc8259#section-8.3 149 */ 150 private String parseName() { 151 if (!currCharEquals('"')) { 152 throw failure("Invalid member name"); 153 } 154 offset++; // Move past the starting quote 155 var escape = false; 156 boolean useBldr = false; 157 var start = offset; 158 for (; hasInput(); offset++) { 159 var c = doc[offset]; 160 if (escape) { 161 var escapeLength = 0; 162 switch (c) { 163 // Allowed JSON escapes 164 case '"', '\\', '/' -> {} 165 case 'b' -> c = '\b'; 166 case 'f' -> c = '\f'; 167 case 'n' -> c = '\n'; 168 case 'r' -> c = '\r'; 169 case 't' -> c = '\t'; 170 case 'u' -> { 171 if (offset + 4 < doc.length) { 172 escapeLength = 4; 173 offset++; // Move to first char in sequence 174 c = codeUnit(); 175 // Move to the last hex digit, since outer loop will increment offset 176 offset += 3; 177 } else { 178 throw failure("Invalid Unicode escape sequence"); 179 } 180 } 181 default -> throw failure("Illegal escape"); 182 } 183 if (!useBldr) { 184 initBuilder(); 185 // Append everything up to the first escape sequence 186 builder.append(doc, start, offset - escapeLength - 1 - start); 187 useBldr = true; 188 } 189 escape = false; 190 } else if (c == '\\') { 191 escape = true; 192 continue; 193 } else if (c == '\"') { 194 offset++; 195 if (useBldr) { 196 var name = builder.toString(); 197 builder.setLength(0); 198 return name; 199 } else { 200 return new String(doc, start, offset - start - 1); 201 } 202 } else if (c < ' ') { 203 throw failure("Unescaped control code"); 204 } 205 if (useBldr) { 206 builder.append(c); 207 } 208 } 209 throw failure("Closing quote missing"); 210 } 211 212 /* 213 * The parsed JsonArray contains a List which holds all lazy children 214 * elements. No offsets are required as children values hold their own offsets. 215 * See https://datatracker.ietf.org/doc/html/rfc8259#section-5 216 */ 217 private JsonArray parseArray() { 218 var list = new ArrayList<JsonValue>(); 219 offset++; // Walk past the '[' 220 skipWhitespaces(); 221 // Check for empty case 222 if (currCharEquals(']')) { 223 offset++; 224 return new JsonArrayImpl(list); 225 } 226 for (; hasInput(); offset++) { 227 // Get the JsonValue 228 list.add(parseValue()); 229 // Ensure current char is either ']' or ',' 230 if (currCharEquals(']')) { 231 offset++; 232 return new JsonArrayImpl(list); 233 } else if (!currCharEquals(',')) { 234 break; 235 } 236 } 237 throw failure("Array was not closed with ']'"); 238 } 239 240 /* 241 * The parsed JsonString will contain offsets correlating to the beginning 242 * and ending quotation marks. All Unicode characters are allowed except the 243 * following that require escaping: quotation mark, reverse solidus, and the 244 * control characters (U+0000 through U+001F). Any character may be escaped 245 * either through a Unicode escape sequence or two-char sequence. 246 * See https://datatracker.ietf.org/doc/html/rfc8259#section-7 247 */ 248 private JsonString parseString() { 249 int start = offset; 250 offset++; // Move past the starting quote 251 var escape = false; 252 for (; hasInput(); offset++) { 253 var c = doc[offset]; 254 if (escape) { 255 switch (c) { 256 // Allowed JSON escapes 257 case '"', '\\', '/', 'b', 'f', 'n', 'r', 't' -> {} 258 case 'u' -> { 259 if (offset + 4 < doc.length) { 260 offset++; // Move to first char in sequence 261 checkEscapeSequence(); 262 offset += 3; // Move to the last hex digit, outer loop increments 263 } else { 264 throw failure("Invalid Unicode escape sequence"); 265 } 266 } 267 default -> throw failure("Illegal escape"); 268 } 269 escape = false; 270 } else if (c == '\\') { 271 escape = true; 272 } else if (c == '\"') { 273 return new JsonStringImpl(doc, start, offset += 1); 274 } else if (c < ' ') { 275 throw failure("Unescaped control code"); 276 } 277 } 278 throw failure("Closing quote missing"); 279 } 280 281 /* 282 * Parsing true, false, and null return singletons. These JsonValues 283 * do not require offsets to lazily compute their values. 284 */ 285 private JsonBooleanImpl parseTrue() { 286 if (charsEqual("rue", offset + 1)) { 287 offset += 4; 288 return JsonBooleanImpl.TRUE; 289 } 290 throw failure("Expected true"); 291 } 292 293 private JsonBooleanImpl parseFalse() { 294 if (charsEqual( "alse", offset + 1)) { 295 offset += 5; 296 return JsonBooleanImpl.FALSE; 297 } 298 throw failure("Expected false"); 299 } 300 301 private JsonNullImpl parseNull() { 302 if (charsEqual("ull", offset + 1)) { 303 offset += 4; 304 return JsonNullImpl.NULL; 305 } 306 throw failure("Expected null"); 307 } 308 309 /* 310 * The parsed JsonNumber contains offsets correlating to the first and last 311 * allowed chars permitted in the JSON numeric grammar: 312 * number = [ minus ] int [ frac ] [ exp ] 313 * See https://datatracker.ietf.org/doc/html/rfc8259#section-6 314 */ 315 private JsonNumberImpl parseNumber() { 316 boolean sawDecimal = false; 317 boolean sawExponent = false; 318 boolean sawZero = false; 319 boolean sawWhitespace = false; 320 boolean havePart = false; 321 boolean sawInvalid = false; 322 boolean sawSign = false; 323 var start = offset; 324 for (; hasInput() && !sawWhitespace && !sawInvalid; offset++) { 325 switch (doc[offset]) { 326 case '-' -> { 327 if (offset != start && !sawExponent || sawSign) { 328 throw failure("Invalid '-' position"); 329 } 330 sawSign = true; 331 } 332 case '+' -> { 333 if (!sawExponent || havePart || sawSign) { 334 throw failure("Invalid '+' position"); 335 } 336 sawSign = true; 337 } 338 case '0' -> { 339 if (!havePart) { 340 sawZero = true; 341 } 342 havePart = true; 343 } 344 case '1', '2', '3', '4', '5', '6', '7', '8', '9' -> { 345 if (!sawDecimal && !sawExponent && sawZero) { 346 throw failure("Invalid '0' position"); 347 } 348 havePart = true; 349 } 350 case '.' -> { 351 if (sawDecimal) { 352 throw failure("Invalid '.' position"); 353 } else { 354 if (!havePart) { 355 throw failure("Invalid '.' position"); 356 } 357 sawDecimal = true; 358 havePart = false; 359 } 360 } 361 case 'e', 'E' -> { 362 if (sawExponent) { 363 throw failure("Invalid '[e|E]' position"); 364 } else { 365 if (!havePart) { 366 throw failure("Invalid '[e|E]' position"); 367 } 368 sawExponent = true; 369 havePart = false; 370 sawSign = false; 371 } 372 } 373 case ' ', '\t', '\r', '\n' -> { 374 sawWhitespace = true; 375 offset --; 376 } 377 default -> { 378 offset--; 379 sawInvalid = true; 380 } 381 } 382 } 383 if (!havePart) { 384 throw failure("Input expected after '[.|e|E]'"); 385 } 386 return new JsonNumberImpl(doc, start, offset); 387 } 388 389 // Utility functions 390 391 // Called when a SB is required to un-escape a member name 392 private void initBuilder() { 393 if (builder == null) { 394 builder = new StringBuilder(); 395 } 396 } 397 398 // Validate unicode escape sequence 399 // This method does not increment offset 400 private void checkEscapeSequence() { 401 for (int index = 0; index < 4; index++) { 402 char c = doc[offset + index]; 403 if ((c < 'a' || c > 'f') && (c < 'A' || c > 'F') && (c < '0' || c > '9')) { 404 throw failure("Invalid Unicode escape sequence"); 405 } 406 } 407 } 408 409 // Unescapes the Unicode escape sequence and produces a char 410 private char codeUnit() { 411 try { 412 return Utils.codeUnit(doc, offset); 413 } catch (IllegalArgumentException _) { 414 // Catch and re-throw as JPE with correct row/col 415 throw failure("Invalid Unicode escape sequence"); 416 } 417 } 418 419 // Returns true if the parser has not yet reached the end of the Document 420 private boolean hasInput() { 421 return offset < doc.length; 422 } 423 424 // Walk to the next non-white space char from the current offset 425 private void skipWhitespaces() { 426 while (hasInput()) { 427 if (notWhitespace()) { 428 break; 429 } 430 offset++; 431 } 432 } 433 434 // see https://datatracker.ietf.org/doc/html/rfc8259#section-2 435 private boolean notWhitespace() { 436 return switch (doc[offset]) { 437 case ' ', '\t','\r' -> false; 438 case '\n' -> { 439 // Increments the row and col 440 line += 1; 441 lineStart = offset + 1; 442 yield false; 443 } 444 default -> true; 445 }; 446 } 447 448 private JsonParseException failure(String message) { 449 var errMsg = composeParseExceptionMessage( 450 message, line, lineStart, offset); 451 return new JsonParseException(errMsg, line, offset - lineStart); 452 } 453 454 // returns true if the char at the specified offset equals the input char 455 // and is within bounds of the char[] 456 private boolean currCharEquals(char c) { 457 return hasInput() && c == doc[offset]; 458 } 459 460 // Returns true if the substring starting at the given offset equals the 461 // input String and is within bounds of the JSON document 462 private boolean charsEqual(String str, int o) { 463 if (o + str.length() - 1 < doc.length) { 464 for (int index = 0; index < str.length(); index++) { 465 if (doc[o] != str.charAt(index)) { 466 return false; // char does not match 467 } 468 o++; 469 } 470 return true; // all chars match 471 } 472 return false; // not within bounds 473 } 474 475 // Utility method to compose parse exception message 476 private String composeParseExceptionMessage(String message, int line, int lineStart, int offset) { 477 return "%s: (%s) at Row %d, Col %d." 478 .formatted(message, new String(doc, offset, Math.min(offset + 8, doc.length) - offset), 479 line, offset - lineStart); 480 } 481 }