1 /*
2 * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package oracle.code.json.impl;
27
28 import oracle.code.json.*;
29
30 import java.util.ArrayList;
31 import java.util.HashMap;
32 import java.util.LinkedHashMap;
33
34 /**
35 * Parses a JSON Document char[] into a tree of JsonValues. JsonObject and JsonArray
36 * nodes create their data structures which maintain the connection to children.
37 * JsonNumber and JsonString contain only a start and end offset, which
38 * are used to lazily procure their underlying value/string on demand. Singletons
39 * are used for JsonBoolean and JsonNull.
40 */
41 public final class JsonParser {
42
43 // Access to the underlying JSON contents
44 private final char[] doc;
45 // Current offset during parsing
46 private int offset;
47 // For exception message on failure
48 private int line;
49 private int lineStart;
50 private StringBuilder builder;
51
52 public JsonParser(char[] doc) {
53 this.doc = doc;
54 }
55
56 // Parses the lone JsonValue root
57 public JsonValue parseRoot() {
58 JsonValue root = parseValue();
59 if (hasInput()) {
60 throw failure("Unexpected character(s)");
61 }
62 return root;
63 }
64
65 /*
66 * Parse any one of the JSON value types: object, array, number, string,
67 * true, false, or null.
68 * JSON-text = ws value ws
69 * See https://datatracker.ietf.org/doc/html/rfc8259#section-3
70 */
71 private JsonValue parseValue() {
72 skipWhitespaces();
73 if (!hasInput()) {
74 throw failure("Missing JSON value");
75 }
76 var val = switch (doc[offset]) {
77 case '{' -> parseObject();
78 case '[' -> parseArray();
79 case '"' -> parseString();
80 case 't' -> parseTrue();
81 case 'f' -> parseFalse();
82 case 'n' -> parseNull();
83 // While JSON Number does not support leading '+', '.', or 'e'
84 // we still accept, so that we can provide a better error message
85 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '+', 'e', '.'
86 -> parseNumber();
87 default -> throw failure("Unexpected character(s)");
88 };
89 skipWhitespaces();
90 return val;
91 }
92
93 /*
94 * The parsed JsonObject contains a map which holds all lazy member mappings.
95 * No offsets are required as member values hold their own offsets.
96 * See https://datatracker.ietf.org/doc/html/rfc8259#section-4
97 */
98 private JsonObject parseObject() {
99 // @@@ Do not preserve encounter order, requires adjustment to the API
100 // var members = new LinkedHashMap<String, JsonValue>();
101 var members = new HashMap<String, JsonValue>();
102 offset++; // Walk past the '{'
103 skipWhitespaces();
104 // Check for empty case
105 if (currCharEquals('}')) {
106 offset++;
107 return new JsonObjectImpl(members);
108 }
109 while (hasInput()) {
110 // Get the member name, which should be unescaped
111 // Why not parse the name as a JsonString and then return its value()?
112 // Would requires 2 passes; we should build the String as we parse.
113 var name = parseName();
114
115 if (members.containsKey(name)) {
116 throw failure("The duplicate member name: '%s' was already parsed".formatted(name));
117 }
118
119 // Move from name to ':'
120 skipWhitespaces();
121 if (!currCharEquals(':')) {
122 throw failure(
123 "Expected ':' after the member name");
124 }
125
126 // Move from ':' to JsonValue
127 offset++;
128 members.put(name, parseValue());
129 // Ensure current char is either ',' or '}'
130 if (currCharEquals('}')) {
131 offset++;
132 return new JsonObjectImpl(members);
133 } else if (currCharEquals(',')) {
134 // Add the comma, and move to the next key
135 offset++;
136 skipWhitespaces();
137 } else {
138 // Neither ',' nor '}' so fail
139 break;
140 }
141 }
142 throw failure("Object was not closed with '}'");
143 }
144
145 /*
146 * Member name equality and storage in the map should be done with the
147 * unescaped String value.
148 * See https://datatracker.ietf.org/doc/html/rfc8259#section-8.3
149 */
150 private String parseName() {
151 if (!currCharEquals('"')) {
152 throw failure("Invalid member name");
153 }
154 offset++; // Move past the starting quote
155 var escape = false;
156 boolean useBldr = false;
157 var start = offset;
158 for (; hasInput(); offset++) {
159 var c = doc[offset];
160 if (escape) {
161 var escapeLength = 0;
162 switch (c) {
163 // Allowed JSON escapes
164 case '"', '\\', '/' -> {}
165 case 'b' -> c = '\b';
166 case 'f' -> c = '\f';
167 case 'n' -> c = '\n';
168 case 'r' -> c = '\r';
169 case 't' -> c = '\t';
170 case 'u' -> {
171 if (offset + 4 < doc.length) {
172 escapeLength = 4;
173 offset++; // Move to first char in sequence
174 c = codeUnit();
175 // Move to the last hex digit, since outer loop will increment offset
176 offset += 3;
177 } else {
178 throw failure("Invalid Unicode escape sequence");
179 }
180 }
181 default -> throw failure("Illegal escape");
182 }
183 if (!useBldr) {
184 initBuilder();
185 // Append everything up to the first escape sequence
186 builder.append(doc, start, offset - escapeLength - 1 - start);
187 useBldr = true;
188 }
189 escape = false;
190 } else if (c == '\\') {
191 escape = true;
192 continue;
193 } else if (c == '\"') {
194 offset++;
195 if (useBldr) {
196 var name = builder.toString();
197 builder.setLength(0);
198 return name;
199 } else {
200 return new String(doc, start, offset - start - 1);
201 }
202 } else if (c < ' ') {
203 throw failure("Unescaped control code");
204 }
205 if (useBldr) {
206 builder.append(c);
207 }
208 }
209 throw failure("Closing quote missing");
210 }
211
212 /*
213 * The parsed JsonArray contains a List which holds all lazy children
214 * elements. No offsets are required as children values hold their own offsets.
215 * See https://datatracker.ietf.org/doc/html/rfc8259#section-5
216 */
217 private JsonArray parseArray() {
218 var list = new ArrayList<JsonValue>();
219 offset++; // Walk past the '['
220 skipWhitespaces();
221 // Check for empty case
222 if (currCharEquals(']')) {
223 offset++;
224 return new JsonArrayImpl(list);
225 }
226 for (; hasInput(); offset++) {
227 // Get the JsonValue
228 list.add(parseValue());
229 // Ensure current char is either ']' or ','
230 if (currCharEquals(']')) {
231 offset++;
232 return new JsonArrayImpl(list);
233 } else if (!currCharEquals(',')) {
234 break;
235 }
236 }
237 throw failure("Array was not closed with ']'");
238 }
239
240 /*
241 * The parsed JsonString will contain offsets correlating to the beginning
242 * and ending quotation marks. All Unicode characters are allowed except the
243 * following that require escaping: quotation mark, reverse solidus, and the
244 * control characters (U+0000 through U+001F). Any character may be escaped
245 * either through a Unicode escape sequence or two-char sequence.
246 * See https://datatracker.ietf.org/doc/html/rfc8259#section-7
247 */
248 private JsonString parseString() {
249 int start = offset;
250 offset++; // Move past the starting quote
251 var escape = false;
252 for (; hasInput(); offset++) {
253 var c = doc[offset];
254 if (escape) {
255 switch (c) {
256 // Allowed JSON escapes
257 case '"', '\\', '/', 'b', 'f', 'n', 'r', 't' -> {}
258 case 'u' -> {
259 if (offset + 4 < doc.length) {
260 offset++; // Move to first char in sequence
261 checkEscapeSequence();
262 offset += 3; // Move to the last hex digit, outer loop increments
263 } else {
264 throw failure("Invalid Unicode escape sequence");
265 }
266 }
267 default -> throw failure("Illegal escape");
268 }
269 escape = false;
270 } else if (c == '\\') {
271 escape = true;
272 } else if (c == '\"') {
273 return new JsonStringImpl(doc, start, offset += 1);
274 } else if (c < ' ') {
275 throw failure("Unescaped control code");
276 }
277 }
278 throw failure("Closing quote missing");
279 }
280
281 /*
282 * Parsing true, false, and null return singletons. These JsonValues
283 * do not require offsets to lazily compute their values.
284 */
285 private JsonBooleanImpl parseTrue() {
286 if (charsEqual("rue", offset + 1)) {
287 offset += 4;
288 return JsonBooleanImpl.TRUE;
289 }
290 throw failure("Expected true");
291 }
292
293 private JsonBooleanImpl parseFalse() {
294 if (charsEqual( "alse", offset + 1)) {
295 offset += 5;
296 return JsonBooleanImpl.FALSE;
297 }
298 throw failure("Expected false");
299 }
300
301 private JsonNullImpl parseNull() {
302 if (charsEqual("ull", offset + 1)) {
303 offset += 4;
304 return JsonNullImpl.NULL;
305 }
306 throw failure("Expected null");
307 }
308
309 /*
310 * The parsed JsonNumber contains offsets correlating to the first and last
311 * allowed chars permitted in the JSON numeric grammar:
312 * number = [ minus ] int [ frac ] [ exp ]
313 * See https://datatracker.ietf.org/doc/html/rfc8259#section-6
314 */
315 private JsonNumberImpl parseNumber() {
316 boolean sawDecimal = false;
317 boolean sawExponent = false;
318 boolean sawZero = false;
319 boolean sawWhitespace = false;
320 boolean havePart = false;
321 boolean sawInvalid = false;
322 boolean sawSign = false;
323 var start = offset;
324 for (; hasInput() && !sawWhitespace && !sawInvalid; offset++) {
325 switch (doc[offset]) {
326 case '-' -> {
327 if (offset != start && !sawExponent || sawSign) {
328 throw failure("Invalid '-' position");
329 }
330 sawSign = true;
331 }
332 case '+' -> {
333 if (!sawExponent || havePart || sawSign) {
334 throw failure("Invalid '+' position");
335 }
336 sawSign = true;
337 }
338 case '0' -> {
339 if (!havePart) {
340 sawZero = true;
341 }
342 havePart = true;
343 }
344 case '1', '2', '3', '4', '5', '6', '7', '8', '9' -> {
345 if (!sawDecimal && !sawExponent && sawZero) {
346 throw failure("Invalid '0' position");
347 }
348 havePart = true;
349 }
350 case '.' -> {
351 if (sawDecimal) {
352 throw failure("Invalid '.' position");
353 } else {
354 if (!havePart) {
355 throw failure("Invalid '.' position");
356 }
357 sawDecimal = true;
358 havePart = false;
359 }
360 }
361 case 'e', 'E' -> {
362 if (sawExponent) {
363 throw failure("Invalid '[e|E]' position");
364 } else {
365 if (!havePart) {
366 throw failure("Invalid '[e|E]' position");
367 }
368 sawExponent = true;
369 havePart = false;
370 sawSign = false;
371 }
372 }
373 case ' ', '\t', '\r', '\n' -> {
374 sawWhitespace = true;
375 offset --;
376 }
377 default -> {
378 offset--;
379 sawInvalid = true;
380 }
381 }
382 }
383 if (!havePart) {
384 throw failure("Input expected after '[.|e|E]'");
385 }
386 return new JsonNumberImpl(doc, start, offset);
387 }
388
389 // Utility functions
390
391 // Called when a SB is required to un-escape a member name
392 private void initBuilder() {
393 if (builder == null) {
394 builder = new StringBuilder();
395 }
396 }
397
398 // Validate unicode escape sequence
399 // This method does not increment offset
400 private void checkEscapeSequence() {
401 for (int index = 0; index < 4; index++) {
402 char c = doc[offset + index];
403 if ((c < 'a' || c > 'f') && (c < 'A' || c > 'F') && (c < '0' || c > '9')) {
404 throw failure("Invalid Unicode escape sequence");
405 }
406 }
407 }
408
409 // Unescapes the Unicode escape sequence and produces a char
410 private char codeUnit() {
411 try {
412 return Utils.codeUnit(doc, offset);
413 } catch (IllegalArgumentException _) {
414 // Catch and re-throw as JPE with correct row/col
415 throw failure("Invalid Unicode escape sequence");
416 }
417 }
418
419 // Returns true if the parser has not yet reached the end of the Document
420 private boolean hasInput() {
421 return offset < doc.length;
422 }
423
424 // Walk to the next non-white space char from the current offset
425 private void skipWhitespaces() {
426 while (hasInput()) {
427 if (notWhitespace()) {
428 break;
429 }
430 offset++;
431 }
432 }
433
434 // see https://datatracker.ietf.org/doc/html/rfc8259#section-2
435 private boolean notWhitespace() {
436 return switch (doc[offset]) {
437 case ' ', '\t','\r' -> false;
438 case '\n' -> {
439 // Increments the row and col
440 line += 1;
441 lineStart = offset + 1;
442 yield false;
443 }
444 default -> true;
445 };
446 }
447
448 private JsonParseException failure(String message) {
449 var errMsg = composeParseExceptionMessage(
450 message, line, lineStart, offset);
451 return new JsonParseException(errMsg, line, offset - lineStart);
452 }
453
454 // returns true if the char at the specified offset equals the input char
455 // and is within bounds of the char[]
456 private boolean currCharEquals(char c) {
457 return hasInput() && c == doc[offset];
458 }
459
460 // Returns true if the substring starting at the given offset equals the
461 // input String and is within bounds of the JSON document
462 private boolean charsEqual(String str, int o) {
463 if (o + str.length() - 1 < doc.length) {
464 for (int index = 0; index < str.length(); index++) {
465 if (doc[o] != str.charAt(index)) {
466 return false; // char does not match
467 }
468 o++;
469 }
470 return true; // all chars match
471 }
472 return false; // not within bounds
473 }
474
475 // Utility method to compose parse exception message
476 private String composeParseExceptionMessage(String message, int line, int lineStart, int offset) {
477 return "%s: (%s) at Row %d, Col %d."
478 .formatted(message, new String(doc, offset, Math.min(offset + 8, doc.length) - offset),
479 line, offset - lineStart);
480 }
481 }