1 /*
2 * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package hat.tools.json.impl;
27
28
29 import hat.tools.json.JsonArray;
30 import hat.tools.json.JsonObject;
31 import hat.tools.json.JsonParseException;
32 import hat.tools.json.JsonString;
33 import hat.tools.json.JsonValue;
34
35 import java.util.ArrayList;
36 import java.util.HashMap;
37
38 /**
39 * Parses a JSON Document char[] into a tree of JsonValues. JsonObject and JsonArray
40 * nodes create their data structures which maintain the connection to children.
41 * JsonNumber and JsonString contain only a start and end offset, which
42 * are used to lazily procure their underlying value/string on demand. Singletons
43 * are used for JsonBoolean and JsonNull.
44 */
45 public final class JsonParser {
46
47 // Access to the underlying JSON contents
48 private final char[] doc;
49 // Current offset during parsing
50 private int offset;
51 // For exception message on failure
52 private int line;
53 private int lineStart;
54 private StringBuilder builder;
55
56 public JsonParser(char[] doc) {
57 this.doc = doc;
58 }
59
60 // Parses the lone JsonValue root
61 public JsonValue parseRoot() {
62 JsonValue root = parseValue();
63 if (hasInput()) {
64 throw failure("Unexpected character(s)");
65 }
66 return root;
67 }
68
69 /*
70 * Parse any one of the JSON value types: object, array, number, string,
71 * true, false, or null.
72 * JSON-text = ws value ws
73 * See https://datatracker.ietf.org/doc/html/rfc8259#section-3
74 */
75 private JsonValue parseValue() {
76 skipWhitespaces();
77 if (!hasInput()) {
78 throw failure("Missing JSON value");
79 }
80 var val = switch (doc[offset]) {
81 case '{' -> parseObject();
82 case '[' -> parseArray();
83 case '"' -> parseString();
84 case 't' -> parseTrue();
85 case 'f' -> parseFalse();
86 case 'n' -> parseNull();
87 // While JSON Number does not support leading '+', '.', or 'e'
88 // we still accept, so that we can provide a better error message
89 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '+', 'e', '.'
90 -> parseNumber();
91 default -> throw failure("Unexpected character(s)");
92 };
93 skipWhitespaces();
94 return val;
95 }
96
97 /*
98 * The parsed JsonObject contains a map which holds all lazy member mappings.
99 * No offsets are required as member values hold their own offsets.
100 * See https://datatracker.ietf.org/doc/html/rfc8259#section-4
101 */
102 private JsonObject parseObject() {
103 // @@@ Do not preserve encounter order, requires adjustment to the API
104 // var members = new LinkedHashMap<String, JsonValue>();
105 var members = new HashMap<String, JsonValue>();
106 offset++; // Walk past the '{'
107 skipWhitespaces();
108 // Check for empty case
109 if (currCharEquals('}')) {
110 offset++;
111 return new JsonObjectImpl(members);
112 }
113 while (hasInput()) {
114 // Get the member name, which should be unescaped
115 // Why not parse the name as a JsonString and then return its value()?
116 // Would requires 2 passes; we should build the String as we parse.
117 var name = parseName();
118
119 if (members.containsKey(name)) {
120 throw failure("The duplicate member name: '%s' was already parsed".formatted(name));
121 }
122
123 // Move from name to ':'
124 skipWhitespaces();
125 if (!currCharEquals(':')) {
126 throw failure(
127 "Expected ':' after the member name");
128 }
129
130 // Move from ':' to JsonValue
131 offset++;
132 members.put(name, parseValue());
133 // Ensure current char is either ',' or '}'
134 if (currCharEquals('}')) {
135 offset++;
136 return new JsonObjectImpl(members);
137 } else if (currCharEquals(',')) {
138 // Add the comma, and move to the next key
139 offset++;
140 skipWhitespaces();
141 } else {
142 // Neither ',' nor '}' so fail
143 break;
144 }
145 }
146 throw failure("Object was not closed with '}'");
147 }
148
149 /*
150 * Member name equality and storage in the map should be done with the
151 * unescaped String value.
152 * See https://datatracker.ietf.org/doc/html/rfc8259#section-8.3
153 */
154 private String parseName() {
155 if (!currCharEquals('"')) {
156 throw failure("Invalid member name");
157 }
158 offset++; // Move past the starting quote
159 var escape = false;
160 boolean useBldr = false;
161 var start = offset;
162 for (; hasInput(); offset++) {
163 var c = doc[offset];
164 if (escape) {
165 var escapeLength = 0;
166 switch (c) {
167 // Allowed JSON escapes
168 case '"', '\\', '/' -> {}
169 case 'b' -> c = '\b';
170 case 'f' -> c = '\f';
171 case 'n' -> c = '\n';
172 case 'r' -> c = '\r';
173 case 't' -> c = '\t';
174 case 'u' -> {
175 if (offset + 4 < doc.length) {
176 escapeLength = 4;
177 offset++; // Move to first char in sequence
178 c = codeUnit();
179 // Move to the last hex digit, since outer loop will increment offset
180 offset += 3;
181 } else {
182 throw failure("Invalid Unicode escape sequence");
183 }
184 }
185 default -> throw failure("Illegal escape");
186 }
187 if (!useBldr) {
188 initBuilder();
189 // Append everything up to the first escape sequence
190 builder.append(doc, start, offset - escapeLength - 1 - start);
191 useBldr = true;
192 }
193 escape = false;
194 } else if (c == '\\') {
195 escape = true;
196 continue;
197 } else if (c == '\"') {
198 offset++;
199 if (useBldr) {
200 var name = builder.toString();
201 builder.setLength(0);
202 return name;
203 } else {
204 return new String(doc, start, offset - start - 1);
205 }
206 } else if (c < ' ') {
207 throw failure("Unescaped control code");
208 }
209 if (useBldr) {
210 builder.append(c);
211 }
212 }
213 throw failure("Closing quote missing");
214 }
215
216 /*
217 * The parsed JsonArray contains a List which holds all lazy children
218 * elements. No offsets are required as children values hold their own offsets.
219 * See https://datatracker.ietf.org/doc/html/rfc8259#section-5
220 */
221 private JsonArray parseArray() {
222 var list = new ArrayList<JsonValue>();
223 offset++; // Walk past the '['
224 skipWhitespaces();
225 // Check for empty case
226 if (currCharEquals(']')) {
227 offset++;
228 return new JsonArrayImpl(list);
229 }
230 for (; hasInput(); offset++) {
231 // Get the JsonValue
232 list.add(parseValue());
233 // Ensure current char is either ']' or ','
234 if (currCharEquals(']')) {
235 offset++;
236 return new JsonArrayImpl(list);
237 } else if (!currCharEquals(',')) {
238 break;
239 }
240 }
241 throw failure("Array was not closed with ']'");
242 }
243
244 /*
245 * The parsed JsonString will contain offsets correlating to the beginning
246 * and ending quotation marks. All Unicode characters are allowed except the
247 * following that require escaping: quotation mark, reverse solidus, and the
248 * control characters (U+0000 through U+001F). Any character may be escaped
249 * either through a Unicode escape sequence or two-char sequence.
250 * See https://datatracker.ietf.org/doc/html/rfc8259#section-7
251 */
252 private JsonString parseString() {
253 int start = offset;
254 offset++; // Move past the starting quote
255 var escape = false;
256 for (; hasInput(); offset++) {
257 var c = doc[offset];
258 if (escape) {
259 switch (c) {
260 // Allowed JSON escapes
261 case '"', '\\', '/', 'b', 'f', 'n', 'r', 't' -> {}
262 case 'u' -> {
263 if (offset + 4 < doc.length) {
264 offset++; // Move to first char in sequence
265 checkEscapeSequence();
266 offset += 3; // Move to the last hex digit, outer loop increments
267 } else {
268 throw failure("Invalid Unicode escape sequence");
269 }
270 }
271 default -> throw failure("Illegal escape");
272 }
273 escape = false;
274 } else if (c == '\\') {
275 escape = true;
276 } else if (c == '\"') {
277 return new JsonStringImpl(doc, start, offset += 1);
278 } else if (c < ' ') {
279 throw failure("Unescaped control code");
280 }
281 }
282 throw failure("Closing quote missing");
283 }
284
285 /*
286 * Parsing true, false, and null return singletons. These JsonValues
287 * do not require offsets to lazily compute their values.
288 */
289 private JsonBooleanImpl parseTrue() {
290 if (charsEqual("rue", offset + 1)) {
291 offset += 4;
292 return JsonBooleanImpl.TRUE;
293 }
294 throw failure("Expected true");
295 }
296
297 private JsonBooleanImpl parseFalse() {
298 if (charsEqual( "alse", offset + 1)) {
299 offset += 5;
300 return JsonBooleanImpl.FALSE;
301 }
302 throw failure("Expected false");
303 }
304
305 private JsonNullImpl parseNull() {
306 if (charsEqual("ull", offset + 1)) {
307 offset += 4;
308 return JsonNullImpl.NULL;
309 }
310 throw failure("Expected null");
311 }
312
313 /*
314 * The parsed JsonNumber contains offsets correlating to the first and last
315 * allowed chars permitted in the JSON numeric grammar:
316 * number = [ minus ] int [ frac ] [ exp ]
317 * See https://datatracker.ietf.org/doc/html/rfc8259#section-6
318 */
319 private JsonNumberImpl parseNumber() {
320 boolean sawDecimal = false;
321 boolean sawExponent = false;
322 boolean sawZero = false;
323 boolean sawWhitespace = false;
324 boolean havePart = false;
325 boolean sawInvalid = false;
326 boolean sawSign = false;
327 var start = offset;
328 for (; hasInput() && !sawWhitespace && !sawInvalid; offset++) {
329 switch (doc[offset]) {
330 case '-' -> {
331 if (offset != start && !sawExponent || sawSign) {
332 throw failure("Invalid '-' position");
333 }
334 sawSign = true;
335 }
336 case '+' -> {
337 if (!sawExponent || havePart || sawSign) {
338 throw failure("Invalid '+' position");
339 }
340 sawSign = true;
341 }
342 case '0' -> {
343 if (!havePart) {
344 sawZero = true;
345 }
346 havePart = true;
347 }
348 case '1', '2', '3', '4', '5', '6', '7', '8', '9' -> {
349 if (!sawDecimal && !sawExponent && sawZero) {
350 throw failure("Invalid '0' position");
351 }
352 havePart = true;
353 }
354 case '.' -> {
355 if (sawDecimal) {
356 throw failure("Invalid '.' position");
357 } else {
358 if (!havePart) {
359 throw failure("Invalid '.' position");
360 }
361 sawDecimal = true;
362 havePart = false;
363 }
364 }
365 case 'e', 'E' -> {
366 if (sawExponent) {
367 throw failure("Invalid '[e|E]' position");
368 } else {
369 if (!havePart) {
370 throw failure("Invalid '[e|E]' position");
371 }
372 sawExponent = true;
373 havePart = false;
374 sawSign = false;
375 }
376 }
377 case ' ', '\t', '\r', '\n' -> {
378 sawWhitespace = true;
379 offset --;
380 }
381 default -> {
382 offset--;
383 sawInvalid = true;
384 }
385 }
386 }
387 if (!havePart) {
388 throw failure("Input expected after '[.|e|E]'");
389 }
390 return new JsonNumberImpl(doc, start, offset);
391 }
392
393 // Utility functions
394
395 // Called when a SB is required to un-escape a member name
396 private void initBuilder() {
397 if (builder == null) {
398 builder = new StringBuilder();
399 }
400 }
401
402 // Validate unicode escape sequence
403 // This method does not increment offset
404 private void checkEscapeSequence() {
405 for (int index = 0; index < 4; index++) {
406 char c = doc[offset + index];
407 if ((c < 'a' || c > 'f') && (c < 'A' || c > 'F') && (c < '0' || c > '9')) {
408 throw failure("Invalid Unicode escape sequence");
409 }
410 }
411 }
412
413 // Unescapes the Unicode escape sequence and produces a char
414 private char codeUnit() {
415 try {
416 return Utils.codeUnit(doc, offset);
417 } catch (IllegalArgumentException _) {
418 // Catch and re-throw as JPE with correct row/col
419 throw failure("Invalid Unicode escape sequence");
420 }
421 }
422
423 // Returns true if the parser has not yet reached the end of the Document
424 private boolean hasInput() {
425 return offset < doc.length;
426 }
427
428 // Walk to the next non-white space char from the current offset
429 private void skipWhitespaces() {
430 while (hasInput()) {
431 if (notWhitespace()) {
432 break;
433 }
434 offset++;
435 }
436 }
437
438 // see https://datatracker.ietf.org/doc/html/rfc8259#section-2
439 private boolean notWhitespace() {
440 return switch (doc[offset]) {
441 case ' ', '\t','\r' -> false;
442 case '\n' -> {
443 // Increments the row and col
444 line += 1;
445 lineStart = offset + 1;
446 yield false;
447 }
448 default -> true;
449 };
450 }
451
452 private JsonParseException failure(String message) {
453 var errMsg = composeParseExceptionMessage(
454 message, line, lineStart, offset);
455 return new JsonParseException(errMsg, line, offset - lineStart);
456 }
457
458 // returns true if the char at the specified offset equals the input char
459 // and is within bounds of the char[]
460 private boolean currCharEquals(char c) {
461 return hasInput() && c == doc[offset];
462 }
463
464 // Returns true if the substring starting at the given offset equals the
465 // input String and is within bounds of the JSON document
466 private boolean charsEqual(String str, int o) {
467 if (o + str.length() - 1 < doc.length) {
468 for (int index = 0; index < str.length(); index++) {
469 if (doc[o] != str.charAt(index)) {
470 return false; // char does not match
471 }
472 o++;
473 }
474 return true; // all chars match
475 }
476 return false; // not within bounds
477 }
478
479 // Utility method to compose parse exception message
480 private String composeParseExceptionMessage(String message, int line, int lineStart, int offset) {
481 return "%s: (%s) at Row %d, Col %d."
482 .formatted(message, new String(doc, offset, Math.min(offset + 8, doc.length) - offset),
483 line, offset - lineStart);
484 }
485 }