1 /*
  2  * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 package hat.tools.json.impl;
 27 
 28 
 29 import hat.tools.json.JsonArray;
 30 import hat.tools.json.JsonObject;
 31 import hat.tools.json.JsonParseException;
 32 import hat.tools.json.JsonString;
 33 import hat.tools.json.JsonValue;
 34 
 35 import java.util.ArrayList;
 36 import java.util.HashMap;
 37 
 38 /**
 39  * Parses a JSON Document char[] into a tree of JsonValues. JsonObject and JsonArray
 40  * nodes create their data structures which maintain the connection to children.
 41  * JsonNumber and JsonString contain only a start and end offset, which
 42  * are used to lazily procure their underlying value/string on demand. Singletons
 43  * are used for JsonBoolean and JsonNull.
 44  */
 45 public final class JsonParser {
 46 
 47     // Access to the underlying JSON contents
 48     private final char[] doc;
 49     // Current offset during parsing
 50     private int offset;
 51     // For exception message on failure
 52     private int line;
 53     private int lineStart;
 54     private StringBuilder builder;
 55 
 56     public JsonParser(char[] doc) {
 57         this.doc = doc;
 58     }
 59 
 60     // Parses the lone JsonValue root
 61     public JsonValue parseRoot() {
 62         JsonValue root = parseValue();
 63         if (hasInput()) {
 64             throw failure("Unexpected character(s)");
 65         }
 66         return root;
 67     }
 68 
 69     /*
 70      * Parse any one of the JSON value types: object, array, number, string,
 71      * true, false, or null.
 72      *      JSON-text = ws value ws
 73      * See https://datatracker.ietf.org/doc/html/rfc8259#section-3
 74      */
 75     private JsonValue parseValue() {
 76         skipWhitespaces();
 77         if (!hasInput()) {
 78             throw failure("Missing JSON value");
 79         }
 80         var val = switch (doc[offset]) {
 81             case '{' -> parseObject();
 82             case '[' -> parseArray();
 83             case '"' -> parseString();
 84             case 't' -> parseTrue();
 85             case 'f' -> parseFalse();
 86             case 'n' -> parseNull();
 87             // While JSON Number does not support leading '+', '.', or 'e'
 88             // we still accept, so that we can provide a better error message
 89             case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '+', 'e', '.'
 90                     -> parseNumber();
 91             default -> throw failure("Unexpected character(s)");
 92         };
 93         skipWhitespaces();
 94         return val;
 95     }
 96 
 97     /*
 98      * The parsed JsonObject contains a map which holds all lazy member mappings.
 99      * No offsets are required as member values hold their own offsets.
100      * See https://datatracker.ietf.org/doc/html/rfc8259#section-4
101      */
102     private JsonObject parseObject() {
103         // @@@ Do not preserve encounter order, requires adjustment to the API
104 //        var members = new LinkedHashMap<String, JsonValue>();
105         var members = new HashMap<String, JsonValue>();
106         offset++; // Walk past the '{'
107         skipWhitespaces();
108         // Check for empty case
109         if (currCharEquals('}')) {
110             offset++;
111             return new JsonObjectImpl(members);
112         }
113         while (hasInput()) {
114             // Get the member name, which should be unescaped
115             // Why not parse the name as a JsonString and then return its value()?
116             // Would requires 2 passes; we should build the String as we parse.
117             var name = parseName();
118 
119             if (members.containsKey(name)) {
120                 throw failure("The duplicate member name: '%s' was already parsed".formatted(name));
121             }
122 
123             // Move from name to ':'
124             skipWhitespaces();
125             if (!currCharEquals(':')) {
126                 throw failure(
127                         "Expected ':' after the member name");
128             }
129 
130             // Move from ':' to JsonValue
131             offset++;
132             members.put(name, parseValue());
133             // Ensure current char is either ',' or '}'
134             if (currCharEquals('}')) {
135                 offset++;
136                 return new JsonObjectImpl(members);
137             } else if (currCharEquals(',')) {
138                 // Add the comma, and move to the next key
139                 offset++;
140                 skipWhitespaces();
141             } else {
142                 // Neither ',' nor '}' so fail
143                 break;
144             }
145         }
146         throw failure("Object was not closed with '}'");
147     }
148 
149     /*
150      * Member name equality and storage in the map should be done with the
151      * unescaped String value.
152      * See https://datatracker.ietf.org/doc/html/rfc8259#section-8.3
153      */
154     private String parseName() {
155         if (!currCharEquals('"')) {
156             throw failure("Invalid member name");
157         }
158         offset++; // Move past the starting quote
159         var escape = false;
160         boolean useBldr = false;
161         var start = offset;
162         for (; hasInput(); offset++) {
163             var c = doc[offset];
164             if (escape) {
165                 var escapeLength = 0;
166                 switch (c) {
167                     // Allowed JSON escapes
168                     case '"', '\\', '/' -> {}
169                     case 'b' -> c = '\b';
170                     case 'f' -> c = '\f';
171                     case 'n' -> c = '\n';
172                     case 'r' -> c = '\r';
173                     case 't' -> c = '\t';
174                     case 'u' -> {
175                         if (offset + 4 < doc.length) {
176                             escapeLength = 4;
177                             offset++; // Move to first char in sequence
178                             c = codeUnit();
179                             // Move to the last hex digit, since outer loop will increment offset
180                             offset += 3;
181                         } else {
182                             throw failure("Invalid Unicode escape sequence");
183                         }
184                     }
185                     default -> throw failure("Illegal escape");
186                 }
187                 if (!useBldr) {
188                     initBuilder();
189                     // Append everything up to the first escape sequence
190                     builder.append(doc, start, offset - escapeLength - 1 - start);
191                     useBldr = true;
192                 }
193                 escape = false;
194             } else if (c == '\\') {
195                 escape = true;
196                 continue;
197             } else if (c == '\"') {
198                 offset++;
199                 if (useBldr) {
200                     var name = builder.toString();
201                     builder.setLength(0);
202                     return name;
203                 } else {
204                     return new String(doc, start, offset - start - 1);
205                 }
206             } else if (c < ' ') {
207                 throw failure("Unescaped control code");
208             }
209             if (useBldr) {
210                 builder.append(c);
211             }
212         }
213         throw failure("Closing quote missing");
214     }
215 
216     /*
217      * The parsed JsonArray contains a List which holds all lazy children
218      * elements. No offsets are required as children values hold their own offsets.
219      * See https://datatracker.ietf.org/doc/html/rfc8259#section-5
220      */
221     private JsonArray parseArray() {
222         var list = new ArrayList<JsonValue>();
223         offset++; // Walk past the '['
224         skipWhitespaces();
225         // Check for empty case
226         if (currCharEquals(']')) {
227             offset++;
228             return new JsonArrayImpl(list);
229         }
230         for (; hasInput(); offset++) {
231             // Get the JsonValue
232             list.add(parseValue());
233             // Ensure current char is either ']' or ','
234             if (currCharEquals(']')) {
235                 offset++;
236                 return new JsonArrayImpl(list);
237             } else if (!currCharEquals(',')) {
238                 break;
239             }
240         }
241         throw failure("Array was not closed with ']'");
242     }
243 
244     /*
245      * The parsed JsonString will contain offsets correlating to the beginning
246      * and ending quotation marks. All Unicode characters are allowed except the
247      * following that require escaping: quotation mark, reverse solidus, and the
248      * control characters (U+0000 through U+001F). Any character may be escaped
249      * either through a Unicode escape sequence or two-char sequence.
250      * See https://datatracker.ietf.org/doc/html/rfc8259#section-7
251      */
252     private JsonString parseString() {
253         int start = offset;
254         offset++; // Move past the starting quote
255         var escape = false;
256         for (; hasInput(); offset++) {
257             var c = doc[offset];
258             if (escape) {
259                 switch (c) {
260                     // Allowed JSON escapes
261                     case '"', '\\', '/', 'b', 'f', 'n', 'r', 't' -> {}
262                     case 'u' -> {
263                         if (offset + 4 < doc.length) {
264                             offset++; // Move to first char in sequence
265                             checkEscapeSequence();
266                             offset += 3; // Move to the last hex digit, outer loop increments
267                         } else {
268                             throw failure("Invalid Unicode escape sequence");
269                         }
270                     }
271                     default -> throw failure("Illegal escape");
272                 }
273                 escape = false;
274             } else if (c == '\\') {
275                 escape = true;
276             } else if (c == '\"') {
277                 return new JsonStringImpl(doc, start, offset += 1);
278             } else if (c < ' ') {
279                 throw failure("Unescaped control code");
280             }
281         }
282         throw failure("Closing quote missing");
283     }
284 
285     /*
286      * Parsing true, false, and null return singletons. These JsonValues
287      * do not require offsets to lazily compute their values.
288      */
289     private JsonBooleanImpl parseTrue() {
290         if (charsEqual("rue", offset + 1)) {
291             offset += 4;
292             return JsonBooleanImpl.TRUE;
293         }
294         throw failure("Expected true");
295     }
296 
297     private JsonBooleanImpl parseFalse() {
298         if (charsEqual( "alse", offset + 1)) {
299             offset += 5;
300             return JsonBooleanImpl.FALSE;
301         }
302         throw failure("Expected false");
303     }
304 
305     private JsonNullImpl parseNull() {
306         if (charsEqual("ull", offset + 1)) {
307             offset += 4;
308             return JsonNullImpl.NULL;
309         }
310         throw failure("Expected null");
311     }
312 
313     /*
314      * The parsed JsonNumber contains offsets correlating to the first and last
315      * allowed chars permitted in the JSON numeric grammar:
316      *      number = [ minus ] int [ frac ] [ exp ]
317      * See https://datatracker.ietf.org/doc/html/rfc8259#section-6
318      */
319     private JsonNumberImpl parseNumber() {
320         boolean sawDecimal = false;
321         boolean sawExponent = false;
322         boolean sawZero = false;
323         boolean sawWhitespace = false;
324         boolean havePart = false;
325         boolean sawInvalid = false;
326         boolean sawSign = false;
327         var start = offset;
328         for (; hasInput() && !sawWhitespace && !sawInvalid; offset++) {
329             switch (doc[offset]) {
330                 case '-' -> {
331                     if (offset != start && !sawExponent || sawSign) {
332                         throw failure("Invalid '-' position");
333                     }
334                     sawSign = true;
335                 }
336                 case '+' -> {
337                     if (!sawExponent || havePart || sawSign) {
338                         throw failure("Invalid '+' position");
339                     }
340                     sawSign = true;
341                 }
342                 case '0' -> {
343                     if (!havePart) {
344                         sawZero = true;
345                     }
346                     havePart = true;
347                 }
348                 case '1', '2', '3', '4', '5', '6', '7', '8', '9' -> {
349                     if (!sawDecimal && !sawExponent && sawZero) {
350                         throw failure("Invalid '0' position");
351                     }
352                     havePart = true;
353                 }
354                 case '.' -> {
355                     if (sawDecimal) {
356                         throw failure("Invalid '.' position");
357                     } else {
358                         if (!havePart) {
359                             throw failure("Invalid '.' position");
360                         }
361                         sawDecimal = true;
362                         havePart = false;
363                     }
364                 }
365                 case 'e', 'E' -> {
366                     if (sawExponent) {
367                         throw failure("Invalid '[e|E]' position");
368                     } else {
369                         if (!havePart) {
370                             throw failure("Invalid '[e|E]' position");
371                         }
372                         sawExponent = true;
373                         havePart = false;
374                         sawSign = false;
375                     }
376                 }
377                 case ' ', '\t', '\r', '\n' -> {
378                     sawWhitespace = true;
379                     offset --;
380                 }
381                 default -> {
382                     offset--;
383                     sawInvalid = true;
384                 }
385             }
386         }
387         if (!havePart) {
388             throw failure("Input expected after '[.|e|E]'");
389         }
390         return new JsonNumberImpl(doc, start, offset);
391     }
392 
393     // Utility functions
394 
395     // Called when a SB is required to un-escape a member name
396     private void initBuilder() {
397         if (builder == null) {
398             builder = new StringBuilder();
399         }
400     }
401 
402     // Validate unicode escape sequence
403     // This method does not increment offset
404     private void checkEscapeSequence() {
405         for (int index = 0; index < 4; index++) {
406             char c = doc[offset + index];
407             if ((c < 'a' || c > 'f') && (c < 'A' || c > 'F') && (c < '0' || c > '9')) {
408                 throw failure("Invalid Unicode escape sequence");
409             }
410         }
411     }
412 
413     // Unescapes the Unicode escape sequence and produces a char
414     private char codeUnit() {
415         try {
416             return Utils.codeUnit(doc, offset);
417         } catch (IllegalArgumentException _) {
418             // Catch and re-throw as JPE with correct row/col
419             throw failure("Invalid Unicode escape sequence");
420         }
421     }
422 
423     // Returns true if the parser has not yet reached the end of the Document
424     private boolean hasInput() {
425         return offset < doc.length;
426     }
427 
428     // Walk to the next non-white space char from the current offset
429     private void skipWhitespaces() {
430         while (hasInput()) {
431             if (notWhitespace()) {
432                 break;
433             }
434             offset++;
435         }
436     }
437 
438     // see https://datatracker.ietf.org/doc/html/rfc8259#section-2
439     private boolean notWhitespace() {
440         return switch (doc[offset]) {
441             case ' ', '\t','\r' -> false;
442             case '\n' -> {
443                 // Increments the row and col
444                 line += 1;
445                 lineStart = offset + 1;
446                 yield false;
447             }
448             default -> true;
449         };
450     }
451 
452     private JsonParseException failure(String message) {
453         var errMsg = composeParseExceptionMessage(
454                 message, line, lineStart, offset);
455         return new JsonParseException(errMsg, line, offset - lineStart);
456     }
457 
458     // returns true if the char at the specified offset equals the input char
459     // and is within bounds of the char[]
460     private boolean currCharEquals(char c) {
461         return hasInput() && c == doc[offset];
462     }
463 
464     // Returns true if the substring starting at the given offset equals the
465     // input String and is within bounds of the JSON document
466     private boolean charsEqual(String str, int o) {
467         if (o + str.length() - 1 < doc.length) {
468             for (int index = 0; index < str.length(); index++) {
469                 if (doc[o] != str.charAt(index)) {
470                     return false; // char does not match
471                 }
472                 o++;
473             }
474             return true; // all chars match
475         }
476         return false; // not within bounds
477     }
478 
479     // Utility method to compose parse exception message
480     private String composeParseExceptionMessage(String message, int line, int lineStart, int offset) {
481         return "%s: (%s) at Row %d, Col %d."
482             .formatted(message, new String(doc, offset, Math.min(offset + 8, doc.length) - offset),
483                 line, offset - lineStart);
484     }
485 }