New cr-examples/onnx/opgen/src/main/java/oracle/code/json/impl/JsonParser.java

  1 /*
  2  * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 package oracle.code.json.impl;
 27 
 28 import oracle.code.json.*;
 29 
 30 import java.util.ArrayList;
 31 import java.util.HashMap;
 32 import java.util.LinkedHashMap;
 33 
 34 /**
 35  * Parses a JSON Document char[] into a tree of JsonValues. JsonObject and JsonArray
 36  * nodes create their data structures which maintain the connection to children.
 37  * JsonNumber and JsonString contain only a start and end offset, which
 38  * are used to lazily procure their underlying value/string on demand. Singletons
 39  * are used for JsonBoolean and JsonNull.
 40  */
 41 public final class JsonParser {
 42 
 43     // Access to the underlying JSON contents
 44     private final char[] doc;
 45     // Current offset during parsing
 46     private int offset;
 47     // For exception message on failure
 48     private int line;
 49     private int lineStart;
 50     private StringBuilder builder;
 51 
 52     public JsonParser(char[] doc) {
 53         this.doc = doc;
 54     }
 55 
 56     // Parses the lone JsonValue root
 57     public JsonValue parseRoot() {
 58         JsonValue root = parseValue();
 59         if (hasInput()) {
 60             throw failure("Unexpected character(s)");
 61         }
 62         return root;
 63     }
 64 
 65     /*
 66      * Parse any one of the JSON value types: object, array, number, string,
 67      * true, false, or null.
 68      *      JSON-text = ws value ws
 69      * See https://datatracker.ietf.org/doc/html/rfc8259#section-3
 70      */
 71     private JsonValue parseValue() {
 72         skipWhitespaces();
 73         if (!hasInput()) {
 74             throw failure("Missing JSON value");
 75         }
 76         var val = switch (doc[offset]) {
 77             case '{' -> parseObject();
 78             case '[' -> parseArray();
 79             case '"' -> parseString();
 80             case 't' -> parseTrue();
 81             case 'f' -> parseFalse();
 82             case 'n' -> parseNull();
 83             // While JSON Number does not support leading '+', '.', or 'e'
 84             // we still accept, so that we can provide a better error message
 85             case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '+', 'e', '.'
 86                     -> parseNumber();
 87             default -> throw failure("Unexpected character(s)");
 88         };
 89         skipWhitespaces();
 90         return val;
 91     }
 92 
 93     /*
 94      * The parsed JsonObject contains a map which holds all lazy member mappings.
 95      * No offsets are required as member values hold their own offsets.
 96      * See https://datatracker.ietf.org/doc/html/rfc8259#section-4
 97      */
 98     private JsonObject parseObject() {
 99         // @@@ Do not preserve encounter order, requires adjustment to the API
100 //        var members = new LinkedHashMap<String, JsonValue>();
101         var members = new HashMap<String, JsonValue>();
102         offset++; // Walk past the '{'
103         skipWhitespaces();
104         // Check for empty case
105         if (currCharEquals('}')) {
106             offset++;
107             return new JsonObjectImpl(members);
108         }
109         while (hasInput()) {
110             // Get the member name, which should be unescaped
111             // Why not parse the name as a JsonString and then return its value()?
112             // Would requires 2 passes; we should build the String as we parse.
113             var name = parseName();
114 
115             if (members.containsKey(name)) {
116                 throw failure("The duplicate member name: '%s' was already parsed".formatted(name));
117             }
118 
119             // Move from name to ':'
120             skipWhitespaces();
121             if (!currCharEquals(':')) {
122                 throw failure(
123                         "Expected ':' after the member name");
124             }
125 
126             // Move from ':' to JsonValue
127             offset++;
128             members.put(name, parseValue());
129             // Ensure current char is either ',' or '}'
130             if (currCharEquals('}')) {
131                 offset++;
132                 return new JsonObjectImpl(members);
133             } else if (currCharEquals(',')) {
134                 // Add the comma, and move to the next key
135                 offset++;
136                 skipWhitespaces();
137             } else {
138                 // Neither ',' nor '}' so fail
139                 break;
140             }
141         }
142         throw failure("Object was not closed with '}'");
143     }
144 
145     /*
146      * Member name equality and storage in the map should be done with the
147      * unescaped String value.
148      * See https://datatracker.ietf.org/doc/html/rfc8259#section-8.3
149      */
150     private String parseName() {
151         if (!currCharEquals('"')) {
152             throw failure("Invalid member name");
153         }
154         offset++; // Move past the starting quote
155         var escape = false;
156         boolean useBldr = false;
157         var start = offset;
158         for (; hasInput(); offset++) {
159             var c = doc[offset];
160             if (escape) {
161                 var escapeLength = 0;
162                 switch (c) {
163                     // Allowed JSON escapes
164                     case '"', '\\', '/' -> {}
165                     case 'b' -> c = '\b';
166                     case 'f' -> c = '\f';
167                     case 'n' -> c = '\n';
168                     case 'r' -> c = '\r';
169                     case 't' -> c = '\t';
170                     case 'u' -> {
171                         if (offset + 4 < doc.length) {
172                             escapeLength = 4;
173                             offset++; // Move to first char in sequence
174                             c = codeUnit();
175                             // Move to the last hex digit, since outer loop will increment offset
176                             offset += 3;
177                         } else {
178                             throw failure("Invalid Unicode escape sequence");
179                         }
180                     }
181                     default -> throw failure("Illegal escape");
182                 }
183                 if (!useBldr) {
184                     initBuilder();
185                     // Append everything up to the first escape sequence
186                     builder.append(doc, start, offset - escapeLength - 1 - start);
187                     useBldr = true;
188                 }
189                 escape = false;
190             } else if (c == '\\') {
191                 escape = true;
192                 continue;
193             } else if (c == '\"') {
194                 offset++;
195                 if (useBldr) {
196                     var name = builder.toString();
197                     builder.setLength(0);
198                     return name;
199                 } else {
200                     return new String(doc, start, offset - start - 1);
201                 }
202             } else if (c < ' ') {
203                 throw failure("Unescaped control code");
204             }
205             if (useBldr) {
206                 builder.append(c);
207             }
208         }
209         throw failure("Closing quote missing");
210     }
211 
212     /*
213      * The parsed JsonArray contains a List which holds all lazy children
214      * elements. No offsets are required as children values hold their own offsets.
215      * See https://datatracker.ietf.org/doc/html/rfc8259#section-5
216      */
217     private JsonArray parseArray() {
218         var list = new ArrayList<JsonValue>();
219         offset++; // Walk past the '['
220         skipWhitespaces();
221         // Check for empty case
222         if (currCharEquals(']')) {
223             offset++;
224             return new JsonArrayImpl(list);
225         }
226         for (; hasInput(); offset++) {
227             // Get the JsonValue
228             list.add(parseValue());
229             // Ensure current char is either ']' or ','
230             if (currCharEquals(']')) {
231                 offset++;
232                 return new JsonArrayImpl(list);
233             } else if (!currCharEquals(',')) {
234                 break;
235             }
236         }
237         throw failure("Array was not closed with ']'");
238     }
239 
240     /*
241      * The parsed JsonString will contain offsets correlating to the beginning
242      * and ending quotation marks. All Unicode characters are allowed except the
243      * following that require escaping: quotation mark, reverse solidus, and the
244      * control characters (U+0000 through U+001F). Any character may be escaped
245      * either through a Unicode escape sequence or two-char sequence.
246      * See https://datatracker.ietf.org/doc/html/rfc8259#section-7
247      */
248     private JsonString parseString() {
249         int start = offset;
250         offset++; // Move past the starting quote
251         var escape = false;
252         for (; hasInput(); offset++) {
253             var c = doc[offset];
254             if (escape) {
255                 switch (c) {
256                     // Allowed JSON escapes
257                     case '"', '\\', '/', 'b', 'f', 'n', 'r', 't' -> {}
258                     case 'u' -> {
259                         if (offset + 4 < doc.length) {
260                             offset++; // Move to first char in sequence
261                             checkEscapeSequence();
262                             offset += 3; // Move to the last hex digit, outer loop increments
263                         } else {
264                             throw failure("Invalid Unicode escape sequence");
265                         }
266                     }
267                     default -> throw failure("Illegal escape");
268                 }
269                 escape = false;
270             } else if (c == '\\') {
271                 escape = true;
272             } else if (c == '\"') {
273                 return new JsonStringImpl(doc, start, offset += 1);
274             } else if (c < ' ') {
275                 throw failure("Unescaped control code");
276             }
277         }
278         throw failure("Closing quote missing");
279     }
280 
281     /*
282      * Parsing true, false, and null return singletons. These JsonValues
283      * do not require offsets to lazily compute their values.
284      */
285     private JsonBooleanImpl parseTrue() {
286         if (charsEqual("rue", offset + 1)) {
287             offset += 4;
288             return JsonBooleanImpl.TRUE;
289         }
290         throw failure("Expected true");
291     }
292 
293     private JsonBooleanImpl parseFalse() {
294         if (charsEqual( "alse", offset + 1)) {
295             offset += 5;
296             return JsonBooleanImpl.FALSE;
297         }
298         throw failure("Expected false");
299     }
300 
301     private JsonNullImpl parseNull() {
302         if (charsEqual("ull", offset + 1)) {
303             offset += 4;
304             return JsonNullImpl.NULL;
305         }
306         throw failure("Expected null");
307     }
308 
309     /*
310      * The parsed JsonNumber contains offsets correlating to the first and last
311      * allowed chars permitted in the JSON numeric grammar:
312      *      number = [ minus ] int [ frac ] [ exp ]
313      * See https://datatracker.ietf.org/doc/html/rfc8259#section-6
314      */
315     private JsonNumberImpl parseNumber() {
316         boolean sawDecimal = false;
317         boolean sawExponent = false;
318         boolean sawZero = false;
319         boolean sawWhitespace = false;
320         boolean havePart = false;
321         boolean sawInvalid = false;
322         boolean sawSign = false;
323         var start = offset;
324         for (; hasInput() && !sawWhitespace && !sawInvalid; offset++) {
325             switch (doc[offset]) {
326                 case '-' -> {
327                     if (offset != start && !sawExponent || sawSign) {
328                         throw failure("Invalid '-' position");
329                     }
330                     sawSign = true;
331                 }
332                 case '+' -> {
333                     if (!sawExponent || havePart || sawSign) {
334                         throw failure("Invalid '+' position");
335                     }
336                     sawSign = true;
337                 }
338                 case '0' -> {
339                     if (!havePart) {
340                         sawZero = true;
341                     }
342                     havePart = true;
343                 }
344                 case '1', '2', '3', '4', '5', '6', '7', '8', '9' -> {
345                     if (!sawDecimal && !sawExponent && sawZero) {
346                         throw failure("Invalid '0' position");
347                     }
348                     havePart = true;
349                 }
350                 case '.' -> {
351                     if (sawDecimal) {
352                         throw failure("Invalid '.' position");
353                     } else {
354                         if (!havePart) {
355                             throw failure("Invalid '.' position");
356                         }
357                         sawDecimal = true;
358                         havePart = false;
359                     }
360                 }
361                 case 'e', 'E' -> {
362                     if (sawExponent) {
363                         throw failure("Invalid '[e|E]' position");
364                     } else {
365                         if (!havePart) {
366                             throw failure("Invalid '[e|E]' position");
367                         }
368                         sawExponent = true;
369                         havePart = false;
370                         sawSign = false;
371                     }
372                 }
373                 case ' ', '\t', '\r', '\n' -> {
374                     sawWhitespace = true;
375                     offset --;
376                 }
377                 default -> {
378                     offset--;
379                     sawInvalid = true;
380                 }
381             }
382         }
383         if (!havePart) {
384             throw failure("Input expected after '[.|e|E]'");
385         }
386         return new JsonNumberImpl(doc, start, offset);
387     }
388 
389     // Utility functions
390 
391     // Called when a SB is required to un-escape a member name
392     private void initBuilder() {
393         if (builder == null) {
394             builder = new StringBuilder();
395         }
396     }
397 
398     // Validate unicode escape sequence
399     // This method does not increment offset
400     private void checkEscapeSequence() {
401         for (int index = 0; index < 4; index++) {
402             char c = doc[offset + index];
403             if ((c < 'a' || c > 'f') && (c < 'A' || c > 'F') && (c < '0' || c > '9')) {
404                 throw failure("Invalid Unicode escape sequence");
405             }
406         }
407     }
408 
409     // Unescapes the Unicode escape sequence and produces a char
410     private char codeUnit() {
411         try {
412             return Utils.codeUnit(doc, offset);
413         } catch (IllegalArgumentException _) {
414             // Catch and re-throw as JPE with correct row/col
415             throw failure("Invalid Unicode escape sequence");
416         }
417     }
418 
419     // Returns true if the parser has not yet reached the end of the Document
420     private boolean hasInput() {
421         return offset < doc.length;
422     }
423 
424     // Walk to the next non-white space char from the current offset
425     private void skipWhitespaces() {
426         while (hasInput()) {
427             if (notWhitespace()) {
428                 break;
429             }
430             offset++;
431         }
432     }
433 
434     // see https://datatracker.ietf.org/doc/html/rfc8259#section-2
435     private boolean notWhitespace() {
436         return switch (doc[offset]) {
437             case ' ', '\t','\r' -> false;
438             case '\n' -> {
439                 // Increments the row and col
440                 line += 1;
441                 lineStart = offset + 1;
442                 yield false;
443             }
444             default -> true;
445         };
446     }
447 
448     private JsonParseException failure(String message) {
449         var errMsg = composeParseExceptionMessage(
450                 message, line, lineStart, offset);
451         return new JsonParseException(errMsg, line, offset - lineStart);
452     }
453 
454     // returns true if the char at the specified offset equals the input char
455     // and is within bounds of the char[]
456     private boolean currCharEquals(char c) {
457         return hasInput() && c == doc[offset];
458     }
459 
460     // Returns true if the substring starting at the given offset equals the
461     // input String and is within bounds of the JSON document
462     private boolean charsEqual(String str, int o) {
463         if (o + str.length() - 1 < doc.length) {
464             for (int index = 0; index < str.length(); index++) {
465                 if (doc[o] != str.charAt(index)) {
466                     return false; // char does not match
467                 }
468                 o++;
469             }
470             return true; // all chars match
471         }
472         return false; // not within bounds
473     }
474 
475     // Utility method to compose parse exception message
476     private String composeParseExceptionMessage(String message, int line, int lineStart, int offset) {
477         return "%s: (%s) at Row %d, Col %d."
478             .formatted(message, new String(doc, offset, Math.min(offset + 8, doc.length) - offset),
479                 line, offset - lineStart);
480     }
481 }