1 /*
  2  * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 package oracle.code.json;
 27 
 28 import java.util.HashSet;
 29 
 30 // Responsible for parsing the Json document which validates the contents
 31 // and builds the tokens array in JsonDocumentInfo which is used for lazy inflation
 32 final class JsonParser { ;
 33 
 34     // Parse the JSON and return the built DocumentInfo w/ tokens array
 35     static JsonDocumentInfo parseRoot(JsonDocumentInfo docInfo) {
 36         int end = parseValue(docInfo, 0, 0);
 37         if (!checkWhitespaces(docInfo, end, docInfo.getEndOffset())) {
 38             throw failure(docInfo,"Unexpected character(s)", end);
 39         }
 40         return docInfo;
 41     }
 42 
 43     static int parseValue(JsonDocumentInfo docInfo, int offset, int depth) {
 44         offset = skipWhitespaces(docInfo, offset);
 45 
 46         return switch (docInfo.charAt(offset)) {
 47             case '{' -> parseObject(docInfo, offset, depth + 1);
 48             case '[' -> parseArray(docInfo, offset, depth + 1);
 49             case '"' -> parseString(docInfo, offset);
 50             case 't', 'f' -> parseBoolean(docInfo, offset);
 51             case 'n' -> parseNull(docInfo, offset);
 52             case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-' -> parseNumber(docInfo, offset);
 53             default -> throw failure(docInfo, "Unexpected character(s)", offset);
 54         };
 55     }
 56 
 57     static int parseObject(JsonDocumentInfo docInfo, int offset, int depth) {
 58         checkDepth(docInfo, offset, depth);
 59         var keys = new HashSet<String>();
 60         docInfo.tokens[docInfo.index++] = offset;
 61         // Walk past the '{'
 62         offset = JsonParser.skipWhitespaces(docInfo, offset + 1);
 63         // Check for empty case
 64         if (docInfo.charAt(offset) == '}') {
 65             docInfo.tokens[docInfo.index++] = offset;
 66             return ++offset;
 67         }
 68         while (offset < docInfo.getEndOffset()) {
 69             // Get the key
 70             if (docInfo.charAt(offset) != '"') {
 71                 throw failure(docInfo, "Invalid key", offset);
 72             }
 73             // Member equality done via unescaped String
 74             // see https://datatracker.ietf.org/doc/html/rfc8259#section-8.3
 75             docInfo.tokens[docInfo.index++] = offset++; // Move past the starting quote
 76             var escape = false;
 77             boolean useBldr = false;
 78             var start = offset;
 79             StringBuilder sb = null; // only init if we need to use for escapes
 80             boolean foundClosing = false;
 81             for (; offset < docInfo.getEndOffset(); offset++) {
 82                 var c = docInfo.charAt(offset);
 83                 if (escape) {
 84                     var length = 0;
 85                     switch (c) {
 86                         // Allowed JSON escapes
 87                         case '"', '\\', '/' -> {}
 88                         case 'b' -> c = '\b';
 89                         case 'f' -> c = '\f';
 90                         case 'n' -> c = '\n';
 91                         case 'r' -> c = '\r';
 92                         case 't' -> c = '\t';
 93                         case 'u' -> {
 94                             if (offset + 4 < docInfo.getEndOffset()) {
 95                                 c = codeUnit(docInfo, offset + 1);
 96                                 length = 4;
 97                             } else {
 98                                 throw failure(docInfo,
 99                                         "Illegal Unicode escape sequence", offset);
100                             }
101                         }
102                         default -> throw failure(docInfo,
103                                 "Illegal escape", offset);
104                     }
105                     if (!useBldr) {
106                         useBldr = true;
107                         sb = new StringBuilder(docInfo.substring(start, offset - 1));
108                     }
109                     offset+=length;
110                     escape = false;
111                 } else if (c == '\\') {
112                     escape = true;
113                     continue;
114                 } else if (c == '\"') {
115                     docInfo.tokens[docInfo.index++] = offset++;
116                     foundClosing = true;
117                     break;
118                 } else if (c < ' ') {
119                     throw failure(docInfo,
120                             "Unescaped control code", offset);
121                 }
122                 if (useBldr) {
123                     sb.append(c);
124                 }
125             }
126             if (!foundClosing) {
127                 throw failure(docInfo, "Closing quote missing", offset);
128             }
129             var keyStr = useBldr ? sb.toString() :
130                     docInfo.substring(start, offset - 1);
131 
132             // Check for duplicates
133             if (keys.contains(keyStr)) {
134                 throw failure(docInfo,
135                         "The duplicate key: '%s' was already parsed".formatted(keyStr), offset);
136             }
137             keys.add(keyStr);
138 
139             // Move from key to ':'
140             offset = JsonParser.skipWhitespaces(docInfo, offset);
141             docInfo.tokens[docInfo.index++] = offset;
142             if (docInfo.charAt(offset) != ':') {
143                 throw failure(docInfo,
144                         "Unexpected character(s) found after key", offset);
145             }
146 
147             // Move from ':' to JsonValue
148             offset = JsonParser.skipWhitespaces(docInfo, offset + 1);
149             offset = JsonParser.parseValue(docInfo, offset, depth);
150 
151             // Walk to either ',' or '}'
152             offset = JsonParser.skipWhitespaces(docInfo, offset);
153             var c = docInfo.charAt(offset);
154             if (c == '}') {
155                 docInfo.tokens[docInfo.index++] = offset;
156                 return ++offset;
157             } else if (docInfo.charAt(offset) != ',') {
158                 break;
159             }
160 
161             // Add the comma, and move to the next key
162             docInfo.tokens[docInfo.index++] = offset;
163             offset = JsonParser.skipWhitespaces(docInfo, offset + 1);
164         }
165         throw failure(docInfo,
166                 "Unexpected character(s) found after value", offset);
167     }
168 
169     static int parseArray(JsonDocumentInfo docInfo, int offset, int depth) {
170         checkDepth(docInfo, offset, depth);
171         docInfo.tokens[docInfo.index++] = offset;
172         // Walk past the '['
173         offset = JsonParser.skipWhitespaces(docInfo, offset + 1);
174         // Check for empty case
175         if (docInfo.charAt(offset) == ']') {
176             docInfo.tokens[docInfo.index++] = offset;
177             return ++offset;
178         }
179 
180         while (offset < docInfo.getEndOffset()) {
181             // Get the JsonValue
182             offset = JsonParser.parseValue(docInfo, offset, depth);
183             // Walk to either ',' or ']'
184             offset = JsonParser.skipWhitespaces(docInfo, offset);
185             var c = docInfo.charAt(offset);
186             if (c == ']') {
187                 docInfo.tokens[docInfo.index++] = offset;
188                 return ++offset;
189             } else if (c != ',') {
190                 break;
191             }
192 
193             // Add the comma, and move to the next value
194             docInfo.tokens[docInfo.index++] = offset;
195             offset = JsonParser.skipWhitespaces(docInfo, offset + 1);
196         }
197         throw failure(docInfo,
198                 "Unexpected character(s) found after value", offset);
199     }
200 
201     static int parseString(JsonDocumentInfo docInfo, int offset) {
202         docInfo.tokens[docInfo.index++] = offset++; // Move past the starting quote
203         var escape = false;
204 
205         for (; offset < docInfo.getEndOffset(); offset++) {
206             var c = docInfo.charAt(offset);
207             if (escape) {
208                 switch (c) {
209                     // Allowed JSON escapes
210                     case '"', '\\', '/', 'b', 'f', 'n', 'r', 't' -> {}
211                     case 'u' -> {
212                         if (offset + 4 < docInfo.getEndOffset()) {
213                             checkEscapeSequence(docInfo, offset + 1);
214                             offset += 4;
215                         } else {
216                             throw failure(docInfo,
217                                     "Illegal Unicode escape sequence", offset);
218                         }
219                     }
220                     default -> throw failure(docInfo,
221                             "Illegal escape", offset);
222                 }
223                 escape = false;
224             } else if (c == '\\') {
225                 escape = true;
226             } else if (c == '\"') {
227                 docInfo.tokens[docInfo.index++] = offset;
228                 return ++offset;
229             } else if (c < ' ') {
230                 throw failure(docInfo,
231                         "Unescaped control code", offset);
232             }
233         }
234         throw failure(docInfo, "Closing quote missing", offset);
235     }
236 
237     // Validate unicode escape sequence
238     static void checkEscapeSequence(JsonDocumentInfo docInfo, int offset) {
239         for (int index = 0; index < 4; index++) {
240             char c = docInfo.charAt(offset + index);
241             if ((c < 'a' || c > 'f') && (c < 'A' || c > 'F') && (c < '0' || c > '9')) {
242                 throw failure(docInfo, "Invalid Unicode escape", offset);
243             }
244         }
245     }
246 
247     // Validate and construct corresponding value of unicode escape sequence
248     static char codeUnit(JsonDocumentInfo docInfo, int offset) {
249         char val = 0;
250         for (int index = 0; index < 4; index ++) {
251             char c = docInfo.charAt(offset + index);
252             val <<= 4;
253             val += (char) (
254                     switch (c) {
255                         case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' -> c - '0';
256                         case 'a', 'b', 'c', 'd', 'e', 'f' -> c - 'a' + 10;
257                         case 'A', 'B', 'C', 'D', 'E', 'F' -> c - 'A' + 10;
258                         default -> throw new InternalError();
259                     });
260         }
261         return val;
262     }
263 
264     static int parseBoolean(JsonDocumentInfo docInfo, int offset) {
265         var start = docInfo.charAt(offset);
266         if (start == 't') {
267             if (offset + 3 >= docInfo.getEndOffset() || !docInfo.substring(offset + 1, offset + 4).equals("rue")) {
268                 throw failure(docInfo, "Unexpected character(s)", offset);
269             }
270             return offset + 4;
271         } else {
272             if (offset + 4 >= docInfo.getEndOffset() || !docInfo.substring(offset + 1, offset + 5).equals("alse")) {
273                 throw failure(docInfo, "Unexpected character(s)", offset);
274             }
275             return offset + 5;
276         }
277     }
278 
279     static int parseNull(JsonDocumentInfo docInfo, int offset) {
280         if (offset + 3 >= docInfo.getEndOffset() || !docInfo.substring(offset + 1, offset + 4).equals("ull")) {
281             throw failure(docInfo, "Unexpected character(s)", offset);
282         }
283         return offset + 4;
284     }
285 
286     static int parseNumber(JsonDocumentInfo docInfo, int offset) {
287         boolean sawDecimal = false;
288         boolean sawExponent = false;
289         boolean sawZero = false;
290         boolean sawWhitespace = false;
291         boolean havePart = false;
292         boolean sawInvalid = false;
293         boolean sawSign = false;
294         var start = offset;
295         for (; offset < docInfo.getEndOffset() && !sawWhitespace && !sawInvalid; offset++) {
296             switch (docInfo.charAt(offset)) {
297                 case '-' -> {
298                     if (offset != start && !sawExponent || sawSign) {
299                         throw failure(docInfo,
300                                 "Invalid '-' position", offset);
301                     }
302                     sawSign = true;
303                 }
304                 case '+' -> {
305                     if (!sawExponent || havePart || sawSign) {
306                         throw failure(docInfo,
307                                 "Invalid '+' position", offset);
308                     }
309                     sawSign = true;
310                 }
311                 case '0' -> {
312                     if (!havePart) {
313                         sawZero = true;
314                     }
315                     havePart = true;
316                 }
317                 case '1', '2', '3', '4', '5', '6', '7', '8', '9' -> {
318                     if (!sawDecimal && !sawExponent && sawZero) {
319                         throw failure(docInfo,
320                                 "Invalid '0' position", offset);
321                     }
322                     havePart = true;
323                 }
324                 case '.' -> {
325                     if (sawDecimal) {
326                         throw failure(docInfo,
327                                 "Invalid '.' position", offset);
328                     } else {
329                         if (!havePart) {
330                             throw failure(docInfo,
331                                     "Invalid '.' position", offset);
332                         }
333                         sawDecimal = true;
334                         havePart = false;
335                     }
336                 }
337                 case 'e', 'E' -> {
338                     if (sawExponent) {
339                         throw failure(docInfo,
340                                 "Invalid '[e|E]' position", offset);
341                     } else {
342                         if (!havePart) {
343                             throw failure(docInfo,
344                                     "Invalid '[e|E]' position", offset);
345                         }
346                         sawExponent = true;
347                         havePart = false;
348                         sawSign = false;
349                     }
350                 }
351                 case ' ', '\t', '\r', '\n' -> {
352                     sawWhitespace = true;
353                     offset --;
354                 }
355                 default -> {
356                     offset--;
357                     sawInvalid = true;
358                 }
359             }
360         }
361         if (!havePart) {
362             throw failure(docInfo,
363                     "Input expected after '[.|e|E]'", offset);
364         }
365         return offset;
366     }
367 
368     // Utility functions
369     static int skipWhitespaces(JsonDocumentInfo docInfo, int offset) {
370         while (offset < docInfo.getEndOffset()) {
371             if (notWhitespace(docInfo, offset)) {
372                 break;
373             }
374             offset ++;
375         }
376         return offset;
377     }
378 
379     static boolean checkWhitespaces(JsonDocumentInfo docInfo, int offset, int endOffset) {
380         int end = Math.min(endOffset, docInfo.getEndOffset());
381         while (offset < end) {
382             if (notWhitespace(docInfo, offset)) {
383                 return false;
384             }
385             offset ++;
386         }
387         return true;
388     }
389 
390     static boolean notWhitespace(JsonDocumentInfo docInfo, int offset) {
391         return !isWhitespace(docInfo, offset);
392     }
393 
394     static boolean isWhitespace(JsonDocumentInfo docInfo, int offset) {
395         return switch (docInfo.charAt(offset)) {
396             case ' ', '\t','\r' -> true;
397             case '\n' -> {
398                 docInfo.line+=1;
399                 docInfo.lineStart = offset + 1;
400                 yield true;
401             }
402             default -> false;
403         };
404     }
405 
406     static JsonParseException failure(JsonDocumentInfo docInfo, String message, int offset) {
407         var errMsg = docInfo.composeParseExceptionMessage(
408                 message, docInfo.line, docInfo.lineStart, offset);
409         return new JsonParseException(errMsg, docInfo.line, offset - docInfo.lineStart);
410     }
411 
412     private static void checkDepth(JsonDocumentInfo docInfo, int offset, int depth) {
413         if (depth > Json.MAX_DEPTH) {
414             throw failure(docInfo, "Max depth exceeded", offset);
415         }
416     }
417 
418     // no instantiation of this parser
419     private JsonParser(){}
420 }