| /* Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <ctype.h> |
| #include <stdio.h> |
| |
| #include "Lucy/Util/ToolSet.h" |
| |
| #include "Lucy/Util/Json.h" |
| #include "Lucy/Object/Host.h" |
| #include "Lucy/Store/Folder.h" |
| #include "Lucy/Store/InStream.h" |
| #include "Lucy/Store/OutStream.h" |
| #include "Lucy/Util/Memory.h" |
| #include "Lucy/Util/Json/JsonParser.h" |
| |
| /* Routines generated by Lemon. */ |
| void* |
| LucyParseJsonAlloc(void * (*allocate)(size_t)); |
| void |
| LucyParseJson(void *json_parser, int token_type, lucy_Obj *value, |
| lucy_JsonParserState *state); |
| void |
| LucyParseJsonFree(void *json_parser, void(*freemem)(void*)); |
| void |
| LucyParseJsonTrace(FILE *trace, char *line_prefix); |
| |
| // Encode JSON for supplied "dump". On failure, sets Err_error and returns |
| // false. |
| static bool_t |
| S_to_json(Obj *dump, CharBuf *json, int32_t depth); |
| |
| // Parse JSON from raw UTF-8 in memory. |
| static Obj* |
| S_parse_json(char *text, size_t size); |
| static Obj* |
| S_do_parse_json(void *json_parser, char *json, size_t len); |
| |
| // Parse a JSON number. Advance the text buffer just past the number. |
| static Float64* |
| S_parse_number(char **json_ptr, char *const limit); |
| |
| // Parse a JSON string. Advance the text buffer from pointing at the opening |
| // double quote to pointing just after the closing double quote. |
| static CharBuf* |
| S_parse_string(char **json_ptr, char *const limit); |
| |
| // Unescape JSON string text. Expects pointers bookending the text data (i.e. |
| // pointing just after the opening double quote and directly at the closing |
| // double quote), and assumes that escapes have already been sanity checked |
| // for length. |
| static CharBuf* |
| S_unescape_text(char *const top, char *const end); |
| |
| // Check that the supplied text begins with the specified keyword, which must |
| // then end on a word boundary (i.e. match "null" but not the first four |
| // letters of "nullify"). |
| static INLINE bool_t |
| SI_check_keyword(char *json, char* end, const char *keyword, size_t len); |
| |
| // Make it possible to be loosen constraints during testing. |
| static bool_t tolerant = false; |
| |
| // Indentation: two spaces per level. |
| static const char indentation[] = " "; |
| static const size_t INDENTATION_LEN = sizeof(indentation) - 1; |
| |
| // Append indentation spaces x depth. |
| static void |
| S_cat_whitespace(CharBuf *json, int32_t depth); |
| |
| // Set Err_error, appending escaped JSON in the vicinity of the error. |
| static void |
| S_set_error(CharBuf *mess, char *json, char *limit, int line, |
| const char *func); |
| #define SET_ERROR(_mess, _json, _end) \ |
| S_set_error(_mess, _json, _end, __LINE__, CFISH_ERR_FUNC_MACRO) |
| |
| Obj* |
| Json_from_json(CharBuf *json) { |
| Obj *dump = S_parse_json((char*)CB_Get_Ptr8(json), CB_Get_Size(json)); |
| if (!dump) { |
| ERR_ADD_FRAME(Err_get_error()); |
| } |
| return dump; |
| } |
| |
| Obj* |
| Json_slurp_json(Folder *folder, const CharBuf *path) { |
| InStream *instream = Folder_Open_In(folder, path); |
| if (!instream) { |
| ERR_ADD_FRAME(Err_get_error()); |
| return NULL; |
| } |
| size_t len = (size_t)InStream_Length(instream); |
| char *buf = InStream_Buf(instream, len); |
| Obj *dump = S_parse_json(buf, len); |
| InStream_Close(instream); |
| DECREF(instream); |
| if (!dump) { |
| ERR_ADD_FRAME(Err_get_error()); |
| } |
| return dump; |
| } |
| |
| bool_t |
| Json_spew_json(Obj *dump, Folder *folder, const CharBuf *path) { |
| CharBuf *json = Json_to_json(dump); |
| if (!json) { |
| ERR_ADD_FRAME(Err_get_error()); |
| return false; |
| } |
| OutStream *outstream = Folder_Open_Out(folder, path); |
| if (!outstream) { |
| ERR_ADD_FRAME(Err_get_error()); |
| DECREF(json); |
| return false; |
| } |
| size_t size = CB_Get_Size(json); |
| OutStream_Write_Bytes(outstream, CB_Get_Ptr8(json), size); |
| OutStream_Close(outstream); |
| DECREF(outstream); |
| DECREF(json); |
| return true; |
| } |
| |
| CharBuf* |
| Json_to_json(Obj *dump) { |
| // Validate object type, only allowing hashes and arrays per JSON spec. |
| if (!dump || !(Obj_Is_A(dump, HASH) || Obj_Is_A(dump, VARRAY))) { |
| if (!tolerant) { |
| CharBuf *class_name = dump ? Obj_Get_Class_Name(dump) : NULL; |
| CharBuf *mess = MAKE_MESS("Illegal top-level object type: %o", |
| class_name); |
| Err_set_error(Err_new(mess)); |
| return NULL; |
| } |
| } |
| |
| // Encode. |
| CharBuf *json = CB_new(31); |
| if (!S_to_json(dump, json, 0)) { |
| DECREF(json); |
| ERR_ADD_FRAME(Err_get_error()); |
| json = NULL; |
| } |
| else { |
| // Append newline. |
| CB_Cat_Trusted_Str(json, "\n", 1); |
| } |
| |
| return json; |
| } |
| |
| void |
| Json_set_tolerant(bool_t tolerance) { |
| tolerant = tolerance; |
| } |
| |
| static const int32_t MAX_DEPTH = 200; |
| |
| static void |
| S_append_json_string(Obj *dump, CharBuf *json) { |
| // Append opening quote. |
| CB_Cat_Trusted_Str(json, "\"", 1); |
| |
| // Process string data. |
| ZombieCharBuf *iterator = ZCB_WRAP((CharBuf*)dump); |
| while (ZCB_Get_Size(iterator)) { |
| uint32_t code_point = ZCB_Nip_One(iterator); |
| if (code_point > 127) { |
| // There is no need to escape any high characters, including those |
| // above the BMP, as we assume that the destination channel can |
| // handle arbitrary UTF-8 data. |
| CB_Cat_Char(json, code_point); |
| } |
| else { |
| char buffer[7]; |
| size_t len; |
| switch (code_point & 127) { |
| // Perform all mandatory escapes enumerated in the JSON spec. |
| // Note that the spec makes escaping forward slash optional; |
| // we choose not to. |
| case 0x00: case 0x01: case 0x02: case 0x03: |
| case 0x04: case 0x05: case 0x06: case 0x07: |
| case 0x0b: case 0x0e: case 0x0f: |
| case 0x10: case 0x11: case 0x12: case 0x13: |
| case 0x14: case 0x15: case 0x16: case 0x17: |
| case 0x18: case 0x19: case 0x1a: case 0x1b: |
| case 0x1c: case 0x1d: case 0x1e: case 0x1f: { |
| sprintf(buffer, "\\u%04x", (unsigned)code_point); |
| len = 6; |
| break; |
| } |
| case '\b': |
| memcpy(buffer, "\\b", 2); |
| len = 2; |
| break; |
| case '\t': |
| memcpy(buffer, "\\t", 2); |
| len = 2; |
| break; |
| case '\n': |
| memcpy(buffer, "\\n", 2); |
| len = 2; |
| break; |
| case '\f': |
| memcpy(buffer, "\\f", 2); |
| len = 2; |
| break; |
| case '\r': |
| memcpy(buffer, "\\r", 2); |
| len = 2; |
| break; |
| case '\\': |
| memcpy(buffer, "\\\\", 2); |
| len = 2; |
| break; |
| case '\"': |
| memcpy(buffer, "\\\"", 2); |
| len = 2; |
| break; |
| |
| // Ordinary printable ASCII. |
| default: |
| buffer[0] = (char)code_point; |
| len = 1; |
| } |
| CB_Cat_Trusted_Str(json, buffer, len); |
| } |
| } |
| |
| // Append closing quote. |
| CB_Cat_Trusted_Str(json, "\"", 1); |
| } |
| |
| static void |
| S_cat_whitespace(CharBuf *json, int32_t depth) { |
| while (depth--) { |
| CB_Cat_Trusted_Str(json, indentation, INDENTATION_LEN); |
| } |
| } |
| |
| static bool_t |
| S_to_json(Obj *dump, CharBuf *json, int32_t depth) { |
| // Guard against infinite recursion in self-referencing data structures. |
| if (depth > MAX_DEPTH) { |
| CharBuf *mess = MAKE_MESS("Exceeded max depth of %i32", MAX_DEPTH); |
| Err_set_error(Err_new(mess)); |
| return false; |
| } |
| |
| if (!dump) { |
| CB_Cat_Trusted_Str(json, "null", 4); |
| } |
| else if (dump == (Obj*)CFISH_TRUE) { |
| CB_Cat_Trusted_Str(json, "true", 4); |
| } |
| else if (dump == (Obj*)CFISH_FALSE) { |
| CB_Cat_Trusted_Str(json, "false", 5); |
| } |
| else if (Obj_Is_A(dump, CHARBUF)) { |
| S_append_json_string(dump, json); |
| } |
| else if (Obj_Is_A(dump, INTNUM)) { |
| CB_catf(json, "%i64", Obj_To_I64(dump)); |
| } |
| else if (Obj_Is_A(dump, FLOATNUM)) { |
| CB_catf(json, "%f64", Obj_To_F64(dump)); |
| } |
| else if (Obj_Is_A(dump, VARRAY)) { |
| VArray *array = (VArray*)dump; |
| size_t size = VA_Get_Size(array); |
| if (size == 0) { |
| // Put empty array on single line. |
| CB_Cat_Trusted_Str(json, "[]", 2); |
| return true; |
| } |
| else if (size == 1) { |
| Obj *elem = VA_Fetch(array, 0); |
| if (!(Obj_Is_A(elem, HASH) || Obj_Is_A(elem, VARRAY))) { |
| // Put array containing single scalar element on one line. |
| CB_Cat_Trusted_Str(json, "[", 1); |
| if (!S_to_json(elem, json, depth + 1)) { |
| return false; |
| } |
| CB_Cat_Trusted_Str(json, "]", 1); |
| return true; |
| } |
| } |
| // Fall back to spreading elements across multiple lines. |
| CB_Cat_Trusted_Str(json, "[", 1); |
| for (size_t i = 0; i < size; i++) { |
| CB_Cat_Trusted_Str(json, "\n", 1); |
| S_cat_whitespace(json, depth + 1); |
| if (!S_to_json(VA_Fetch(array, i), json, depth + 1)) { |
| return false; |
| } |
| if (i + 1 < size) { |
| CB_Cat_Trusted_Str(json, ",", 1); |
| } |
| } |
| CB_Cat_Trusted_Str(json, "\n", 1); |
| S_cat_whitespace(json, depth); |
| CB_Cat_Trusted_Str(json, "]", 1); |
| } |
| else if (Obj_Is_A(dump, HASH)) { |
| Hash *hash = (Hash*)dump; |
| size_t size = Hash_Get_Size(hash); |
| |
| // Put empty hash on single line. |
| if (size == 0) { |
| CB_Cat_Trusted_Str(json, "{}", 2); |
| return true; |
| } |
| |
| // Validate that all keys are strings, then sort. |
| VArray *keys = Hash_Keys(hash); |
| for (size_t i = 0; i < size; i++) { |
| Obj *key = VA_Fetch(keys, i); |
| if (!key || !Obj_Is_A(key, CHARBUF)) { |
| DECREF(keys); |
| CharBuf *key_class = key ? Obj_Get_Class_Name(key) : NULL; |
| CharBuf *mess = MAKE_MESS("Illegal key type: %o", key_class); |
| Err_set_error(Err_new(mess)); |
| return false; |
| } |
| } |
| VA_Sort(keys, NULL, NULL); |
| |
| // Spread pairs across multiple lines. |
| CB_Cat_Trusted_Str(json, "{", 1); |
| for (size_t i = 0; i < size; i++) { |
| Obj *key = VA_Fetch(keys, i); |
| CB_Cat_Trusted_Str(json, "\n", 1); |
| S_cat_whitespace(json, depth + 1); |
| S_append_json_string(key, json); |
| CB_Cat_Trusted_Str(json, ": ", 2); |
| if (!S_to_json(Hash_Fetch(hash, key), json, depth + 1)) { |
| DECREF(keys); |
| return false; |
| } |
| if (i + 1 < size) { |
| CB_Cat_Trusted_Str(json, ",", 1); |
| } |
| } |
| CB_Cat_Trusted_Str(json, "\n", 1); |
| S_cat_whitespace(json, depth); |
| CB_Cat_Trusted_Str(json, "}", 1); |
| |
| DECREF(keys); |
| } |
| |
| return true; |
| } |
| |
| static Obj* |
| S_parse_json(char *text, size_t size) { |
| void *json_parser = LucyParseJsonAlloc(lucy_Memory_wrapped_malloc); |
| if (json_parser == NULL) { |
| CharBuf *mess = MAKE_MESS("Failed to allocate JSON parser"); |
| Err_set_error(Err_new(mess)); |
| return NULL; |
| } |
| Obj *dump = S_do_parse_json(json_parser, text, size); |
| LucyParseJsonFree(json_parser, lucy_Memory_wrapped_free); |
| return dump; |
| } |
| |
| static Obj* |
| S_do_parse_json(void *json_parser, char *json, size_t len) { |
| lucy_JsonParserState state; |
| state.result = NULL; |
| state.errors = false; |
| |
| char *text = json; |
| char *const end = text + len; |
| while (text < end) { |
| int token_type = -1; |
| Obj *value = NULL; |
| char *const save = text; |
| switch (*text) { |
| case ' ': case '\n': case '\r': case '\t': |
| // Skip insignificant whitespace, which the JSON RFC defines |
| // as only four ASCII characters. |
| text++; |
| continue; |
| case '[': |
| token_type = LUCY_JSON_TOKENTYPE_LEFT_SQUARE_BRACKET; |
| text++; |
| break; |
| case ']': |
| token_type = LUCY_JSON_TOKENTYPE_RIGHT_SQUARE_BRACKET; |
| text++; |
| break; |
| case '{': |
| token_type = LUCY_JSON_TOKENTYPE_LEFT_CURLY_BRACKET; |
| text++; |
| break; |
| case '}': |
| token_type = LUCY_JSON_TOKENTYPE_RIGHT_CURLY_BRACKET; |
| text++; |
| break; |
| case ':': |
| token_type = LUCY_JSON_TOKENTYPE_COLON; |
| text++; |
| break; |
| case ',': |
| token_type = LUCY_JSON_TOKENTYPE_COMMA; |
| text++; |
| break; |
| case '"': |
| value = (Obj*)S_parse_string(&text, end); |
| if (value) { |
| token_type = LUCY_JSON_TOKENTYPE_STRING; |
| } |
| else { |
| // Clear out parser and return. |
| LucyParseJson(json_parser, 0, NULL, &state); |
| ERR_ADD_FRAME(Err_get_error()); |
| return NULL; |
| } |
| break; |
| case 'n': |
| if (SI_check_keyword(text, end, "null", 4)) { |
| token_type = LUCY_JSON_TOKENTYPE_NULL; |
| text += 4; |
| } |
| break; |
| case 't': |
| if (SI_check_keyword(text, end, "true", 4)) { |
| token_type = LUCY_JSON_TOKENTYPE_TRUE; |
| value = (Obj*)CFISH_TRUE; |
| text += 4; |
| } |
| break; |
| case 'f': |
| if (SI_check_keyword(text, end, "false", 5)) { |
| token_type = LUCY_JSON_TOKENTYPE_FALSE; |
| value = (Obj*)CFISH_FALSE; |
| text += 5; |
| } |
| break; |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| case '-': { // Note no '+', as JSON spec doesn't allow it. |
| value = (Obj*)S_parse_number(&text, end); |
| if (value) { |
| token_type = LUCY_JSON_TOKENTYPE_NUMBER; |
| } |
| else { |
| // Clear out parser and return. |
| LucyParseJson(json_parser, 0, NULL, &state); |
| ERR_ADD_FRAME(Err_get_error()); |
| return NULL; |
| } |
| } |
| break; |
| } |
| LucyParseJson(json_parser, token_type, value, &state); |
| if (state.errors) { |
| SET_ERROR(CB_newf("JSON syntax error"), save, end); |
| return NULL; |
| } |
| } |
| |
| // Finish up. |
| LucyParseJson(json_parser, 0, NULL, &state); |
| if (state.errors) { |
| SET_ERROR(CB_newf("JSON syntax error"), json, end); |
| return NULL; |
| } |
| return state.result; |
| } |
| |
| static Float64* |
| S_parse_number(char **json_ptr, char *const limit) { |
| char *top = *json_ptr; |
| char *end = top; |
| bool_t terminated = false; |
| |
| // We can't assume NULL termination for the JSON string, so we need to |
| // ensure that strtod() cannot overrun and access invalid memory. |
| for (; end < limit; end++) { |
| switch (*end) { |
| // Only these characters may legally follow a number in |
| // Javascript. If we don't find one before the end of the JSON, |
| // it's a parse error. |
| case ' ': case '\n': case '\r': case '\t': |
| case ']': |
| case '}': |
| case ':': |
| case ',': |
| terminated = true; |
| break; |
| } |
| } |
| |
| Float64 *result = NULL; |
| if (terminated) { |
| char *terminus; |
| double number = strtod(top, &terminus); |
| if (terminus != top) { |
| *json_ptr = terminus; |
| result = Float64_new(number); |
| } |
| } |
| if (!result) { |
| SET_ERROR(CB_newf("JSON syntax error"), top, limit); |
| } |
| return result; |
| } |
| |
| static CharBuf* |
| S_parse_string(char **json_ptr, char *const limit) { |
| // Find terminating double quote, determine whether there are any escapes. |
| char *top = *json_ptr + 1; |
| char *end = NULL; |
| bool_t saw_backslash = false; |
| for (char *text = top; text < limit; text++) { |
| if (*text == '"') { |
| end = text; |
| break; |
| } |
| else if (*text == '\\') { |
| saw_backslash = true; |
| if (text + 1 < limit && text[1] == 'u') { |
| text += 5; |
| } |
| else { |
| text += 1; |
| } |
| } |
| } |
| if (!end) { |
| SET_ERROR(CB_newf("Unterminated string"), *json_ptr, limit); |
| return NULL; |
| } |
| |
| // Advance the text buffer to just beyond the closing quote. |
| *json_ptr = end + 1; |
| |
| if (saw_backslash) { |
| return S_unescape_text(top, end); |
| } |
| else { |
| // Optimize common case where there are no escapes. |
| size_t len = end - top; |
| if (!StrHelp_utf8_valid(top, len)) { |
| CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON"); |
| Err_set_error(Err_new(mess)); |
| return NULL; |
| } |
| return CB_new_from_trusted_utf8(top, len); |
| } |
| } |
| |
| static CharBuf* |
| S_unescape_text(char *const top, char *const end) { |
| // The unescaped string will never be longer than the escaped string |
| // because only a \u escape can theoretically be too long and |
| // StrHelp_encode_utf8_char guards against sequences over 4 bytes. |
| // Therefore we can allocate once and not worry about reallocating. |
| size_t cap = end - top + 1; |
| char *target_buf = (char*)MALLOCATE(cap); |
| size_t target_size = 0; |
| for (char *text = top; text < end; text++) { |
| if (*text != '\\') { |
| target_buf[target_size++] = *text; |
| } |
| else { |
| // Process escape. |
| text++; |
| switch (*text) { |
| case '"': |
| target_buf[target_size++] = '"'; |
| break; |
| case '\\': |
| target_buf[target_size++] = '\\'; |
| break; |
| case '/': |
| target_buf[target_size++] = '/'; |
| break; |
| case 'b': |
| target_buf[target_size++] = '\b'; |
| break; |
| case 'f': |
| target_buf[target_size++] = '\f'; |
| break; |
| case 'n': |
| target_buf[target_size++] = '\n'; |
| break; |
| case 'r': |
| target_buf[target_size++] = '\r'; |
| break; |
| case 't': |
| target_buf[target_size++] = '\t'; |
| break; |
| case 'u': { |
| // Copy into a temp buffer because strtol will overrun |
| // into adjacent text data for e.g. "\uAAAA1". |
| char temp[5] = { 0, 0, 0, 0, 0 }; |
| memcpy(temp, text + 1, 4); |
| text += 4; |
| char *num_end; |
| long code_point = strtol(temp, &num_end, 16); |
| char *temp_ptr = temp; |
| if (num_end != temp_ptr + 4 || code_point < 0) { |
| FREEMEM(target_buf); |
| SET_ERROR(CB_newf("Invalid \\u escape"), text - 5, end); |
| return NULL; |
| } |
| if (code_point >= 0xD800 && code_point <= 0xDFFF) { |
| FREEMEM(target_buf); |
| SET_ERROR(CB_newf("Surrogate pairs not supported"), |
| text - 5, end); |
| return NULL; |
| } |
| target_size += StrHelp_encode_utf8_char((uint32_t)code_point, |
| target_buf + target_size); |
| } |
| break; |
| default: |
| FREEMEM(target_buf); |
| SET_ERROR(CB_newf("Illegal escape"), text - 1, end); |
| return NULL; |
| } |
| } |
| } |
| |
| // NULL-terminate, sanity check, then return the escaped string. |
| target_buf[target_size] = '\0'; |
| if (!StrHelp_utf8_valid(target_buf, target_size)) { |
| FREEMEM(target_buf); |
| CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON"); |
| Err_set_error(Err_new(mess)); |
| return NULL; |
| } |
| return CB_new_steal_from_trusted_str(target_buf, target_size, cap); |
| } |
| |
| static INLINE bool_t |
| SI_check_keyword(char *json, char* end, const char *keyword, size_t len) { |
| if (end - json > len |
| && strncmp(json, keyword, len) == 0 |
| && json[len] != '_' |
| && !isalnum(json[len]) |
| ) { |
| return true; |
| } |
| return false; |
| } |
| |
| static void |
| S_set_error(CharBuf *mess, char *json, char *limit, int line, |
| const char *func) { |
| if (func) { |
| CB_catf(mess, " at %s %s line %i32 near ", func, __FILE__, |
| (int32_t)line); |
| } |
| else { |
| CB_catf(mess, " at %s line %i32 near ", __FILE__, (int32_t)line); |
| } |
| |
| // Append escaped text. |
| int64_t len = limit - json; |
| if (len > 32) { |
| const char *end = StrHelp_back_utf8_char(json + 32, json); |
| len = end - json; |
| } |
| ZombieCharBuf *snippet = ZCB_WRAP_STR(json, len); |
| S_append_json_string((Obj*)snippet, mess); |
| |
| // Set Err_error. |
| Err_set_error(Err_new(mess)); |
| } |
| |