| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // This file is copied from |
| // https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.cpp |
| // and modified by Doris |
| |
| #include "vec/json/json_parser.h" |
| |
| #include <assert.h> |
| #include <fmt/format.h> |
| #include <glog/logging.h> |
| |
| #include <algorithm> |
| #include <string_view> |
| |
| #include "common/cast_set.h" |
| #include "common/config.h" |
| #include "common/status.h" |
| #include "vec/json/path_in_data.h" |
| #include "vec/json/simd_json_parser.h" |
| |
| namespace doris::vectorized { |
| #include "common/compile_check_begin.h" |
| |
| template <typename ParserImpl> |
| std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char* begin, size_t length, |
| const ParseConfig& config) { |
| Element document; |
| if (!parser.parse(begin, length, document)) { |
| return {}; |
| } |
| ParseContext context; |
| context.enable_flatten_nested = config.enable_flatten_nested; |
| context.is_top_array = document.isArray(); |
| traverse(document, context); |
| ParseResult result; |
| result.values = std::move(context.values); |
| result.paths.reserve(context.paths.size()); |
| for (auto&& path : context.paths) { |
| result.paths.emplace_back(std::move(path)); |
| } |
| return result; |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext& ctx) { |
| // checkStackSize(); |
| if (element.isObject()) { |
| traverseObject(element.getObject(), ctx); |
| } else if (element.isArray()) { |
| if (ctx.has_nested_in_flatten) { |
| throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, |
| "Nesting of array in Nested array within variant subcolumns is " |
| "currently not supported."); |
| } |
| has_nested = false; |
| check_has_nested_object(element); |
| ctx.has_nested_in_flatten = has_nested && ctx.enable_flatten_nested; |
| if (has_nested && !ctx.enable_flatten_nested) { |
| // Parse nested arrays to JsonbField |
| JsonbWriter writer; |
| traverseArrayAsJsonb(element.getArray(), writer); |
| ctx.paths.push_back(ctx.builder.get_parts()); |
| ctx.values.push_back(Field::create_field<TYPE_JSONB>( |
| JsonbField(writer.getOutput()->getBuffer(), writer.getOutput()->getSize()))); |
| } else { |
| traverseArray(element.getArray(), ctx); |
| } |
| // we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array |
| ctx.has_nested_in_flatten = false; |
| } else { |
| ctx.paths.push_back(ctx.builder.get_parts()); |
| ctx.values.push_back(getValueAsField(element)); |
| } |
| } |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object, ParseContext& ctx) { |
| ctx.paths.reserve(ctx.paths.size() + object.size()); |
| ctx.values.reserve(ctx.values.size() + object.size()); |
| for (auto it = object.begin(); it != object.end(); ++it) { |
| const auto& [key, value] = *it; |
| if (key.size() >= std::numeric_limits<uint8_t>::max()) { |
| throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, |
| "Key length exceeds maximum allowed size of 255 bytes."); |
| } |
| ctx.builder.append(key, false); |
| traverse(value, ctx); |
| ctx.builder.pop_back(); |
| } |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::check_has_nested_object(const Element& element) { |
| if (element.isArray()) { |
| const JSONArray& array = element.getArray(); |
| for (auto it = array.begin(); it != array.end(); ++it) { |
| check_has_nested_object(*it); |
| } |
| } |
| if (element.isObject()) { |
| has_nested = true; |
| } |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::traverseAsJsonb(const Element& element, JsonbWriter& writer) { |
| if (element.isObject()) { |
| traverseObjectAsJsonb(element.getObject(), writer); |
| } else if (element.isArray()) { |
| traverseArrayAsJsonb(element.getArray(), writer); |
| } else { |
| writeValueAsJsonb(element, writer); |
| } |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::traverseObjectAsJsonb(const JSONObject& object, |
| JsonbWriter& writer) { |
| writer.writeStartObject(); |
| for (auto it = object.begin(); it != object.end(); ++it) { |
| const auto& [key, value] = *it; |
| if (key.size() >= std::numeric_limits<uint8_t>::max()) { |
| throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, |
| "Key length exceeds maximum allowed size of 255 bytes."); |
| } |
| writer.writeKey(key.data(), cast_set<uint8_t>(key.size())); |
| traverseAsJsonb(value, writer); |
| } |
| writer.writeEndObject(); |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer) { |
| writer.writeStartArray(); |
| for (auto it = array.begin(); it != array.end(); ++it) { |
| traverseAsJsonb(*it, writer); |
| } |
| writer.writeEndArray(); |
| } |
| |
| // check isPrefix in PathInData::Parts. like : [{"a": {"c": {"b": 1}}}, {"a": {"c": 2.2}}], "a.c" is prefix of "a.c.b" |
| // return true if prefix is a prefix of parts |
| static bool is_prefix(const PathInData::Parts& prefix, const PathInData::Parts& parts) { |
| if (prefix.size() >= parts.size()) { |
| return false; |
| } |
| for (size_t i = 0; i < prefix.size(); ++i) { |
| if (prefix[i].key != parts[i].key) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::traverseArray(const JSONArray& array, ParseContext& ctx) { |
| /// Traverse elements of array and collect an array of fields by each path. |
| ParseArrayContext array_ctx; |
| array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; |
| array_ctx.is_top_array = ctx.is_top_array; |
| array_ctx.total_size = array.size(); |
| for (auto it = array.begin(); it != array.end(); ++it) { |
| traverseArrayElement(*it, array_ctx); |
| ++array_ctx.current_size; |
| } |
| auto&& arrays_by_path = array_ctx.arrays_by_path; |
| if (arrays_by_path.empty()) { |
| ctx.paths.push_back(ctx.builder.get_parts()); |
| ctx.values.push_back(Field::create_field<TYPE_ARRAY>(Array())); |
| } else { |
| ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size()); |
| ctx.values.reserve(ctx.values.size() + arrays_by_path.size()); |
| for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); ++it) { |
| auto&& [path, path_array] = it->second; |
| /// Merge prefix path and path of array element. |
| ctx.paths.push_back(ctx.builder.append(path, true).get_parts()); |
| ctx.values.push_back(Field::create_field<TYPE_ARRAY>(std::move(path_array))); |
| ctx.builder.pop_back(path.size()); |
| } |
| } |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element, |
| ParseArrayContext& ctx) { |
| ParseContext element_ctx; |
| element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; |
| element_ctx.is_top_array = ctx.is_top_array; |
| traverse(element, element_ctx); |
| auto& [_, paths, values, flatten_nested, __, is_top_array] = element_ctx; |
| |
| if (element_ctx.has_nested_in_flatten && is_top_array) { |
| checkAmbiguousStructure(ctx, paths); |
| } |
| |
| size_t size = paths.size(); |
| size_t keys_to_update = ctx.arrays_by_path.size(); |
| |
| for (size_t i = 0; i < size; ++i) { |
| if (values[i].is_null()) { |
| continue; |
| } |
| |
| UInt128 hash = PathInData::get_parts_hash(paths[i]); |
| auto found = ctx.arrays_by_path.find(hash); |
| |
| if (found != ctx.arrays_by_path.end()) { |
| handleExistingPath(found->second, paths[i], values[i], ctx, keys_to_update); |
| } else { |
| handleNewPath(hash, paths[i], values[i], ctx); |
| } |
| } |
| |
| if (keys_to_update && !(is_top_array && ctx.has_nested_in_flatten)) { |
| fillMissedValuesInArrays(ctx); |
| } |
| } |
| |
| // check if the structure of top_array is ambiguous like: |
| // [{"a": {"b": {"c": 1}}}, {"a": {"b": 1}}] a.b is ambiguous |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::checkAmbiguousStructure( |
| const ParseArrayContext& ctx, const std::vector<PathInData::Parts>& paths) { |
| for (auto&& current_path : paths) { |
| for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) { |
| auto&& [p, _] = it->second; |
| if (is_prefix(p, current_path) || is_prefix(current_path, p)) { |
| throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, |
| "Ambiguous structure of top_array nested subcolumns: {}, {}", |
| PathInData(p).to_jsonpath(), |
| PathInData(current_path).to_jsonpath()); |
| } |
| } |
| } |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts, Array>& path_data, |
| const PathInData::Parts& path, Field& value, |
| ParseArrayContext& ctx, |
| size_t& keys_to_update) { |
| auto& path_array = path_data.second; |
| // For top_array structure we no need to check cur array size equals ctx.current_size |
| // because we do not need to maintain the association information between Nested in array |
| if (!(ctx.is_top_array && ctx.has_nested_in_flatten)) { |
| assert(path_array.size() == ctx.current_size); |
| } |
| // If current element of array is part of Nested, |
| // collect its size or check it if the size of |
| // the Nested has been already collected. |
| auto nested_key = getNameOfNested(path, value); |
| if (!nested_key.empty()) { |
| size_t array_size = get<const Array&>(value).size(); |
| auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key]; |
| if (current_nested_sizes.size() == ctx.current_size) { |
| current_nested_sizes.push_back(array_size); |
| } else if (array_size != current_nested_sizes.back()) { |
| throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, |
| "Array sizes mismatched ({} and {})", array_size, |
| current_nested_sizes.back()); |
| } |
| } |
| |
| path_array.push_back(std::move(value)); |
| --keys_to_update; |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::handleNewPath(UInt128 hash, const PathInData::Parts& path, |
| Field& value, ParseArrayContext& ctx) { |
| Array path_array; |
| path_array.reserve(ctx.total_size); |
| |
| // For top_array structure we no need to resize array |
| // because we no need to fill default values for maintaining the association information between Nested in array |
| if (!(ctx.is_top_array && ctx.has_nested_in_flatten)) { |
| path_array.resize(ctx.current_size); |
| } |
| |
| auto nested_key = getNameOfNested(path, value); |
| if (!nested_key.empty()) { |
| size_t array_size = get<const Array&>(value).size(); |
| auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key]; |
| if (current_nested_sizes.empty()) { |
| current_nested_sizes.resize(ctx.current_size); |
| } else { |
| // If newly added element is part of the Nested then |
| // resize its elements to keep correct sizes of Nested arrays. |
| for (size_t j = 0; j < ctx.current_size; ++j) { |
| path_array[j] = Field::create_field<TYPE_ARRAY>(Array(current_nested_sizes[j])); |
| } |
| } |
| if (current_nested_sizes.size() == ctx.current_size) { |
| current_nested_sizes.push_back(array_size); |
| } else if (array_size != current_nested_sizes.back()) { |
| throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, |
| "Array sizes mismatched ({} and {})", array_size, |
| current_nested_sizes.back()); |
| } |
| } |
| |
| path_array.push_back(std::move(value)); |
| auto& elem = ctx.arrays_by_path[hash]; |
| elem.first = std::move(path); |
| elem.second = std::move(path_array); |
| } |
| |
| template <typename ParserImpl> |
| void JSONDataParser<ParserImpl>::fillMissedValuesInArrays(ParseArrayContext& ctx) { |
| for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) { |
| auto& [path, path_array] = it->second; |
| assert(path_array.size() == ctx.current_size || path_array.size() == ctx.current_size + 1); |
| if (path_array.size() == ctx.current_size) { |
| bool inserted = tryInsertDefaultFromNested(ctx, path, path_array); |
| if (!inserted) { |
| path_array.emplace_back(); |
| } |
| } |
| } |
| } |
| |
| template <typename ParserImpl> |
| bool JSONDataParser<ParserImpl>::tryInsertDefaultFromNested(ParseArrayContext& ctx, |
| const PathInData::Parts& path, |
| Array& array) { |
| /// If there is a collected size of current Nested |
| /// then insert array of this size as a default value. |
| if (path.empty() || array.empty()) { |
| return false; |
| } |
| /// Last element is not Null, because otherwise this path wouldn't exist. |
| auto nested_key = getNameOfNested(path, array.back()); |
| if (nested_key.empty()) { |
| return false; |
| } |
| auto mapped = ctx.nested_sizes_by_key.find(nested_key); |
| if (mapped == ctx.nested_sizes_by_key.end()) { |
| return false; |
| } |
| auto& current_nested_sizes = mapped->second; |
| assert(current_nested_sizes.size() == ctx.current_size || |
| current_nested_sizes.size() == ctx.current_size + 1); |
| /// If all keys of Nested were missed then add a zero length. |
| if (current_nested_sizes.size() == ctx.current_size) { |
| current_nested_sizes.push_back(0); |
| } |
| size_t array_size = current_nested_sizes.back(); |
| array.push_back(Field::create_field<TYPE_ARRAY>(Array(array_size))); |
| return true; |
| } |
| |
| template <typename ParserImpl> |
| StringRef JSONDataParser<ParserImpl>::getNameOfNested(const PathInData::Parts& path, |
| const Field& value) { |
| if (value.get_type() != PrimitiveType::TYPE_ARRAY || path.empty()) { |
| return {}; |
| } |
| /// Find first key that is marked as nested, |
| /// because we may have tuple of Nested and there could be |
| /// several arrays with the same prefix, but with independent sizes. |
| /// Consider we have array element with type `k2 Tuple(k3 Nested(...), k5 Nested(...))` |
| /// Then subcolumns `k2.k3` and `k2.k5` may have indepented sizes and we should extract |
| /// `k3` and `k5` keys instead of `k2`. |
| for (const auto& part : path) { |
| if (part.is_nested) { |
| return {part.key.data(), part.key.size()}; |
| } |
| } |
| return {}; |
| } |
| |
| #include "common/compile_check_end.h" |
| |
| template class JSONDataParser<SimdJSONParser>; |
| } // namespace doris::vectorized |