blob: 897ba043b05e92aa8118cb5aacc5ec4e9b9053d3 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.h
// and modified by Doris
#pragma once
#include <parallel_hashmap/phmap.h>
#include <stddef.h>
#include <optional>
#include <string>
#include <utility>
#include <vector>
#include "util/jsonb_writer.h"
#include "vec/columns/column.h"
#include "vec/common/string_ref.h"
#include "vec/common/uint128.h"
#include "vec/core/field.h"
#include "vec/json/path_in_data.h"
#include "vec/json/simd_json_parser.h"
namespace doris::vectorized {
template <typename Element>
Field getValueAsField(const Element& element) {
// bool will convert to type FiledType::UInt64
if (element.isBool()) {
return Field::create_field<TYPE_BOOLEAN>(element.getBool());
}
if (element.isInt64()) {
return Field::create_field<TYPE_BIGINT>(element.getInt64());
}
// doris only support signed integers at present
if (element.isUInt64()) {
return Field::create_field<TYPE_BIGINT>(element.getInt64());
}
if (element.isDouble()) {
return Field::create_field<TYPE_DOUBLE>(element.getDouble());
}
if (element.isString()) {
return Field::create_field<TYPE_STRING>(String(element.getString()));
}
if (element.isNull()) {
return Field();
}
return Field();
}
template <typename Element>
void writeValueAsJsonb(const Element& element, JsonbWriter& writer) {
// bool will convert to type FiledType::UInt64
if (element.isBool()) {
writer.writeBool(element.getBool());
return;
}
if (element.isInt64()) {
writer.writeInt64(element.getInt64());
return;
}
// doris only support signed integers at present
if (element.isUInt64()) {
writer.writeInt64(element.getInt64());
return;
}
if (element.isDouble()) {
writer.writeDouble(element.getDouble());
return;
}
if (element.isString()) {
writer.writeStartString();
std::string_view str = element.getString();
writer.writeString(str.data(), str.size());
writer.writeEndString();
return;
}
if (element.isNull()) {
writer.writeNull();
return;
}
}
template <typename Element>
std::string castValueAsString(const Element& element) {
if (element.isBool()) {
return element.getBool() ? "1" : "0";
}
if (element.isInt64()) {
return std::to_string(element.getInt64());
}
if (element.isUInt64()) {
return std::to_string(element.getUInt64());
}
if (element.isDouble()) {
return std::to_string(element.getDouble());
}
if (element.isNull()) {
return "";
}
return "";
}
enum class ExtractType {
ToString = 0,
// ...
};
struct ParseConfig {
bool enable_flatten_nested = false;
};
template <typename ParserImpl>
class JSONDataParser {
public:
using Element = typename ParserImpl::Element;
using JSONObject = typename ParserImpl::Object;
using JSONArray = typename ParserImpl::Array;
std::optional<ParseResult> parse(const char* begin, size_t length, const ParseConfig& config);
private:
struct ParseContext {
PathInDataBuilder builder;
std::vector<PathInData::Parts> paths;
std::vector<Field> values;
bool enable_flatten_nested = false;
};
using PathPartsWithArray = std::pair<PathInData::Parts, Array>;
using PathToArray = phmap::flat_hash_map<UInt128, PathPartsWithArray, UInt128TrivialHash>;
using KeyToSizes = phmap::flat_hash_map<StringRef, std::vector<size_t>, StringRefHash>;
struct ParseArrayContext {
size_t current_size = 0;
size_t total_size = 0;
PathToArray arrays_by_path;
KeyToSizes nested_sizes_by_key;
};
void traverse(const Element& element, ParseContext& ctx);
void traverseObject(const JSONObject& object, ParseContext& ctx);
void traverseArray(const JSONArray& array, ParseContext& ctx);
void traverseArrayElement(const Element& element, ParseArrayContext& ctx);
static void fillMissedValuesInArrays(ParseArrayContext& ctx);
static bool tryInsertDefaultFromNested(ParseArrayContext& ctx, const PathInData::Parts& path,
Array& array);
static StringRef getNameOfNested(const PathInData::Parts& path, const Field& value);
bool has_nested = false;
void check_has_nested_object(const Element& element);
void traverseAsJsonb(const Element& element, JsonbWriter& writer);
void traverseObjectAsJsonb(const JSONObject& object, JsonbWriter& writer);
void traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer);
ParserImpl parser;
};
} // namespace doris::vectorized