blob: ece5f5bc8b502ce5923c0a00471c117ab2fcfc2b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/DataTypes/Serializations/SubcolumnsTree.h
// and modified by Doris
#pragma once
#include <memory>
#include <unordered_map>
#include "runtime/exec_env.h"
#include "runtime/thread_context.h"
#include "vec/columns/column.h"
#include "vec/common/arena.h"
#include "vec/common/string_ref.h"
#include "vec/json/path_in_data.h"
namespace doris::vectorized {
// Tree that represents paths in document with additional data in nodes.
// IsShared mean this object shared above multiple tasks, need swtich to subcolumns_tree_tracker
template <typename NodeData, bool IsShared>
class SubcolumnsTree {
public:
struct Node {
enum Kind { TUPLE, NESTED, SCALAR };
explicit Node(Kind kind_) : kind(kind_) {}
Node(Kind kind_, const NodeData& data_) : kind(kind_), data(data_) {}
Node(Kind kind_, const NodeData& data_, const PathInData& path_)
: kind(kind_), data(data_), path(path_) {}
Node(Kind kind_, NodeData&& data_) : kind(kind_), data(std::move(data_)) {}
Node(Kind kind_, NodeData&& data_, const PathInData& path_)
: kind(kind_), data(std::move(data_)), path(path_) {}
Kind kind = TUPLE;
const Node* parent = nullptr;
std::unordered_map<StringRef, std::shared_ptr<Node>, StringRefHash> children;
NodeData data;
PathInData path;
bool is_nested() const { return kind == NESTED; }
bool is_scalar() const { return kind == SCALAR; }
bool is_leaf_node() const { return kind == SCALAR && children.empty(); }
// Only modify data and kind
void modify(std::shared_ptr<Node>&& other) {
data = std::move(other->data);
kind = other->kind;
path = other->path;
}
// modify data and kind
void modify_to_scalar(NodeData&& other_data) {
data = std::move(other_data);
kind = Kind::SCALAR;
}
void add_child(std::string_view key, std::shared_ptr<Node> next_node, Arena& strings_pool) {
next_node->parent = this;
StringRef key_ref;
if constexpr (IsShared) {
SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(
ExecEnv::GetInstance()->subcolumns_tree_tracker());
key_ref = {strings_pool.insert(key.data(), key.length()), key.length()};
} else {
key_ref = {strings_pool.insert(key.data(), key.length()), key.length()};
}
children[key_ref] = std::move(next_node);
}
std::vector<StringRef> get_sorted_chilren_keys() const {
std::vector<StringRef> sorted_keys;
for (auto it = children.begin(); it != children.end(); ++it) {
sorted_keys.push_back(it->first);
}
std::sort(sorted_keys.begin(), sorted_keys.end());
return sorted_keys;
}
std::shared_ptr<const Node> get_child_node(StringRef key) const {
auto it = children.find(key);
if (it != children.end()) {
return it->second;
}
return nullptr;
}
};
using NodeKind = typename Node::Kind;
using NodePtr = std::shared_ptr<Node>;
/// Add a leaf without any data in other nodes.
bool add(const PathInData& path, const NodeData& leaf_data) {
return add(path, [&](NodeKind kind, bool exists) -> NodePtr {
if (exists) {
return nullptr;
}
if (kind == Node::SCALAR) {
return std::make_shared<Node>(kind, leaf_data, path);
}
return std::make_shared<Node>(kind);
});
}
bool add(const PathInData& path, NodeData&& leaf_data) {
return add(path, [&](NodeKind kind, bool exists) -> NodePtr {
if (exists) {
return nullptr;
}
if (kind == Node::SCALAR) {
return std::make_shared<Node>(kind, std::move(leaf_data), path);
}
return std::make_shared<Node>(kind);
});
}
/// Callback for creation of node. Receives kind of node and
/// flag, which is true if node already exists.
using NodeCreator = std::function<NodePtr(NodeKind, bool)>;
// create root as SCALAR node
void create_root(NodeData&& leaf_data) {
root = std::make_shared<Node>(Node::SCALAR, std::move(leaf_data));
leaves.push_back(root);
}
// create root as SCALAR node
void create_root(const NodeData& leaf_data) {
root = std::make_shared<Node>(Node::SCALAR, std::move(leaf_data));
leaves.push_back(root);
}
void add_leaf(const NodePtr& node) { leaves.push_back(node); }
bool add(const PathInData& path, const NodeCreator& node_creator) {
const auto& parts = path.get_parts();
if (parts.empty()) {
return false;
}
if (!root) {
root = std::make_shared<Node>(Node::TUPLE);
}
Node* current_node = root.get();
for (size_t i = 0; i < parts.size() - 1; ++i) {
auto it = current_node->children.find(
StringRef {parts[i].key.data(), parts[i].key.size()});
if (it != current_node->children.end()) {
current_node = it->second.get();
node_creator(current_node->kind, true);
} else {
auto next_kind = parts[i].is_nested ? Node::NESTED : Node::TUPLE;
auto next_node = node_creator(next_kind, false);
current_node->add_child(String(parts[i].key), next_node, *strings_pool);
current_node = next_node.get();
}
}
auto it = current_node->children.find(
StringRef {parts.back().key.data(), parts.back().key.size()});
if (it != current_node->children.end()) {
// Modify this node to Node::SCALAR
auto new_node = node_creator(Node::SCALAR, false);
it->second->modify(std::move(new_node));
leaves.push_back(it->second);
return true;
}
auto next_node = node_creator(Node::SCALAR, false);
current_node->add_child(String(parts.back().key), next_node, *strings_pool);
leaves.push_back(std::move(next_node));
return true;
}
/// Find node that matches the path the best.
const Node* find_best_match(const PathInData& path) const { return find_impl(path, false); }
using NodePredicate = std::function<bool(const Node&)>;
/// Finds leaf that satisfies the predicate.
const Node* find_leaf(const NodePredicate& predicate) {
return find_leaf(root.get(), predicate);
}
/// Find node that matches the path exactly.
const Node* find_exact(const PathInData& path) const { return find_impl(path, true); }
static const Node* find_leaf(const Node* node, const NodePredicate& predicate) {
if (!node) {
return nullptr;
}
if (node->is_scalar()) {
return predicate(*node) ? node : nullptr;
}
for (auto it = node->children.begin(); it != node->children.end(); ++it) {
auto child = it->second;
if (const auto* leaf = find_leaf(child.get(), predicate)) {
return leaf;
}
}
return nullptr;
}
/// Find leaf by path.
const Node* find_leaf(const PathInData& path) const {
const auto* candidate = find_exact(path);
if (!candidate || !candidate->is_scalar()) {
return nullptr;
}
return candidate;
}
const Node* get_leaf_of_the_same_nested(const PathInData& path,
const NodePredicate& pred) const {
if (!path.has_nested_part()) {
return nullptr;
}
const auto* current_node = find_leaf(path);
const Node* leaf = nullptr;
while (current_node) {
/// Try to find the first Nested up to the current node.
const auto* node_nested = find_parent(current_node, [](const auto& candidate) -> bool {
return candidate.is_nested();
});
if (!node_nested) {
break;
}
/// Find the leaf with subcolumn that contains values
/// for the last rows.
/// If there are no leaves, skip current node and find
/// the next node up to the current.
leaf = SubcolumnsTree<NodeData, IsShared>::find_leaf(node_nested, pred);
if (leaf) {
break;
}
current_node = node_nested->parent;
}
return leaf;
}
/// Find first parent node that satisfies the predicate.
static const Node* find_parent(const Node* node, const NodePredicate& predicate) {
while (node && !predicate(*node)) {
node = node->parent;
}
return node;
}
bool empty() const { return root == nullptr; }
size_t size() const { return leaves.size(); }
using Nodes = std::vector<NodePtr>;
const Nodes& get_leaves() const { return leaves; }
const Node* get_root() const { return root.get(); }
const NodePtr& get_root_ptr() const { return root; }
Node* get_mutable_root() { return root.get(); }
static void get_leaves_of_node(const Node* node, std::vector<const Node*>& nodes,
vectorized::PathsInData& paths) {
if (node->is_scalar()) {
nodes.push_back(node);
paths.push_back(node->path);
}
for (auto it = node->children.begin(); it != node->children.end(); ++it) {
auto child = it->second;
get_leaves_of_node(child.get(), nodes, paths);
}
}
using iterator = typename Nodes::iterator;
using const_iterator = typename Nodes::const_iterator;
iterator begin() { return leaves.begin(); }
iterator end() { return leaves.end(); }
const_iterator begin() const { return leaves.begin(); }
const_iterator end() const { return leaves.end(); }
~SubcolumnsTree() {
if constexpr (IsShared) {
SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(
ExecEnv::GetInstance()->subcolumns_tree_tracker());
strings_pool.reset();
} else {
strings_pool.reset();
}
}
SubcolumnsTree() {
if constexpr (IsShared) {
SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(
ExecEnv::GetInstance()->subcolumns_tree_tracker());
SCOPED_SKIP_MEMORY_CHECK();
strings_pool = std::make_shared<Arena>();
} else {
SCOPED_SKIP_MEMORY_CHECK();
strings_pool = std::make_shared<Arena>();
}
}
private:
const Node* find_impl(const PathInData& path, bool find_exact) const {
if (!root) {
return nullptr;
}
const auto& parts = path.get_parts();
const Node* current_node = root.get();
for (const auto& part : parts) {
auto it = current_node->children.find(StringRef {part.key.data(), part.key.size()});
if (it == current_node->children.end()) {
return find_exact ? nullptr : current_node;
}
current_node = it->second.get();
}
return current_node;
}
std::shared_ptr<Arena> strings_pool;
NodePtr root;
Nodes leaves;
};
} // namespace doris::vectorized