blob: 094150f3de03e6ff0ffd41f60e226a0d23247f43 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <gen_cpp/internal_service.pb.h>
#include "common/object_pool.h"
#include "exprs/filter_base.h"
#include "runtime/primitive_type.h"
#include "runtime_filter/utils.h"
#include "vec/columns/column_nullable.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/common/hash_table/phmap_fwd_decl.h"
namespace doris {
constexpr int FIXED_CONTAINER_MAX_SIZE = 8;
/**
* Fix Container can use simd to improve performance. 1 <= N <= 8 can be improved performance by test. FIXED_CONTAINER_MAX_SIZE = 8.
* @tparam T Element Type
* @tparam N Fixed Number
*/
template <typename T, size_t N>
class FixedContainer {
public:
using Self = FixedContainer;
using ElementType = T;
class Iterator;
FixedContainer() { static_assert(N >= 0 && N <= FIXED_CONTAINER_MAX_SIZE); }
~FixedContainer() = default;
void insert(const T& value) {
DCHECK(_size < N);
_data[_size++] = value;
}
void insert(Iterator begin, Iterator end) {
for (auto iter = begin; iter != end; ++iter) {
DCHECK(_size < N);
_data[_size++] = (*iter);
}
}
void check_size() {
if (N != _size) {
throw doris::Exception(ErrorCode::INTERNAL_ERROR,
"invalid size of FixedContainer<{}>: {}", N, _size);
}
}
// Use '|' instead of '||' has better performance by test.
ALWAYS_INLINE bool find(const T& value) const {
DCHECK_EQ(N, _size);
if constexpr (N == 0) {
return false;
}
if constexpr (N == 1) {
return (value == _data[0]);
}
if constexpr (N == 2) {
return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1]);
}
if constexpr (N == 3) {
return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1]) |
(uint8_t)(value == _data[2]);
}
if constexpr (N == 4) {
return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1]) |
(uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3]);
}
if constexpr (N == 5) {
return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1]) |
(uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3]) |
(uint8_t)(value == _data[4]);
}
if constexpr (N == 6) {
return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1]) |
(uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3]) |
(uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5]);
}
if constexpr (N == 7) {
return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1]) |
(uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3]) |
(uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5]) |
(uint8_t)(value == _data[6]);
}
if constexpr (N == FIXED_CONTAINER_MAX_SIZE) {
return (uint8_t)(value == _data[0]) | (uint8_t)(value == _data[1]) |
(uint8_t)(value == _data[2]) | (uint8_t)(value == _data[3]) |
(uint8_t)(value == _data[4]) | (uint8_t)(value == _data[5]) |
(uint8_t)(value == _data[6]) | (uint8_t)(value == _data[7]);
}
CHECK(false) << "unreachable path";
return false;
}
size_t size() const { return _size; }
class Iterator {
public:
explicit Iterator(std::array<T, N>& data, size_t index) : _data(data), _index(index) {}
Iterator& operator++() {
++_index;
return *this;
}
Iterator operator++(int) {
Iterator ret_val = *this;
++(*this);
return ret_val;
}
bool operator==(Iterator other) const { return _index == other._index; }
bool operator!=(Iterator other) const { return !(*this == other); }
T& operator*() const { return _data[_index]; }
T* operator->() const { return &operator*(); }
// iterator traits
using iterator_category = std::forward_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = T;
using pointer = T*;
using reference = T&;
private:
std::array<T, N>& _data;
size_t _index;
};
Iterator begin() { return Iterator(_data, 0); }
Iterator end() { return Iterator(_data, _size); }
void clear() {
std::array<T, N> {}.swap(_data);
_size = 0;
}
private:
std::array<T, N> _data;
size_t _size {};
};
template <typename T>
struct IsFixedContainer : std::false_type {};
template <typename T, size_t N>
struct IsFixedContainer<FixedContainer<T, N>> : std::true_type {};
/**
* Dynamic Container uses phmap::flat_hash_set.
* @tparam T Element Type
*/
template <typename T>
class DynamicContainer {
public:
using Self = DynamicContainer;
using Iterator = typename vectorized::flat_hash_set<T>::iterator;
using ElementType = T;
DynamicContainer() = default;
~DynamicContainer() = default;
void insert(const T& value) { _set.insert(value); }
void insert(Iterator begin, Iterator end) { _set.insert(begin, end); }
bool find(const T& value) const { return _set.contains(value); }
void clear() { _set.clear(); }
Iterator begin() { return _set.begin(); }
Iterator end() { return _set.end(); }
size_t size() const { return _set.size(); }
private:
vectorized::flat_hash_set<T> _set;
};
// TODO Maybe change void* parameter to template parameter better.
class HybridSetBase : public FilterBase {
public:
HybridSetBase(bool null_aware) : FilterBase(null_aware) {}
virtual ~HybridSetBase() = default;
virtual void insert(const void* data) = 0;
// use in vectorize execute engine
virtual void insert(void* data, size_t) = 0;
virtual void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) = 0;
virtual void insert(HybridSetBase* set) {
HybridSetBase::IteratorBase* iter = set->begin();
while (iter->has_next()) {
const void* value = iter->get_value();
insert(value);
iter->next();
}
_contain_null |= set->_contain_null;
}
virtual void clear() = 0;
bool empty() { return !_contain_null && size() == 0; }
virtual int size() = 0;
virtual bool find(const void* data) const = 0;
// use in vectorize execute engine
virtual bool find(const void* data, size_t) const = 0;
virtual void find_batch(const doris::vectorized::IColumn& column, size_t rows,
doris::vectorized::ColumnUInt8::Container& results) = 0;
virtual void find_batch_negative(const doris::vectorized::IColumn& column, size_t rows,
doris::vectorized::ColumnUInt8::Container& results) = 0;
virtual void find_batch_nullable(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap& null_map,
doris::vectorized::ColumnUInt8::Container& results) = 0;
virtual void find_batch_nullable_negative(
const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap& null_map,
doris::vectorized::ColumnUInt8::Container& results) = 0;
virtual void to_pb(PInFilter* filter) = 0;
class IteratorBase {
public:
IteratorBase() = default;
virtual ~IteratorBase() = default;
virtual const void* get_value() = 0;
virtual bool has_next() const = 0;
virtual void next() = 0;
};
virtual IteratorBase* begin() = 0;
};
template <PrimitiveType T,
typename _ContainerType = DynamicContainer<typename PrimitiveTypeTraits<T>::CppType>,
typename _ColumnType = typename PrimitiveTypeTraits<T>::ColumnType>
class HybridSet : public HybridSetBase {
public:
using ContainerType = _ContainerType;
using ElementType = typename ContainerType::ElementType;
using ColumnType = _ColumnType;
HybridSet(bool null_aware) : HybridSetBase(null_aware) {}
~HybridSet() override = default;
void insert(const void* data) override {
if (data == nullptr) {
_contain_null = true;
return;
}
_set.insert(*reinterpret_cast<const ElementType*>(data));
}
void clear() override { _set.clear(); }
void insert(void* data, size_t /*unused*/) override { insert(data); }
void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
const auto size = column->size();
if (column->is_nullable()) {
const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
const auto& col = nullable->get_nested_column();
const auto& nullmap =
assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
.get_data();
const ElementType* data = (ElementType*)col.get_raw_data().data;
for (size_t i = start; i < size; i++) {
if (!nullmap[i]) {
_set.insert(*(data + i));
} else {
_contain_null = true;
}
}
} else {
const ElementType* data = (ElementType*)column->get_raw_data().data;
for (size_t i = start; i < size; i++) {
_set.insert(*(data + i));
}
}
}
int size() override { return _set.size(); }
bool find(const void* data) const override {
return _set.find(*reinterpret_cast<const ElementType*>(data));
}
bool find(const void* data, size_t /*unused*/) const override { return find(data); }
void find_batch(const doris::vectorized::IColumn& column, size_t rows,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<false, false>(column, rows, nullptr, results);
}
void find_batch_negative(const doris::vectorized::IColumn& column, size_t rows,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<false, true>(column, rows, nullptr, results);
}
void find_batch_nullable(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap& null_map,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<true, false>(column, rows, &null_map, results);
}
void find_batch_nullable_negative(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap& null_map,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<true, true>(column, rows, &null_map, results);
}
template <bool is_nullable, bool is_negative>
void _find_batch(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap* null_map,
doris::vectorized::ColumnUInt8::Container& results) {
auto& col = assert_cast<const ColumnType&>(column);
const auto* __restrict data = (ElementType*)col.get_data().data();
const uint8_t* __restrict null_map_data;
if constexpr (is_nullable) {
null_map_data = null_map->data();
}
if constexpr (IsFixedContainer<ContainerType>::value) {
_set.check_size();
}
auto* __restrict result_data = results.data();
for (size_t i = 0; i < rows; ++i) {
if constexpr (!is_nullable && !is_negative) {
result_data[i] = _set.find(data[i]);
} else if constexpr (!is_nullable && is_negative) {
result_data[i] = !_set.find(data[i]);
} else if constexpr (is_nullable && !is_negative) {
result_data[i] = _set.find(data[i]) & (!null_map_data[i]);
} else { // (is_nullable && is_negative)
result_data[i] = !(_set.find(data[i]) & (!null_map_data[i]));
}
}
}
class Iterator : public IteratorBase {
public:
Iterator(typename ContainerType::Iterator begin, typename ContainerType::Iterator end)
: _begin(begin), _end(end) {}
~Iterator() override = default;
bool has_next() const override { return !(_begin == _end); }
const void* get_value() override { return _begin.operator->(); }
void next() override { ++_begin; }
private:
typename ContainerType::Iterator _begin;
typename ContainerType::Iterator _end;
};
IteratorBase* begin() override {
return _pool.add(new (std::nothrow) Iterator(_set.begin(), _set.end()));
}
void set_pb(PInFilter* filter, auto f) {
for (auto v : _set) {
f(filter->add_values(), v);
}
}
void to_pb(PInFilter* filter) override { set_pb(filter, get_convertor<ElementType>()); }
private:
ContainerType _set;
ObjectPool _pool;
};
template <typename _ContainerType = DynamicContainer<std::string>>
class StringSet : public HybridSetBase {
public:
using ContainerType = _ContainerType;
StringSet(bool null_aware) : HybridSetBase(null_aware) {}
~StringSet() override = default;
void clear() override { _set.clear(); }
void insert(const void* data) override {
if (data == nullptr) {
_contain_null = true;
return;
}
const auto* value = reinterpret_cast<const StringRef*>(data);
std::string str_value(value->data, value->size);
_set.insert(str_value);
}
void insert(void* data, size_t size) override {
if (data == nullptr) {
insert(nullptr);
} else {
std::string str_value(reinterpret_cast<char*>(data), size);
_set.insert(str_value);
}
}
void _insert_fixed_len_string(const auto& col, const uint8_t* __restrict nullmap, size_t start,
size_t end) {
for (size_t i = start; i < end; i++) {
if (nullmap == nullptr || !nullmap[i]) {
_set.insert(col.get_data_at(i).to_string());
} else {
_contain_null = true;
}
}
}
void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
if (column->is_nullable()) {
const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
const auto& nullmap =
assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
.get_data();
if (nullable->get_nested_column().is_column_string64()) {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString64&>(
nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
} else {
_insert_fixed_len_string(
assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
}
} else {
if (column->is_column_string64()) {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString64&>(*column),
nullptr, start, column->size());
} else {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString&>(*column),
nullptr, start, column->size());
}
}
}
int size() override { return _set.size(); }
bool find(const void* data) const override {
const auto* value = reinterpret_cast<const StringRef*>(data);
std::string str_value(const_cast<const char*>(value->data), value->size);
return _set.find(str_value);
}
bool find(const void* data, size_t size) const override {
std::string str_value(reinterpret_cast<const char*>(data), size);
return _set.find(str_value);
}
void find_batch(const doris::vectorized::IColumn& column, size_t rows,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<false, false>(column, rows, nullptr, results);
}
void find_batch_negative(const doris::vectorized::IColumn& column, size_t rows,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<false, true>(column, rows, nullptr, results);
}
void find_batch_nullable(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap& null_map,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<true, false>(column, rows, &null_map, results);
}
void find_batch_nullable_negative(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap& null_map,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<true, true>(column, rows, &null_map, results);
}
template <bool is_nullable, bool is_negative>
void _find_batch(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap* null_map,
doris::vectorized::ColumnUInt8::Container& results) {
const auto& col = assert_cast<const doris::vectorized::ColumnString&>(column);
const uint8_t* __restrict null_map_data;
if constexpr (is_nullable) {
null_map_data = null_map->data();
}
if constexpr (IsFixedContainer<ContainerType>::value) {
_set.check_size();
}
auto* __restrict result_data = results.data();
for (size_t i = 0; i < rows; ++i) {
const auto& string_data = col.get_data_at(i).to_string();
if constexpr (!is_nullable && !is_negative) {
result_data[i] = _set.find(string_data);
} else if constexpr (!is_nullable && is_negative) {
result_data[i] = !_set.find(string_data);
} else if constexpr (is_nullable && !is_negative) {
result_data[i] = _set.find(string_data) & (!null_map_data[i]);
} else { // (is_nullable && is_negative)
result_data[i] = !(_set.find(string_data) & (!null_map_data[i]));
}
}
}
class Iterator : public IteratorBase {
public:
Iterator(typename ContainerType::Iterator begin, typename ContainerType::Iterator end)
: _begin(begin), _end(end) {}
~Iterator() override = default;
bool has_next() const override { return !(_begin == _end); }
const void* get_value() override {
_value.data = const_cast<char*>(_begin->data());
_value.size = _begin->length();
return &_value;
}
void next() override { ++_begin; }
private:
typename ContainerType::Iterator _begin;
typename ContainerType::Iterator _end;
StringRef _value;
};
IteratorBase* begin() override {
return _pool.add(new (std::nothrow) Iterator(_set.begin(), _set.end()));
}
void set_pb(PInFilter* filter, auto f) {
for (const auto& v : _set) {
f(filter->add_values(), v);
}
}
void to_pb(PInFilter* filter) override { set_pb(filter, get_convertor<std::string>()); }
private:
ContainerType _set;
ObjectPool _pool;
};
// note: Two difference from StringSet
// 1 StringRef has better comparison performance than std::string
// 2 std::string keeps its own memory, bug StringRef just keeps ptr and len, so you the caller should manage memory of StringRef
template <typename _ContainerType = DynamicContainer<StringRef>>
class StringValueSet : public HybridSetBase {
public:
using ContainerType = _ContainerType;
StringValueSet(bool null_aware) : HybridSetBase(null_aware) {}
~StringValueSet() override = default;
void clear() override { _set.clear(); }
void insert(const void* data) override {
if (data == nullptr) {
_contain_null = true;
return;
}
const auto* value = reinterpret_cast<const StringRef*>(data);
StringRef sv(value->data, value->size);
_set.insert(sv);
}
void insert(void* data, size_t size) override {
if (data == nullptr) {
insert(nullptr);
} else {
StringRef sv(reinterpret_cast<char*>(data), size);
_set.insert(sv);
}
}
void _insert_fixed_len_string(const auto& col, const uint8_t* __restrict nullmap, size_t start,
size_t end) {
for (size_t i = start; i < end; i++) {
if (nullmap == nullptr || !nullmap[i]) {
_set.insert(col.get_data_at(i));
} else {
_contain_null = true;
}
}
}
void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
if (column->is_nullable()) {
const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
const auto& nullmap =
assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
.get_data();
if (nullable->get_nested_column().is_column_string64()) {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString64&>(
nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
} else {
_insert_fixed_len_string(
assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
}
} else {
if (column->is_column_string64()) {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString64&>(*column),
nullptr, start, column->size());
} else {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString&>(*column),
nullptr, start, column->size());
}
}
}
int size() override { return _set.size(); }
bool find(const void* data) const override {
const auto* value = reinterpret_cast<const StringRef*>(data);
return _set.find(*value);
}
bool find(const void* data, size_t size) const override {
StringRef sv(reinterpret_cast<const char*>(data), size);
return _set.find(sv);
}
void find_batch(const doris::vectorized::IColumn& column, size_t rows,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<false, false>(column, rows, nullptr, results);
}
void find_batch_negative(const doris::vectorized::IColumn& column, size_t rows,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<false, true>(column, rows, nullptr, results);
}
void find_batch_nullable(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap& null_map,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<true, false>(column, rows, &null_map, results);
}
void find_batch_nullable_negative(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap& null_map,
doris::vectorized::ColumnUInt8::Container& results) override {
_find_batch<true, true>(column, rows, &null_map, results);
}
template <bool is_nullable, bool is_negative>
void _find_batch(const doris::vectorized::IColumn& column, size_t rows,
const doris::vectorized::NullMap* null_map,
doris::vectorized::ColumnUInt8::Container& results) {
const auto& col = assert_cast<const doris::vectorized::ColumnString&>(column);
const auto& offset = col.get_offsets();
const uint8_t* __restrict data = col.get_chars().data();
auto* __restrict cursor = const_cast<uint8_t*>(data);
const uint8_t* __restrict null_map_data;
if constexpr (is_nullable) {
null_map_data = null_map->data();
}
if constexpr (IsFixedContainer<ContainerType>::value) {
_set.check_size();
}
auto* __restrict result_data = results.data();
for (size_t i = 0; i < rows; ++i) {
uint32_t len = offset[i] - offset[i - 1];
if constexpr (!is_nullable && !is_negative) {
result_data[i] = _set.find(StringRef(cursor, len));
} else if constexpr (!is_nullable && is_negative) {
result_data[i] = !_set.find(StringRef(cursor, len));
} else if constexpr (is_nullable && !is_negative) {
result_data[i] = (!null_map_data[i]) & _set.find(StringRef(cursor, len));
} else { // (is_nullable && is_negative)
result_data[i] = !((!null_map_data[i]) & _set.find(StringRef(cursor, len)));
}
cursor += len;
}
}
class Iterator : public IteratorBase {
public:
Iterator(typename ContainerType::Iterator begin, typename ContainerType::Iterator end)
: _begin(begin), _end(end) {}
~Iterator() override = default;
bool has_next() const override { return !(_begin == _end); }
const void* get_value() override {
_value.data = const_cast<char*>(_begin->data);
_value.size = _begin->size;
return &_value;
}
void next() override { ++_begin; }
private:
typename ContainerType::Iterator _begin;
typename ContainerType::Iterator _end;
StringRef _value;
};
IteratorBase* begin() override {
return _pool.add(new (std::nothrow) Iterator(_set.begin(), _set.end()));
}
void to_pb(PInFilter* filter) override {
throw Exception(ErrorCode::INTERNAL_ERROR, "StringValueSet do not support to_pb");
}
private:
ContainerType _set;
ObjectPool _pool;
};
} // namespace doris