blob: 82e0542cb75211cf6600aa48a21806e0745e58a3 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/StringHashTable.h
// and modified by Doris
#pragma once
#include <new>
#include <variant>
#include "vec/common/hash_table/hash.h"
#include "vec/common/hash_table/hash_table.h"
#include "vec/common/memcpy_small.h"
using StringKey2 = doris::vectorized::UInt16;
using StringKey4 = doris::vectorized::UInt32;
using StringKey8 = doris::vectorized::UInt64;
using StringKey16 = doris::vectorized::UInt128;
struct StringHashMapSubKeys {
using T1 = StringKey2;
using T2 = StringKey4;
using T3 = StringKey8;
using T4 = StringKey16;
};
template <typename StringKey>
StringKey to_string_key(const doris::StringRef& key) {
DCHECK_LE(key.size, sizeof(StringKey));
StringKey string_key {};
memcpy_small<sizeof(StringKey)>((char*)&string_key, key.data, key.size);
return string_key;
}
template <typename T>
inline doris::StringRef ALWAYS_INLINE to_string_ref(const T& n) {
assert(n != 0);
return {reinterpret_cast<const char*>(&n), sizeof(T) - (__builtin_clzll(n) >> 3)};
}
inline doris::StringRef ALWAYS_INLINE to_string_ref(const StringKey16& n) {
assert(n.items[1] != 0);
return {reinterpret_cast<const char*>(&n), 16UL - (__builtin_clzll(n.items[1]) >> 3)};
}
struct StringHashTableHash {
#if defined(__SSE4_2__) || defined(__aarch64__)
template <typename T>
size_t ALWAYS_INLINE operator()(T key) const {
size_t res = -1ULL;
res = _mm_crc32_u64(res, key);
return res;
}
size_t ALWAYS_INLINE operator()(StringKey16 key) const {
size_t res = -1ULL;
res = _mm_crc32_u64(res, key.low());
res = _mm_crc32_u64(res, key.high());
return res;
}
#else
template <typename T>
size_t ALWAYS_INLINE operator()(T key) const {
return util_hash::CityHash64(reinterpret_cast<const char*>(&key), sizeof(T));
}
#endif
size_t ALWAYS_INLINE operator()(doris::StringRef key) const {
if (key.size == 0) {
return 0;
} else if (key.size <= sizeof(StringHashMapSubKeys::T1)) {
return StringHashTableHash()(to_string_key<StringHashMapSubKeys::T1>(key));
} else if (key.size <= sizeof(StringHashMapSubKeys::T2)) {
return StringHashTableHash()(to_string_key<StringHashMapSubKeys::T2>(key));
} else if (key.size <= sizeof(StringHashMapSubKeys::T3)) {
return StringHashTableHash()(to_string_key<StringHashMapSubKeys::T3>(key));
} else if (key.size <= sizeof(StringHashMapSubKeys::T4)) {
return StringHashTableHash()(to_string_key<StringHashMapSubKeys::T4>(key));
}
return doris::StringRefHash()(key);
}
};
template <typename Cell>
struct StringHashTableEmpty {
using Self = StringHashTableEmpty;
bool _has_zero = false;
std::aligned_storage_t<sizeof(Cell), alignof(Cell)>
zero_value_storage; /// Storage of element with zero key.
public:
bool has_zero() const { return _has_zero; }
void set_has_zero(const typename Cell::key_type& key) {
_has_zero = true;
new (zero_value()) Cell();
zero_value()->value.first = key;
}
void set_has_zero(const Cell& other) {
_has_zero = true;
new (zero_value()) Cell(other);
}
void clear_has_zero() {
_has_zero = false;
if (!std::is_trivially_destructible_v<Cell>) {
zero_value()->~Cell();
}
}
Cell* zero_value() { return std::launder(reinterpret_cast<Cell*>(&zero_value_storage)); }
const Cell* zero_value() const {
return std::launder(reinterpret_cast<const Cell*>(&zero_value_storage));
}
using LookupResult = Cell*;
using ConstLookupResult = const Cell*;
template <typename KeyHolder>
void ALWAYS_INLINE emplace(KeyHolder&& key, LookupResult& it, bool& inserted, size_t = 0) {
if (!has_zero()) {
set_has_zero(key);
inserted = true;
} else {
inserted = false;
}
it = zero_value();
}
class Constructor {
public:
friend struct StringHashTableEmpty;
template <typename... Args>
void operator()(Args&&... args) const {
new (_cell) Cell(std::forward<Args>(args)...);
}
private:
Constructor(Cell* cell) : _cell(cell) {}
Cell* _cell = nullptr;
};
template <typename KeyHolder, typename Func, typename Origin>
void ALWAYS_INLINE lazy_emplace_with_origin(KeyHolder&& key, Origin&& origin, LookupResult& it,
size_t hash_value, Func&& f) {
if (!has_zero()) {
set_has_zero(key);
std::forward<Func>(f)(Constructor(zero_value()), key, origin);
}
it = zero_value();
}
template <typename Key>
LookupResult ALWAYS_INLINE find(const Key&, size_t = 0) {
return has_zero() ? zero_value() : nullptr;
}
template <typename Key>
ConstLookupResult ALWAYS_INLINE find(const Key&, size_t = 0) const {
return has_zero() ? zero_value() : nullptr;
}
size_t size() const { return has_zero() ? 1 : 0; }
bool empty() const { return !has_zero(); }
size_t get_buffer_size_in_bytes() const { return sizeof(Cell); }
};
template <size_t initial_size_degree = 8>
struct StringHashTableGrower : public HashTableGrowerWithPrecalculation<initial_size_degree> {
// Smooth growing for string maps
void increase_size() { this->increase_size_degree(1); }
};
template <typename Mapped>
struct StringHashTableLookupResult {
Mapped* mapped_ptr;
StringHashTableLookupResult() : mapped_ptr(nullptr) {}
StringHashTableLookupResult(Mapped* mapped_ptr_) : mapped_ptr(mapped_ptr_) {}
StringHashTableLookupResult(std::nullptr_t) {}
auto& get_mapped() { return *mapped_ptr; }
auto& operator*() { return *this; }
auto& operator*() const { return *this; }
auto* operator->() { return this; }
auto* operator->() const { return this; }
explicit operator bool() const { return mapped_ptr; }
friend bool operator==(const StringHashTableLookupResult& a, const std::nullptr_t&) {
return !a.mapped_ptr;
}
friend bool operator==(const std::nullptr_t&, const StringHashTableLookupResult& b) {
return !b.mapped_ptr;
}
friend bool operator!=(const StringHashTableLookupResult& a, const std::nullptr_t&) {
return a.mapped_ptr;
}
friend bool operator!=(const std::nullptr_t&, const StringHashTableLookupResult& b) {
return b.mapped_ptr;
}
};
template <typename Mapped>
ALWAYS_INLINE inline auto lookup_result_get_mapped(StringHashTableLookupResult<Mapped> cell) {
return &cell.get_mapped();
}
template <typename SubMaps>
class StringHashTable : private boost::noncopyable {
protected:
// Map for storing empty string
using T0 = typename SubMaps::T0;
// Short strings are stored as numbers
using T1 = typename SubMaps::T1;
using T2 = typename SubMaps::T2;
using T3 = typename SubMaps::T3;
using T4 = typename SubMaps::T4;
// Long strings are stored as doris::StringRef along with saved hash
using Ts = typename SubMaps::Ts;
using Self = StringHashTable;
T0 m0;
T1 m1;
T2 m2;
T3 m3;
T4 m4;
Ts ms;
using Cell = typename Ts::cell_type;
template <typename Derived, bool is_const>
class iterator_base {
using Container = std::conditional_t<is_const, const Self, Self>;
Container* container = nullptr;
int sub_table_index;
typename T1::iterator iterator1;
typename T2::iterator iterator2;
typename T3::iterator iterator3;
typename T4::iterator iterator4;
typename Ts::iterator iterator5;
typename Ts::cell_type cell;
friend class StringHashTable;
public:
iterator_base() = default;
iterator_base(Container* container_, bool end = false) : container(container_) {
if (end) {
sub_table_index = 5;
iterator5 = container->ms.end();
} else {
sub_table_index = 0;
if (container->m0.size() == 0) {
sub_table_index++;
} else {
return;
}
iterator1 = container->m1.begin();
if (iterator1 == container->m1.end()) {
sub_table_index++;
} else {
return;
}
iterator2 = container->m2.begin();
if (iterator2 == container->m2.end()) {
sub_table_index++;
} else {
return;
}
iterator3 = container->m3.begin();
if (iterator3 == container->m3.end()) {
sub_table_index++;
} else {
return;
}
iterator4 = container->m4.begin();
if (iterator4 == container->m4.end()) {
sub_table_index++;
} else {
return;
}
iterator5 = container->ms.begin();
}
}
bool operator==(const iterator_base& rhs) const {
if (sub_table_index != rhs.sub_table_index) {
return false;
}
switch (sub_table_index) {
case 0: {
return true;
}
case 1: {
return iterator1 == rhs.iterator1;
}
case 2: {
return iterator2 == rhs.iterator2;
}
case 3: {
return iterator3 == rhs.iterator3;
}
case 4: {
return iterator4 == rhs.iterator4;
}
case 5: {
return iterator5 == rhs.iterator5;
}
}
throw doris::Exception(doris::Status::FatalError("__builtin_unreachable"));
}
bool operator!=(const iterator_base& rhs) const { return !(*this == rhs); }
Derived& operator++() {
bool need_switch_to_next = false;
switch (sub_table_index) {
case 0: {
need_switch_to_next = true;
break;
}
case 1: {
++iterator1;
if (iterator1 == container->m1.end()) {
need_switch_to_next = true;
}
break;
}
case 2: {
++iterator2;
if (iterator2 == container->m2.end()) {
need_switch_to_next = true;
}
break;
}
case 3: {
++iterator3;
if (iterator3 == container->m3.end()) {
need_switch_to_next = true;
}
break;
}
case 4: {
++iterator4;
if (iterator4 == container->m4.end()) {
need_switch_to_next = true;
}
break;
}
case 5: {
++iterator5;
break;
}
}
while (need_switch_to_next) {
sub_table_index++;
need_switch_to_next = false;
switch (sub_table_index) {
case 1: {
iterator1 = container->m1.begin();
if (iterator1 == container->m1.end()) {
need_switch_to_next = true;
}
break;
}
case 2: {
iterator2 = container->m2.begin();
if (iterator2 == container->m2.end()) {
need_switch_to_next = true;
}
break;
}
case 3: {
iterator3 = container->m3.begin();
if (iterator3 == container->m3.end()) {
need_switch_to_next = true;
}
break;
}
case 4: {
iterator4 = container->m4.begin();
if (iterator4 == container->m4.end()) {
need_switch_to_next = true;
}
break;
}
case 5: {
iterator5 = container->ms.begin();
break;
}
}
}
return static_cast<Derived&>(*this);
}
auto& operator*() const {
switch (sub_table_index) {
case 0: {
this->cell = *(container->m0.zero_value());
break;
}
case 1: {
this->cell = *iterator1;
break;
}
case 2: {
this->cell = *iterator2;
break;
}
case 3: {
this->cell = *iterator3;
break;
}
case 4: {
this->cell = *iterator4;
break;
}
case 5: {
this->cell = *iterator5;
break;
}
}
return cell;
}
auto* operator->() const { return &(this->operator*()); }
auto get_ptr() const { return &(this->operator*()); }
size_t get_hash() const {
switch (sub_table_index) {
case 0: {
return container->m0.zero_value()->get_hash(container->m0);
}
case 1: {
return iterator1->get_hash(container->m1);
}
case 2: {
return iterator2->get_hash(container->m2);
}
case 3: {
return iterator3->get_hash(container->m3);
}
case 4: {
return iterator4->get_hash(container->m4);
}
case 5: {
return iterator5->get_hash(container->ms);
}
}
}
/**
* A hack for HashedDictionary.
*
* The problem: std-like find() returns an iterator, which has to be
* compared to end(). On the other hand, HashMap::find() returns
* LookupResult, which is compared to nullptr. HashedDictionary has to
* support both hash maps with the same code, hence the need for this
* hack.
*
* The proper way would be to remove iterator interface from our
* HashMap completely, change all its users to the existing internal
* iteration interface, and redefine end() to return LookupResult for
* compatibility with std find(). Unfortunately, now is not the time to
* do this.
*/
operator Cell*() const { return nullptr; }
};
public:
using Key = doris::StringRef;
using key_type = Key;
using mapped_type = typename Ts::mapped_type;
using value_type = typename Ts::value_type;
using cell_type = typename Ts::cell_type;
using LookupResult = StringHashTableLookupResult<typename cell_type::mapped_type>;
using ConstLookupResult = StringHashTableLookupResult<const typename cell_type::mapped_type>;
StringHashTable() = default;
explicit StringHashTable(size_t reserve_for_num_elements)
: m1 {reserve_for_num_elements / 5},
m2 {reserve_for_num_elements / 5},
m3 {reserve_for_num_elements / 5},
m4 {reserve_for_num_elements / 5},
ms {reserve_for_num_elements / 5} {}
~StringHashTable() = default;
size_t hash(const doris::StringRef& key) { return StringHashTableHash()(key); }
// Dispatch is written in a way that maximizes the performance:
// 1. Always memcpy 8 times bytes
// 2. Use switch case extension to generate fast dispatching table
// 3. Funcs are named callables that can be force_inlined
//
// NOTE: It relies on Little Endianness
//
// NOTE: It requires padded to 8 bytes keys (IOW you cannot pass
// std::string here, but you can pass i.e. ColumnString::getDataAt()),
// since it copies 8 bytes at a time.
template <typename Self, typename KeyHolder, typename Func>
static auto ALWAYS_INLINE dispatch(Self& self, KeyHolder&& key, size_t hash_value,
Func&& func) {
const size_t sz = key.size;
if (sz == 0) {
return func(self.m0, std::forward<KeyHolder>(key), key, 0);
}
if (key.data[sz - 1] == 0) {
// Strings with trailing zeros are not representable as fixed-size
// string keys. Put them to the generic table.
return func(self.ms, std::forward<KeyHolder>(key), key, hash_value);
}
if (sz <= sizeof(StringHashMapSubKeys::T1)) {
return func(self.m1, to_string_key<StringHashMapSubKeys::T1>(key), key, hash_value);
}
if (sz <= sizeof(StringHashMapSubKeys::T2)) {
return func(self.m2, to_string_key<StringHashMapSubKeys::T2>(key), key, hash_value);
}
if (sz <= sizeof(StringHashMapSubKeys::T3)) {
return func(self.m3, to_string_key<StringHashMapSubKeys::T3>(key), key, hash_value);
}
if (sz <= sizeof(StringHashMapSubKeys::T4)) {
return func(self.m4, to_string_key<StringHashMapSubKeys::T4>(key), key, hash_value);
}
return func(self.ms, std::forward<KeyHolder>(key), key, hash_value);
}
struct EmplaceCallable {
LookupResult& mapped;
bool& inserted;
EmplaceCallable(LookupResult& mapped_, bool& inserted_)
: mapped(mapped_), inserted(inserted_) {}
template <typename Map, typename KeyHolder, typename Origin>
void ALWAYS_INLINE operator()(Map& map, KeyHolder&& key_holder, Origin&& origin,
size_t hash) {
typename Map::LookupResult result;
map.emplace(key_holder, result, inserted, hash);
mapped = &result->get_second();
}
};
template <typename KeyHolder>
void ALWAYS_INLINE emplace(KeyHolder&& key_holder, LookupResult& it, bool& inserted,
size_t hash_value) {
this->dispatch(*this, key_holder, hash_value, EmplaceCallable(it, inserted));
}
template <typename Func>
struct LazyEmplaceCallable {
LookupResult& mapped;
Func&& f;
LazyEmplaceCallable(LookupResult& mapped_, Func&& f_)
: mapped(mapped_), f(std::forward<Func>(f_)) {}
template <typename Map, typename Key, typename OriginalKey>
void ALWAYS_INLINE operator()(Map& map, Key&& key, OriginalKey&& origin, size_t hash) {
typename Map::LookupResult result;
map.lazy_emplace_with_origin(key, origin, result, hash, std::forward<Func>(f));
mapped = &result->get_second();
}
};
template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, size_t hash_value,
Func&& f) {
this->dispatch(*this, key_holder, hash_value,
LazyEmplaceCallable<Func>(it, std::forward<Func>(f)));
}
template <bool read>
void ALWAYS_INLINE prefetch(const doris::StringRef& key, size_t hash_value) {
if (!key.size) {
return;
}
if (key.size <= sizeof(StringHashMapSubKeys::T1)) {
m1.template prefetch<read>(hash_value);
} else if (key.size <= sizeof(StringHashMapSubKeys::T2)) {
m2.template prefetch<read>(hash_value);
} else if (key.size <= sizeof(StringHashMapSubKeys::T3)) {
m3.template prefetch<read>(hash_value);
} else if (key.size <= sizeof(StringHashMapSubKeys::T4)) {
m4.template prefetch<read>(hash_value);
} else {
ms.template prefetch<read>(hash_value);
}
}
struct FindCallable {
// find() doesn't need any key memory management, so we don't work with
// any key holders here, only with normal keys. The key type is still
// different for every subtable, this is why it is a template parameter.
template <typename Submap, typename SubmapKey, typename Origin>
auto ALWAYS_INLINE operator()(Submap& map, const SubmapKey& key, const Origin& origin,
size_t hash) {
auto it = map.find(key, hash);
if (!it) {
return decltype(&it->get_mapped()) {};
} else {
return &it->get_mapped();
}
}
};
LookupResult ALWAYS_INLINE find(const Key& x, size_t hash_value) {
return dispatch(*this, x, hash_value, FindCallable {});
}
ConstLookupResult ALWAYS_INLINE find(const Key& x, size_t hash_value) const {
return dispatch(*this, x, hash_value, FindCallable {});
}
size_t size() const {
return m0.size() + m1.size() + m2.size() + m3.size() + m4.size() + ms.size();
}
bool empty() const {
return m0.empty() && m1.empty() && m2.empty() && m3.empty() && m4.empty() && ms.empty();
}
size_t get_buffer_size_in_bytes() const {
return m0.get_buffer_size_in_bytes() + m1.get_buffer_size_in_bytes() +
m2.get_buffer_size_in_bytes() + m3.get_buffer_size_in_bytes() +
m4.get_buffer_size_in_bytes() + ms.get_buffer_size_in_bytes();
}
class iterator : public iterator_base<iterator, false> {
public:
using iterator_base<iterator, false>::iterator_base;
};
class const_iterator : public iterator_base<const_iterator, true> {
public:
using iterator_base<const_iterator, true>::iterator_base;
};
const_iterator begin() const { return const_iterator(this); }
const_iterator cbegin() const { return begin(); }
iterator begin() { return iterator(this); }
const_iterator end() const { return const_iterator(this, true); }
const_iterator cend() const { return end(); }
iterator end() { return iterator(this, true); }
bool add_elem_size_overflow(size_t add_size) const {
return m1.add_elem_size_overflow(add_size) || m2.add_elem_size_overflow(add_size) ||
m3.add_elem_size_overflow(add_size) || m4.add_elem_size_overflow(add_size) ||
ms.add_elem_size_overflow(add_size);
}
size_t estimate_memory(size_t num_elem) const {
size_t estimate_size = 0;
if (m1.add_elem_size_overflow(num_elem)) {
estimate_size = m1.estimate_memory(num_elem);
}
if (m2.add_elem_size_overflow(num_elem)) {
estimate_size = std::max(estimate_size, m2.estimate_memory(num_elem));
}
if (m3.add_elem_size_overflow(num_elem)) {
estimate_size = std::max(estimate_size, m3.estimate_memory(num_elem));
}
if (m4.add_elem_size_overflow(num_elem)) {
estimate_size = std::max(estimate_size, m4.estimate_memory(num_elem));
}
if (ms.add_elem_size_overflow(num_elem)) {
estimate_size = std::max(estimate_size, ms.estimate_memory(num_elem));
}
return estimate_size;
}
};