| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| // Private header, not to be exported |
| |
| #pragma once |
| |
| #include <algorithm> |
| #include <cassert> |
| #include <cmath> |
| #include <cstdint> |
| #include <cstring> |
| #include <limits> |
| #include <memory> |
| #include <string> |
| #include <type_traits> |
| #include <utility> |
| #include <vector> |
| |
| #include "arrow/array/builder_binary.h" |
| #include "arrow/buffer_builder.h" |
| #include "arrow/result.h" |
| #include "arrow/status.h" |
| #include "arrow/type_fwd.h" |
| #include "arrow/type_traits.h" |
| #include "arrow/util/bit_util.h" |
| #include "arrow/util/bitmap_builders.h" |
| #include "arrow/util/endian.h" |
| #include "arrow/util/logging.h" |
| #include "arrow/util/macros.h" |
| #include "arrow/util/ubsan.h" |
| |
| #define XXH_INLINE_ALL |
| |
| #include "arrow/vendored/xxhash.h" // IWYU pragma: keep |
| |
| namespace arrow { |
| namespace internal { |
| |
| // XXX would it help to have a 32-bit hash value on large datasets? |
| typedef uint64_t hash_t; |
| |
| // Notes about the choice of a hash function. |
| // - XXH3 is extremely fast on most data sizes, from small to huge; |
| // faster even than HW CRC-based hashing schemes |
| // - our custom hash function for tiny values (< 16 bytes) is still |
| // significantly faster (~30%), at least on this machine and compiler |
| |
| template <uint64_t AlgNum> |
| inline hash_t ComputeStringHash(const void* data, int64_t length); |
| |
| template <typename Scalar, uint64_t AlgNum> |
| struct ScalarHelperBase { |
| static bool CompareScalars(Scalar u, Scalar v) { return u == v; } |
| |
| static hash_t ComputeHash(const Scalar& value) { |
| // Generic hash computation for scalars. Simply apply the string hash |
| // to the bit representation of the value. |
| |
| // XXX in the case of FP values, we'd like equal values to have the same hash, |
| // even if they have different bit representations... |
| return ComputeStringHash<AlgNum>(&value, sizeof(value)); |
| } |
| }; |
| |
| template <typename Scalar, uint64_t AlgNum = 0, typename Enable = void> |
| struct ScalarHelper : public ScalarHelperBase<Scalar, AlgNum> {}; |
| |
| template <typename Scalar, uint64_t AlgNum> |
| struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_integral<Scalar>::value>> |
| : public ScalarHelperBase<Scalar, AlgNum> { |
| // ScalarHelper specialization for integers |
| |
| static hash_t ComputeHash(const Scalar& value) { |
| // Faster hash computation for integers. |
| |
| // Two of xxhash's prime multipliers (which are chosen for their |
| // bit dispersion properties) |
| static constexpr uint64_t multipliers[] = {11400714785074694791ULL, |
| 14029467366897019727ULL}; |
| |
| // Multiplying by the prime number mixes the low bits into the high bits, |
| // then byte-swapping (which is a single CPU instruction) allows the |
| // combined high and low bits to participate in the initial hash table index. |
| auto h = static_cast<hash_t>(value); |
| return BitUtil::ByteSwap(multipliers[AlgNum] * h); |
| } |
| }; |
| |
| template <typename Scalar, uint64_t AlgNum> |
| struct ScalarHelper<Scalar, AlgNum, |
| enable_if_t<std::is_same<util::string_view, Scalar>::value>> |
| : public ScalarHelperBase<Scalar, AlgNum> { |
| // ScalarHelper specialization for util::string_view |
| |
| static hash_t ComputeHash(const util::string_view& value) { |
| return ComputeStringHash<AlgNum>(value.data(), static_cast<int64_t>(value.size())); |
| } |
| }; |
| |
| template <typename Scalar, uint64_t AlgNum> |
| struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_floating_point<Scalar>::value>> |
| : public ScalarHelperBase<Scalar, AlgNum> { |
| // ScalarHelper specialization for reals |
| |
| static bool CompareScalars(Scalar u, Scalar v) { |
| if (std::isnan(u)) { |
| // XXX should we do a bit-precise comparison? |
| return std::isnan(v); |
| } |
| return u == v; |
| } |
| }; |
| |
| template <uint64_t AlgNum = 0> |
| hash_t ComputeStringHash(const void* data, int64_t length) { |
| if (ARROW_PREDICT_TRUE(length <= 16)) { |
| // Specialize for small hash strings, as they are quite common as |
| // hash table keys. Even XXH3 isn't quite as fast. |
| auto p = reinterpret_cast<const uint8_t*>(data); |
| auto n = static_cast<uint32_t>(length); |
| if (n <= 8) { |
| if (n <= 3) { |
| if (n == 0) { |
| return 1U; |
| } |
| uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1]; |
| return ScalarHelper<uint32_t, AlgNum>::ComputeHash(x); |
| } |
| // 4 <= length <= 8 |
| // We can read the string as two overlapping 32-bit ints, apply |
| // different hash functions to each of them in parallel, then XOR |
| // the results |
| uint32_t x, y; |
| hash_t hx, hy; |
| x = util::SafeLoadAs<uint32_t>(p + n - 4); |
| y = util::SafeLoadAs<uint32_t>(p); |
| hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x); |
| hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y); |
| return n ^ hx ^ hy; |
| } |
| // 8 <= length <= 16 |
| // Apply the same principle as above |
| uint64_t x, y; |
| hash_t hx, hy; |
| x = util::SafeLoadAs<uint64_t>(p + n - 8); |
| y = util::SafeLoadAs<uint64_t>(p); |
| hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x); |
| hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y); |
| return n ^ hx ^ hy; |
| } |
| |
| #if XXH3_SECRET_SIZE_MIN != 136 |
| #error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets |
| #endif |
| |
| // XXH3_64bits_withSeed generates a secret based on the seed, which is too slow. |
| // Instead, we use hard-coded random secrets. To maximize cache efficiency, |
| // they reuse the same memory area. |
| static constexpr unsigned char kXxh3Secrets[XXH3_SECRET_SIZE_MIN + 1] = { |
| 0xe7, 0x8b, 0x13, 0xf9, 0xfc, 0xb5, 0x8e, 0xef, 0x81, 0x48, 0x2c, 0xbf, 0xf9, 0x9f, |
| 0xc1, 0x1e, 0x43, 0x6d, 0xbf, 0xa6, 0x6d, 0xb5, 0x72, 0xbc, 0x97, 0xd8, 0x61, 0x24, |
| 0x0f, 0x12, 0xe3, 0x05, 0x21, 0xf7, 0x5c, 0x66, 0x67, 0xa5, 0x65, 0x03, 0x96, 0x26, |
| 0x69, 0xd8, 0x29, 0x20, 0xf8, 0xc7, 0xb0, 0x3d, 0xdd, 0x7d, 0x18, 0xa0, 0x60, 0x75, |
| 0x92, 0xa4, 0xce, 0xba, 0xc0, 0x77, 0xf4, 0xac, 0xb7, 0x03, 0x53, 0xf0, 0x98, 0xce, |
| 0xe6, 0x2b, 0x20, 0xc7, 0x82, 0x91, 0xab, 0xbf, 0x68, 0x5c, 0x62, 0x4d, 0x33, 0xa3, |
| 0xe1, 0xb3, 0xff, 0x97, 0x54, 0x4c, 0x44, 0x34, 0xb5, 0xb9, 0x32, 0x4c, 0x75, 0x42, |
| 0x89, 0x53, 0x94, 0xd4, 0x9f, 0x2b, 0x76, 0x4d, 0x4e, 0xe6, 0xfa, 0x15, 0x3e, 0xc1, |
| 0xdb, 0x71, 0x4b, 0x2c, 0x94, 0xf5, 0xfc, 0x8c, 0x89, 0x4b, 0xfb, 0xc1, 0x82, 0xa5, |
| 0x6a, 0x53, 0xf9, 0x4a, 0xba, 0xce, 0x1f, 0xc0, 0x97, 0x1a, 0x87}; |
| |
| static_assert(AlgNum < 2, "AlgNum too large"); |
| static constexpr auto secret = kXxh3Secrets + AlgNum; |
| return XXH3_64bits_withSecret(data, static_cast<size_t>(length), secret, |
| XXH3_SECRET_SIZE_MIN); |
| } |
| |
| // XXX add a HashEq<ArrowType> struct with both hash and compare functions? |
| |
| // ---------------------------------------------------------------------- |
| // An open-addressing insert-only hash table (no deletes) |
| |
| template <typename Payload> |
| class HashTable { |
| public: |
| static constexpr hash_t kSentinel = 0ULL; |
| static constexpr int64_t kLoadFactor = 2UL; |
| |
| struct Entry { |
| hash_t h; |
| Payload payload; |
| |
| // An entry is valid if the hash is different from the sentinel value |
| operator bool() const { return h != kSentinel; } |
| }; |
| |
| HashTable(MemoryPool* pool, uint64_t capacity) : entries_builder_(pool) { |
| DCHECK_NE(pool, nullptr); |
| // Minimum of 32 elements |
| capacity = std::max<uint64_t>(capacity, 32UL); |
| capacity_ = BitUtil::NextPower2(capacity); |
| capacity_mask_ = capacity_ - 1; |
| size_ = 0; |
| |
| DCHECK_OK(UpsizeBuffer(capacity_)); |
| } |
| |
| // Lookup with non-linear probing |
| // cmp_func should have signature bool(const Payload*). |
| // Return a (Entry*, found) pair. |
| template <typename CmpFunc> |
| std::pair<Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) { |
| auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_, |
| std::forward<CmpFunc>(cmp_func)); |
| return {&entries_[p.first], p.second}; |
| } |
| |
| template <typename CmpFunc> |
| std::pair<const Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) const { |
| auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_, |
| std::forward<CmpFunc>(cmp_func)); |
| return {&entries_[p.first], p.second}; |
| } |
| |
| Status Insert(Entry* entry, hash_t h, const Payload& payload) { |
| // Ensure entry is empty before inserting |
| assert(!*entry); |
| entry->h = FixHash(h); |
| entry->payload = payload; |
| ++size_; |
| |
| if (ARROW_PREDICT_FALSE(NeedUpsizing())) { |
| // Resize less frequently since it is expensive |
| return Upsize(capacity_ * kLoadFactor * 2); |
| } |
| return Status::OK(); |
| } |
| |
| uint64_t size() const { return size_; } |
| |
| // Visit all non-empty entries in the table |
| // The visit_func should have signature void(const Entry*) |
| template <typename VisitFunc> |
| void VisitEntries(VisitFunc&& visit_func) const { |
| for (uint64_t i = 0; i < capacity_; i++) { |
| const auto& entry = entries_[i]; |
| if (entry) { |
| visit_func(&entry); |
| } |
| } |
| } |
| |
| protected: |
| // NoCompare is for when the value is known not to exist in the table |
| enum CompareKind { DoCompare, NoCompare }; |
| |
| // The workhorse lookup function |
| template <CompareKind CKind, typename CmpFunc> |
| std::pair<uint64_t, bool> Lookup(hash_t h, const Entry* entries, uint64_t size_mask, |
| CmpFunc&& cmp_func) const { |
| static constexpr uint8_t perturb_shift = 5; |
| |
| uint64_t index, perturb; |
| const Entry* entry; |
| |
| h = FixHash(h); |
| index = h & size_mask; |
| perturb = (h >> perturb_shift) + 1U; |
| |
| while (true) { |
| entry = &entries[index]; |
| if (CompareEntry<CKind, CmpFunc>(h, entry, std::forward<CmpFunc>(cmp_func))) { |
| // Found |
| return {index, true}; |
| } |
| if (entry->h == kSentinel) { |
| // Empty slot |
| return {index, false}; |
| } |
| |
| // Perturbation logic inspired from CPython's set / dict object. |
| // The goal is that all 64 bits of the unmasked hash value eventually |
| // participate in the probing sequence, to minimize clustering. |
| index = (index + perturb) & size_mask; |
| perturb = (perturb >> perturb_shift) + 1U; |
| } |
| } |
| |
| template <CompareKind CKind, typename CmpFunc> |
| bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const { |
| if (CKind == NoCompare) { |
| return false; |
| } else { |
| return entry->h == h && cmp_func(&entry->payload); |
| } |
| } |
| |
| bool NeedUpsizing() const { |
| // Keep the load factor <= 1/2 |
| return size_ * kLoadFactor >= capacity_; |
| } |
| |
| Status UpsizeBuffer(uint64_t capacity) { |
| RETURN_NOT_OK(entries_builder_.Resize(capacity)); |
| entries_ = entries_builder_.mutable_data(); |
| memset(static_cast<void*>(entries_), 0, capacity * sizeof(Entry)); |
| |
| return Status::OK(); |
| } |
| |
| Status Upsize(uint64_t new_capacity) { |
| assert(new_capacity > capacity_); |
| uint64_t new_mask = new_capacity - 1; |
| assert((new_capacity & new_mask) == 0); // it's a power of two |
| |
| // Stash old entries and seal builder, effectively resetting the Buffer |
| const Entry* old_entries = entries_; |
| std::shared_ptr<Buffer> previous; |
| RETURN_NOT_OK(entries_builder_.Finish(&previous)); |
| // Allocate new buffer |
| RETURN_NOT_OK(UpsizeBuffer(new_capacity)); |
| |
| for (uint64_t i = 0; i < capacity_; i++) { |
| const auto& entry = old_entries[i]; |
| if (entry) { |
| // Dummy compare function will not be called |
| auto p = Lookup<NoCompare>(entry.h, entries_, new_mask, |
| [](const Payload*) { return false; }); |
| // Lookup<NoCompare> (and CompareEntry<NoCompare>) ensure that an |
| // empty slots is always returned |
| assert(!p.second); |
| entries_[p.first] = entry; |
| } |
| } |
| capacity_ = new_capacity; |
| capacity_mask_ = new_mask; |
| |
| return Status::OK(); |
| } |
| |
| hash_t FixHash(hash_t h) const { return (h == kSentinel) ? 42U : h; } |
| |
| // The number of slots available in the hash table array. |
| uint64_t capacity_; |
| uint64_t capacity_mask_; |
| // The number of used slots in the hash table array. |
| uint64_t size_; |
| |
| Entry* entries_; |
| TypedBufferBuilder<Entry> entries_builder_; |
| }; |
| |
| // XXX typedef memo_index_t int32_t ? |
| |
| constexpr int32_t kKeyNotFound = -1; |
| |
| // ---------------------------------------------------------------------- |
| // A base class for memoization table. |
| |
| class MemoTable { |
| public: |
| virtual ~MemoTable() = default; |
| |
| virtual int32_t size() const = 0; |
| }; |
| |
| // ---------------------------------------------------------------------- |
| // A memoization table for memory-cheap scalar values. |
| |
| // The memoization table remembers and allows to look up the insertion |
| // index for each key. |
| |
| template <typename Scalar, template <class> class HashTableTemplateType = HashTable> |
| class ScalarMemoTable : public MemoTable { |
| public: |
| explicit ScalarMemoTable(MemoryPool* pool, int64_t entries = 0) |
| : hash_table_(pool, static_cast<uint64_t>(entries)) {} |
| |
| int32_t Get(const Scalar& value) const { |
| auto cmp_func = [value](const Payload* payload) -> bool { |
| return ScalarHelper<Scalar, 0>::CompareScalars(payload->value, value); |
| }; |
| hash_t h = ComputeHash(value); |
| auto p = hash_table_.Lookup(h, cmp_func); |
| if (p.second) { |
| return p.first->payload.memo_index; |
| } else { |
| return kKeyNotFound; |
| } |
| } |
| |
| template <typename Func1, typename Func2> |
| Status GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found, |
| int32_t* out_memo_index) { |
| auto cmp_func = [value](const Payload* payload) -> bool { |
| return ScalarHelper<Scalar, 0>::CompareScalars(value, payload->value); |
| }; |
| hash_t h = ComputeHash(value); |
| auto p = hash_table_.Lookup(h, cmp_func); |
| int32_t memo_index; |
| if (p.second) { |
| memo_index = p.first->payload.memo_index; |
| on_found(memo_index); |
| } else { |
| memo_index = size(); |
| RETURN_NOT_OK(hash_table_.Insert(p.first, h, {value, memo_index})); |
| on_not_found(memo_index); |
| } |
| *out_memo_index = memo_index; |
| return Status::OK(); |
| } |
| |
| Status GetOrInsert(const Scalar& value, int32_t* out_memo_index) { |
| return GetOrInsert( |
| value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index); |
| } |
| |
| int32_t GetNull() const { return null_index_; } |
| |
| template <typename Func1, typename Func2> |
| int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) { |
| int32_t memo_index = GetNull(); |
| if (memo_index != kKeyNotFound) { |
| on_found(memo_index); |
| } else { |
| null_index_ = memo_index = size(); |
| on_not_found(memo_index); |
| } |
| return memo_index; |
| } |
| |
| int32_t GetOrInsertNull() { |
| return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {}); |
| } |
| |
| // The number of entries in the memo table +1 if null was added. |
| // (which is also 1 + the largest memo index) |
| int32_t size() const override { |
| return static_cast<int32_t>(hash_table_.size()) + (GetNull() != kKeyNotFound); |
| } |
| |
| // Copy values starting from index `start` into `out_data` |
| void CopyValues(int32_t start, Scalar* out_data) const { |
| hash_table_.VisitEntries([=](const HashTableEntry* entry) { |
| int32_t index = entry->payload.memo_index - start; |
| if (index >= 0) { |
| out_data[index] = entry->payload.value; |
| } |
| }); |
| } |
| |
| void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); } |
| |
| protected: |
| struct Payload { |
| Scalar value; |
| int32_t memo_index; |
| }; |
| |
| using HashTableType = HashTableTemplateType<Payload>; |
| using HashTableEntry = typename HashTableType::Entry; |
| HashTableType hash_table_; |
| int32_t null_index_ = kKeyNotFound; |
| |
| hash_t ComputeHash(const Scalar& value) const { |
| return ScalarHelper<Scalar, 0>::ComputeHash(value); |
| } |
| }; |
| |
| // ---------------------------------------------------------------------- |
| // A memoization table for small scalar values, using direct indexing |
| |
| template <typename Scalar, typename Enable = void> |
| struct SmallScalarTraits {}; |
| |
| template <> |
| struct SmallScalarTraits<bool> { |
| static constexpr int32_t cardinality = 2; |
| |
| static uint32_t AsIndex(bool value) { return value ? 1 : 0; } |
| }; |
| |
| template <typename Scalar> |
| struct SmallScalarTraits<Scalar, enable_if_t<std::is_integral<Scalar>::value>> { |
| using Unsigned = typename std::make_unsigned<Scalar>::type; |
| |
| static constexpr int32_t cardinality = 1U + std::numeric_limits<Unsigned>::max(); |
| |
| static uint32_t AsIndex(Scalar value) { return static_cast<Unsigned>(value); } |
| }; |
| |
| template <typename Scalar, template <class> class HashTableTemplateType = HashTable> |
| class SmallScalarMemoTable : public MemoTable { |
| public: |
| explicit SmallScalarMemoTable(MemoryPool* pool, int64_t entries = 0) { |
| std::fill(value_to_index_, value_to_index_ + cardinality + 1, kKeyNotFound); |
| index_to_value_.reserve(cardinality); |
| } |
| |
| int32_t Get(const Scalar value) const { |
| auto value_index = AsIndex(value); |
| return value_to_index_[value_index]; |
| } |
| |
| template <typename Func1, typename Func2> |
| Status GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found, |
| int32_t* out_memo_index) { |
| auto value_index = AsIndex(value); |
| auto memo_index = value_to_index_[value_index]; |
| if (memo_index == kKeyNotFound) { |
| memo_index = static_cast<int32_t>(index_to_value_.size()); |
| index_to_value_.push_back(value); |
| value_to_index_[value_index] = memo_index; |
| DCHECK_LT(memo_index, cardinality + 1); |
| on_not_found(memo_index); |
| } else { |
| on_found(memo_index); |
| } |
| *out_memo_index = memo_index; |
| return Status::OK(); |
| } |
| |
| Status GetOrInsert(const Scalar value, int32_t* out_memo_index) { |
| return GetOrInsert( |
| value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index); |
| } |
| |
| int32_t GetNull() const { return value_to_index_[cardinality]; } |
| |
| template <typename Func1, typename Func2> |
| int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) { |
| auto memo_index = GetNull(); |
| if (memo_index == kKeyNotFound) { |
| memo_index = value_to_index_[cardinality] = size(); |
| index_to_value_.push_back(0); |
| on_not_found(memo_index); |
| } else { |
| on_found(memo_index); |
| } |
| return memo_index; |
| } |
| |
| int32_t GetOrInsertNull() { |
| return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {}); |
| } |
| |
| // The number of entries in the memo table |
| // (which is also 1 + the largest memo index) |
| int32_t size() const override { return static_cast<int32_t>(index_to_value_.size()); } |
| |
| // Copy values starting from index `start` into `out_data` |
| void CopyValues(int32_t start, Scalar* out_data) const { |
| DCHECK_GE(start, 0); |
| DCHECK_LE(static_cast<size_t>(start), index_to_value_.size()); |
| int64_t offset = start * static_cast<int32_t>(sizeof(Scalar)); |
| memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar)); |
| } |
| |
| void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); } |
| |
| const std::vector<Scalar>& values() const { return index_to_value_; } |
| |
| protected: |
| static constexpr auto cardinality = SmallScalarTraits<Scalar>::cardinality; |
| static_assert(cardinality <= 256, "cardinality too large for direct-addressed table"); |
| |
| uint32_t AsIndex(Scalar value) const { |
| return SmallScalarTraits<Scalar>::AsIndex(value); |
| } |
| |
| // The last index is reserved for the null element. |
| int32_t value_to_index_[cardinality + 1]; |
| std::vector<Scalar> index_to_value_; |
| }; |
| |
| // ---------------------------------------------------------------------- |
| // A memoization table for variable-sized binary data. |
| |
| template <typename BinaryBuilderT> |
| class BinaryMemoTable : public MemoTable { |
| public: |
| using builder_offset_type = typename BinaryBuilderT::offset_type; |
| explicit BinaryMemoTable(MemoryPool* pool, int64_t entries = 0, |
| int64_t values_size = -1) |
| : hash_table_(pool, static_cast<uint64_t>(entries)), binary_builder_(pool) { |
| const int64_t data_size = (values_size < 0) ? entries * 4 : values_size; |
| DCHECK_OK(binary_builder_.Resize(entries)); |
| DCHECK_OK(binary_builder_.ReserveData(data_size)); |
| } |
| |
| int32_t Get(const void* data, builder_offset_type length) const { |
| hash_t h = ComputeStringHash<0>(data, length); |
| auto p = Lookup(h, data, length); |
| if (p.second) { |
| return p.first->payload.memo_index; |
| } else { |
| return kKeyNotFound; |
| } |
| } |
| |
| int32_t Get(const util::string_view& value) const { |
| return Get(value.data(), static_cast<builder_offset_type>(value.length())); |
| } |
| |
| template <typename Func1, typename Func2> |
| Status GetOrInsert(const void* data, builder_offset_type length, Func1&& on_found, |
| Func2&& on_not_found, int32_t* out_memo_index) { |
| hash_t h = ComputeStringHash<0>(data, length); |
| auto p = Lookup(h, data, length); |
| int32_t memo_index; |
| if (p.second) { |
| memo_index = p.first->payload.memo_index; |
| on_found(memo_index); |
| } else { |
| memo_index = size(); |
| // Insert string value |
| RETURN_NOT_OK(binary_builder_.Append(static_cast<const char*>(data), length)); |
| // Insert hash entry |
| RETURN_NOT_OK( |
| hash_table_.Insert(const_cast<HashTableEntry*>(p.first), h, {memo_index})); |
| |
| on_not_found(memo_index); |
| } |
| *out_memo_index = memo_index; |
| return Status::OK(); |
| } |
| |
| template <typename Func1, typename Func2> |
| Status GetOrInsert(const util::string_view& value, Func1&& on_found, |
| Func2&& on_not_found, int32_t* out_memo_index) { |
| return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()), |
| std::forward<Func1>(on_found), std::forward<Func2>(on_not_found), |
| out_memo_index); |
| } |
| |
| Status GetOrInsert(const void* data, builder_offset_type length, |
| int32_t* out_memo_index) { |
| return GetOrInsert( |
| data, length, [](int32_t i) {}, [](int32_t i) {}, out_memo_index); |
| } |
| |
| Status GetOrInsert(const util::string_view& value, int32_t* out_memo_index) { |
| return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()), |
| out_memo_index); |
| } |
| |
| int32_t GetNull() const { return null_index_; } |
| |
| template <typename Func1, typename Func2> |
| int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) { |
| int32_t memo_index = GetNull(); |
| if (memo_index == kKeyNotFound) { |
| memo_index = null_index_ = size(); |
| DCHECK_OK(binary_builder_.AppendNull()); |
| on_not_found(memo_index); |
| } else { |
| on_found(memo_index); |
| } |
| return memo_index; |
| } |
| |
| int32_t GetOrInsertNull() { |
| return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {}); |
| } |
| |
| // The number of entries in the memo table |
| // (which is also 1 + the largest memo index) |
| int32_t size() const override { |
| return static_cast<int32_t>(hash_table_.size() + (GetNull() != kKeyNotFound)); |
| } |
| |
| int64_t values_size() const { return binary_builder_.value_data_length(); } |
| |
| // Copy (n + 1) offsets starting from index `start` into `out_data` |
| template <class Offset> |
| void CopyOffsets(int32_t start, Offset* out_data) const { |
| DCHECK_LE(start, size()); |
| |
| const builder_offset_type* offsets = binary_builder_.offsets_data(); |
| const builder_offset_type delta = |
| start < binary_builder_.length() ? offsets[start] : 0; |
| for (int32_t i = start; i < size(); ++i) { |
| const builder_offset_type adjusted_offset = offsets[i] - delta; |
| Offset cast_offset = static_cast<Offset>(adjusted_offset); |
| assert(static_cast<builder_offset_type>(cast_offset) == |
| adjusted_offset); // avoid truncation |
| *out_data++ = cast_offset; |
| } |
| |
| // Copy last value since BinaryBuilder only materializes it on in Finish() |
| *out_data = static_cast<Offset>(binary_builder_.value_data_length() - delta); |
| } |
| |
| template <class Offset> |
| void CopyOffsets(Offset* out_data) const { |
| CopyOffsets(0, out_data); |
| } |
| |
| // Copy values starting from index `start` into `out_data` |
| void CopyValues(int32_t start, uint8_t* out_data) const { |
| CopyValues(start, -1, out_data); |
| } |
| |
| // Same as above, but check output size in debug mode |
| void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const { |
| DCHECK_LE(start, size()); |
| |
| // The absolute byte offset of `start` value in the binary buffer. |
| const builder_offset_type offset = binary_builder_.offset(start); |
| const auto length = binary_builder_.value_data_length() - static_cast<size_t>(offset); |
| |
| if (out_size != -1) { |
| assert(static_cast<int64_t>(length) <= out_size); |
| } |
| |
| auto view = binary_builder_.GetView(start); |
| memcpy(out_data, view.data(), length); |
| } |
| |
| void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); } |
| |
| void CopyValues(int64_t out_size, uint8_t* out_data) const { |
| CopyValues(0, out_size, out_data); |
| } |
| |
| void CopyFixedWidthValues(int32_t start, int32_t width_size, int64_t out_size, |
| uint8_t* out_data) const { |
| // This method exists to cope with the fact that the BinaryMemoTable does |
| // not know the fixed width when inserting the null value. The data |
| // buffer hold a zero length string for the null value (if found). |
| // |
| // Thus, the method will properly inject an empty value of the proper width |
| // in the output buffer. |
| // |
| if (start >= size()) { |
| return; |
| } |
| |
| int32_t null_index = GetNull(); |
| if (null_index < start) { |
| // Nothing to skip, proceed as usual. |
| CopyValues(start, out_size, out_data); |
| return; |
| } |
| |
| builder_offset_type left_offset = binary_builder_.offset(start); |
| |
| // Ensure that the data length is exactly missing width_size bytes to fit |
| // in the expected output (n_values * width_size). |
| #ifndef NDEBUG |
| int64_t data_length = values_size() - static_cast<size_t>(left_offset); |
| assert(data_length + width_size == out_size); |
| ARROW_UNUSED(data_length); |
| #endif |
| |
| auto in_data = binary_builder_.value_data() + left_offset; |
| // The null use 0-length in the data, slice the data in 2 and skip by |
| // width_size in out_data. [part_1][width_size][part_2] |
| auto null_data_offset = binary_builder_.offset(null_index); |
| auto left_size = null_data_offset - left_offset; |
| if (left_size > 0) { |
| memcpy(out_data, in_data + left_offset, left_size); |
| } |
| |
| auto right_size = values_size() - static_cast<size_t>(null_data_offset); |
| if (right_size > 0) { |
| // skip the null fixed size value. |
| auto out_offset = left_size + width_size; |
| assert(out_data + out_offset + right_size == out_data + out_size); |
| memcpy(out_data + out_offset, in_data + null_data_offset, right_size); |
| } |
| } |
| |
| // Visit the stored values in insertion order. |
| // The visitor function should have the signature `void(util::string_view)` |
| // or `void(const util::string_view&)`. |
| template <typename VisitFunc> |
| void VisitValues(int32_t start, VisitFunc&& visit) const { |
| for (int32_t i = start; i < size(); ++i) { |
| visit(binary_builder_.GetView(i)); |
| } |
| } |
| |
| protected: |
| struct Payload { |
| int32_t memo_index; |
| }; |
| |
| using HashTableType = HashTable<Payload>; |
| using HashTableEntry = typename HashTable<Payload>::Entry; |
| HashTableType hash_table_; |
| BinaryBuilderT binary_builder_; |
| |
| int32_t null_index_ = kKeyNotFound; |
| |
| std::pair<const HashTableEntry*, bool> Lookup(hash_t h, const void* data, |
| builder_offset_type length) const { |
| auto cmp_func = [=](const Payload* payload) { |
| util::string_view lhs = binary_builder_.GetView(payload->memo_index); |
| util::string_view rhs(static_cast<const char*>(data), length); |
| return lhs == rhs; |
| }; |
| return hash_table_.Lookup(h, cmp_func); |
| } |
| }; |
| |
| template <typename T, typename Enable = void> |
| struct HashTraits {}; |
| |
| template <> |
| struct HashTraits<BooleanType> { |
| using MemoTableType = SmallScalarMemoTable<bool>; |
| }; |
| |
| template <typename T> |
| struct HashTraits<T, enable_if_8bit_int<T>> { |
| using c_type = typename T::c_type; |
| using MemoTableType = SmallScalarMemoTable<typename T::c_type>; |
| }; |
| |
| template <typename T> |
| struct HashTraits<T, enable_if_t<has_c_type<T>::value && !is_8bit_int<T>::value>> { |
| using c_type = typename T::c_type; |
| using MemoTableType = ScalarMemoTable<c_type, HashTable>; |
| }; |
| |
| template <typename T> |
| struct HashTraits<T, enable_if_t<has_string_view<T>::value && |
| !std::is_base_of<LargeBinaryType, T>::value>> { |
| using MemoTableType = BinaryMemoTable<BinaryBuilder>; |
| }; |
| |
| template <typename T> |
| struct HashTraits<T, enable_if_decimal<T>> { |
| using MemoTableType = BinaryMemoTable<BinaryBuilder>; |
| }; |
| |
| template <typename T> |
| struct HashTraits<T, enable_if_t<std::is_base_of<LargeBinaryType, T>::value>> { |
| using MemoTableType = BinaryMemoTable<LargeBinaryBuilder>; |
| }; |
| |
| template <typename MemoTableType> |
| static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& memo_table, |
| int64_t start_offset, int64_t* null_count, |
| std::shared_ptr<Buffer>* null_bitmap) { |
| int64_t dict_length = static_cast<int64_t>(memo_table.size()) - start_offset; |
| int64_t null_index = memo_table.GetNull(); |
| |
| *null_count = 0; |
| *null_bitmap = nullptr; |
| |
| if (null_index != kKeyNotFound && null_index >= start_offset) { |
| null_index -= start_offset; |
| *null_count = 1; |
| ARROW_ASSIGN_OR_RAISE(*null_bitmap, |
| internal::BitmapAllButOne(pool, dict_length, null_index)); |
| } |
| |
| return Status::OK(); |
| } |
| |
| } // namespace internal |
| } // namespace arrow |