| // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
| // This source code is licensed under both the GPLv2 (found in the |
| // COPYING file in the root directory) and Apache 2.0 License |
| // (found in the LICENSE.Apache file in the root directory). |
| |
| #include "table/block_prefix_index.h" |
| |
| #include <vector> |
| |
| #include "rocksdb/comparator.h" |
| #include "rocksdb/slice.h" |
| #include "rocksdb/slice_transform.h" |
| #include "util/arena.h" |
| #include "util/coding.h" |
| #include "util/hash.h" |
| |
| namespace rocksdb { |
| |
| inline uint32_t Hash(const Slice& s) { |
| return rocksdb::Hash(s.data(), s.size(), 0); |
| } |
| |
| inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) { |
| return Hash(prefix) % num_buckets; |
| } |
| |
| // The prefix block index is simply a bucket array, with each entry pointing to |
| // the blocks that span the prefixes hashed to this bucket. |
| // |
| // To reduce memory footprint, if there is only one block per bucket, the entry |
| // stores the block id directly. If there are more than one blocks per bucket, |
| // because of hash collision or a single prefix spanning multiple blocks, |
| // the entry points to an array of block ids. The block array is an array of |
| // uint32_t's. The first uint32_t indicates the total number of blocks, followed |
| // by the block ids. |
| // |
| // To differentiate the two cases, the high order bit of the entry indicates |
| // whether it is a 'pointer' into a separate block array. |
| // 0x7FFFFFFF is reserved for empty bucket. |
| |
| const uint32_t kNoneBlock = 0x7FFFFFFF; |
| const uint32_t kBlockArrayMask = 0x80000000; |
| |
| inline bool IsNone(uint32_t block_id) { |
| return block_id == kNoneBlock; |
| } |
| |
| inline bool IsBlockId(uint32_t block_id) { |
| return (block_id & kBlockArrayMask) == 0; |
| } |
| |
| inline uint32_t DecodeIndex(uint32_t block_id) { |
| uint32_t index = block_id ^ kBlockArrayMask; |
| assert(index < kBlockArrayMask); |
| return index; |
| } |
| |
| inline uint32_t EncodeIndex(uint32_t index) { |
| assert(index < kBlockArrayMask); |
| return index | kBlockArrayMask; |
| } |
| |
| // temporary storage for prefix information during index building |
| struct PrefixRecord { |
| Slice prefix; |
| uint32_t start_block; |
| uint32_t end_block; |
| uint32_t num_blocks; |
| PrefixRecord* next; |
| }; |
| |
| class BlockPrefixIndex::Builder { |
| public: |
| explicit Builder(const SliceTransform* internal_prefix_extractor) |
| : internal_prefix_extractor_(internal_prefix_extractor) {} |
| |
| void Add(const Slice& key_prefix, uint32_t start_block, |
| uint32_t num_blocks) { |
| PrefixRecord* record = reinterpret_cast<PrefixRecord*>( |
| arena_.AllocateAligned(sizeof(PrefixRecord))); |
| record->prefix = key_prefix; |
| record->start_block = start_block; |
| record->end_block = start_block + num_blocks - 1; |
| record->num_blocks = num_blocks; |
| prefixes_.push_back(record); |
| } |
| |
| BlockPrefixIndex* Finish() { |
| // For now, use roughly 1:1 prefix to bucket ratio. |
| uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1; |
| |
| // Collect prefix records that hash to the same bucket, into a single |
| // linklist. |
| std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr); |
| std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0); |
| for (PrefixRecord* current : prefixes_) { |
| uint32_t bucket = PrefixToBucket(current->prefix, num_buckets); |
| // merge the prefix block span if the first block of this prefix is |
| // connected to the last block of the previous prefix. |
| PrefixRecord* prev = prefixes_per_bucket[bucket]; |
| if (prev) { |
| assert(current->start_block >= prev->end_block); |
| auto distance = current->start_block - prev->end_block; |
| if (distance <= 1) { |
| prev->end_block = current->end_block; |
| prev->num_blocks = prev->end_block - prev->start_block + 1; |
| num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1); |
| continue; |
| } |
| } |
| current->next = prev; |
| prefixes_per_bucket[bucket] = current; |
| num_blocks_per_bucket[bucket] += current->num_blocks; |
| } |
| |
| // Calculate the block array buffer size |
| uint32_t total_block_array_entries = 0; |
| for (uint32_t i = 0; i < num_buckets; i++) { |
| uint32_t num_blocks = num_blocks_per_bucket[i]; |
| if (num_blocks > 1) { |
| total_block_array_entries += (num_blocks + 1); |
| } |
| } |
| |
| // Populate the final prefix block index |
| uint32_t* block_array_buffer = new uint32_t[total_block_array_entries]; |
| uint32_t* buckets = new uint32_t[num_buckets]; |
| uint32_t offset = 0; |
| for (uint32_t i = 0; i < num_buckets; i++) { |
| uint32_t num_blocks = num_blocks_per_bucket[i]; |
| if (num_blocks == 0) { |
| assert(prefixes_per_bucket[i] == nullptr); |
| buckets[i] = kNoneBlock; |
| } else if (num_blocks == 1) { |
| assert(prefixes_per_bucket[i] != nullptr); |
| assert(prefixes_per_bucket[i]->next == nullptr); |
| buckets[i] = prefixes_per_bucket[i]->start_block; |
| } else { |
| assert(total_block_array_entries > 0); |
| assert(prefixes_per_bucket[i] != nullptr); |
| buckets[i] = EncodeIndex(offset); |
| block_array_buffer[offset] = num_blocks; |
| uint32_t* last_block = &block_array_buffer[offset + num_blocks]; |
| auto current = prefixes_per_bucket[i]; |
| // populate block ids from largest to smallest |
| while (current != nullptr) { |
| for (uint32_t iter = 0; iter < current->num_blocks; iter++) { |
| *last_block = current->end_block - iter; |
| last_block--; |
| } |
| current = current->next; |
| } |
| assert(last_block == &block_array_buffer[offset]); |
| offset += (num_blocks + 1); |
| } |
| } |
| |
| assert(offset == total_block_array_entries); |
| |
| return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets, |
| buckets, total_block_array_entries, |
| block_array_buffer); |
| } |
| |
| private: |
| const SliceTransform* internal_prefix_extractor_; |
| |
| std::vector<PrefixRecord*> prefixes_; |
| Arena arena_; |
| }; |
| |
| |
| Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, |
| const Slice& prefixes, const Slice& prefix_meta, |
| BlockPrefixIndex** prefix_index) { |
| uint64_t pos = 0; |
| auto meta_pos = prefix_meta; |
| Status s; |
| Builder builder(internal_prefix_extractor); |
| |
| while (!meta_pos.empty()) { |
| uint32_t prefix_size = 0; |
| uint32_t entry_index = 0; |
| uint32_t num_blocks = 0; |
| if (!GetVarint32(&meta_pos, &prefix_size) || |
| !GetVarint32(&meta_pos, &entry_index) || |
| !GetVarint32(&meta_pos, &num_blocks)) { |
| s = Status::Corruption( |
| "Corrupted prefix meta block: unable to read from it."); |
| break; |
| } |
| if (pos + prefix_size > prefixes.size()) { |
| s = Status::Corruption( |
| "Corrupted prefix meta block: size inconsistency."); |
| break; |
| } |
| Slice prefix(prefixes.data() + pos, prefix_size); |
| builder.Add(prefix, entry_index, num_blocks); |
| |
| pos += prefix_size; |
| } |
| |
| if (s.ok() && pos != prefixes.size()) { |
| s = Status::Corruption("Corrupted prefix meta block"); |
| } |
| |
| if (s.ok()) { |
| *prefix_index = builder.Finish(); |
| } |
| |
| return s; |
| } |
| |
| uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, |
| uint32_t** blocks) { |
| Slice prefix = internal_prefix_extractor_->Transform(key); |
| |
| uint32_t bucket = PrefixToBucket(prefix, num_buckets_); |
| uint32_t block_id = buckets_[bucket]; |
| |
| if (IsNone(block_id)) { |
| return 0; |
| } else if (IsBlockId(block_id)) { |
| *blocks = &buckets_[bucket]; |
| return 1; |
| } else { |
| uint32_t index = DecodeIndex(block_id); |
| assert(index < num_block_array_buffer_entries_); |
| *blocks = &block_array_buffer_[index+1]; |
| uint32_t num_blocks = block_array_buffer_[index]; |
| assert(num_blocks > 1); |
| assert(index + num_blocks < num_block_array_buffer_entries_); |
| return num_blocks; |
| } |
| } |
| |
| } // namespace rocksdb |