| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // Dictionary encoding for strings. There is only one dictionary block |
| // for all the data blocks within a cfile. |
| // layout for dictionary encoded block: |
| // Either header + embedded codeword block, which can be encoded with any |
| // int blockbuilder, when mode_ = kCodeWordMode. |
| // Or header + embedded StringPlainBlock, when mode_ = kPlainStringMode. |
| // Data blocks start with mode_ = kCodeWordMode, when the the size of dictionary |
| // block go beyond the option_->block_size, the subsequent data blocks will switch |
| // to string plain block automatically. |
| |
| // You can embed any int block builder encoding formats, such as group-varint, |
| // bitshuffle. Currently, we use bitshuffle builder for codewords. |
| // |
| // To use other block builder/decoder, just make sure that BlockDecoder has |
| // interface CopyNextValuesToArray(size_t*, uint8_t*). To do that, just replace |
| // BShufBuilder/Decoder is ok. |
| #pragma once |
| |
| #include <sys/types.h> |
| |
| #include <cstddef> |
| #include <cstdint> |
| #include <memory> |
| #include <vector> |
| |
| #include <sparsehash/dense_hash_map> |
| |
| #include "kudu/cfile/binary_plain_block.h" |
| #include "kudu/cfile/block_encodings.h" |
| #include "kudu/cfile/block_handle.h" |
| #include "kudu/common/rowid.h" |
| #include "kudu/gutil/casts.h" |
| #include "kudu/gutil/macros.h" |
| #include "kudu/gutil/port.h" |
| #include "kudu/gutil/ref_counted.h" |
| #include "kudu/gutil/strings/stringpiece.h" |
| #include "kudu/util/faststring.h" |
| #include "kudu/util/memory/arena.h" |
| #include "kudu/util/slice.h" |
| #include "kudu/util/status.h" |
| |
| template <class X> |
| struct GoodFastHash; |
| |
| namespace kudu { |
| |
| class ColumnDataView; |
| class ColumnMaterializationContext; |
| class SelectionVectorView; |
| |
| namespace cfile { |
| |
| class CFileFooterPB; |
| class CFileWriter; |
| struct WriterOptions; |
| |
| // Header Mode type |
| enum DictEncodingMode { |
| DictEncodingMode_min = 1, |
| kCodeWordMode = 1, |
| kPlainBinaryMode = 2, |
| DictEncodingMode_max = 2 |
| }; |
| |
| class BinaryDictBlockBuilder final : public BlockBuilder { |
| public: |
| explicit BinaryDictBlockBuilder(const WriterOptions* options); |
| |
| bool IsBlockFull() const override; |
| |
| // Append the dictionary block for the current cfile to the end of the cfile and set the footer |
| // accordingly. |
| Status AppendExtraInfo(CFileWriter* c_writer, CFileFooterPB* footer) OVERRIDE; |
| |
| int Add(const uint8_t* vals, size_t count) OVERRIDE; |
| |
| void Finish(rowid_t ordinal_pos, std::vector<Slice>* slices) OVERRIDE; |
| |
| void Reset() OVERRIDE; |
| |
| size_t Count() const OVERRIDE; |
| |
| Status GetFirstKey(void* key) const OVERRIDE; |
| |
| Status GetLastKey(void* key) const OVERRIDE; |
| |
| static const size_t kMaxHeaderSize = sizeof(uint32_t) * 1; |
| |
| private: |
| int AddCodeWords(const uint8_t* vals, size_t count); |
| |
| ATTRIBUTE_COLD |
| bool AddToDict(Slice val, uint32_t* codeword); |
| |
| // Buffer used in Finish() for holding the encoded header. |
| faststring header_buffer_; |
| bool finished_; |
| const WriterOptions* options_; |
| |
| std::unique_ptr<BlockBuilder> data_builder_; |
| |
| // dict_block_, dictionary_, dictionary_strings_arena_ |
| // is related to the dictionary block (one per cfile). |
| // They should NOT be cleared in the Reset() method. |
| BinaryPlainBlockBuilder dict_block_; |
| |
| google::dense_hash_map<StringPiece, uint32_t, GoodFastHash<StringPiece> > dictionary_; |
| // Memory to hold the actual content for strings in the dictionary_. |
| // |
| // The size of it should be bigger than the size limit for dictionary block |
| // (e.g option_->block_size). |
| // |
| // Currently, it can hold at most 64MB content. |
| Arena dictionary_strings_arena_; |
| |
| DictEncodingMode mode_; |
| |
| // First key when mode_ = kCodeWordMode |
| faststring first_key_; |
| }; |
| |
| class CFileIterator; |
| |
| class BinaryDictBlockDecoder final : public BlockDecoder { |
| public: |
| explicit BinaryDictBlockDecoder(scoped_refptr<BlockHandle> block, CFileIterator* iter); |
| |
| virtual Status ParseHeader() OVERRIDE; |
| virtual void SeekToPositionInBlock(uint pos) OVERRIDE; |
| virtual Status SeekAtOrAfterValue(const void* value, bool* exact_match) OVERRIDE; |
| Status CopyNextValues(size_t* n, ColumnDataView* dst) OVERRIDE; |
| Status CopyNextAndEval(size_t* n, |
| ColumnMaterializationContext* ctx, |
| SelectionVectorView* sel, |
| ColumnDataView* dst) override; |
| |
| virtual bool HasNext() const OVERRIDE { |
| return data_decoder_->HasNext(); |
| } |
| |
| virtual size_t Count() const OVERRIDE { |
| return data_decoder_->Count(); |
| } |
| |
| virtual size_t GetCurrentIndex() const OVERRIDE { |
| return data_decoder_->GetCurrentIndex(); |
| } |
| |
| virtual rowid_t GetFirstRowId() const OVERRIDE { |
| return data_decoder_->GetFirstRowId(); |
| } |
| |
| static const size_t kMinHeaderSize = sizeof(uint32_t) * 1; |
| |
| private: |
| Status CopyNextDecodeStrings(size_t* n, ColumnDataView* dst); |
| |
| scoped_refptr<BlockHandle> block_; |
| Slice data_; |
| bool parsed_; |
| |
| // Dictionary block decoder. |
| BinaryPlainBlockDecoder* dict_decoder_; |
| |
| std::unique_ptr<BlockDecoder> data_decoder_; |
| |
| // Parent CFileIterator, each dictionary decoder in the same CFile will share |
| // the same vocabulary, and thus, the same set of matching codewords. |
| CFileIterator* parent_cfile_iter_; |
| |
| DictEncodingMode mode_; |
| |
| // buffer to hold the codewords, needed by CopyNextDecodeStrings() |
| faststring codeword_buf_; |
| |
| }; |
| |
| } // namespace cfile |
| } // namespace kudu |
| |
| // Defined for tight_enum_test_cast<> -- has to be defined outside of any namespace. |
| MAKE_ENUM_LIMITS(kudu::cfile::DictEncodingMode, |
| kudu::cfile::DictEncodingMode_min, |
| kudu::cfile::DictEncodingMode_max); |