blob: fa07fc5e974b3970e70e308d09ade7632605ec87 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// Dictionary encoding for strings. There is only one dictionary block
// for all the data blocks within a cfile.
// layout for dictionary encoded block:
// Either header + embedded codeword block, which can be encoded with any
// int blockbuilder, when mode_ = kCodeWordMode.
// Or header + embedded StringPlainBlock, when mode_ = kPlainStringMode.
// Data blocks start with mode_ = kCodeWordMode, when the the size of dictionary
// block go beyond the option_->block_size, the subsequent data blocks will switch
// to string plain block automatically.
// You can embed any int block builder encoding formats, such as group-varint,
// bitshuffle. Currently, we use bitshuffle builder for codewords.
//
// To use other block builder/decoder, just make sure that BlockDecoder has
// interface CopyNextValuesToArray(size_t*, uint8_t*). To do that, just replace
// BShufBuilder/Decoder is ok.
#pragma once
#include <sys/types.h>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <vector>
#include <sparsehash/dense_hash_map>
#include "kudu/cfile/binary_plain_block.h"
#include "kudu/cfile/block_encodings.h"
#include "kudu/cfile/block_handle.h"
#include "kudu/common/rowid.h"
#include "kudu/gutil/casts.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/port.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/gutil/strings/stringpiece.h"
#include "kudu/util/faststring.h"
#include "kudu/util/memory/arena.h"
#include "kudu/util/slice.h"
#include "kudu/util/status.h"
template <class X>
struct GoodFastHash;
namespace kudu {
class ColumnDataView;
class ColumnMaterializationContext;
class SelectionVectorView;
namespace cfile {
class CFileFooterPB;
class CFileWriter;
struct WriterOptions;
// Header Mode type
enum DictEncodingMode {
DictEncodingMode_min = 1,
kCodeWordMode = 1,
kPlainBinaryMode = 2,
DictEncodingMode_max = 2
};
class BinaryDictBlockBuilder final : public BlockBuilder {
public:
explicit BinaryDictBlockBuilder(const WriterOptions* options);
bool IsBlockFull() const override;
// Append the dictionary block for the current cfile to the end of the cfile and set the footer
// accordingly.
Status AppendExtraInfo(CFileWriter* c_writer, CFileFooterPB* footer) OVERRIDE;
int Add(const uint8_t* vals, size_t count) OVERRIDE;
void Finish(rowid_t ordinal_pos, std::vector<Slice>* slices) OVERRIDE;
void Reset() OVERRIDE;
size_t Count() const OVERRIDE;
Status GetFirstKey(void* key) const OVERRIDE;
Status GetLastKey(void* key) const OVERRIDE;
static const size_t kMaxHeaderSize = sizeof(uint32_t) * 1;
private:
int AddCodeWords(const uint8_t* vals, size_t count);
ATTRIBUTE_COLD
bool AddToDict(Slice val, uint32_t* codeword);
// Buffer used in Finish() for holding the encoded header.
faststring header_buffer_;
bool finished_;
const WriterOptions* options_;
std::unique_ptr<BlockBuilder> data_builder_;
// dict_block_, dictionary_, dictionary_strings_arena_
// is related to the dictionary block (one per cfile).
// They should NOT be cleared in the Reset() method.
BinaryPlainBlockBuilder dict_block_;
google::dense_hash_map<StringPiece, uint32_t, GoodFastHash<StringPiece> > dictionary_;
// Memory to hold the actual content for strings in the dictionary_.
//
// The size of it should be bigger than the size limit for dictionary block
// (e.g option_->block_size).
//
// Currently, it can hold at most 64MB content.
Arena dictionary_strings_arena_;
DictEncodingMode mode_;
// First key when mode_ = kCodeWordMode
faststring first_key_;
};
class CFileIterator;
class BinaryDictBlockDecoder final : public BlockDecoder {
public:
explicit BinaryDictBlockDecoder(scoped_refptr<BlockHandle> block, CFileIterator* iter);
virtual Status ParseHeader() OVERRIDE;
virtual void SeekToPositionInBlock(uint pos) OVERRIDE;
virtual Status SeekAtOrAfterValue(const void* value, bool* exact_match) OVERRIDE;
Status CopyNextValues(size_t* n, ColumnDataView* dst) OVERRIDE;
Status CopyNextAndEval(size_t* n,
ColumnMaterializationContext* ctx,
SelectionVectorView* sel,
ColumnDataView* dst) override;
virtual bool HasNext() const OVERRIDE {
return data_decoder_->HasNext();
}
virtual size_t Count() const OVERRIDE {
return data_decoder_->Count();
}
virtual size_t GetCurrentIndex() const OVERRIDE {
return data_decoder_->GetCurrentIndex();
}
virtual rowid_t GetFirstRowId() const OVERRIDE {
return data_decoder_->GetFirstRowId();
}
static const size_t kMinHeaderSize = sizeof(uint32_t) * 1;
private:
Status CopyNextDecodeStrings(size_t* n, ColumnDataView* dst);
scoped_refptr<BlockHandle> block_;
Slice data_;
bool parsed_;
// Dictionary block decoder.
BinaryPlainBlockDecoder* dict_decoder_;
std::unique_ptr<BlockDecoder> data_decoder_;
// Parent CFileIterator, each dictionary decoder in the same CFile will share
// the same vocabulary, and thus, the same set of matching codewords.
CFileIterator* parent_cfile_iter_;
DictEncodingMode mode_;
// buffer to hold the codewords, needed by CopyNextDecodeStrings()
faststring codeword_buf_;
};
} // namespace cfile
} // namespace kudu
// Defined for tight_enum_test_cast<> -- has to be defined outside of any namespace.
MAKE_ENUM_LIMITS(kudu::cfile::DictEncodingMode,
kudu::cfile::DictEncodingMode_min,
kudu::cfile::DictEncodingMode_max);