blob: 25cfc84179a5328f38510145a332a2715fa0b9aa [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// Simplistic block encoding for strings.
//
// The block consists of:
// Header:
// ordinal_pos (32-bit fixed)
// num_elems (32-bit fixed)
// offsets_pos (32-bit fixed): position of the first offset, relative to block start
// Strings:
// raw strings that were written
// Offsets: [pointed to by offsets_pos]
// gvint-encoded offsets pointing to the beginning of each string.
#ifndef KUDU_CFILE_BINARY_PLAIN_BLOCK_H
#define KUDU_CFILE_BINARY_PLAIN_BLOCK_H
#include <sys/types.h>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <vector>
#include <glog/logging.h>
#include "kudu/cfile/block_encodings.h"
#include "kudu/common/rowid.h"
#include "kudu/gutil/port.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/util/faststring.h"
#include "kudu/util/slice.h"
#include "kudu/util/status.h"
namespace kudu {
class ColumnDataView;
class ColumnMaterializationContext;
class SelectionVectorView;
namespace cfile {
class BlockHandle;
struct WriterOptions;
class BinaryPlainBlockBuilder final : public BlockBuilder {
public:
explicit BinaryPlainBlockBuilder(const WriterOptions *options);
virtual ~BinaryPlainBlockBuilder();
bool IsBlockFull() const override;
int Add(const uint8_t *vals, size_t count) OVERRIDE;
void Finish(rowid_t ordinal_pos, std::vector<Slice>* slices) override;
void Reset() OVERRIDE;
size_t Count() const OVERRIDE;
// Return the key at index idx.
// key should be a Slice*
Status GetKeyAtIdx(void* key_void, int idx) const;
// Return the first added key.
// key should be a Slice*
Status GetFirstKey(void* key) const OVERRIDE;
// Return the last added key.
// key should be a Slice*
Status GetLastKey(void* key) const OVERRIDE;
// Length of a header.
static constexpr size_t kHeaderSize = sizeof(uint32_t) * 3;
private:
faststring buffer_;
size_t end_of_data_offset_;
size_t size_estimate_;
// Offsets of each entry, relative to the start of the block.
std::vector<uint32_t> offsets_;
bool finished_;
const WriterOptions *options_;
};
class BinaryPlainBlockDecoder final : public BlockDecoder {
public:
explicit BinaryPlainBlockDecoder(scoped_refptr<BlockHandle> block);
virtual ~BinaryPlainBlockDecoder();
virtual Status ParseHeader() OVERRIDE;
virtual void SeekToPositionInBlock(uint pos) OVERRIDE;
virtual Status SeekAtOrAfterValue(const void *value,
bool *exact_match) OVERRIDE;
Status CopyNextValues(size_t *n, ColumnDataView *dst) OVERRIDE;
Status CopyNextAndEval(size_t* n,
ColumnMaterializationContext* ctx,
SelectionVectorView* sel,
ColumnDataView* dst) override;
virtual bool HasNext() const OVERRIDE {
DCHECK(parsed_);
return cur_idx_ < num_elems_;
}
virtual size_t Count() const OVERRIDE {
DCHECK(parsed_);
return num_elems_;
}
virtual size_t GetCurrentIndex() const OVERRIDE {
DCHECK(parsed_);
return cur_idx_;
}
virtual rowid_t GetFirstRowId() const OVERRIDE {
return ordinal_pos_base_;
}
Slice string_at_index(size_t idx) const {
const uint32_t str_offset = offset(idx);
uint32_t len = offset(idx + 1) - str_offset;
return Slice(&data_[str_offset], len);
}
const scoped_refptr<BlockHandle>& block_handle() {
return block_;
}
// Minimum length of a header.
static const size_t kMinHeaderSize = sizeof(uint32_t) * 3;
private:
// Helper template for handling batches of rows. CellHandler is a lambda that
// gets called on every cell. When decoder evaluation is enabled, it
// evaluates whether or not the string should be copied and sets a
// SelectionVectorView bit at the appropriate location. When decoder
// evaluation is disabled, it copies the cell's string to dst.
template <typename CellHandler>
Status HandleBatch(size_t* n, ColumnDataView* dst, CellHandler c);
// Return the offset within 'data_' where the string value with index 'idx'
// can be found.
uint32_t offset(int idx) const {
const uint8_t* p = &offsets_buf_[idx * sizeof(uint32_t)];
uint32_t ret;
memcpy(&ret, p, sizeof(uint32_t));
return ret;
}
scoped_refptr<BlockHandle> block_;
Slice data_;
bool parsed_;
// A buffer for an array of 32-bit integers for the offsets of the underlying
// strings in 'data_'.
//
// This array also contains one extra offset at the end, pointing
// _after_ the last entry. This makes the code much simpler.
//
// The array is stored inside a 'faststring' instead of a vector<uint32_t> to
// avoid the overhead of calling vector::push_back -- one would think it would
// be fully inlined away, but it's actually a perf win to do this.
faststring offsets_buf_;
uint32_t num_elems_;
rowid_t ordinal_pos_base_;
// Index of the currently seeked element in the block.
uint32_t cur_idx_;
};
} // namespace cfile
} // namespace kudu
#endif // KUDU_CFILE_BINARY_PREFIX_BLOCK_H