blob: 0f9d5ec23e1fb7ceb292022318c877edcb6d7f1c [file] [log] [blame]
/// Licensed to the Apache Software Foundation (ASF) under one
/// or more contributor license agreements. See the NOTICE file
/// distributed with this work for additional information
/// regarding copyright ownership. The ASF licenses this file
/// to you under the Apache License, Version 2.0 (the
/// "License"); you may not use this file except in compliance
/// with the License. You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing,
/// software distributed under the License is distributed on an
/// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
/// KIND, either express or implied. See the License for the
/// specific language governing permissions and limitations
/// under the License.
#pragma once
#include "common/status.h"
namespace impala {
// This class can be used to encode values with byte stream split encoding
// (https://github.com/apache/parquet-format/blob/master/Encodings.md#byte-stream-split-byte_stream_split--9)
// and write them to a buffer. To encode values to a page:
// 1. pass the number of bytes a value takes up via the constructor or the template
// parameter (`T_SIZE`). If `T_SIZE` is set to 0, the size of the type must be passed
// as an argument to the constructor.
// 2. call `NewPage()` with a pointer to the first byte of an empty or prepopulated buffer
// 3. use the `Put()` or `PutBytes()` function to add a value to be encoded
// 4. call `FinalizePage()` with a pointer to the output buffer. Encoding happens upon
// this call, and it also resets the encoder.
//
// Passing the byte size via the constructor is only recommended if the byte size is
// not 4 or 8. Using the template parameter allows for better optimization.
template <size_t T_SIZE>
class ParquetByteStreamSplitEncoder {
public:
// This constructor should be used when the byte size comes from the template parameter
// as a compile-time constant. This way it can be better optimized.
template <size_t SIZE = T_SIZE, std::enable_if_t<SIZE != 0, bool> = true>
ParquetByteStreamSplitEncoder() : size_in_bytes_(T_SIZE) {
static_assert(SIZE == T_SIZE);
}
// This constructor should be used when the byte size does not come from the template
// parameter. The byte size must be passed as an argument. This would usually be used
// for fixed_len_byte_array types.
template <size_t SIZE = T_SIZE, std::enable_if_t<SIZE == 0, bool> = true>
ParquetByteStreamSplitEncoder(int byte_count) : size_in_bytes_(byte_count) {
static_assert(SIZE == T_SIZE);
}
// The function sets the pointer and the length of the input buffer.
// `input_buffer` should point to the start of a buffer where the encoder can start
// gathering values to be encoded. If `prepopulated` > 0, the first `prepopulated`
// values in the buffer are treated as if they have already been added to the encoder.
// The buffer length must be non-negative. The pointer must not be a nullpointer.
void NewPage(uint8_t* input_buffer, int buffer_len, int prepopulated = 0);
// Adds `value` to the list of values to be encoded.
// The type (T) must be the same size as `size_in_bytes`.
// Only valid to call when `NewPage()` has already been called.
// Returns whether or not adding the value was successful.
template <typename T>
WARN_UNUSED_RESULT
bool Put(T value) {
DCHECK(sizeof(T) == getByteSize());
DCHECK(input_buffer_ != nullptr); // NewPage() must be called first
if (value_count_ * getByteSize() + sizeof(T) > input_buffer_len_) return false;
memcpy(input_buffer_ + value_count_ * getByteSize(), &value, getByteSize());
value_count_++;
return true;
}
// Adds `value` to the list of values to be encoded.
// It is the caller's responsibility to ensure that the value is the correct size.
// Only valid to call when `NewPage()` has already been called.
// Returns whether or not adding the value was successful.
bool PutBytes(const uint8_t* value) WARN_UNUSED_RESULT;
// Writes the encoded values to the `output_buffer`, and resets the encoder.
// Returns the number of values encoded, or -1 if an error occured.
// The `Put()` function can't be called until calling `NewPage()` again.
int FinalizePage(uint8_t* output_buffer, int output_buffer_len) WARN_UNUSED_RESULT;
private:
// Number of bytes that T consists of.
const int size_in_bytes_;
// Points to the first byte of the input buffer.
// This is where we store the values to be encoded.
uint8_t* input_buffer_ = nullptr;
// Length of the buffer in bytes.
int input_buffer_len_ = 0;
// The number of values to encode when `FinalizePage()` is called.
// Note that this contains the number of values, not bytes.
int value_count_ = 0;
void Reset();
// This is the helper function for FinalizePage() that does the actual encoding.
static void Encode(const uint8_t* to_encode, uint8_t* encoded,
int value_count, size_t byte_size);
ALWAYS_INLINE size_t getByteSize() const {
return T_SIZE == 0 ? size_in_bytes_ : T_SIZE;
}
};
} // namespace impala