| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // This file is copied from |
| // https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/bit-stream-utils.h |
| // and modified by Doris |
| |
| #pragma once |
| |
| #include "util/bit_packing.h" |
| #include "util/bit_util.h" |
| #include "util/faststring.h" |
| |
| using doris::BitUtil; |
| #include "common/compile_check_begin.h" |
| namespace doris { |
| |
| // Utility class to write bit/byte streams. This class can write data to either be |
| // bit packed or byte aligned (and a single stream that has a mix of both). |
| class BitWriter { |
| public: |
| // buffer: buffer to write bits to. |
| explicit BitWriter(faststring* buffer) : buffer_(buffer) { Clear(); } |
| |
| void Clear() { |
| buffered_values_ = 0; |
| byte_offset_ = 0; |
| bit_offset_ = 0; |
| buffer_->clear(); |
| } |
| |
| // Returns a pointer to the underlying buffer |
| faststring* buffer() const { return buffer_; } |
| |
| // The number of current bytes written, including the current byte (i.e. may include a |
| // fraction of a byte). Includes buffered values. |
| int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); } |
| |
| // Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit |
| // packed. |
| void PutValue(uint64_t v, int num_bits); |
| |
| // Writes v to the next aligned byte using num_bits. If T is larger than num_bits, the |
| // extra high-order bits will be ignored. |
| template <typename T> |
| void PutAligned(T v, int num_bits); |
| |
| // Write a Vlq encoded int to the buffer. The value is written byte aligned. |
| // For more details on vlq: en.wikipedia.org/wiki/Variable-length_quantity |
| void PutVlqInt(int32_t v); |
| |
| // Get the index to the next aligned byte and advance the underlying buffer by num_bytes. |
| size_t GetByteIndexAndAdvance(int num_bytes) { |
| uint8_t* ptr = GetNextBytePtr(num_bytes); |
| return ptr - buffer_->data(); |
| } |
| |
| // Get a pointer to the next aligned byte and advance the underlying buffer by num_bytes. |
| uint8_t* GetNextBytePtr(int num_bytes); |
| |
| // Flushes all buffered values to the buffer. Call this when done writing to the buffer. |
| // If 'align' is true, buffered_values_ is reset and any future writes will be written |
| // to the next byte boundary. |
| void Flush(bool align = false); |
| |
| private: |
| // Bit-packed values are initially written to this variable before being memcpy'd to |
| // buffer_. This is faster than writing values byte by byte directly to buffer_. |
| uint64_t buffered_values_; |
| |
| faststring* buffer_ = nullptr; |
| int byte_offset_; // Offset in buffer_ |
| int bit_offset_; // Offset in buffered_values_ |
| }; |
| |
| // Utility class to read bit/byte stream. This class can read bits or bytes |
| // that are either byte aligned or not. It also has utilities to read multiple |
| // bytes in one read (e.g. encoded int). |
| class BitReader { |
| public: |
| // 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. |
| BitReader(const uint8_t* buffer, int buffer_len); |
| |
| BitReader() : buffer_(nullptr), max_bytes_(0) {} |
| |
| // Gets the next value from the buffer. Returns true if 'v' could be read or false if |
| // there are not enough bytes left. num_bits must be <= 32. |
| template <typename T> |
| bool GetValue(int num_bits, T* v); |
| |
| // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a |
| // little-endian native type and big enough to store 'num_bytes'. The value is assumed |
| // to be byte-aligned so the stream will be advanced to the start of the next byte |
| // before 'v' is read. Returns false if there are not enough bytes left. |
| template <typename T> |
| bool GetAligned(int num_bytes, T* v); |
| |
| // Reads a vlq encoded int from the stream. The encoded int must start at the |
| // beginning of a byte. Return false if there were not enough bytes in the buffer. |
| bool GetVlqInt(uint32_t* v); |
| // Reads a zigzag encoded int `into` v. |
| bool GetZigZagVlqInt(int32_t* v); |
| |
| // Reads a vlq encoded int from the stream. The encoded int must start at the |
| // beginning of a byte. Return false if there were not enough bytes in the buffer. |
| bool GetVlqInt(uint64_t* v); |
| // Reads a zigzag encoded int `into` v. |
| bool GetZigZagVlqInt(int64_t* v); |
| |
| // Returns the number of bytes left in the stream, not including the current byte (i.e., |
| // there may be an additional fraction of a byte). |
| int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } |
| |
| // Current position in the stream, by bit. |
| int position() const { return byte_offset_ * 8 + bit_offset_; } |
| |
| // Rewind the stream by 'num_bits' bits |
| void Rewind(int num_bits); |
| |
| // Advance the stream by 'num_bits' bits |
| bool Advance(int64_t num_bits); |
| |
| // Seek to a specific bit in the buffer |
| void SeekToBit(unsigned int stream_position); |
| |
| // Maximum byte length of a vlq encoded int |
| static const int MAX_VLQ_BYTE_LEN = 5; |
| |
| // Maximum byte length of a vlq encoded int64 |
| static const int MAX_VLQ_BYTE_LEN_FOR_INT64 = 10; |
| |
| bool is_initialized() const { return buffer_ != nullptr; } |
| |
| const uint8_t* buffer() const { return buffer_; } |
| |
| int max_bytes() const { return max_bytes_; } |
| |
| private: |
| // Used by SeekToBit() and GetValue() to fetch the |
| // the next word into buffer_. |
| void BufferValues(); |
| |
| const uint8_t* buffer_ = nullptr; |
| int max_bytes_; |
| |
| // Bytes are memcpy'd from buffer_ and values are read from this variable. This is |
| // faster than reading values byte by byte directly from buffer_. |
| uint64_t buffered_values_; |
| |
| int byte_offset_; // Offset in buffer_ |
| int bit_offset_; // Offset in buffered_values_ |
| }; |
| |
| /// Utility class to read bit/byte stream. This class can read bits or bytes that are |
| /// either byte aligned or not. It also has utilities to read multiple bytes in one |
| /// read (e.g. encoded int). Exposes a batch-oriented interface to allow efficient |
| /// processing of multiple values at a time. |
| class BatchedBitReader { |
| public: |
| /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. |
| /// Does not take ownership of the buffer. |
| BatchedBitReader(const uint8_t* buffer, int64_t buffer_len) { Reset(buffer, buffer_len); } |
| |
| BatchedBitReader() {} |
| |
| // The implicit copy constructor is left defined. If a BatchedBitReader is copied, the |
| // two copies do not share any state. Invoking functions on either copy continues |
| // reading from the current read position without modifying the state of the other |
| // copy. |
| |
| /// Resets the read to start reading from the start of 'buffer'. The buffer's |
| /// length is 'buffer_len'. Does not take ownership of the buffer. |
| void Reset(const uint8_t* buffer, int64_t buffer_len) { |
| DCHECK(buffer != nullptr); |
| DCHECK_GE(buffer_len, 0); |
| buffer_pos_ = buffer; |
| buffer_end_ = buffer + buffer_len; |
| } |
| |
| /// Gets up to 'num_values' bit-packed values, starting from the current byte in the |
| /// buffer and advance the read position. 'bit_width' must be <= 64. |
| /// If 'bit_width' * 'num_values' is not a multiple of 8, the trailing bytes are |
| /// skipped and the next UnpackBatch() call will start reading from the next byte. |
| /// |
| /// If the caller does not want to drop trailing bits, 'num_values' must be exactly the |
| /// total number of values the caller wants to read from a run of bit-packed values, or |
| /// 'bit_width' * 'num_values' must be a multiple of 8. This condition is always |
| /// satisfied if 'num_values' is a multiple of 32. |
| /// |
| /// The output type 'T' must be an unsigned integer. |
| /// |
| /// Returns the number of values read. |
| template <typename T> |
| int UnpackBatch(int bit_width, int num_values, T* v); |
| |
| /// Skip 'num_values_to_skip' bit-packed values. |
| /// 'num_values_to_skip * bit_width' is either divisible by 8, or |
| /// 'num_values_to_skip' equals to the count of the remaining bit-packed values. |
| bool SkipBatch(int bit_width, int num_values_to_skip); |
| |
| /// Unpack bit-packed values in the same way as UnpackBatch() and decode them using the |
| /// dictionary 'dict' with 'dict_len' entries. Return -1 if a decoding error is |
| /// encountered, i.e. if the bit-packed values are not valid indices in 'dict'. |
| /// Otherwise returns the number of values decoded. The values are written to 'v' with |
| /// a stride of 'stride' bytes. |
| template <typename T> |
| int UnpackAndDecodeBatch(int bit_width, T* dict, int64_t dict_len, int num_values, T* v, |
| int64_t stride); |
| |
| /// Reads an unpacked 'num_bytes'-sized value from the buffer and stores it in 'v'. T |
| /// needs to be a little-endian native type and big enough to store 'num_bytes'. |
| /// Returns false if there are not enough bytes left. |
| template <typename T> |
| bool GetBytes(int num_bytes, T* v); |
| |
| /// Read an unsigned ULEB-128 encoded int from the stream. The encoded int must start |
| /// at the beginning of a byte. Return false if there were not enough bytes in the |
| /// buffer or the int is invalid. For more details on ULEB-128: |
| /// https://en.wikipedia.org/wiki/LEB128 |
| /// UINT_T must be an unsigned integer type. |
| template <typename UINT_T> |
| bool GetUleb128(UINT_T* v); |
| |
| /// Returns the number of bytes left in the stream. |
| int bytes_left() { return static_cast<int>(buffer_end_ - buffer_pos_); } |
| |
| /// Maximum byte length of a vlq encoded integer of type T. |
| template <typename T> |
| static constexpr int max_vlq_byte_len() { |
| return BitUtil::Ceil(sizeof(T) * 8, 7); |
| } |
| |
| /// Maximum supported bitwidth for reader. |
| static const int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH; |
| |
| private: |
| /// Current read position in the buffer. |
| const uint8_t* buffer_pos_ = nullptr; |
| |
| /// Pointer to the byte after the end of the buffer. |
| const uint8_t* buffer_end_ = nullptr; |
| }; |
| #include "common/compile_check_end.h" |
| } // namespace doris |