| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| |
| #include <cstdint> |
| #include <cstring> |
| |
| #include "arrow/buffer.h" |
| #include "arrow/util/bit_util.h" |
| #include "arrow/util/endian.h" |
| #include "arrow/util/macros.h" |
| |
| namespace arrow { |
| namespace internal { |
| |
| class BitmapReader { |
| public: |
| BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length) |
| : bitmap_(bitmap), position_(0), length_(length) { |
| current_byte_ = 0; |
| byte_offset_ = start_offset / 8; |
| bit_offset_ = start_offset % 8; |
| if (length > 0) { |
| current_byte_ = bitmap[byte_offset_]; |
| } |
| } |
| |
| bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; } |
| |
| bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; } |
| |
| void Next() { |
| ++bit_offset_; |
| ++position_; |
| if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) { |
| bit_offset_ = 0; |
| ++byte_offset_; |
| if (ARROW_PREDICT_TRUE(position_ < length_)) { |
| current_byte_ = bitmap_[byte_offset_]; |
| } |
| } |
| } |
| |
| int64_t position() const { return position_; } |
| |
| int64_t length() const { return length_; } |
| |
| private: |
| const uint8_t* bitmap_; |
| int64_t position_; |
| int64_t length_; |
| |
| uint8_t current_byte_; |
| int64_t byte_offset_; |
| int64_t bit_offset_; |
| }; |
| |
| // XXX Cannot name it BitmapWordReader because the name is already used |
| // in bitmap_ops.cc |
| |
| class BitmapUInt64Reader { |
| public: |
| BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length) |
| : bitmap_(util::MakeNonNull(bitmap) + start_offset / 8), |
| num_carry_bits_(8 - start_offset % 8), |
| length_(length), |
| remaining_length_(length_) { |
| if (length_ > 0) { |
| // Load carry bits from the first byte's MSBs |
| if (length_ >= num_carry_bits_) { |
| carry_bits_ = |
| LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_); |
| } else { |
| carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_); |
| } |
| } |
| } |
| |
| uint64_t NextWord() { |
| if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) { |
| // We can load a full word |
| uint64_t next_word = LoadFullWord(); |
| // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word |
| uint64_t word = carry_bits_ | (next_word << num_carry_bits_); |
| carry_bits_ = next_word >> (64 - num_carry_bits_); |
| remaining_length_ -= 64; |
| return word; |
| } else if (remaining_length_ > num_carry_bits_) { |
| // We can load a partial word |
| uint64_t next_word = |
| LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_); |
| uint64_t word = carry_bits_ | (next_word << num_carry_bits_); |
| carry_bits_ = next_word >> (64 - num_carry_bits_); |
| remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0); |
| return word; |
| } else { |
| remaining_length_ = 0; |
| return carry_bits_; |
| } |
| } |
| |
| int64_t position() const { return length_ - remaining_length_; } |
| |
| int64_t length() const { return length_; } |
| |
| private: |
| uint64_t LoadFullWord() { |
| uint64_t word; |
| memcpy(&word, bitmap_, 8); |
| bitmap_ += 8; |
| return BitUtil::ToLittleEndian(word); |
| } |
| |
| uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) { |
| uint64_t word = 0; |
| const int64_t num_bytes = BitUtil::BytesForBits(num_bits); |
| memcpy(&word, bitmap_, num_bytes); |
| bitmap_ += num_bytes; |
| return (BitUtil::ToLittleEndian(word) >> bit_offset) & |
| BitUtil::LeastSignificantBitMask(num_bits); |
| } |
| |
| const uint8_t* bitmap_; |
| const int64_t num_carry_bits_; // in [1, 8] |
| const int64_t length_; |
| int64_t remaining_length_; |
| uint64_t carry_bits_; |
| }; |
| |
| // BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h) |
| // on sufficiently large inputs. However, it has a larger prolog / epilog overhead |
| // and should probably not be used for small bitmaps. |
| |
| template <typename Word, bool may_have_byte_offset = true> |
| class BitmapWordReader { |
| public: |
| BitmapWordReader() = default; |
| BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length) |
| : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)), |
| bitmap_(bitmap + offset / 8), |
| bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) { |
| // decrement word count by one as we may touch two adjacent words in one iteration |
| nwords_ = length / (sizeof(Word) * 8) - 1; |
| if (nwords_ < 0) { |
| nwords_ = 0; |
| } |
| trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8); |
| trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_)); |
| |
| if (nwords_ > 0) { |
| current_data.word_ = load<Word>(bitmap_); |
| } else if (length > 0) { |
| current_data.epi.byte_ = load<uint8_t>(bitmap_); |
| } |
| } |
| |
| Word NextWord() { |
| bitmap_ += sizeof(Word); |
| const Word next_word = load<Word>(bitmap_); |
| Word word = current_data.word_; |
| if (may_have_byte_offset && offset_) { |
| // combine two adjacent words into one word |
| // |<------ next ----->|<---- current ---->| |
| // +-------------+-----+-------------+-----+ |
| // | --- | A | B | --- | |
| // +-------------+-----+-------------+-----+ |
| // | | offset |
| // v v |
| // +-----+-------------+ |
| // | A | B | |
| // +-----+-------------+ |
| // |<------ word ----->| |
| word >>= offset_; |
| word |= next_word << (sizeof(Word) * 8 - offset_); |
| } |
| current_data.word_ = next_word; |
| return word; |
| } |
| |
| uint8_t NextTrailingByte(int& valid_bits) { |
| uint8_t byte; |
| assert(trailing_bits_ > 0); |
| |
| if (trailing_bits_ <= 8) { |
| // last byte |
| valid_bits = trailing_bits_; |
| trailing_bits_ = 0; |
| byte = 0; |
| internal::BitmapReader reader(bitmap_, offset_, valid_bits); |
| for (int i = 0; i < valid_bits; ++i) { |
| byte >>= 1; |
| if (reader.IsSet()) { |
| byte |= 0x80; |
| } |
| reader.Next(); |
| } |
| byte >>= (8 - valid_bits); |
| } else { |
| ++bitmap_; |
| const uint8_t next_byte = load<uint8_t>(bitmap_); |
| byte = current_data.epi.byte_; |
| if (may_have_byte_offset && offset_) { |
| byte >>= offset_; |
| byte |= next_byte << (8 - offset_); |
| } |
| current_data.epi.byte_ = next_byte; |
| trailing_bits_ -= 8; |
| trailing_bytes_--; |
| valid_bits = 8; |
| } |
| return byte; |
| } |
| |
| int64_t words() const { return nwords_; } |
| int trailing_bytes() const { return trailing_bytes_; } |
| |
| private: |
| int64_t offset_; |
| const uint8_t* bitmap_; |
| |
| const uint8_t* bitmap_end_; |
| int64_t nwords_; |
| int trailing_bits_; |
| int trailing_bytes_; |
| union { |
| Word word_; |
| struct { |
| #if ARROW_LITTLE_ENDIAN == 0 |
| uint8_t padding_bytes_[sizeof(Word) - 1]; |
| #endif |
| uint8_t byte_; |
| } epi; |
| } current_data; |
| |
| template <typename DType> |
| DType load(const uint8_t* bitmap) { |
| assert(bitmap + sizeof(DType) <= bitmap_end_); |
| return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap)); |
| } |
| }; |
| |
| /// \brief Index into a possibly non-existent bitmap |
| struct OptionalBitIndexer { |
| const uint8_t* bitmap; |
| const int64_t offset; |
| |
| explicit OptionalBitIndexer(const std::shared_ptr<Buffer>& buffer, int64_t offset = 0) |
| : bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {} |
| |
| bool operator[](int64_t i) const { |
| return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i); |
| } |
| }; |
| |
| } // namespace internal |
| } // namespace arrow |