| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| |
| #include <cstdint> |
| #include <cstring> |
| |
| #include "arrow/util/bit_util.h" |
| #include "arrow/util/endian.h" |
| #include "arrow/util/macros.h" |
| |
| namespace arrow { |
| namespace internal { |
| |
| class BitmapWriter { |
| // A sequential bitwise writer that preserves surrounding bit values. |
| |
| public: |
| BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length) |
| : bitmap_(bitmap), position_(0), length_(length) { |
| byte_offset_ = start_offset / 8; |
| bit_mask_ = BitUtil::kBitmask[start_offset % 8]; |
| if (length > 0) { |
| current_byte_ = bitmap[byte_offset_]; |
| } else { |
| current_byte_ = 0; |
| } |
| } |
| |
| void Set() { current_byte_ |= bit_mask_; } |
| |
| void Clear() { current_byte_ &= bit_mask_ ^ 0xFF; } |
| |
| void Next() { |
| bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1); |
| ++position_; |
| if (bit_mask_ == 0) { |
| // Finished this byte, need advancing |
| bit_mask_ = 0x01; |
| bitmap_[byte_offset_++] = current_byte_; |
| if (ARROW_PREDICT_TRUE(position_ < length_)) { |
| current_byte_ = bitmap_[byte_offset_]; |
| } |
| } |
| } |
| |
| void Finish() { |
| // Store current byte if we didn't went past bitmap storage |
| if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { |
| bitmap_[byte_offset_] = current_byte_; |
| } |
| } |
| |
| int64_t position() const { return position_; } |
| |
| private: |
| uint8_t* bitmap_; |
| int64_t position_; |
| int64_t length_; |
| |
| uint8_t current_byte_; |
| uint8_t bit_mask_; |
| int64_t byte_offset_; |
| }; |
| |
| class FirstTimeBitmapWriter { |
| // Like BitmapWriter, but any bit values *following* the bits written |
| // might be clobbered. It is hence faster than BitmapWriter, and can |
| // also avoid false positives with Valgrind. |
| |
| public: |
| FirstTimeBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length) |
| : bitmap_(bitmap), position_(0), length_(length) { |
| current_byte_ = 0; |
| byte_offset_ = start_offset / 8; |
| bit_mask_ = BitUtil::kBitmask[start_offset % 8]; |
| if (length > 0) { |
| current_byte_ = bitmap[byte_offset_] & BitUtil::kPrecedingBitmask[start_offset % 8]; |
| } else { |
| current_byte_ = 0; |
| } |
| } |
| |
| /// Appends number_of_bits from word to valid_bits and valid_bits_offset. |
| /// |
| /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed |
| /// to be unset (i.e. 0). |
| /// \param[in] number_of_bits The number of bits to append from word. |
| void AppendWord(uint64_t word, int64_t number_of_bits) { |
| if (ARROW_PREDICT_FALSE(number_of_bits == 0)) { |
| return; |
| } |
| |
| // Location that the first byte needs to be written to. |
| uint8_t* append_position = bitmap_ + byte_offset_; |
| |
| // Update state variables except for current_byte_ here. |
| position_ += number_of_bits; |
| int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast<uint32_t>(bit_mask_)); |
| bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8]; |
| byte_offset_ += (bit_offset + number_of_bits) / 8; |
| |
| if (bit_offset != 0) { |
| // We are in the middle of the byte. This code updates the byte and shifts |
| // bits appropriately within word so it can be memcpy'd below. |
| int64_t bits_to_carry = 8 - bit_offset; |
| // Carry over bits from word to current_byte_. We assume any extra bits in word |
| // unset so no additional accounting is needed for when number_of_bits < |
| // bits_to_carry. |
| current_byte_ |= (word & BitUtil::kPrecedingBitmask[bits_to_carry]) << bit_offset; |
| // Check if everything is transfered into current_byte_. |
| if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) { |
| return; |
| } |
| *append_position = current_byte_; |
| append_position++; |
| // Move the carry bits off of word. |
| word = word >> bits_to_carry; |
| number_of_bits -= bits_to_carry; |
| } |
| word = BitUtil::ToLittleEndian(word); |
| int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits); |
| std::memcpy(append_position, &word, bytes_for_word); |
| // At this point, the previous current_byte_ has been written to bitmap_. |
| // The new current_byte_ is either the last relevant byte in 'word' |
| // or cleared if the new position is byte aligned (i.e. a fresh byte). |
| if (bit_mask_ == 0x1) { |
| current_byte_ = 0; |
| } else { |
| current_byte_ = *(append_position + bytes_for_word - 1); |
| } |
| } |
| |
| void Set() { current_byte_ |= bit_mask_; } |
| |
| void Clear() {} |
| |
| void Next() { |
| bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1); |
| ++position_; |
| if (bit_mask_ == 0) { |
| // Finished this byte, need advancing |
| bit_mask_ = 0x01; |
| bitmap_[byte_offset_++] = current_byte_; |
| current_byte_ = 0; |
| } |
| } |
| |
| void Finish() { |
| // Store current byte if we didn't went go bitmap storage |
| if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { |
| bitmap_[byte_offset_] = current_byte_; |
| } |
| } |
| |
| int64_t position() const { return position_; } |
| |
| private: |
| uint8_t* bitmap_; |
| int64_t position_; |
| int64_t length_; |
| |
| uint8_t current_byte_; |
| uint8_t bit_mask_; |
| int64_t byte_offset_; |
| }; |
| |
| template <typename Word, bool may_have_byte_offset = true> |
| class BitmapWordWriter { |
| public: |
| BitmapWordWriter() = default; |
| BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length) |
| : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)), |
| bitmap_(bitmap + offset / 8), |
| bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)), |
| mask_((1U << offset_) - 1) { |
| if (offset_) { |
| if (length >= static_cast<int>(sizeof(Word) * 8)) { |
| current_data.word_ = load<Word>(bitmap_); |
| } else if (length > 0) { |
| current_data.epi.byte_ = load<uint8_t>(bitmap_); |
| } |
| } |
| } |
| |
| void PutNextWord(Word word) { |
| if (may_have_byte_offset && offset_) { |
| // split one word into two adjacent words, don't touch unused bits |
| // |<------ word ----->| |
| // +-----+-------------+ |
| // | A | B | |
| // +-----+-------------+ |
| // | | |
| // v v offset |
| // +-------------+-----+-------------+-----+ |
| // | --- | A | B | --- | |
| // +-------------+-----+-------------+-----+ |
| // |<------ next ----->|<---- current ---->| |
| word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_)); |
| Word next_word = load<Word>(bitmap_ + sizeof(Word)); |
| current_data.word_ = (current_data.word_ & mask_) | (word & ~mask_); |
| next_word = (next_word & ~mask_) | (word & mask_); |
| store<Word>(bitmap_, current_data.word_); |
| store<Word>(bitmap_ + sizeof(Word), next_word); |
| current_data.word_ = next_word; |
| } else { |
| store<Word>(bitmap_, word); |
| } |
| bitmap_ += sizeof(Word); |
| } |
| |
| void PutNextTrailingByte(uint8_t byte, int valid_bits) { |
| if (valid_bits == 8) { |
| if (may_have_byte_offset && offset_) { |
| byte = (byte << offset_) | (byte >> (8 - offset_)); |
| uint8_t next_byte = load<uint8_t>(bitmap_ + 1); |
| current_data.epi.byte_ = (current_data.epi.byte_ & mask_) | (byte & ~mask_); |
| next_byte = (next_byte & ~mask_) | (byte & mask_); |
| store<uint8_t>(bitmap_, current_data.epi.byte_); |
| store<uint8_t>(bitmap_ + 1, next_byte); |
| current_data.epi.byte_ = next_byte; |
| } else { |
| store<uint8_t>(bitmap_, byte); |
| } |
| ++bitmap_; |
| } else { |
| assert(valid_bits > 0); |
| assert(valid_bits < 8); |
| assert(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits) <= bitmap_end_); |
| internal::BitmapWriter writer(bitmap_, offset_, valid_bits); |
| for (int i = 0; i < valid_bits; ++i) { |
| (byte & 0x01) ? writer.Set() : writer.Clear(); |
| writer.Next(); |
| byte >>= 1; |
| } |
| writer.Finish(); |
| } |
| } |
| |
| private: |
| int64_t offset_; |
| uint8_t* bitmap_; |
| |
| const uint8_t* bitmap_end_; |
| uint64_t mask_; |
| union { |
| Word word_; |
| struct { |
| #if ARROW_LITTLE_ENDIAN == 0 |
| uint8_t padding_bytes_[sizeof(Word) - 1]; |
| #endif |
| uint8_t byte_; |
| } epi; |
| } current_data; |
| |
| template <typename DType> |
| DType load(const uint8_t* bitmap) { |
| assert(bitmap + sizeof(DType) <= bitmap_end_); |
| return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap)); |
| } |
| |
| template <typename DType> |
| void store(uint8_t* bitmap, DType data) { |
| assert(bitmap + sizeof(DType) <= bitmap_end_); |
| util::SafeStore(bitmap, BitUtil::FromLittleEndian(data)); |
| } |
| }; |
| |
| } // namespace internal |
| } // namespace arrow |