| // |
| // Created by 李烁麟 on 25-8-4. |
| // |
| |
| #ifndef SIMD_TS_2DIFF_H |
| #define SIMD_TS_2DIFF_H |
| |
| #include <cstdint> |
| #include <cstring> |
| #include <iostream> |
| |
| #include "simde/x86/avx2.h" |
| #include "utils.h" |
| |
| static int N = 128; |
| |
| enum class predict_type { |
| EQ, |
| GT, |
| GE, |
| LE, |
| LT, |
| BT, |
| }; |
| |
| enum class block_action { All, None, NeedScan }; |
| |
| struct predictor { |
| predict_type type_; |
| int32_t value_; |
| int32_t rvalue_; |
| |
| bool satisfy(int32_t value) { |
| switch (type_) { |
| case predict_type::EQ: |
| return value_ == value; |
| case predict_type::GT: |
| return value > value_; |
| case predict_type::GE: |
| return value >= value_; |
| case predict_type::LT: |
| return value < value_; |
| case predict_type::LE: |
| return value <= value_; |
| case predict_type::BT: |
| return value >= value_ && value <= rvalue_; |
| default: |
| return false; |
| } |
| } |
| |
| block_action block_check(int32_t lvalue, int32_t rvalue) { |
| switch (type_) { |
| case predict_type::LT: { |
| if (lvalue >= value_) return block_action::None; |
| if (rvalue < value_) return block_action::All; |
| return block_action::NeedScan; |
| } |
| case predict_type::LE: { |
| if (lvalue > value_) return block_action::None; |
| if (rvalue <= value_) return block_action::All; |
| return block_action::NeedScan; |
| } |
| case predict_type::GT: { |
| if (rvalue <= value_) return block_action::None; |
| if (lvalue > value_) return block_action::All; |
| return block_action::NeedScan; |
| } |
| case predict_type::GE: { |
| if (rvalue < value_) return block_action::None; |
| if (lvalue >= value_) return block_action::All; |
| return block_action::NeedScan; |
| } |
| case predict_type::EQ: { |
| if (rvalue < value_ || lvalue > value_) |
| return block_action::None; |
| if (lvalue <= value_ && rvalue >= value_) |
| return block_action::NeedScan; |
| return block_action::NeedScan; |
| } |
| } |
| return block_action::NeedScan; |
| } |
| }; |
| |
| class simd_ts_2diff_decoder { |
| public: |
| simd_ts_2diff_decoder(uint8_t *data_array, int32_t data_size, |
| predictor *predict = nullptr) |
| : data_array_(data_array), data_size_(data_size), predict_(predict) { |
| bit_width_ = 0; |
| write_index_ = 0; |
| delta_min_ = 0; |
| previous_value_ = 0; |
| position_ = 0; |
| bits_left_ = 0; |
| }; |
| |
| bool decode(int32_t *out, int32_t *offset_map, int &size) { |
| if (position_ < data_size_) { |
| position_ += handle_one_block(data_array_ + position_, out, |
| offset_map, size); |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| static inline void drain_batch_compact(const int32_t out_u32[8], |
| uint8_t mask, uint32_t position, |
| std::vector<int32_t> &values, |
| std::vector<int32_t> &offsets) { |
| if (!mask) return; |
| const int k = __builtin_popcount(mask); |
| values.reserve(values.size() + k); |
| offsets.reserve(offsets.size() + k); |
| int32_t tmp_vals[8]; |
| int32_t tmp_offs[8]; |
| int n = 0; |
| uint32_t m = mask; |
| while (m) { |
| int j = __builtin_ctz(m); |
| m &= (m - 1); |
| tmp_vals[n] = out_u32[j]; |
| tmp_offs[n] = position + (uint32_t)j + 1; |
| ++n; |
| } |
| values.insert(values.end(), tmp_vals, tmp_vals + n); |
| offsets.insert(offsets.end(), tmp_offs, tmp_offs + n); |
| } |
| |
| size_t handle_one_block(uint8_t *data_array, int32_t *value, |
| int32_t *offset_map, int &size) { |
| uint8_t *in = data_array; |
| write_index_ = read_ui32(in); |
| bit_width_ = read_ui32(in); |
| delta_min_ = read_ui32(in); |
| previous_value_ = read_ui32(in); |
| int32_t block_min_lower = |
| previous_value_ + (delta_min_ >= 0 ? 0 : N) * delta_min_; |
| int32_t delta_upper_bound = (1 << bit_width_) - 1; |
| int32_t max_delta = delta_min_ + delta_upper_bound; |
| int32_t block_max_upper = previous_value_ + |
| delta_min_ * (max_delta > 0 ? 1 : 0) + |
| (max_delta > 0 ? N : 0) * delta_upper_bound; |
| block_action action = |
| predict_ == nullptr |
| ? block_action::All |
| : predict_->block_check(block_min_lower, block_max_upper); |
| if (action == block_action::None) { |
| return sizeof(write_index_) + sizeof(bit_width_) + |
| sizeof(delta_min_) + sizeof(previous_value_) + |
| (bit_width_ * write_index_ + 7) / 8; |
| } |
| |
| if (predict_ != nullptr) { |
| if (predict_->satisfy(previous_value_)) { |
| value[size++] = previous_value_; |
| offset_map[0] = 0; |
| } |
| } else { |
| value[size++] = previous_value_; |
| offset_map[0] = 0; |
| } |
| |
| for (int i = 0; i < write_index_;) { |
| if (!can_simd_decode8(i, in, data_array + data_size_)) { |
| const int remain_data = write_index_ - i; |
| previous_value_ = |
| handle_normal_data(in, previous_value_, remain_data, i, |
| value, offset_map, size); |
| i += remain_data; |
| } else { |
| int32_t out_u32[8]; |
| uint8_t mask; |
| previous_value_ = |
| handle_batch_data(in, bit_width_, delta_min_, i, |
| previous_value_, out_u32, mask); |
| if (mask != 0) { |
| if(mask == 0xFF) { |
| memcpy(value +size, out_u32, 8 * sizeof(int32_t)); |
| for (int k = 0; k < 8; ++k) offset_map[size + k] = i + k + 1; |
| size += 8; |
| } else { |
| uint8_t m = mask; |
| while(m) { |
| int tz = __builtin_ctz((unsigned )m); |
| value[size] = out_u32[tz]; |
| offset_map[size] = tz + i + 1; |
| ++ size; |
| m &= m -1; |
| } |
| } |
| } |
| i += 8; |
| } |
| } |
| return sizeof(write_index_) + sizeof(bit_width_) + sizeof(delta_min_) + |
| sizeof(previous_value_) + (bit_width_ * write_index_ + 7) / 8; |
| } |
| |
| int32_t handle_normal_data(uint8_t *&in, int32_t base, int cap, |
| int position, int32_t *out, int32_t *offset, |
| int &size) { |
| int32_t basement = base; |
| for (int i = 0; i < cap; ++i) { |
| int32_t value = read_long(bit_width_, in); |
| value += (basement + delta_min_); |
| basement = value; |
| if (predict_ != nullptr) { |
| if (predict_->satisfy(value)) { |
| out[size] = value; |
| offset[size++] = position + i + 1; |
| } |
| } else { |
| out[size] = value; |
| offset[size++] = position + i + 1; |
| } |
| } |
| return basement; |
| } |
| |
| int32_t read_long(int bits, uint8_t *&in) { |
| int32_t value = 0; |
| while (bits > 0) { |
| read_byte_if_empty(in); |
| if (bits > bits_left_ || bits == 8) { |
| auto d = (uint8_t)(buffer_ & ((1 << bits_left_) - 1)); |
| value = (value << bits_left_) + (d & 0xFF); |
| bits -= bits_left_; |
| bits_left_ = 0; |
| } else { |
| auto d = (uint8_t)((((uint8_t)buffer_) >> (bits_left_ - bits)) & |
| ((1 << bits) - 1)); |
| value = (value << bits) + (d & 0xFF); |
| bits_left_ -= bits; |
| bits = 0; |
| } |
| if (bits <= 0) { |
| break; |
| } |
| } |
| return value; |
| } |
| |
| void read_byte_if_empty(uint8_t *&in) { |
| if (bits_left_ == 0) { |
| memcpy(&buffer_, in, 1); |
| bits_left_ = 8; |
| in += 1; |
| } |
| } |
| |
| bool can_simd_decode8(int pos, const uint8_t* cur, const uint8_t * end) { |
| if (write_index_ - pos < 8) { |
| return false; |
| } |
| const int32_t overhead_bits = (bit_width_ >= 16) ? 64 : 32; |
| const int32_t rest_data = 8; |
| const int32_t need_bits = (rest_data - 1) * bit_width_ + overhead_bits; |
| const size_t remain_bits = size_t(end - cur) * 8; |
| return need_bits <= remain_bits; |
| } |
| |
| int32_t handle_batch_data(const uint8_t *in, int32_t w, int32_t min_delta, |
| int32_t ind, int32_t base, int32_t out_u32[8], |
| uint8_t &mask) { |
| // This function decodes 8 values from the input data array, |
| // producing the final results that satisfy the prediction. |
| |
| // Set byte-reversal method to convert little-endian data to big-endian |
| // format. |
| static const simde__m256i SHUF_REV8 = simde_mm256_setr_epi8( |
| 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, |
| 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); |
| |
| static const simde__m128i SHUF_REV4 = simde_mm_setr_epi8( |
| 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); |
| |
| const simde__m128i VMIN4 = simde_mm_set1_epi32(min_delta); |
| |
| int32_t basement = base; |
| mask = 0; |
| |
| // Decode using two loops. |
| for (int32_t grp = 0; grp < 2; ++grp) { |
| int32_t pos0 = grp * 4u * w + ind * w; |
| int32_t pos[4] = {pos0 + 0 * w, pos0 + 1 * w, pos0 + 2 * w, |
| pos0 + 3 * w}; |
| // The byte items start |
| int32_t bidx_s[4] = {pos[0] >> 3, pos[1] >> 3, pos[2] >> 3, |
| pos[3] >> 3}; |
| // The offset items start |
| int32_t off_s[4] = {pos[0] & 7, pos[1] & 7, pos[2] & 7, pos[3] & 7}; |
| |
| simde__m128i IDX128 = |
| simde_mm_setr_epi32(bidx_s[0], bidx_s[1], bidx_s[2], bidx_s[3]); |
| simde__m128i OFF128 = |
| simde_mm_setr_epi32(off_s[0], off_s[1], off_s[2], off_s[3]); |
| |
| simde__m128i V4; |
| |
| if (w <= 16) { |
| const int rshift = 32 - w; |
| simde__m128i w32_le = |
| simde_mm_i32gather_epi32((const int *)in, IDX128, 1); |
| simde__m128i w32_be = simde_mm_shuffle_epi8(w32_le, SHUF_REV4); |
| simde__m128i U32 = simde_mm_sllv_epi32(w32_be, OFF128); |
| simde__m128i RS32 = simde_mm_set1_epi32(rshift); |
| simde__m128i V32 = simde_mm_srlv_epi32(U32, RS32); |
| V4 = V32; |
| } else { |
| const int rshift = 64 - w; |
| simde__m256i w64_le = simde_mm256_i32gather_epi64( |
| (const long long *)in, IDX128, 1); |
| simde__m256i w64_be = |
| simde_mm256_shuffle_epi8(w64_le, SHUF_REV8); |
| simde__m256i OFF64 = simde_mm256_cvtepu32_epi64(OFF128); |
| simde__m256i U64 = simde_mm256_sllv_epi64(w64_be, OFF64); |
| simde__m256i V64 = |
| simde_mm256_srl_epi64(U64, simde_mm_cvtsi32_si128(rshift)); |
| simde__m256i V32_8 = V64; |
| simde__m256i perm = |
| simde_mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0); |
| simde__m256i comp = |
| simde_mm256_permutevar8x32_epi32(V32_8, perm); |
| V4 = simde_mm256_castsi256_si128(comp); |
| } |
| V4 = simde_mm_add_epi32(V4, VMIN4); |
| simde__m128i t; |
| t = simde_mm_slli_si128(V4, 4); |
| V4 = simde_mm_add_epi32(V4, t); |
| t = simde_mm_slli_si128(V4, 8); |
| V4 = simde_mm_add_epi32(V4, t); |
| simde__m128i C4 = simde_mm_set1_epi32((int)basement); |
| V4 = simde_mm_add_epi32(V4, C4); |
| if (predict_ != nullptr) { |
| simde__m128i m4; |
| simde__m128i a = simde_mm_set1_epi32(predict_->value_); |
| switch (predict_->type_) { |
| case predict_type::EQ: { |
| m4 = simde_mm_cmpeq_epi32(V4, a); |
| break; |
| } |
| case predict_type::GT: { |
| m4 = simde_mm_cmpgt_epi32(V4, a); |
| break; |
| } |
| case predict_type::GE: { |
| m4 = simde_mm_xor_si128(simde_mm_cmpgt_epi32(a, V4), |
| simde_mm_set1_epi32(-1)); |
| break; |
| } |
| case predict_type::LT: { |
| m4 = simde_mm_cmplt_epi32(V4, a); |
| break; |
| } |
| case predict_type::LE: { |
| m4 = simde_mm_xor_si128(simde_mm_cmpgt_epi32(V4, a), |
| simde_mm_set1_epi32(-1)); |
| break; |
| } |
| case predict_type::BT: { |
| int32_t lo = predict_->value_; |
| int32_t hi = predict_->rvalue_; |
| simde__m128i A = simde_mm_set1_epi32(lo); |
| simde__m128i B = simde_mm_set1_epi32(hi); |
| simde__m128i left = |
| simde_mm_xor_si128(simde_mm_cmpgt_epi32(A, V4), |
| simde_mm_set1_epi32(-1)); |
| simde__m128i right = |
| simde_mm_xor_si128(simde_mm_cmpgt_epi32(V4, B), |
| simde_mm_set1_epi32(-1)); |
| m4 = simde_mm_and_si128(left, right); |
| break; |
| } |
| default: |
| break; |
| } |
| uint8_t four_bits = 0; |
| four_bits = |
| simde_mm_movemask_ps(simde_mm_castsi128_ps(m4)) & 0xf; |
| mask |= (uint8_t)(four_bits << (grp * 4)); |
| } else { |
| mask = 0xff; |
| } |
| |
| simde_mm_storeu_si128((simde__m128i *)(out_u32 + grp * 4), V4); |
| basement = out_u32[grp * 4 + 3]; |
| } |
| return basement; |
| } |
| |
| private: |
| uint8_t *data_array_; |
| int bit_width_; |
| int write_index_; |
| int32_t data_size_; |
| int32_t position_; |
| int32_t delta_min_; |
| int32_t previous_value_; |
| predictor *predict_; |
| |
| uint8_t buffer_; |
| int bits_left_; |
| }; |
| |
| #endif // SIMD_TS_2DIFF_H |