blob: 740e3393a4553e5a8c0955ab6ce09fc6cadedc5a [file]
//
// Created by 李烁麟 on 25-8-4.
//
#ifndef SIMD_TS_2DIFF_H
#define SIMD_TS_2DIFF_H
#include <cstdint>
#include <cstring>
#include <iostream>
#include "simde/x86/avx2.h"
#include "utils.h"
static int N = 128;
enum class predict_type {
EQ,
GT,
GE,
LE,
LT,
BT,
};
enum class block_action { All, None, NeedScan };
struct predictor {
predict_type type_;
int32_t value_;
int32_t rvalue_;
bool satisfy(int32_t value) {
switch (type_) {
case predict_type::EQ:
return value_ == value;
case predict_type::GT:
return value > value_;
case predict_type::GE:
return value >= value_;
case predict_type::LT:
return value < value_;
case predict_type::LE:
return value <= value_;
case predict_type::BT:
return value >= value_ && value <= rvalue_;
default:
return false;
}
}
block_action block_check(int32_t lvalue, int32_t rvalue) {
switch (type_) {
case predict_type::LT: {
if (lvalue >= value_) return block_action::None;
if (rvalue < value_) return block_action::All;
return block_action::NeedScan;
}
case predict_type::LE: {
if (lvalue > value_) return block_action::None;
if (rvalue <= value_) return block_action::All;
return block_action::NeedScan;
}
case predict_type::GT: {
if (rvalue <= value_) return block_action::None;
if (lvalue > value_) return block_action::All;
return block_action::NeedScan;
}
case predict_type::GE: {
if (rvalue < value_) return block_action::None;
if (lvalue >= value_) return block_action::All;
return block_action::NeedScan;
}
case predict_type::EQ: {
if (rvalue < value_ || lvalue > value_)
return block_action::None;
if (lvalue <= value_ && rvalue >= value_)
return block_action::NeedScan;
return block_action::NeedScan;
}
}
return block_action::NeedScan;
}
};
class simd_ts_2diff_decoder {
public:
simd_ts_2diff_decoder(uint8_t *data_array, int32_t data_size,
predictor *predict = nullptr)
: data_array_(data_array), data_size_(data_size), predict_(predict) {
bit_width_ = 0;
write_index_ = 0;
delta_min_ = 0;
previous_value_ = 0;
position_ = 0;
bits_left_ = 0;
};
bool decode(int32_t *out, int32_t *offset_map, int &size) {
if (position_ < data_size_) {
position_ += handle_one_block(data_array_ + position_, out,
offset_map, size);
return true;
} else {
return false;
}
}
static inline void drain_batch_compact(const int32_t out_u32[8],
uint8_t mask, uint32_t position,
std::vector<int32_t> &values,
std::vector<int32_t> &offsets) {
if (!mask) return;
const int k = __builtin_popcount(mask);
values.reserve(values.size() + k);
offsets.reserve(offsets.size() + k);
int32_t tmp_vals[8];
int32_t tmp_offs[8];
int n = 0;
uint32_t m = mask;
while (m) {
int j = __builtin_ctz(m);
m &= (m - 1);
tmp_vals[n] = out_u32[j];
tmp_offs[n] = position + (uint32_t)j + 1;
++n;
}
values.insert(values.end(), tmp_vals, tmp_vals + n);
offsets.insert(offsets.end(), tmp_offs, tmp_offs + n);
}
size_t handle_one_block(uint8_t *data_array, int32_t *value,
int32_t *offset_map, int &size) {
uint8_t *in = data_array;
write_index_ = read_ui32(in);
bit_width_ = read_ui32(in);
delta_min_ = read_ui32(in);
previous_value_ = read_ui32(in);
int32_t block_min_lower =
previous_value_ + (delta_min_ >= 0 ? 0 : N) * delta_min_;
int32_t delta_upper_bound = (1 << bit_width_) - 1;
int32_t max_delta = delta_min_ + delta_upper_bound;
int32_t block_max_upper = previous_value_ +
delta_min_ * (max_delta > 0 ? 1 : 0) +
(max_delta > 0 ? N : 0) * delta_upper_bound;
block_action action =
predict_ == nullptr
? block_action::All
: predict_->block_check(block_min_lower, block_max_upper);
if (action == block_action::None) {
return sizeof(write_index_) + sizeof(bit_width_) +
sizeof(delta_min_) + sizeof(previous_value_) +
(bit_width_ * write_index_ + 7) / 8;
}
if (predict_ != nullptr) {
if (predict_->satisfy(previous_value_)) {
value[size++] = previous_value_;
offset_map[0] = 0;
}
} else {
value[size++] = previous_value_;
offset_map[0] = 0;
}
for (int i = 0; i < write_index_;) {
if (!can_simd_decode8(i, in, data_array + data_size_)) {
const int remain_data = write_index_ - i;
previous_value_ =
handle_normal_data(in, previous_value_, remain_data, i,
value, offset_map, size);
i += remain_data;
} else {
int32_t out_u32[8];
uint8_t mask;
previous_value_ =
handle_batch_data(in, bit_width_, delta_min_, i,
previous_value_, out_u32, mask);
if (mask != 0) {
if(mask == 0xFF) {
memcpy(value +size, out_u32, 8 * sizeof(int32_t));
for (int k = 0; k < 8; ++k) offset_map[size + k] = i + k + 1;
size += 8;
} else {
uint8_t m = mask;
while(m) {
int tz = __builtin_ctz((unsigned )m);
value[size] = out_u32[tz];
offset_map[size] = tz + i + 1;
++ size;
m &= m -1;
}
}
}
i += 8;
}
}
return sizeof(write_index_) + sizeof(bit_width_) + sizeof(delta_min_) +
sizeof(previous_value_) + (bit_width_ * write_index_ + 7) / 8;
}
int32_t handle_normal_data(uint8_t *&in, int32_t base, int cap,
int position, int32_t *out, int32_t *offset,
int &size) {
int32_t basement = base;
for (int i = 0; i < cap; ++i) {
int32_t value = read_long(bit_width_, in);
value += (basement + delta_min_);
basement = value;
if (predict_ != nullptr) {
if (predict_->satisfy(value)) {
out[size] = value;
offset[size++] = position + i + 1;
}
} else {
out[size] = value;
offset[size++] = position + i + 1;
}
}
return basement;
}
int32_t read_long(int bits, uint8_t *&in) {
int32_t value = 0;
while (bits > 0) {
read_byte_if_empty(in);
if (bits > bits_left_ || bits == 8) {
auto d = (uint8_t)(buffer_ & ((1 << bits_left_) - 1));
value = (value << bits_left_) + (d & 0xFF);
bits -= bits_left_;
bits_left_ = 0;
} else {
auto d = (uint8_t)((((uint8_t)buffer_) >> (bits_left_ - bits)) &
((1 << bits) - 1));
value = (value << bits) + (d & 0xFF);
bits_left_ -= bits;
bits = 0;
}
if (bits <= 0) {
break;
}
}
return value;
}
void read_byte_if_empty(uint8_t *&in) {
if (bits_left_ == 0) {
memcpy(&buffer_, in, 1);
bits_left_ = 8;
in += 1;
}
}
bool can_simd_decode8(int pos, const uint8_t* cur, const uint8_t * end) {
if (write_index_ - pos < 8) {
return false;
}
const int32_t overhead_bits = (bit_width_ >= 16) ? 64 : 32;
const int32_t rest_data = 8;
const int32_t need_bits = (rest_data - 1) * bit_width_ + overhead_bits;
const size_t remain_bits = size_t(end - cur) * 8;
return need_bits <= remain_bits;
}
int32_t handle_batch_data(const uint8_t *in, int32_t w, int32_t min_delta,
int32_t ind, int32_t base, int32_t out_u32[8],
uint8_t &mask) {
// This function decodes 8 values from the input data array,
// producing the final results that satisfy the prediction.
// Set byte-reversal method to convert little-endian data to big-endian
// format.
static const simde__m256i SHUF_REV8 = simde_mm256_setr_epi8(
7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3,
2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
static const simde__m128i SHUF_REV4 = simde_mm_setr_epi8(
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
const simde__m128i VMIN4 = simde_mm_set1_epi32(min_delta);
int32_t basement = base;
mask = 0;
// Decode using two loops.
for (int32_t grp = 0; grp < 2; ++grp) {
int32_t pos0 = grp * 4u * w + ind * w;
int32_t pos[4] = {pos0 + 0 * w, pos0 + 1 * w, pos0 + 2 * w,
pos0 + 3 * w};
// The byte items start
int32_t bidx_s[4] = {pos[0] >> 3, pos[1] >> 3, pos[2] >> 3,
pos[3] >> 3};
// The offset items start
int32_t off_s[4] = {pos[0] & 7, pos[1] & 7, pos[2] & 7, pos[3] & 7};
simde__m128i IDX128 =
simde_mm_setr_epi32(bidx_s[0], bidx_s[1], bidx_s[2], bidx_s[3]);
simde__m128i OFF128 =
simde_mm_setr_epi32(off_s[0], off_s[1], off_s[2], off_s[3]);
simde__m128i V4;
if (w <= 16) {
const int rshift = 32 - w;
simde__m128i w32_le =
simde_mm_i32gather_epi32((const int *)in, IDX128, 1);
simde__m128i w32_be = simde_mm_shuffle_epi8(w32_le, SHUF_REV4);
simde__m128i U32 = simde_mm_sllv_epi32(w32_be, OFF128);
simde__m128i RS32 = simde_mm_set1_epi32(rshift);
simde__m128i V32 = simde_mm_srlv_epi32(U32, RS32);
V4 = V32;
} else {
const int rshift = 64 - w;
simde__m256i w64_le = simde_mm256_i32gather_epi64(
(const long long *)in, IDX128, 1);
simde__m256i w64_be =
simde_mm256_shuffle_epi8(w64_le, SHUF_REV8);
simde__m256i OFF64 = simde_mm256_cvtepu32_epi64(OFF128);
simde__m256i U64 = simde_mm256_sllv_epi64(w64_be, OFF64);
simde__m256i V64 =
simde_mm256_srl_epi64(U64, simde_mm_cvtsi32_si128(rshift));
simde__m256i V32_8 = V64;
simde__m256i perm =
simde_mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
simde__m256i comp =
simde_mm256_permutevar8x32_epi32(V32_8, perm);
V4 = simde_mm256_castsi256_si128(comp);
}
V4 = simde_mm_add_epi32(V4, VMIN4);
simde__m128i t;
t = simde_mm_slli_si128(V4, 4);
V4 = simde_mm_add_epi32(V4, t);
t = simde_mm_slli_si128(V4, 8);
V4 = simde_mm_add_epi32(V4, t);
simde__m128i C4 = simde_mm_set1_epi32((int)basement);
V4 = simde_mm_add_epi32(V4, C4);
if (predict_ != nullptr) {
simde__m128i m4;
simde__m128i a = simde_mm_set1_epi32(predict_->value_);
switch (predict_->type_) {
case predict_type::EQ: {
m4 = simde_mm_cmpeq_epi32(V4, a);
break;
}
case predict_type::GT: {
m4 = simde_mm_cmpgt_epi32(V4, a);
break;
}
case predict_type::GE: {
m4 = simde_mm_xor_si128(simde_mm_cmpgt_epi32(a, V4),
simde_mm_set1_epi32(-1));
break;
}
case predict_type::LT: {
m4 = simde_mm_cmplt_epi32(V4, a);
break;
}
case predict_type::LE: {
m4 = simde_mm_xor_si128(simde_mm_cmpgt_epi32(V4, a),
simde_mm_set1_epi32(-1));
break;
}
case predict_type::BT: {
int32_t lo = predict_->value_;
int32_t hi = predict_->rvalue_;
simde__m128i A = simde_mm_set1_epi32(lo);
simde__m128i B = simde_mm_set1_epi32(hi);
simde__m128i left =
simde_mm_xor_si128(simde_mm_cmpgt_epi32(A, V4),
simde_mm_set1_epi32(-1));
simde__m128i right =
simde_mm_xor_si128(simde_mm_cmpgt_epi32(V4, B),
simde_mm_set1_epi32(-1));
m4 = simde_mm_and_si128(left, right);
break;
}
default:
break;
}
uint8_t four_bits = 0;
four_bits =
simde_mm_movemask_ps(simde_mm_castsi128_ps(m4)) & 0xf;
mask |= (uint8_t)(four_bits << (grp * 4));
} else {
mask = 0xff;
}
simde_mm_storeu_si128((simde__m128i *)(out_u32 + grp * 4), V4);
basement = out_u32[grp * 4 + 3];
}
return basement;
}
private:
uint8_t *data_array_;
int bit_width_;
int write_index_;
int32_t data_size_;
int32_t position_;
int32_t delta_min_;
int32_t previous_value_;
predictor *predict_;
uint8_t buffer_;
int bits_left_;
};
#endif // SIMD_TS_2DIFF_H