blob: 0f385fc6e28a86faf5551a6e561b0f0fa906ab00 [file]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <vector>
#include "vec/core/wide_integer.h"
namespace doris {
template <typename T>
inline void bit_pack_32(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
wide::UInt256 s = 0;
uint8_t output_mask = 255;
int tail_count = in_num & 31;
int full_batch_size = (in_num >> 5) << 5;
int output_size = bit_width << 2;
for (int i = 0; i < full_batch_size; i += 32) {
s |= static_cast<wide::UInt256>(input[i + 35]);
s |= static_cast<wide::UInt256>(input[i + 34]) << (1 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 33]) << (2 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 32]) << (3 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 31]) << (4 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 30]) << (5 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 29]) << (6 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 28]) << (7 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 27]) << (8 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 26]) << (9 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 25]) << (10 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 24]) << (11 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 23]) << (12 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 22]) << (13 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 21]) << (14 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 20]) << (15 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 19]) << (16 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 18]) << (17 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 17]) << (18 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 16]) << (19 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 15]) << (20 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 14]) << (21 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 13]) << (22 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 12]) << (23 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 11]) << (24 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 10]) << (25 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 9]) << (26 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 8]) << (27 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 7]) << (28 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 6]) << (29 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 5]) << (30 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 4]) << (31 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 3]) << (32 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 2]) << (33 * bit_width);
s |= static_cast<wide::UInt256>(input[i + 1]) << (34 * bit_width);
s |= static_cast<wide::UInt256>(input[i]) << (35 * bit_width);
for (int j = 0; j < output_size; j++) {
output[j] = (s >> ((output_size - j - 1) << 3)) & output_mask;
}
output += output_size;
s = 0;
}
// remainder
int byte = tail_count * bit_width;
int bytes = (byte + 7) >> 3;
for (int i = 0; i < tail_count; i++) {
s |= (static_cast<__int128_t>(input[i + full_batch_size]))
<< ((tail_count - i - 1) * bit_width);
}
s <<= (bytes << 3) - byte;
for (int i = 0; i < bytes; i++) {
output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
}
}
template <typename T, typename U>
inline void bit_pack_16(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
U s = 0;
uint8_t output_mask = 255;
int tail_count = in_num & 15;
int full_batch_size = (in_num >> 4) << 4;
int output_size = bit_width << 1;
for (int i = 0; i < full_batch_size; i += 16) {
s |= static_cast<U>(input[i + 15]);
s |= static_cast<U>(input[i + 14]) << (bit_width);
s |= static_cast<U>(input[i + 13]) << (2 * bit_width);
s |= static_cast<U>(input[i + 12]) << (3 * bit_width);
s |= static_cast<U>(input[i + 11]) << (4 * bit_width);
s |= static_cast<U>(input[i + 10]) << (5 * bit_width);
s |= static_cast<U>(input[i + 9]) << (6 * bit_width);
s |= static_cast<U>(input[i + 8]) << (7 * bit_width);
s |= static_cast<U>(input[i + 7]) << (8 * bit_width);
s |= (static_cast<U>(input[i + 6])) << (9 * bit_width);
s |= (static_cast<U>(input[i + 5])) << (10 * bit_width);
s |= (static_cast<U>(input[i + 4])) << (11 * bit_width);
s |= (static_cast<U>(input[i + 3])) << (12 * bit_width);
s |= (static_cast<U>(input[i + 2])) << (13 * bit_width);
s |= (static_cast<U>(input[i + 1])) << (14 * bit_width);
s |= (static_cast<U>(input[i])) << (15 * bit_width);
for (int j = 0; j < output_size; j++) {
output[j] = (s >> ((output_size - j - 1) << 3)) & output_mask;
}
output += output_size;
s = 0;
}
// remainder
int byte = tail_count * bit_width;
int bytes = (byte + 7) >> 3;
for (int i = 0; i < tail_count; i++) {
s |= (static_cast<__int128_t>(input[i + full_batch_size]))
<< ((tail_count - i - 1) * bit_width);
}
s <<= (bytes << 3) - byte;
for (int i = 0; i < bytes; i++) {
output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
}
}
template <typename T, typename U>
inline void bit_pack_8(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
U s = 0;
uint8_t output_mask = 255;
int tail_count = in_num & 7;
int full_batch_size = (in_num >> 3) << 3;
for (int i = 0; i < full_batch_size; i += 8) {
s |= static_cast<U>(input[i + 7]);
s |= (static_cast<U>(input[i + 6])) << bit_width;
s |= (static_cast<U>(input[i + 5])) << (2 * bit_width);
s |= (static_cast<U>(input[i + 4])) << (3 * bit_width);
s |= (static_cast<U>(input[i + 3])) << (4 * bit_width);
s |= (static_cast<U>(input[i + 2])) << (5 * bit_width);
s |= (static_cast<U>(input[i + 1])) << (6 * bit_width);
s |= (static_cast<U>(input[i])) << (7 * bit_width);
for (int j = 0; j < bit_width; j++) {
output[j] = (s >> ((bit_width - j - 1) << 3)) & output_mask;
}
output += bit_width;
s = 0;
}
// remainder
int byte = tail_count * bit_width;
int bytes = (byte + 7) >> 3;
for (int i = 0; i < tail_count; i++) {
s |= (static_cast<U>(input[i + full_batch_size])) << ((tail_count - i - 1) * bit_width);
}
s <<= (bytes << 3) - byte;
for (int i = 0; i < bytes; i++) {
output[i] = (s >> ((bytes - i - 1) << 3)) & output_mask;
}
}
template <typename T, typename U>
inline void bit_pack_4(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
U s = 0;
uint8_t output_mask = 255;
int tail_count = in_num & 3;
int full_batch_size = (in_num >> 2) << 2;
int output_size = 0; // How many outputs can be processed at a time
int bit_width_remainder =
(bit_width << 2) & 7; // How many bits will be left after processing 4 numbers at a time
int extra_bit = 0; // Extra bits after each process
for (int i = 0; i < full_batch_size; i += 4) {
s <<= bit_width;
s |= (static_cast<U>(input[i]));
s <<= bit_width;
s |= (static_cast<U>(input[i + 1]));
s <<= bit_width;
s |= (static_cast<U>(input[i + 2]));
s <<= bit_width;
s |= (static_cast<U>(input[i + 3]));
output_size = (bit_width * 4 + extra_bit) >> 3;
extra_bit = (extra_bit + bit_width_remainder) & 7;
for (int j = 0; j < output_size; j++) {
output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
}
output += output_size;
s &= (1 << extra_bit) - 1;
}
// remainder
int byte = tail_count * bit_width;
if (extra_bit != 0) byte += extra_bit;
int bytes = (byte + 7) >> 3;
for (int i = 0; i < tail_count; i++) {
s <<= bit_width;
s |= (input[i + full_batch_size]);
}
s <<= (bytes << 3) - byte;
for (int i = 0; i < bytes; i++) {
output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
}
}
template <typename T, typename U>
inline void bit_pack_2(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
U s = 0;
uint8_t output_mask = 255;
int tail_count = in_num & 1;
int full_batch_size = (in_num >> 1) << 1;
int output_size = 0; // How many outputs can be processed at a time
int bit_width_remainder =
(bit_width << 1) & 7; // How many bits will be left after processing 4 numbers at a time
int extra_bit = 0; // Extra bits after each process
for (int i = 0; i < full_batch_size; i += 2) {
s <<= bit_width;
s |= (static_cast<U>(input[i]));
s <<= bit_width;
s |= (static_cast<U>(input[i + 1]));
output_size = (bit_width + bit_width + extra_bit) >> 3;
extra_bit = (extra_bit + bit_width_remainder) & 7;
for (int j = 0; j < output_size; j++) {
output[j] = (s >> (((output_size - j - 1) << 3) + extra_bit)) & output_mask;
}
output += output_size;
s &= (1 << extra_bit) - 1;
}
// remainder
int byte = tail_count * bit_width;
if (extra_bit != 0) byte += extra_bit;
int bytes = (byte + 7) >> 3;
for (int i = 0; i < tail_count; i++) {
s <<= bit_width;
s |= (input[i + full_batch_size]);
}
s <<= (bytes << 3) - byte;
for (int i = 0; i < bytes; i++) {
output[i] = (s >> (((bytes - i - 1) << 3))) & output_mask;
}
}
template <typename T>
void bit_pack_1(const T* input, uint8_t in_num, int bit_width, uint8_t* output) {
int output_mask = 255;
int need_bit = 0; // still need
for (int i = 0; i < in_num; i++) {
T x = input[i];
int width = bit_width;
if (need_bit) {
*output |= x >> (width - need_bit);
output++;
width -= need_bit;
}
int num = width >> 3; // How many outputs can be processed at a time
int remainder = width & 7; // How many bits are left to store
for (int j = 0; j < num; j++) {
*output = (x >> (((num - j - 1) << 3) + remainder)) & output_mask;
output++;
}
if (remainder) {
*output = (x & ((1 << remainder) - 1)) << (8 - remainder);
need_bit = 8 - remainder;
} else {
need_bit = 0;
}
}
}
void get_testdata(__int128_t* test_data, int n, int w) {
__int128_t in_mask = ((__int128_t(1)) << w) - 1;
for (int i = 0; i < n; i++) {
test_data[i] = (i & in_mask);
}
}
static void BM_BitPack_w8_8_int64(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_8<__int128_t, int64_t>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w8_8_int64)->DenseRange(1, 8)->Unit(benchmark::kNanosecond);
static void BM_BitPack_w8_16_int128(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_16<__int128_t, __int128_t>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w8_16_int128)->DenseRange(1, 8)->Unit(benchmark::kNanosecond);
static void BM_BitPack_w8_32_int256(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_32(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w8_32_int256)->DenseRange(1, 8)->Unit(benchmark::kNanosecond);
static void BM_BitPack_w16_4_int64(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_16<__int128_t, int64_t>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w16_4_int64)->DenseRange(9, 16)->Unit(benchmark::kNanosecond);
static void BM_BitPack_w16_8_int128(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_8<__int128_t, __int128_t>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w16_8_int128)->DenseRange(9, 16)->Unit(benchmark::kNanosecond);
static void BM_BitPack_w16_16_int256(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_16<__int128_t, wide::UInt256>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w16_16_int256)->DenseRange(9, 16)->Unit(benchmark::kNanosecond);
static void BM_BitPack_w32_2_int128(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_2<__int128_t, __int128_t>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w32_2_int128)->DenseRange(17, 32)->Unit(benchmark::kNanosecond);
static void BM_BitPack_w32_4_int128(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_4<__int128_t, __int128_t>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w32_4_int128)->DenseRange(17, 32)->Unit(benchmark::kNanosecond);
static void BM_BitPack_w32_8_int256(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_8<__int128_t, wide::UInt256>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w32_8_int256)->DenseRange(17, 32)->Unit(benchmark::kNanosecond);
static void BM_BitPack_w64_4_int256(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_4<__int128_t, wide::UInt256>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_w64_4_int256)->DenseRange(33, 64)->Unit(benchmark::kNanosecond);
static void BM_BitPack_1(benchmark::State& state) {
int w = state.range(0);
int n = 255;
std::vector<__int128_t> test_data(n);
get_testdata(test_data.data(), n, w);
int size = (n * w + 7) / 8;
std::vector<uint8_t> output(size);
for (auto _ : state) {
benchmark::DoNotOptimize(test_data.data());
benchmark::DoNotOptimize(output.data());
bit_pack_1<__int128_t>(test_data.data(), n, w, output.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * size);
}
BENCHMARK(BM_BitPack_1)->DenseRange(1, 32)->Unit(benchmark::kNanosecond);
} // namespace doris