blob: ce6f1b42ca0ab9bf0b677514baabd67a392f2f23 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if defined(__x86_64__)
#include <cstddef>
#include <Columns/ColumnsCommon.h>
#include <Columns/IColumn.h>
#include <Core/Block.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/IDataType.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsRound.h>
#include <Functions/SparkFunctionFloor.h>
#include <Parser/SerializedPlanParser.h>
#include <base/types.h>
#include <benchmark/benchmark.h>
#include <Common/QueryContext.h>
#include <Common/TargetSpecific.h>
#include <DataTypes/DataTypeNullable.h>
#if USE_MULTITARGET_CODE
#include <immintrin.h>
#endif
using namespace DB;
static IColumn::Offsets createOffsets(size_t rows)
{
IColumn::Offsets offsets(rows, 0);
for (size_t i = 0; i < rows; ++i)
offsets[i] = offsets[i-1] + (rand() % 10);
return offsets;
}
static ColumnPtr createColumn(const DataTypePtr & type, size_t rows)
{
const auto * type_array = typeid_cast<const DataTypeArray *>(type.get());
if (type_array)
{
auto data_col = createColumn(type_array->getNestedType(), rows);
auto offset_col = ColumnArray::ColumnOffsets::create(rows, 0);
auto & offsets = offset_col->getData();
for (size_t i = 0; i < data_col->size(); ++i)
offsets[i] = offsets[i - 1] + (rand() % 10);
auto new_data_col = data_col->replicate(offsets);
return ColumnArray::create(std::move(new_data_col), std::move(offset_col));
}
auto type_not_nullable = removeNullable(type);
auto column = type->createColumn();
for (size_t i = 0; i < rows; ++i)
{
if (i % 100)
{
column->insertDefault();
}
else if (isInt(type_not_nullable))
{
column->insert(i);
}
else if (isFloat(type_not_nullable))
{
double d = i * 1.0;
column->insert(d);
}
else if (isDecimal(type_not_nullable))
{
Decimal128 d = Decimal128(i * i);
column->insert(d);
}
else if (isString(type_not_nullable))
{
String s = "helloworld";
column->insert(s);
}
else
{
column->insertDefault();
}
}
return std::move(column);
}
static Block createBlock(const String & type_str, size_t rows)
{
auto type = DataTypeFactory::instance().get(type_str);
auto column = createColumn(type, rows);
Block block;
block.insert(ColumnWithTypeAndName(std::move(column), type, "d"));
return std::move(block);
}
static void BM_CHFloorFunction_For_Int64(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("floor", local_engine::QueryContext::globalContext());
Block int64_block = createBlock("Nullable(Int64)", 65536);
auto executable = function->build(int64_block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(int64_block.getColumnsWithTypeAndName(), executable->getResultType(), int64_block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
static void BM_CHFloorFunction_For_Float64(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("floor", local_engine::QueryContext::globalContext());
Block float64_block = createBlock("Nullable(Float64)", 65536);
auto executable = function->build(float64_block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(float64_block.getColumnsWithTypeAndName(), executable->getResultType(), float64_block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
static void BM_SparkFloorFunction_For_Int64(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("sparkFloor", local_engine::QueryContext::globalContext());
Block int64_block = createBlock("Nullable(Int64)", 65536);
auto executable = function->build(int64_block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(int64_block.getColumnsWithTypeAndName(), executable->getResultType(), int64_block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
static void BM_SparkFloorFunction_For_Float64(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("sparkFloor", local_engine::QueryContext::globalContext());
Block float64_block = createBlock("Nullable(Float64)", 65536);
auto executable = function->build(float64_block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(float64_block.getColumnsWithTypeAndName(), executable->getResultType(), float64_block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
BENCHMARK(BM_CHFloorFunction_For_Int64);
BENCHMARK(BM_CHFloorFunction_For_Float64);
BENCHMARK(BM_SparkFloorFunction_For_Int64);
BENCHMARK(BM_SparkFloorFunction_For_Float64);
static void BM_OptSparkDivide_VectorVector(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("sparkDivide", local_engine::QueryContext::globalContext());
auto type = DataTypeFactory::instance().get("Nullable(Float64)");
auto left = createColumn(type, 65536);
auto right = createColumn(type, 65536);
auto block = Block({ColumnWithTypeAndName(left, type, "left"), ColumnWithTypeAndName(right, type, "right")});
auto executable = function->build(block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
static void BM_OptSparkDivide_VectorConstant(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("sparkDivide", local_engine::QueryContext::globalContext());
auto type = DataTypeFactory::instance().get("Nullable(Float64)");
auto left = createColumn(type, 65536);
auto right = createColumn(type, 1);
auto const_right = ColumnConst::create(std::move(right), 65536);
auto block = Block({ColumnWithTypeAndName(left, type, "left"), ColumnWithTypeAndName(std::move(const_right), type, "right")});
auto executable = function->build(block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
static void BM_OptSparkDivide_ConstantVector(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("sparkDivide", local_engine::QueryContext::globalContext());
auto type = DataTypeFactory::instance().get("Nullable(Float64)");
auto left = createColumn(type, 1);
auto const_left = ColumnConst::create(std::move(left), 65536);
auto right = createColumn(type, 65536);
auto block = Block({ColumnWithTypeAndName(std::move(const_left), type, "left"), ColumnWithTypeAndName(std::move(right), type, "right")});
auto executable = function->build(block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
BENCHMARK(BM_OptSparkDivide_VectorVector);
BENCHMARK(BM_OptSparkDivide_VectorConstant);
BENCHMARK(BM_OptSparkDivide_ConstantVector);
static void BM_OptSparkCastFloatToInt(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("sparkCastFloatToInt32", local_engine::QueryContext::globalContext());
auto type = DataTypeFactory::instance().get("Nullable(Float64)");
auto input = createColumn(type, 65536);
auto block = Block({ColumnWithTypeAndName(std::move(input), type, "input")});
auto executable = function->build(block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
BENCHMARK(BM_OptSparkCastFloatToInt);
/// decimal to decimal, scale up
static void BM_OptCheckDecimalOverflowSparkFromDecimal1(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
auto type = DataTypeFactory::instance().get("Nullable(Decimal128(10))");
auto input = createColumn(type, 65536);
auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
auto scale = ColumnConst::create(ColumnUInt32::create(1, 5), 65536);
auto block = Block(
{ColumnWithTypeAndName(std::move(input), type, "input"),
ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
auto executable = function->build(block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
/// decimal to decimal, scale down
static void BM_OptCheckDecimalOverflowSparkFromDecimal2(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
auto type = DataTypeFactory::instance().get("Nullable(Decimal128(10))");
auto input = createColumn(type, 65536);
auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
auto scale = ColumnConst::create(ColumnUInt32::create(1, 15), 65536);
auto block = Block(
{ColumnWithTypeAndName(std::move(input), type, "input"),
ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
auto executable = function->build(block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
/// decimal to decimal, scale doesn't change
static void BM_OptCheckDecimalOverflowSparkFromDecimal3(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
auto type = DataTypeFactory::instance().get("Nullable(Decimal(38, 10))");
auto input = createColumn(type, 65536);
auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
auto scale = ColumnConst::create(ColumnUInt32::create(1, 10), 65536);
auto block = Block(
{ColumnWithTypeAndName(std::move(input), type, "input"),
ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
auto executable = function->build(block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
/// int to decimal
static void BM_OptCheckDecimalOverflowSparkFromInt(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
auto type = DataTypeFactory::instance().get("Nullable(Int64)");
auto input = createColumn(type, 65536);
auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
auto scale = ColumnConst::create(ColumnUInt32::create(1, 10), 65536);
auto block = Block(
{ColumnWithTypeAndName(std::move(input), type, "input"),
ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
auto executable = function->build(block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
/// float to decimal
static void BM_OptCheckDecimalOverflowSparkFromFloat(benchmark::State & state)
{
using namespace DB;
auto & factory = FunctionFactory::instance();
auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
auto type = DataTypeFactory::instance().get("Nullable(Float64)");
auto input = createColumn(type, 65536);
auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
auto scale = ColumnConst::create(ColumnUInt32::create(1, 10), 65536);
auto block = Block(
{ColumnWithTypeAndName(std::move(input), type, "input"),
ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
auto executable = function->build(block.getColumnsWithTypeAndName());
for (auto _ : state)
{
auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
benchmark::DoNotOptimize(result);
}
}
BENCHMARK(BM_OptCheckDecimalOverflowSparkFromDecimal1);
BENCHMARK(BM_OptCheckDecimalOverflowSparkFromDecimal2);
BENCHMARK(BM_OptCheckDecimalOverflowSparkFromDecimal3);
BENCHMARK(BM_OptCheckDecimalOverflowSparkFromInt);
BENCHMARK(BM_OptCheckDecimalOverflowSparkFromFloat);
static void nanInfToNullAutoOpt(float * data, uint8_t * null_map, size_t size)
{
for (size_t i = 0; i < size; ++i)
{
uint8_t is_nan = (data[i] != data[i]);
uint8_t is_inf
= ((*reinterpret_cast<const uint32_t *>(&data[i]) & 0b01111111111111111111111111111111) == 0b01111111100000000000000000000000);
uint8_t null_flag = is_nan | is_inf;
null_map[i] = null_flag;
UInt32 * uint_data = reinterpret_cast<UInt32 *>(&data[i]);
*uint_data &= ~(-null_flag);
}
}
static void BMNanInfToNullAutoOpt(benchmark::State & state)
{
constexpr size_t size = 8192;
float data[size];
uint8_t null_map[size] = {0};
for (size_t i = 0; i < size; ++i)
data[i] = static_cast<float>(rand()) / rand();
for (auto _ : state)
{
nanInfToNullAutoOpt(data, null_map, size);
benchmark::DoNotOptimize(null_map);
}
}
BENCHMARK(BMNanInfToNullAutoOpt);
DECLARE_AVX2_SPECIFIC_CODE(
void nanInfToNullSIMD(float * data, uint8_t * null_map, size_t size) {
const __m256 inf = _mm256_set1_ps(INFINITY);
const __m256 neg_inf = _mm256_set1_ps(-INFINITY);
const __m256 zero = _mm256_set1_ps(0.0f);
size_t i = 0;
for (; i + 7 < size; i += 8)
{
__m256 values = _mm256_loadu_ps(&data[i]);
__m256 is_inf = _mm256_cmp_ps(values, inf, _CMP_EQ_OQ);
__m256 is_neg_inf = _mm256_cmp_ps(values, neg_inf, _CMP_EQ_OQ);
__m256 is_nan = _mm256_cmp_ps(values, values, _CMP_NEQ_UQ);
__m256 is_null = _mm256_or_ps(_mm256_or_ps(is_inf, is_neg_inf), is_nan);
__m256 new_values = _mm256_blendv_ps(values, zero, is_null);
_mm256_storeu_ps(&data[i], new_values);
UInt32 mask = static_cast<UInt32>(_mm256_movemask_ps(is_null));
for (size_t j = 0; j < 8; ++j)
{
UInt8 null_flag = (mask & 1U);
null_map[i + j] = null_flag;
mask >>= 1;
}
}
})
static void BMNanInfToNullAVX2(benchmark::State & state)
{
constexpr size_t size = 8192;
float data[size];
uint8_t null_map[size] = {0};
for (size_t i = 0; i < size; ++i)
data[i] = static_cast<float>(rand()) / rand();
for (auto _ : state)
{
::TargetSpecific::AVX2::nanInfToNullSIMD(data, null_map, size);
benchmark::DoNotOptimize(null_map);
}
}
BENCHMARK(BMNanInfToNullAVX2);
static void nanInfToNull(float * data, uint8_t * null_map, size_t size)
{
for (size_t i = 0; i < size; ++i)
{
if (data[i] != data[i])
null_map[i] = 1;
else if ((*reinterpret_cast<const uint32_t *>(&data[i]) & 0b01111111111111111111111111111111) == 0b01111111100000000000000000000000)
null_map[i] = 1;
else
null_map[i] = 0;
if (null_map[i])
data[i] = 0.0;
}
}
static void BMNanInfToNull(benchmark::State & state)
{
constexpr size_t size = 8192;
float data[size];
uint8_t null_map[size] = {0};
for (size_t i = 0; i < size; ++i)
data[i] = static_cast<float>(rand()) / rand();
for (auto _ : state)
{
nanInfToNull(data, null_map, size);
benchmark::DoNotOptimize(null_map);
}
}
BENCHMARK(BMNanInfToNull);
/*
/// TO run in https://quick-bench.com/q/h-2qGgqxM8ksp57VD0w7JdKKN-I
using UInt8 = unsigned char;
using UInt64 = unsigned long long;
using Int64 = signed long long;
template<typename T>
using PaddedPODArray = std::vector<T>;
*/
/*
Test performance of fillConstantConstant*
Benchmark when BranchType is Int64
-------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------
BM_fillConstantConstant1 31360 ns 31359 ns 22249
BM_fillConstantConstant2 31369 ns 31368 ns 22288
BM_fillConstantConstant3 31583 ns 31581 ns 22254
*/
/*
Test performance of fillVectorVector*
Benchmark when BranchType is Float64
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
BM_fillVectorVector1 414177 ns 414161 ns 1687
BM_fillVectorVector2 96669 ns 96665 ns 7432
BM_fillVectorVector3 78439 ns 78436 ns 8812
Benchmark when BranchType is Int64
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
BM_fillVectorVector1 80645 ns 80643 ns 8101
BM_fillVectorVector2 73841 ns 73838 ns 9484
BM_fillVectorVector3 73883 ns 73881 ns 9485
Benchmark when BranchType is Decimal64
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
BM_fillVectorVector1 82413 ns 82408 ns 8635
BM_fillVectorVector2 76289 ns 76287 ns 9213
BM_fillVectorVector3 76262 ns 76260 ns 9244
Benchmark when BranchType is Int256
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
BM_fillVectorVector1 307741 ns 307726 ns 2263
BM_fillVectorVector2 2184999 ns 2184903 ns 321
BM_fillVectorVector3 318616 ns 318605 ns 2209
Benchmark when BranchType is Decimal256
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
BM_fillVectorVector1 303179 ns 303164 ns 2311
BM_fillVectorVector3 305023 ns 305010 ns 2266
*/
/*
Some commands that would be helpful
# run benchmark
./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine --benchmark_filter="BM_fillVectorVector*"
# get full symbol name
objdump -t ./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine | c++filt | grep "fillVectorVector1"
# get assembly code mixed with source code by symbol name
gdb -batch -ex "disassemble/rs 'void fillVectorVector3<double, double>(DB::PODArray<char8_t, 4096ul, Allocator<false, false>, 63ul, 64ul> const&, DB::PODArray<double, 4096ul, Allocator<false, false>, 63ul, 64ul> const&, DB::PODArray<double, 4096ul, Allocator<false, false>, 63ul, 64ul> const&, DB::PODArray<double, 4096ul, Allocator<false, false>, 63ul, 64ul>&)'" ./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine | c++filt > 3.S
*/
using ResultType = Float64;
template <typename T>
static NO_INLINE void fillConstantConstant1(const PaddedPODArray<UInt8> & cond, T a, T b, PaddedPODArray<T> & res)
{
size_t rows = cond.size();
for (size_t i = 0; i < rows; ++i)
{
res[i] = cond[i] ? static_cast<T>(a) : static_cast<T>(b);
}
}
template <typename T>
static NO_INLINE void
fillConstantConstant3(const PaddedPODArray<UInt8> & cond, T a, T b, PaddedPODArray<T> & res)
{
size_t rows = cond.size();
T new_a = static_cast<T>(a);
T new_b = static_cast<T>(b);
alignas(64) const T ab[2] = {new_a, new_b};
for (size_t i = 0; i < rows; ++i)
{
if constexpr (std::is_integral_v<T> && sizeof(T) == 1)
{
/// auto opt: cmove and simd is used for integral types
// res[i] = cond[i] ? new_a : new_b;
res[i] = ab[!cond[i]];
}
else if constexpr (std::is_floating_point_v<T>)
{
/// auto opt: cmove not used but simd is used for floating point types
res[i] = cond[i] ? new_a : new_b;
}
else if constexpr (is_decimal<T> && sizeof(T) <= 8)
{
/// auto opt: simd is used for decimal types
res[i] = cond[i] ? new_a : new_b;
}
else if constexpr (is_decimal<T> && sizeof(T) == 32)
{
/// avoid branch mispredict
res[i] = ab[!cond[i]];
}
else if constexpr (is_decimal<T> && sizeof(T) == 16)
{
/// auto opt: cmove and loop unrolling
// res[i] = cond[i] ? static_cast<T>(a) : static_cast<T>(b);
res[i] = ab[!cond[i]];
}
else if constexpr (is_big_int_v<T> && sizeof(T) == 32)
{
res[i] = ab[!cond[i]];
}
else if constexpr (is_big_int_v<T> && sizeof(T) == 16)
{
// res[i] = cond[i] ? static_cast<T>(a) : static_cast<T>(b);
res[i] = ab[!cond[i]];
}
else
{
res[i] = cond[i] ? static_cast<T>(a) : static_cast<T>(b);
}
}
}
template <typename Branch1Type = ResultType, typename Branch2Type = ResultType>
static NO_INLINE void fillVectorVector1(
const PaddedPODArray<UInt8> & cond,
const PaddedPODArray<Branch1Type> & a,
const PaddedPODArray<Branch2Type> & b,
PaddedPODArray<ResultType> & res)
{
size_t rows = cond.size();
for (size_t i = 0; i < rows; ++i)
{
res[i] = cond[i] ? static_cast<ResultType>(a[i]) : static_cast<ResultType>(b[i]);
}
}
template <typename Branch1Type = ResultType, typename Branch2Type = ResultType>
static NO_INLINE void fillVectorVector2(
const PaddedPODArray<UInt8> & cond,
const PaddedPODArray<Branch1Type> & a,
const PaddedPODArray<Branch2Type> & b,
PaddedPODArray<ResultType> & res)
{
size_t rows = cond.size();
for (size_t i = 0; i < rows; ++i)
{
// res[i] = (!!cond[i]) * static_cast<ResultType>(a[i]) + (!cond[i]) * static_cast<ResultType>(b[i]);
}
}
template <typename Branch1Type = ResultType, typename Branch2Type = ResultType>
static NO_INLINE void fillVectorVector3(
const PaddedPODArray<UInt8> & cond,
const PaddedPODArray<Branch1Type> & a,
const PaddedPODArray<Branch2Type> & b,
PaddedPODArray<ResultType> & res)
{
size_t rows = cond.size();
for (size_t i = 0; i < rows; ++i)
{
if constexpr (std::is_integral_v<ResultType> || (is_decimal<ResultType> && sizeof(ResultType) <= 8))
{
// res[i] = (!!cond[i]) * static_cast<ResultType>(a[i]) + (!cond[i]) * static_cast<ResultType>(b[i]);
}
else if constexpr (std::is_floating_point_v<ResultType>)
{
using UIntType = std::conditional_t<sizeof(ResultType) == 8, UInt64, UInt32>;
using IntType = std::conditional_t<sizeof(ResultType) == 8, Int64, Int32>;
auto mask = static_cast<UIntType>(static_cast<IntType>(cond[i]) - 1);
auto new_a = static_cast<ResultType>(a[i]);
auto new_b = static_cast<ResultType>(b[i]);
UIntType uint_a;
std::memcpy(&uint_a, &new_a, sizeof(UIntType));
UIntType uint_b;
std::memcpy(&uint_b, &new_b, sizeof(UIntType));
UIntType tmp = (~mask & uint_a) | (mask & uint_b);
// auto tmp = (~mask & (*reinterpret_cast<const UIntType *>(&new_a))) | (mask & (*reinterpret_cast<const UIntType *>(&new_b)));
res[i] = *(reinterpret_cast<ResultType *>(&tmp));
}
else
{
res[i] = cond[i] ? static_cast<ResultType>(a[i]) : static_cast<ResultType>(b[i]);
}
}
}
static constexpr size_t ROWS = 65536;
static void initCondition(PaddedPODArray<UInt8> & cond)
{
cond.resize(ROWS);
for (size_t i = 0; i < ROWS; ++i)
{
cond[i] = std::rand() % 2;
}
}
template <typename T>
static void initBranch(PaddedPODArray<T> & branch)
{
branch.resize(ROWS);
for (size_t i = 0; i < ROWS; ++i)
{
branch[i] = static_cast<T>(std::rand());
}
}
template <typename T = ResultType>
static void BM_fillConstantConstant1(benchmark::State & state)
{
PaddedPODArray<UInt8> cond;
T a(std::rand());
T b(std::rand());
PaddedPODArray<T> res(ROWS);
initCondition(cond);
for (auto _ : state)
{
fillConstantConstant1(cond, a, b, res);
benchmark::DoNotOptimize(res);
}
}
template <typename T = ResultType>
static void BM_fillConstantConstant3(benchmark::State & state)
{
PaddedPODArray<UInt8> cond;
T a(std::rand());
T b(std::rand());
PaddedPODArray<T> res(ROWS);
initCondition(cond);
for (auto _ : state)
{
fillConstantConstant3(cond, a, b, res);
benchmark::DoNotOptimize(res);
}
}
static void BM_fillVectorVector1(benchmark::State & state)
{
PaddedPODArray<UInt8> cond;
PaddedPODArray<ResultType> a;
PaddedPODArray<ResultType> b;
PaddedPODArray<ResultType> res(ROWS);
initCondition(cond);
initBranch(a);
initBranch(b);
for (auto _ : state)
{
fillVectorVector1(cond, a, b, res);
benchmark::DoNotOptimize(res);
}
}
static void BM_fillVectorVector2(benchmark::State & state)
{
PaddedPODArray<UInt8> cond;
PaddedPODArray<ResultType> a;
PaddedPODArray<ResultType> b;
PaddedPODArray<ResultType> res(ROWS);
initCondition(cond);
initBranch(a);
initBranch(b);
for (auto _ : state)
{
fillVectorVector2(cond, a, b, res);
benchmark::DoNotOptimize(res);
}
}
static void BM_fillVectorVector3(benchmark::State & state)
{
PaddedPODArray<UInt8> cond;
PaddedPODArray<ResultType> a;
PaddedPODArray<ResultType> b;
PaddedPODArray<ResultType> res(ROWS);
initCondition(cond);
initBranch(a);
initBranch(b);
for (auto _ : state)
{
fillVectorVector3(cond, a, b, res);
benchmark::DoNotOptimize(res);
}
}
/*
-------------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------------
BM_fillConstantConstant1<Int8> 492635 ns 492619 ns 1415
BM_fillConstantConstant3<Int8> 80339 ns 80336 ns 8803
BM_fillConstantConstant1<Int16> 7899 ns 7899 ns 88745
BM_fillConstantConstant3<Int16> 7903 ns 7903 ns 88738
BM_fillConstantConstant1<Int32> 15704 ns 15703 ns 44615
BM_fillConstantConstant3<Int32> 15849 ns 15848 ns 44592
BM_fillConstantConstant1<Int64> 31443 ns 31442 ns 22226
BM_fillConstantConstant3<Int64> 31407 ns 31406 ns 22304
BM_fillConstantConstant1<Int128> 95711 ns 95709 ns 7317
BM_fillConstantConstant3<Int128> 91466 ns 91463 ns 7657
BM_fillConstantConstant1<Int256> 565219 ns 565201 ns 1233
BM_fillConstantConstant3<Int256> 131145 ns 131140 ns 5350
BM_fillConstantConstant1<Float32> 15768 ns 15768 ns 44554
BM_fillConstantConstant3<Float32> 15685 ns 15684 ns 44597
BM_fillConstantConstant1<Float64> 31377 ns 31376 ns 22281
BM_fillConstantConstant3<Float64> 31367 ns 31366 ns 22307
BM_fillConstantConstant1<Decimal32> 65185 ns 65182 ns 10912
BM_fillConstantConstant3<Decimal32> 15703 ns 15702 ns 44490
BM_fillConstantConstant1<Decimal64> 64509 ns 64507 ns 10875
BM_fillConstantConstant3<Decimal64> 31839 ns 31838 ns 22305
BM_fillConstantConstant1<Decimal128> 95602 ns 95600 ns 7325
BM_fillConstantConstant3<Decimal128> 91615 ns 91612 ns 7646
BM_fillConstantConstant1<Decimal256> 572220 ns 572208 ns 1234
BM_fillConstantConstant3<Decimal256> 130326 ns 130323 ns 5375
BM_fillConstantConstant1<DateTime64> 64597 ns 64596 ns 10844
BM_fillConstantConstant3<DateTime64> 64964 ns 64963 ns 10885
*/
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt8);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt8);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int8);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int8);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt16);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt16);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int16);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int16);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt32);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt32);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int32);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int32);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt64);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt64);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int64);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int64);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt128);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt128);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int128);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int128);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt256);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt256);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int256);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int256);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Float32);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Float32);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Float64);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Float64);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Decimal32);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Decimal32);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Decimal64);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Decimal64);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Decimal128);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Decimal128);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Decimal256);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Decimal256);
BENCHMARK_TEMPLATE(BM_fillConstantConstant1, DateTime64);
BENCHMARK_TEMPLATE(BM_fillConstantConstant3, DateTime64);
BENCHMARK(BM_fillVectorVector1);
BENCHMARK(BM_fillVectorVector2);
BENCHMARK(BM_fillVectorVector3);
template <typename T>
struct slice
{
T * data;
};
template <typename T>
NO_INLINE auto BitOrProcess(slice<T> & d)
{
for (auto i = 0u; i < 65536; ++i)
d.data[i] |= T(0xaa);
}
template <typename T>
void initSlice(slice<T> & d)
{
d.data = new T[65536];
for (auto i = 0u; i < 65536; ++i)
d.data[i] = T(std::rand());
}
template <typename T>
void finalizeSlice(slice<T> & d)
{
delete[] d.data;
}
template <typename T>
void BM_BitOrProcess(benchmark::State & state)
{
slice<T> d;
initSlice(d);
for (auto _ : state)
{
BitOrProcess(d);
benchmark::DoNotOptimize(d);
}
}
BENCHMARK_TEMPLATE(BM_BitOrProcess, char8_t);
BENCHMARK_TEMPLATE(BM_BitOrProcess, int8_t);
BENCHMARK_TEMPLATE(BM_BitOrProcess, uint8_t);
MULTITARGET_FUNCTION_AVX512BW_AVX512F_AVX2_SSE42(
MULTITARGET_FUNCTION_HEADER(static void NO_INLINE), isNotNull, MULTITARGET_FUNCTION_BODY((const PaddedPODArray<UInt8> & null_map, PaddedPODArray<UInt8> & res) /// NOLINT
{
for (size_t i = 0; i < 65536; ++i)
res[i] = !null_map[i];
}))
#define BENCHMARK_ISNOTNULL_TEMPLATE(TARGET) \
static void BM_isNotNull##TARGET(benchmark::State & state) \
{ \
PaddedPODArray<UInt8> null_map; \
initCondition(null_map); \
for (auto _ : state) \
{ \
PaddedPODArray<UInt8> res(ROWS); \
isNotNull##TARGET(null_map, res); \
benchmark::DoNotOptimize(res); \
} \
} \
BENCHMARK(BM_isNotNull##TARGET);
BENCHMARK_ISNOTNULL_TEMPLATE()
BENCHMARK_ISNOTNULL_TEMPLATE(SSE42)
BENCHMARK_ISNOTNULL_TEMPLATE(AVX2)
BENCHMARK_ISNOTNULL_TEMPLATE(AVX512F)
BENCHMARK_ISNOTNULL_TEMPLATE(AVX512BW)
MULTITARGET_FUNCTION_AVX512BW_AVX512F_AVX2_SSE42(
MULTITARGET_FUNCTION_HEADER(static void NO_INLINE), isNotNullTest, MULTITARGET_FUNCTION_BODY((const PaddedPODArray<UInt8> & null_map, PaddedPODArray<UInt8> & res) /// NOLINT
{
res.reserve(ROWS);
for (size_t i = 0; i < ROWS; ++i)
res.push_back(!null_map[i]);
}))
#define BENCHMARK_ISNOTNULLTEST_TEMPLATE(TARGET) \
static void BM_isNotNullTest##TARGET(benchmark::State & state) \
{ \
PaddedPODArray<UInt8> null_map; \
initCondition(null_map); \
for (auto _ : state) \
{ \
PaddedPODArray<UInt8> res; \
isNotNullTest##TARGET(null_map, res); \
benchmark::DoNotOptimize(res); \
} \
} \
BENCHMARK(BM_isNotNullTest##TARGET);
BENCHMARK_ISNOTNULLTEST_TEMPLATE()
BENCHMARK_ISNOTNULLTEST_TEMPLATE(SSE42)
BENCHMARK_ISNOTNULLTEST_TEMPLATE(AVX2)
BENCHMARK_ISNOTNULLTEST_TEMPLATE(AVX512F)
BENCHMARK_ISNOTNULLTEST_TEMPLATE(AVX512BW)
/*
Run on (32 X 2100 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x16)
L1 Instruction 32 KiB (x16)
L2 Unified 1024 KiB (x16)
L3 Unified 11264 KiB (x2)
Load Average: 4.47, 4.43, 4.74
-------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------
BM_isNotNull 3854 ns 3854 ns 181463
BM_isNotNullSSE42 3859 ns 3859 ns 181243
BM_isNotNullAVX2 3037 ns 3037 ns 232898
BM_isNotNullAVX512F 2859 ns 2859 ns 245235
BM_isNotNullAVX512BW 2880 ns 2880 ns 244221
BM_isNotNullTest 95141 ns 95139 ns 7342
BM_isNotNullTestSSE42 95201 ns 95199 ns 7322
BM_isNotNullTestAVX2 95107 ns 95105 ns 7362
BM_isNotNullTestAVX512F 95151 ns 95147 ns 7370
BM_isNotNullTestAVX512BW 95150 ns 95148 ns 7348
*/
static NO_INLINE void insertManyFrom(IColumn & dst, const IColumn & src)
{
size_t size = src.size();
dst.insertManyFrom(src, size/2, size);
}
/*
static NO_INLINE void insertManyFromV1(IColumn & dst, const IColumn & src)
{
size_t size = src.size();
ColumnNullable * dst_nullable = typeid_cast<ColumnNullable *>(&dst);
ColumnVector<Int64> * dst_nested = typeid_cast<ColumnVector<Int64> *>(&dst_nullable->getNestedColumn());
auto & dst_data = dst_nested->getData();
auto & dst_null_map = dst_nullable->getNullMapData();
auto src_field = src[size/2];
if (src_field.isNull())
{
dst_data.resize_fill(size, 0);
dst_null_map.resize_fill(size, 1);
}
else
{
auto src_value = src_field.get<Int64>();
dst_data.resize_fill(size, src_value);
dst_null_map.resize_fill(size, 0);
}
}
*/
template <const std::string & str_type>
static void BM_insertManyFrom(benchmark::State & state)
{
auto type = DataTypeFactory::instance().get(str_type);
auto src = createColumn(type, ROWS);
for (auto _ : state)
{
state.PauseTiming();
auto dst = type->createColumn();
dst->reserve(ROWS);
state.ResumeTiming();
insertManyFrom(*dst, *src);
benchmark::DoNotOptimize(dst);
}
}
static const String type_int64 = "Int64";
static const String type_nullable_int64 = "Nullable(Int64)";
static const String type_string = "String";
static const String type_nullable_string = "Nullable(String)";
static const String type_decimal = "Decimal128(3)";
static const String type_nullable_decimal = "Nullable(Decimal128(3))";
static const String type_array_int64 = "Array(Int64)";
static const String type_array_nullable_int64 = "Array(Nullable(Int64))";
static const String type_array_string = "Array(String)";
static const String type_array_nullable_string = "Array(Nullable(String))";
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_int64);
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_nullable_int64);
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_string);
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_nullable_string);
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_decimal);
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_nullable_decimal);
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_array_int64);
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_array_nullable_int64);
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_array_string);
BENCHMARK_TEMPLATE(BM_insertManyFrom, type_array_nullable_string);
/// Benchmark result: https://github.com/ClickHouse/ClickHouse/pull/60846
template <const std::string & str_type>
static void BM_replicate(benchmark::State & state)
{
auto type = DataTypeFactory::instance().get(str_type);
auto col = createColumn(type, ROWS);
auto offsets = createOffsets(ROWS);
for (auto _ : state)
{
auto new_col = col->replicate(offsets);
benchmark::DoNotOptimize(new_col);
}
}
BENCHMARK_TEMPLATE(BM_replicate, type_string);
BENCHMARK_TEMPLATE(BM_replicate, type_nullable_string);
IColumn::Filter mockFilter(size_t rows)
{
IColumn::Filter result(rows);
for (size_t i = 0; i < rows; ++i)
result[i] = rand() % 2 ? 0 : 1;
return std::move(result);
}
ColumnPtr mockIndexes(size_t rows)
{
auto filter = mockFilter(rows);
auto result = ColumnUInt64::create();
auto & data = result->getData();
for (size_t i = 0; i < rows; ++i)
{
if (filter[i])
data.push_back(i);
}
return std::move(result);
}
template <const std::string & str_type>
static void BM_filter(benchmark::State & state)
{
auto type = DataTypeFactory::instance().get(str_type);
auto src = createColumn(type, ROWS);
auto filter = mockFilter(ROWS);
auto dst_size_hint = countBytesInFilter(filter);
for (auto _ : state)
{
auto dst = src->filter(filter, dst_size_hint);
benchmark::DoNotOptimize(dst);
}
}
template <const std::string & str_type>
static void BM_index(benchmark::State & state)
{
auto type = DataTypeFactory::instance().get(str_type);
auto src = createColumn(type, ROWS);
auto indexes = mockIndexes(ROWS);
for (auto _ : state)
{
auto dst = src->index(*indexes, 0);
benchmark::DoNotOptimize(dst);
}
}
/*
template <const std::string & str_type>
static void BM_filterInPlace(benchmark::State & state)
{
auto type = DataTypeFactory::instance().get(str_type);
auto indexes_col = mockIndexes(ROWS);
const auto & indexes = assert_cast<const ColumnUInt64 &>(*indexes_col).getData();
for (auto _ : state)
{
state.PauseTiming();
auto src = createColumn(type, ROWS);
auto mutable_src = src->assumeMutable();
state.ResumeTiming();
mutable_src->filterInPlace(indexes, 0);
benchmark::DoNotOptimize(src);
}
}
*/
/*
10%
---------------------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------------------
BM_filter<type_int64> 28485 ns 28484 ns 24242
BM_index<type_int64> 7925 ns 7925 ns 88317
BM_filterInPlace<type_int64> 8553 ns 8520 ns 84726
BM_filter<type_nullable_int64> 64513 ns 64512 ns 10885
BM_index<type_nullable_int64> 14209 ns 14209 ns 49289
BM_filterInPlace<type_nullable_int64> 17854 ns 17706 ns 41712
BM_filter<type_string> 82993 ns 82990 ns 8313
BM_index<type_string> 35828 ns 35827 ns 19943
BM_filterInPlace<type_string> 28748 ns 28621 ns 23922
BM_filter<type_nullable_string> 115106 ns 115099 ns 6057
BM_index<type_nullable_string> 42771 ns 42767 ns 17238
BM_filterInPlace<type_nullable_string> 35743 ns 35625 ns
99%
---------------------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------------------
BM_filter<type_int64> 81142 ns 81139 ns 8908
BM_index<type_int64> 75867 ns 75865 ns 9197
BM_filterInPlace<type_int64> 57672 ns 57635 ns 12098
BM_filter<type_nullable_int64> 150746 ns 150740 ns 4716
BM_index<type_nullable_int64> 129113 ns 129108 ns 5440
BM_filterInPlace<type_nullable_int64> 111840 ns 111763 ns 6190
BM_filter<type_string> 367415 ns 367092 ns 1917
BM_index<type_string> 261185 ns 261174 ns 2682
BM_filterInPlace<type_string> 215903 ns 215809 ns 3180
BM_filter<type_nullable_string> 435441 ns 435426 ns 1577
BM_index<type_nullable_string> 313108 ns 313097 ns 2229
BM_filterInPlace<type_nullable_string> 271534 ns 271424 ns
*/
BENCHMARK_TEMPLATE(BM_filter, type_int64);
BENCHMARK_TEMPLATE(BM_index, type_int64);
// BENCHMARK_TEMPLATE(BM_filterInPlace, type_int64);
BENCHMARK_TEMPLATE(BM_filter, type_nullable_int64);
BENCHMARK_TEMPLATE(BM_index, type_nullable_int64);
// BENCHMARK_TEMPLATE(BM_filterInPlace, type_nullable_int64);
BENCHMARK_TEMPLATE(BM_filter, type_string);
BENCHMARK_TEMPLATE(BM_index, type_string);
// BENCHMARK_TEMPLATE(BM_filterInPlace, type_string);
BENCHMARK_TEMPLATE(BM_filter, type_nullable_string);
BENCHMARK_TEMPLATE(BM_index, type_nullable_string);
// BENCHMARK_TEMPLATE(BM_filterInPlace, type_nullable_string);
/// If mask is a number of this kind: [0]*[1]* function returns the length of the cluster of 1s.
/// Otherwise it returns the special value: 0xFF.
static inline uint8_t prefixToCopy(UInt64 mask)
{
if (mask == 0)
return 0;
if (mask == static_cast<UInt64>(-1))
return 64;
/// Row with index 0 correspond to the least significant bit.
/// So the length of the prefix to copy is 64 - #(leading zeroes).
const UInt64 leading_zeroes = __builtin_clzll(mask);
if (mask == ((static_cast<UInt64>(-1) << leading_zeroes) >> leading_zeroes))
return 64 - leading_zeroes;
else
return 0xFF;
}
static inline uint8_t suffixToCopy(UInt64 mask)
{
const auto prefix_to_copy = prefixToCopy(~mask);
return prefix_to_copy >= 64 ? prefix_to_copy : 64 - prefix_to_copy;
}
static inline UInt64 blsr(UInt64 mask)
{
#ifdef __BMI__
return _blsr_u64(mask);
#else
return mask & (mask-1);
#endif
}
DECLARE_DEFAULT_CODE(
template <typename T>
void myFilterToIndices(const UInt8 * filt, size_t start, size_t end, PaddedPODArray<T> & indices)
{
size_t pos = 0;
for (; start + 64 <= end; start += 64)
{
UInt64 mask = bytes64MaskToBits64Mask(filt + start);
const uint8_t prefix_to_copy = prefixToCopy(mask);
if (0xFF != prefix_to_copy)
{
for (size_t i = 0; i < prefix_to_copy; ++i)
indices[pos++] = start + i;
}
else
{
const uint8_t suffix_to_copy = suffixToCopy(mask);
if (0xFF != suffix_to_copy)
{
for (size_t i = 64 - suffix_to_copy; i < 64; ++i)
indices[pos++] = start + i;
}
else
{
while (mask)
{
size_t index = std::countr_zero(mask);
indices[pos++] = start + index;
mask = blsr(mask);
}
}
}
}
for (; start != end; ++start)
if (filt[start])
indices[pos++] = start;
}
)
DECLARE_AVX512F_SPECIFIC_CODE(
template <typename T>
void myFilterToIndices(const UInt8 * filt, size_t start, size_t end, PaddedPODArray<T> & indices)
{
static constexpr size_t LOOPS_PER_MASK = sizeof(T);
static constexpr size_t MASK_BITS_PER_LOOP = 64 / LOOPS_PER_MASK;
static constexpr UInt64 MASK_IN_LOOP = (1ULL << MASK_BITS_PER_LOOP) - 1;
__m512i index_vec;
__m512i increment_vec;
if constexpr (std::is_same_v<T, UInt64>)
{
index_vec
= _mm512_set_epi64(start + 7, start + 6, start + 5, start + 4, start + 3, start + 2, start + 1, start); // Initial index vector
increment_vec = _mm512_set1_epi64(8); // Increment vector
}
else if constexpr (std::is_same_v<T, UInt32>)
{
index_vec = _mm512_set_epi32(
start + 15,
start + 14,
start + 13,
start + 12,
start + 11,
start + 10,
start + 9,
start + 8,
start + 7,
start + 6,
start + 5,
start + 4,
start + 3,
start + 2,
start + 1,
start); // Initial index vector
increment_vec = _mm512_set1_epi64(16); // Increment vector
}
/*
else if constexpr (std::is_same_v<T, UInt16>)
{
index_vec = _mm512_set_epi16(
start + 31,
start + 30,
start + 29,
start + 28,
start + 27,
start + 26,
start + 25,
start + 24,
start + 23,
start + 22,
start + 21,
start + 20,
start + 19,
start + 18,
start + 17,
start + 16,
start + 15,
start + 14,
start + 13,
start + 12,
start + 11,
start + 10,
start + 9,
start + 8,
start + 7,
start + 6,
start + 5,
start + 4,
start + 3,
start + 2,
start + 1,
start); // Initial index vector
increment_vec = _mm512_set1_epi64(32); // Increment vector
}
*/
size_t pos = 0;
for (; start + 64 <= end; start += 64)
{
UInt64 mask64 = bytes64MaskToBits64Mask(filt + start);
for (size_t i = 0; i < LOOPS_PER_MASK; ++i)
{
auto offset = std::popcount(mask64 & MASK_IN_LOOP);
if (offset)
{
if constexpr (std::is_same_v<T, UInt64>)
{
__m512i compressed_indices = _mm512_maskz_compress_epi64(mask64 & MASK_IN_LOOP, index_vec); // Compress indices
_mm512_storeu_si512(&indices[pos], compressed_indices); // Store compressed indices
}
else if constexpr (std::is_same_v<T, UInt32>)
{
__m512i compressed_indices = _mm512_maskz_compress_epi32(mask64 & MASK_IN_LOOP, index_vec); // Compress indices
_mm512_storeu_si512(&indices[pos], compressed_indices); // Store compressed indices
}
/*
else if constexpr (std::is_same_v<T, UInt16>)
{
__m512i compressed_indices = _mm512_maskz_compress_epi16(mask64 & MASK_IN_LOOP, index_vec); // Compress indices
_mm512_storeu_si512(&indices[pos], compressed_indices); // Store compressed indices
}
*/
pos += offset;
}
if constexpr (std::is_same_v<T, UInt64>)
index_vec = _mm512_add_epi64(index_vec, increment_vec); // Increment the index vector
else if constexpr (std::is_same_v<T, UInt32>)
index_vec = _mm512_add_epi32(index_vec, increment_vec); // Increment the index vector
/*
else if constexpr (std::is_same_v<T, UInt16>)
index_vec = _mm512_add_epi16(index_vec, increment_vec); // Increment the index vector
*/
mask64 >>= MASK_BITS_PER_LOOP;
}
}
for (; start != end; ++start)
{
if (filt[start])
indices[pos++] = start;
}
})
template <typename T>
static NO_INLINE size_t myFilterToIndicesAVX512(const IColumn::Filter & filt, PaddedPODArray<T> & indices)
{
static constexpr size_t PADDING_BYTES = 64/sizeof(T) - 1;
if (filt.empty())
return 0;
size_t start = 0;
size_t end = filt.size();
size_t size = countBytesInFilter(filt.data(), start, end);
indices.resize_exact(size + PADDING_BYTES);
::TargetSpecific::AVX512F::myFilterToIndices(filt.data(), start, end, indices);
indices.resize_exact(size);
return start;
}
template <typename T>
static NO_INLINE size_t myFilterToIndicesDefault(const IColumn::Filter & filt, PaddedPODArray<T> & indices)
{
if (filt.empty())
return 0;
size_t start = 0;
size_t end = filt.size();
size_t size = countBytesInFilter(filt.data(), start, end);
indices.resize_exact(size);
::TargetSpecific::Default::myFilterToIndices(filt.data(), start, end, indices);
return start;
}
template <typename T>
static void BM_myFilterToIndicesDefault(benchmark::State & state)
{
for (auto _ : state)
{
state.PauseTiming();
auto filter = mockFilter(ROWS);
state.ResumeTiming();
PaddedPODArray<T> indices;
auto start = myFilterToIndicesDefault(filter, indices);
benchmark::DoNotOptimize(start);
benchmark::DoNotOptimize(indices);
}
}
template <typename T>
static void BM_myFilterToIndicesAVX512(benchmark::State & state)
{
for (auto _ : state)
{
state.PauseTiming();
auto filter = mockFilter(ROWS);
state.ResumeTiming();
PaddedPODArray<T> indices;
auto start = myFilterToIndicesAVX512(filter, indices);
benchmark::DoNotOptimize(start);
benchmark::DoNotOptimize(indices);
}
}
BENCHMARK_TEMPLATE(BM_myFilterToIndicesDefault, UInt16);
/*
BENCHMARK_TEMPLATE(BM_myFilterToIndicesDefault, UInt32);
BENCHMARK_TEMPLATE(BM_myFilterToIndicesDefault, UInt64);
// BENCHMARK_TEMPLATE(BM_myFilterToIndicesAVX512, UInt16);
BENCHMARK_TEMPLATE(BM_myFilterToIndicesAVX512, UInt32);
BENCHMARK_TEMPLATE(BM_myFilterToIndicesAVX512, UInt64);
*/
#endif