cpp-ch/local-engine/tests/benchmark_spark_functions.cpp - gluten - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #if defined(__x86_64__)

 #include <cstddef>
 #include <Columns/ColumnsCommon.h>
 #include <Columns/IColumn.h>
 #include <Core/Block.h>
 #include <DataTypes/DataTypeArray.h>
 #include <DataTypes/DataTypeFactory.h>
 #include <DataTypes/IDataType.h>
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionsRound.h>
 #include <Functions/SparkFunctionFloor.h>
 #include <Parser/SerializedPlanParser.h>
 #include <base/types.h>
 #include <benchmark/benchmark.h>
 #include <Common/QueryContext.h>
 #include <Common/TargetSpecific.h>
 #include <DataTypes/DataTypeNullable.h>

 #if USE_MULTITARGET_CODE
 #include <immintrin.h>
 #endif

 using namespace DB;

 static IColumn::Offsets createOffsets(size_t rows)
 {
     IColumn::Offsets offsets(rows, 0);
     for (size_t i = 0; i < rows; ++i)
         offsets[i] = offsets[i-1] + (rand() % 10);
     return offsets;
 }

 static ColumnPtr createColumn(const DataTypePtr & type, size_t rows)
 {
     const auto * type_array = typeid_cast<const DataTypeArray *>(type.get());
     if (type_array)
     {
         auto data_col = createColumn(type_array->getNestedType(), rows);
         auto offset_col = ColumnArray::ColumnOffsets::create(rows, 0);
         auto & offsets = offset_col->getData();
         for (size_t i = 0; i < data_col->size(); ++i)
             offsets[i] = offsets[i - 1] + (rand() % 10);
         auto new_data_col = data_col->replicate(offsets);

         return ColumnArray::create(std::move(new_data_col), std::move(offset_col));
     }

     auto type_not_nullable = removeNullable(type);
     auto column = type->createColumn();
     for (size_t i = 0; i < rows; ++i)
     {
         if (i % 100)
         {
             column->insertDefault();
         }
         else if (isInt(type_not_nullable))
         {
             column->insert(i);
         }
         else if (isFloat(type_not_nullable))
         {
             double d = i * 1.0;
             column->insert(d);
         }
         else if (isDecimal(type_not_nullable))
         {
             Decimal128 d = Decimal128(i * i);
             column->insert(d);
         }
         else if (isString(type_not_nullable))
         {
             String s = "helloworld";
             column->insert(s);
         }
         else
         {
             column->insertDefault();
         }
     }
     return std::move(column);
 }

 static Block createBlock(const String & type_str, size_t rows)
 {
     auto type = DataTypeFactory::instance().get(type_str);
     auto column = createColumn(type, rows);

     Block block;
     block.insert(ColumnWithTypeAndName(std::move(column), type, "d"));
     return std::move(block);
 }

 static void BM_CHFloorFunction_For_Int64(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("floor", local_engine::QueryContext::globalContext());
     Block int64_block = createBlock("Nullable(Int64)", 65536);
     auto executable = function->build(int64_block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(int64_block.getColumnsWithTypeAndName(), executable->getResultType(), int64_block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 static void BM_CHFloorFunction_For_Float64(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("floor", local_engine::QueryContext::globalContext());
     Block float64_block = createBlock("Nullable(Float64)", 65536);
     auto executable = function->build(float64_block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(float64_block.getColumnsWithTypeAndName(), executable->getResultType(), float64_block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 static void BM_SparkFloorFunction_For_Int64(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("sparkFloor", local_engine::QueryContext::globalContext());
     Block int64_block = createBlock("Nullable(Int64)", 65536);
     auto executable = function->build(int64_block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(int64_block.getColumnsWithTypeAndName(), executable->getResultType(), int64_block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 static void BM_SparkFloorFunction_For_Float64(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("sparkFloor", local_engine::QueryContext::globalContext());
     Block float64_block = createBlock("Nullable(Float64)", 65536);
     auto executable = function->build(float64_block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(float64_block.getColumnsWithTypeAndName(), executable->getResultType(), float64_block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 BENCHMARK(BM_CHFloorFunction_For_Int64);
 BENCHMARK(BM_CHFloorFunction_For_Float64);
 BENCHMARK(BM_SparkFloorFunction_For_Int64);
 BENCHMARK(BM_SparkFloorFunction_For_Float64);

 static void BM_OptSparkDivide_VectorVector(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("sparkDivide", local_engine::QueryContext::globalContext());
     auto type = DataTypeFactory::instance().get("Nullable(Float64)");
     auto left = createColumn(type, 65536);
     auto right = createColumn(type, 65536);
     auto block = Block({ColumnWithTypeAndName(left, type, "left"), ColumnWithTypeAndName(right, type, "right")});
     auto executable = function->build(block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 static void BM_OptSparkDivide_VectorConstant(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("sparkDivide", local_engine::QueryContext::globalContext());
     auto type = DataTypeFactory::instance().get("Nullable(Float64)");
     auto left = createColumn(type, 65536);
     auto right = createColumn(type, 1);
     auto const_right = ColumnConst::create(std::move(right), 65536);
     auto block = Block({ColumnWithTypeAndName(left, type, "left"), ColumnWithTypeAndName(std::move(const_right), type, "right")});
     auto executable = function->build(block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 static void BM_OptSparkDivide_ConstantVector(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("sparkDivide", local_engine::QueryContext::globalContext());
     auto type = DataTypeFactory::instance().get("Nullable(Float64)");
     auto left = createColumn(type, 1);
     auto const_left = ColumnConst::create(std::move(left), 65536);
     auto right = createColumn(type, 65536);
     auto block = Block({ColumnWithTypeAndName(std::move(const_left), type, "left"), ColumnWithTypeAndName(std::move(right), type, "right")});
     auto executable = function->build(block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 BENCHMARK(BM_OptSparkDivide_VectorVector);
 BENCHMARK(BM_OptSparkDivide_VectorConstant);
 BENCHMARK(BM_OptSparkDivide_ConstantVector);

 static void BM_OptSparkCastFloatToInt(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("sparkCastFloatToInt32", local_engine::QueryContext::globalContext());
     auto type = DataTypeFactory::instance().get("Nullable(Float64)");
     auto input = createColumn(type, 65536);
     auto block = Block({ColumnWithTypeAndName(std::move(input), type, "input")});
     auto executable = function->build(block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 BENCHMARK(BM_OptSparkCastFloatToInt);

 /// decimal to decimal, scale up
 static void BM_OptCheckDecimalOverflowSparkFromDecimal1(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
     auto type = DataTypeFactory::instance().get("Nullable(Decimal128(10))");

     auto input = createColumn(type, 65536);
     auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
     auto scale = ColumnConst::create(ColumnUInt32::create(1, 5), 65536);

     auto block = Block(
         {ColumnWithTypeAndName(std::move(input), type, "input"),
          ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
          ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
     auto executable = function->build(block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 /// decimal to decimal, scale down
 static void BM_OptCheckDecimalOverflowSparkFromDecimal2(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
     auto type = DataTypeFactory::instance().get("Nullable(Decimal128(10))");

     auto input = createColumn(type, 65536);
     auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
     auto scale = ColumnConst::create(ColumnUInt32::create(1, 15), 65536);

     auto block = Block(
         {ColumnWithTypeAndName(std::move(input), type, "input"),
          ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
          ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
     auto executable = function->build(block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 /// decimal to decimal, scale doesn't change
 static void BM_OptCheckDecimalOverflowSparkFromDecimal3(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
     auto type = DataTypeFactory::instance().get("Nullable(Decimal(38, 10))");

     auto input = createColumn(type, 65536);
     auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
     auto scale = ColumnConst::create(ColumnUInt32::create(1, 10), 65536);

     auto block = Block(
         {ColumnWithTypeAndName(std::move(input), type, "input"),
          ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
          ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
     auto executable = function->build(block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 /// int to decimal
 static void BM_OptCheckDecimalOverflowSparkFromInt(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
     auto type = DataTypeFactory::instance().get("Nullable(Int64)");

     auto input = createColumn(type, 65536);
     auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
     auto scale = ColumnConst::create(ColumnUInt32::create(1, 10), 65536);

     auto block = Block(
         {ColumnWithTypeAndName(std::move(input), type, "input"),
          ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
          ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
     auto executable = function->build(block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 /// float to decimal
 static void BM_OptCheckDecimalOverflowSparkFromFloat(benchmark::State & state)
 {
     using namespace DB;
     auto & factory = FunctionFactory::instance();
     auto function = factory.get("checkDecimalOverflowSparkOrNull", local_engine::QueryContext::globalContext());
     auto type = DataTypeFactory::instance().get("Nullable(Float64)");

     auto input = createColumn(type, 65536);
     auto precision = ColumnConst::create(ColumnUInt32::create(1, 38), 65536);
     auto scale = ColumnConst::create(ColumnUInt32::create(1, 10), 65536);

     auto block = Block(
         {ColumnWithTypeAndName(std::move(input), type, "input"),
          ColumnWithTypeAndName(std::move(precision), std::make_shared<DataTypeUInt32>(), "precision"),
          ColumnWithTypeAndName(std::move(scale), std::make_shared<DataTypeUInt32>(), "scale")});
     auto executable = function->build(block.getColumnsWithTypeAndName());
     for (auto _ : state)
     {
         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows(), false);
         benchmark::DoNotOptimize(result);
     }
 }

 BENCHMARK(BM_OptCheckDecimalOverflowSparkFromDecimal1);
 BENCHMARK(BM_OptCheckDecimalOverflowSparkFromDecimal2);
 BENCHMARK(BM_OptCheckDecimalOverflowSparkFromDecimal3);
 BENCHMARK(BM_OptCheckDecimalOverflowSparkFromInt);
 BENCHMARK(BM_OptCheckDecimalOverflowSparkFromFloat);

 static void nanInfToNullAutoOpt(float * data, uint8_t * null_map, size_t size)
 {
     for (size_t i = 0; i < size; ++i)
     {
         uint8_t is_nan = (data[i] != data[i]);
         uint8_t is_inf
             = ((*reinterpret_cast<const uint32_t *>(&data[i]) & 0b01111111111111111111111111111111) == 0b01111111100000000000000000000000);
         uint8_t null_flag = is_nan | is_inf;
         null_map[i] = null_flag;

         UInt32 * uint_data = reinterpret_cast<UInt32 *>(&data[i]);
         *uint_data &= ~(-null_flag);
     }
 }

 static void BMNanInfToNullAutoOpt(benchmark::State & state)
 {
     constexpr size_t size = 8192;
     float data[size];
     uint8_t null_map[size] = {0};
     for (size_t i = 0; i < size; ++i)
         data[i] = static_cast<float>(rand()) / rand();

     for (auto _ : state)
     {
         nanInfToNullAutoOpt(data, null_map, size);
         benchmark::DoNotOptimize(null_map);
     }
 }
 BENCHMARK(BMNanInfToNullAutoOpt);

 DECLARE_AVX2_SPECIFIC_CODE(

     void nanInfToNullSIMD(float * data, uint8_t * null_map, size_t size) {
         const __m256 inf = _mm256_set1_ps(INFINITY);
         const __m256 neg_inf = _mm256_set1_ps(-INFINITY);
         const __m256 zero = _mm256_set1_ps(0.0f);

         size_t i = 0;
         for (; i + 7 < size; i += 8)
         {
             __m256 values = _mm256_loadu_ps(&data[i]);

             __m256 is_inf = _mm256_cmp_ps(values, inf, _CMP_EQ_OQ);
             __m256 is_neg_inf = _mm256_cmp_ps(values, neg_inf, _CMP_EQ_OQ);
             __m256 is_nan = _mm256_cmp_ps(values, values, _CMP_NEQ_UQ);
             __m256 is_null = _mm256_or_ps(_mm256_or_ps(is_inf, is_neg_inf), is_nan);
             __m256 new_values = _mm256_blendv_ps(values, zero, is_null);

             _mm256_storeu_ps(&data[i], new_values);

             UInt32 mask = static_cast<UInt32>(_mm256_movemask_ps(is_null));
             for (size_t j = 0; j < 8; ++j)
             {
                 UInt8 null_flag = (mask & 1U);
                 null_map[i + j] = null_flag;
                 mask >>= 1;
             }
         }
     })

 static void BMNanInfToNullAVX2(benchmark::State & state)
 {
     constexpr size_t size = 8192;
     float data[size];
     uint8_t null_map[size] = {0};
     for (size_t i = 0; i < size; ++i)
         data[i] = static_cast<float>(rand()) / rand();

     for (auto _ : state)
     {
         ::TargetSpecific::AVX2::nanInfToNullSIMD(data, null_map, size);
         benchmark::DoNotOptimize(null_map);
     }
 }
 BENCHMARK(BMNanInfToNullAVX2);

 static void nanInfToNull(float * data, uint8_t * null_map, size_t size)
 {
     for (size_t i = 0; i < size; ++i)
     {
         if (data[i] != data[i])
             null_map[i] = 1;
         else if ((*reinterpret_cast<const uint32_t *>(&data[i]) & 0b01111111111111111111111111111111) == 0b01111111100000000000000000000000)
             null_map[i] = 1;
         else
             null_map[i] = 0;

         if (null_map[i])
             data[i] = 0.0;
     }
 }

 static void BMNanInfToNull(benchmark::State & state)
 {
     constexpr size_t size = 8192;
     float data[size];
     uint8_t null_map[size] = {0};
     for (size_t i = 0; i < size; ++i)
         data[i] = static_cast<float>(rand()) / rand();

     for (auto _ : state)
     {
         nanInfToNull(data, null_map, size);
         benchmark::DoNotOptimize(null_map);
     }
 }
 BENCHMARK(BMNanInfToNull);


 /*
 /// TO run in https://quick-bench.com/q/h-2qGgqxM8ksp57VD0w7JdKKN-I
 using UInt8 = unsigned char;
 using UInt64 = unsigned long long;
 using Int64 = signed long long;
 template<typename T>
 using PaddedPODArray = std::vector<T>;
 */


 /*
 Test performance of fillConstantConstant*
 Benchmark when BranchType is Int64
 -------------------------------------------------------------------
 Benchmark                         Time             CPU   Iterations
 -------------------------------------------------------------------
 BM_fillConstantConstant1      31360 ns        31359 ns        22249
 BM_fillConstantConstant2      31369 ns        31368 ns        22288
 BM_fillConstantConstant3      31583 ns        31581 ns        22254
 */

 /*
 Test performance of fillVectorVector*
 Benchmark when BranchType is Float64
 ---------------------------------------------------------------
 Benchmark                     Time             CPU   Iterations
 ---------------------------------------------------------------
 BM_fillVectorVector1     414177 ns       414161 ns         1687
 BM_fillVectorVector2      96669 ns        96665 ns         7432
 BM_fillVectorVector3      78439 ns        78436 ns         8812

 Benchmark when BranchType is Int64
 ---------------------------------------------------------------
 Benchmark                     Time             CPU   Iterations
 ---------------------------------------------------------------
 BM_fillVectorVector1      80645 ns        80643 ns         8101
 BM_fillVectorVector2      73841 ns        73838 ns         9484
 BM_fillVectorVector3      73883 ns        73881 ns         9485

 Benchmark when BranchType is Decimal64
 ---------------------------------------------------------------
 Benchmark                     Time             CPU   Iterations
 ---------------------------------------------------------------
 BM_fillVectorVector1      82413 ns        82408 ns         8635
 BM_fillVectorVector2      76289 ns        76287 ns         9213
 BM_fillVectorVector3      76262 ns        76260 ns         9244

 Benchmark when BranchType is Int256
 ---------------------------------------------------------------
 Benchmark                     Time             CPU   Iterations
 ---------------------------------------------------------------
 BM_fillVectorVector1     307741 ns       307726 ns         2263
 BM_fillVectorVector2    2184999 ns      2184903 ns          321
 BM_fillVectorVector3     318616 ns       318605 ns         2209

 Benchmark when BranchType is Decimal256
 ---------------------------------------------------------------
 Benchmark                     Time             CPU   Iterations
 ---------------------------------------------------------------
 BM_fillVectorVector1     303179 ns       303164 ns         2311
 BM_fillVectorVector3     305023 ns       305010 ns         2266
 */

 /*
 Some commands that would be helpful

 # run benchmark
 ./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine --benchmark_filter="BM_fillVectorVector*"

 # get full symbol name
 objdump -t  ./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine   |  c++filt   | grep "fillVectorVector1"

 # get assembly code mixed with source code by symbol name
 gdb -batch -ex "disassemble/rs 'void fillVectorVector3<double, double>(DB::PODArray<char8_t, 4096ul, Allocator<false, false>, 63ul, 64ul> const&, DB::PODArray<double, 4096ul, Allocator<false, false>, 63ul, 64ul> const&, DB::PODArray<double, 4096ul, Allocator<false, false>, 63ul, 64ul> const&, DB::PODArray<double, 4096ul, Allocator<false, false>, 63ul, 64ul>&)'" ./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine    | c++filt  > 3.S
 */

 using ResultType = Float64;

 template <typename T>
 static NO_INLINE void fillConstantConstant1(const PaddedPODArray<UInt8> & cond, T a, T b, PaddedPODArray<T> & res)
 {
     size_t rows = cond.size();
     for (size_t i = 0; i < rows; ++i)
     {
         res[i] = cond[i] ? static_cast<T>(a) : static_cast<T>(b);
     }
 }

 template <typename T>
 static NO_INLINE void
 fillConstantConstant3(const PaddedPODArray<UInt8> & cond, T a, T b, PaddedPODArray<T> & res)
 {
     size_t rows = cond.size();
     T new_a = static_cast<T>(a);
     T new_b = static_cast<T>(b);
     alignas(64) const T ab[2] = {new_a, new_b};
     for (size_t i = 0; i < rows; ++i)
     {
         if constexpr (std::is_integral_v<T> && sizeof(T) == 1)
         {
             /// auto opt: cmove and simd is used for integral types
             // res[i] = cond[i] ? new_a : new_b;
             res[i] = ab[!cond[i]];
         }
         else if constexpr (std::is_floating_point_v<T>)
         {
             /// auto opt: cmove not used but simd is used for floating point types
             res[i] = cond[i] ? new_a : new_b;
         }
         else if constexpr (is_decimal<T> && sizeof(T) <= 8)
         {
             /// auto opt: simd is used for decimal types
             res[i] = cond[i] ? new_a : new_b;
         }
         else if constexpr (is_decimal<T> && sizeof(T) == 32)
         {
             /// avoid branch mispredict
             res[i] = ab[!cond[i]];
         }
         else if constexpr (is_decimal<T> && sizeof(T) == 16)
         {
             /// auto opt: cmove and loop unrolling
             // res[i] = cond[i] ? static_cast<T>(a) : static_cast<T>(b);
             res[i] = ab[!cond[i]];
         }
         else if constexpr (is_big_int_v<T> && sizeof(T) == 32)
         {
             res[i] = ab[!cond[i]];
         }
         else if constexpr (is_big_int_v<T> && sizeof(T) == 16)
         {
             // res[i] = cond[i] ? static_cast<T>(a) : static_cast<T>(b);
             res[i] = ab[!cond[i]];
         }
         else
         {
             res[i] = cond[i] ? static_cast<T>(a) : static_cast<T>(b);
         }
     }
 }

 template <typename Branch1Type = ResultType, typename Branch2Type = ResultType>
 static NO_INLINE void fillVectorVector1(
     const PaddedPODArray<UInt8> & cond,
     const PaddedPODArray<Branch1Type> & a,
     const PaddedPODArray<Branch2Type> & b,
     PaddedPODArray<ResultType> & res)
 {
     size_t rows = cond.size();
     for (size_t i = 0; i < rows; ++i)
     {
         res[i] = cond[i] ? static_cast<ResultType>(a[i]) : static_cast<ResultType>(b[i]);
     }
 }

 template <typename Branch1Type = ResultType, typename Branch2Type = ResultType>
 static NO_INLINE void fillVectorVector2(
     const PaddedPODArray<UInt8> & cond,
     const PaddedPODArray<Branch1Type> & a,
     const PaddedPODArray<Branch2Type> & b,
     PaddedPODArray<ResultType> & res)
 {
     size_t rows = cond.size();
     for (size_t i = 0; i < rows; ++i)
     {
         // res[i] = (!!cond[i]) * static_cast<ResultType>(a[i]) + (!cond[i]) * static_cast<ResultType>(b[i]);
     }
 }

 template <typename Branch1Type = ResultType, typename Branch2Type = ResultType>
 static NO_INLINE void fillVectorVector3(
     const PaddedPODArray<UInt8> & cond,
     const PaddedPODArray<Branch1Type> & a,
     const PaddedPODArray<Branch2Type> & b,
     PaddedPODArray<ResultType> & res)
 {
     size_t rows = cond.size();
     for (size_t i = 0; i < rows; ++i)
     {
         if constexpr (std::is_integral_v<ResultType> || (is_decimal<ResultType> && sizeof(ResultType) <= 8))
         {
             // res[i] = (!!cond[i]) * static_cast<ResultType>(a[i]) + (!cond[i]) * static_cast<ResultType>(b[i]);
         }
         else if constexpr (std::is_floating_point_v<ResultType>)
         {
             using UIntType = std::conditional_t<sizeof(ResultType) == 8, UInt64, UInt32>;
             using IntType = std::conditional_t<sizeof(ResultType) == 8, Int64, Int32>;
             auto mask = static_cast<UIntType>(static_cast<IntType>(cond[i]) - 1);
             auto new_a = static_cast<ResultType>(a[i]);
             auto new_b = static_cast<ResultType>(b[i]);
             UIntType uint_a;
             std::memcpy(&uint_a, &new_a, sizeof(UIntType));
             UIntType uint_b;
             std::memcpy(&uint_b, &new_b, sizeof(UIntType));
             UIntType tmp = (~mask & uint_a) | (mask & uint_b);
             // auto tmp = (~mask & (*reinterpret_cast<const UIntType *>(&new_a))) | (mask & (*reinterpret_cast<const UIntType *>(&new_b)));
             res[i] = *(reinterpret_cast<ResultType *>(&tmp));
         }
         else
         {
             res[i] = cond[i] ? static_cast<ResultType>(a[i]) : static_cast<ResultType>(b[i]);
         }
     }
 }

 static constexpr size_t ROWS = 65536;
 static void initCondition(PaddedPODArray<UInt8> & cond)
 {
     cond.resize(ROWS);
     for (size_t i = 0; i < ROWS; ++i)
     {
         cond[i] = std::rand() % 2;
     }
 }

 template <typename T>
 static void initBranch(PaddedPODArray<T> & branch)
 {
     branch.resize(ROWS);
     for (size_t i = 0; i < ROWS; ++i)
     {
         branch[i] = static_cast<T>(std::rand());
     }
 }

 template <typename T = ResultType>
 static void BM_fillConstantConstant1(benchmark::State & state)
 {
     PaddedPODArray<UInt8> cond;
     T a(std::rand());
     T b(std::rand());
     PaddedPODArray<T> res(ROWS);
     initCondition(cond);

     for (auto _ : state)
     {
         fillConstantConstant1(cond, a, b, res);
         benchmark::DoNotOptimize(res);
     }
 }

 template <typename T = ResultType>
 static void BM_fillConstantConstant3(benchmark::State & state)
 {
     PaddedPODArray<UInt8> cond;
     T a(std::rand());
     T b(std::rand());
     PaddedPODArray<T> res(ROWS);
     initCondition(cond);

     for (auto _ : state)
     {
         fillConstantConstant3(cond, a, b, res);
         benchmark::DoNotOptimize(res);
     }
 }

 static void BM_fillVectorVector1(benchmark::State & state)
 {
     PaddedPODArray<UInt8> cond;
     PaddedPODArray<ResultType> a;
     PaddedPODArray<ResultType> b;
     PaddedPODArray<ResultType> res(ROWS);
     initCondition(cond);
     initBranch(a);
     initBranch(b);

     for (auto _ : state)
     {
         fillVectorVector1(cond, a, b, res);
         benchmark::DoNotOptimize(res);
     }
 }

 static void BM_fillVectorVector2(benchmark::State & state)
 {
     PaddedPODArray<UInt8> cond;
     PaddedPODArray<ResultType> a;
     PaddedPODArray<ResultType> b;
     PaddedPODArray<ResultType> res(ROWS);
     initCondition(cond);
     initBranch(a);
     initBranch(b);

     for (auto _ : state)
     {
         fillVectorVector2(cond, a, b, res);
         benchmark::DoNotOptimize(res);
     }
 }

 static void BM_fillVectorVector3(benchmark::State & state)
 {
     PaddedPODArray<UInt8> cond;
     PaddedPODArray<ResultType> a;
     PaddedPODArray<ResultType> b;
     PaddedPODArray<ResultType> res(ROWS);
     initCondition(cond);
     initBranch(a);
     initBranch(b);

     for (auto _ : state)
     {
         fillVectorVector3(cond, a, b, res);
         benchmark::DoNotOptimize(res);
     }
 }

 /*
 -------------------------------------------------------------------------------
 Benchmark                                     Time             CPU   Iterations
 -------------------------------------------------------------------------------
 BM_fillConstantConstant1<Int8>           492635 ns       492619 ns         1415
 BM_fillConstantConstant3<Int8>            80339 ns        80336 ns         8803
 BM_fillConstantConstant1<Int16>            7899 ns         7899 ns        88745
 BM_fillConstantConstant3<Int16>            7903 ns         7903 ns        88738
 BM_fillConstantConstant1<Int32>           15704 ns        15703 ns        44615
 BM_fillConstantConstant3<Int32>           15849 ns        15848 ns        44592
 BM_fillConstantConstant1<Int64>           31443 ns        31442 ns        22226
 BM_fillConstantConstant3<Int64>           31407 ns        31406 ns        22304
 BM_fillConstantConstant1<Int128>          95711 ns        95709 ns         7317
 BM_fillConstantConstant3<Int128>          91466 ns        91463 ns         7657
 BM_fillConstantConstant1<Int256>         565219 ns       565201 ns         1233
 BM_fillConstantConstant3<Int256>         131145 ns       131140 ns         5350
 BM_fillConstantConstant1<Float32>         15768 ns        15768 ns        44554
 BM_fillConstantConstant3<Float32>         15685 ns        15684 ns        44597
 BM_fillConstantConstant1<Float64>         31377 ns        31376 ns        22281
 BM_fillConstantConstant3<Float64>         31367 ns        31366 ns        22307
 BM_fillConstantConstant1<Decimal32>       65185 ns        65182 ns        10912
 BM_fillConstantConstant3<Decimal32>       15703 ns        15702 ns        44490
 BM_fillConstantConstant1<Decimal64>       64509 ns        64507 ns        10875
 BM_fillConstantConstant3<Decimal64>       31839 ns        31838 ns        22305
 BM_fillConstantConstant1<Decimal128>      95602 ns        95600 ns         7325
 BM_fillConstantConstant3<Decimal128>      91615 ns        91612 ns         7646
 BM_fillConstantConstant1<Decimal256>     572220 ns       572208 ns         1234
 BM_fillConstantConstant3<Decimal256>     130326 ns       130323 ns         5375
 BM_fillConstantConstant1<DateTime64>      64597 ns        64596 ns        10844
 BM_fillConstantConstant3<DateTime64>      64964 ns        64963 ns        10885
 */
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt8);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt8);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int8);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int8);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt16);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt16);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int16);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int16);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt32);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt32);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int32);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int32);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt64);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt64);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int64);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int64);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt128);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt128);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int128);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int128);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, UInt256);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, UInt256);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Int256);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Int256);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Float32);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Float32);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Float64);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Float64);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Decimal32);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Decimal32);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Decimal64);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Decimal64);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Decimal128);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Decimal128);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, Decimal256);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, Decimal256);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant1, DateTime64);
 BENCHMARK_TEMPLATE(BM_fillConstantConstant3, DateTime64);

 BENCHMARK(BM_fillVectorVector1);
 BENCHMARK(BM_fillVectorVector2);
 BENCHMARK(BM_fillVectorVector3);


 template <typename T>
 struct slice
 {
     T * data;
 };

 template <typename T>
 NO_INLINE auto BitOrProcess(slice<T> & d)
 {
     for (auto i = 0u; i < 65536; ++i)
         d.data[i] |= T(0xaa);
 }

 template <typename T>
 void initSlice(slice<T> & d)
 {
     d.data = new T[65536];
     for (auto i = 0u; i < 65536; ++i)
         d.data[i] = T(std::rand());
 }

 template <typename T>
 void finalizeSlice(slice<T> & d)
 {
     delete[] d.data;
 }

 template <typename T>
 void BM_BitOrProcess(benchmark::State & state)
 {
     slice<T> d;
     initSlice(d);

     for (auto _ : state)
     {
         BitOrProcess(d);
         benchmark::DoNotOptimize(d);
     }
 }

 BENCHMARK_TEMPLATE(BM_BitOrProcess, char8_t);
 BENCHMARK_TEMPLATE(BM_BitOrProcess, int8_t);
 BENCHMARK_TEMPLATE(BM_BitOrProcess, uint8_t);

 MULTITARGET_FUNCTION_AVX512BW_AVX512F_AVX2_SSE42(
 MULTITARGET_FUNCTION_HEADER(static void NO_INLINE), isNotNull, MULTITARGET_FUNCTION_BODY((const PaddedPODArray<UInt8> & null_map, PaddedPODArray<UInt8> & res) /// NOLINT
 {
     for (size_t i = 0; i < 65536; ++i)
         res[i] = !null_map[i];
 }))


 #define BENCHMARK_ISNOTNULL_TEMPLATE(TARGET) \
 static void BM_isNotNull##TARGET(benchmark::State & state) \
 { \
     PaddedPODArray<UInt8> null_map; \
     initCondition(null_map); \
     for (auto _ : state) \
     { \
         PaddedPODArray<UInt8> res(ROWS); \
         isNotNull##TARGET(null_map, res); \
         benchmark::DoNotOptimize(res); \
     } \
 } \
 BENCHMARK(BM_isNotNull##TARGET);

 BENCHMARK_ISNOTNULL_TEMPLATE()
 BENCHMARK_ISNOTNULL_TEMPLATE(SSE42)
 BENCHMARK_ISNOTNULL_TEMPLATE(AVX2)
 BENCHMARK_ISNOTNULL_TEMPLATE(AVX512F)
 BENCHMARK_ISNOTNULL_TEMPLATE(AVX512BW)

 MULTITARGET_FUNCTION_AVX512BW_AVX512F_AVX2_SSE42(
 MULTITARGET_FUNCTION_HEADER(static void NO_INLINE), isNotNullTest, MULTITARGET_FUNCTION_BODY((const PaddedPODArray<UInt8> & null_map, PaddedPODArray<UInt8> & res) /// NOLINT
 {
     res.reserve(ROWS);
     for (size_t i = 0; i < ROWS; ++i)
         res.push_back(!null_map[i]);
 }))


 #define BENCHMARK_ISNOTNULLTEST_TEMPLATE(TARGET) \
 static void BM_isNotNullTest##TARGET(benchmark::State & state) \
 { \
     PaddedPODArray<UInt8> null_map; \
     initCondition(null_map); \
     for (auto _ : state) \
     { \
         PaddedPODArray<UInt8> res; \
         isNotNullTest##TARGET(null_map, res); \
         benchmark::DoNotOptimize(res); \
     } \
 } \
 BENCHMARK(BM_isNotNullTest##TARGET);

 BENCHMARK_ISNOTNULLTEST_TEMPLATE()
 BENCHMARK_ISNOTNULLTEST_TEMPLATE(SSE42)
 BENCHMARK_ISNOTNULLTEST_TEMPLATE(AVX2)
 BENCHMARK_ISNOTNULLTEST_TEMPLATE(AVX512F)
 BENCHMARK_ISNOTNULLTEST_TEMPLATE(AVX512BW)

 /*
 Run on (32 X 2100 MHz CPU s)
 CPU Caches:
   L1 Data 32 KiB (x16)
   L1 Instruction 32 KiB (x16)
   L2 Unified 1024 KiB (x16)
   L3 Unified 11264 KiB (x2)
 Load Average: 4.47, 4.43, 4.74
 -------------------------------------------------------------------
 Benchmark                         Time             CPU   Iterations
 -------------------------------------------------------------------
 BM_isNotNull                   3854 ns         3854 ns       181463
 BM_isNotNullSSE42              3859 ns         3859 ns       181243
 BM_isNotNullAVX2               3037 ns         3037 ns       232898
 BM_isNotNullAVX512F            2859 ns         2859 ns       245235
 BM_isNotNullAVX512BW           2880 ns         2880 ns       244221
 BM_isNotNullTest              95141 ns        95139 ns         7342
 BM_isNotNullTestSSE42         95201 ns        95199 ns         7322
 BM_isNotNullTestAVX2          95107 ns        95105 ns         7362
 BM_isNotNullTestAVX512F       95151 ns        95147 ns         7370
 BM_isNotNullTestAVX512BW      95150 ns        95148 ns         7348
 */


 static NO_INLINE void insertManyFrom(IColumn & dst, const IColumn & src)
 {
     size_t size = src.size();
     dst.insertManyFrom(src, size/2, size);
 }

 /*
 static NO_INLINE void insertManyFromV1(IColumn & dst, const IColumn & src)
 {
     size_t size = src.size();
     ColumnNullable * dst_nullable = typeid_cast<ColumnNullable *>(&dst);
     ColumnVector<Int64> * dst_nested = typeid_cast<ColumnVector<Int64> *>(&dst_nullable->getNestedColumn());
     auto & dst_data = dst_nested->getData();
     auto & dst_null_map = dst_nullable->getNullMapData();

     auto src_field = src[size/2];
     if (src_field.isNull())
     {
         dst_data.resize_fill(size, 0);
         dst_null_map.resize_fill(size, 1);
     }
     else
     {
         auto src_value = src_field.get<Int64>();
         dst_data.resize_fill(size, src_value);
         dst_null_map.resize_fill(size, 0);
     }
 }
 */

 template <const std::string & str_type>
 static void BM_insertManyFrom(benchmark::State & state)
 {
     auto type = DataTypeFactory::instance().get(str_type);
     auto src = createColumn(type, ROWS);

     for (auto _ : state)
     {
         state.PauseTiming();
         auto dst = type->createColumn();
         dst->reserve(ROWS);
         state.ResumeTiming();

         insertManyFrom(*dst, *src);
         benchmark::DoNotOptimize(dst);
     }
 }

 static const String type_int64 = "Int64";
 static const String type_nullable_int64 = "Nullable(Int64)";
 static const String type_string = "String";
 static const String type_nullable_string = "Nullable(String)";
 static const String type_decimal = "Decimal128(3)";
 static const String type_nullable_decimal = "Nullable(Decimal128(3))";

 static const String type_array_int64 = "Array(Int64)";
 static const String type_array_nullable_int64 = "Array(Nullable(Int64))";
 static const String type_array_string = "Array(String)";
 static const String type_array_nullable_string = "Array(Nullable(String))";

 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_int64);
 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_nullable_int64);
 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_string);
 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_nullable_string);
 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_decimal);
 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_nullable_decimal);

 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_array_int64);
 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_array_nullable_int64);
 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_array_string);
 BENCHMARK_TEMPLATE(BM_insertManyFrom, type_array_nullable_string);

 /// Benchmark result: https://github.com/ClickHouse/ClickHouse/pull/60846

 template <const std::string & str_type>
 static void BM_replicate(benchmark::State & state)
 {
     auto type = DataTypeFactory::instance().get(str_type);
     auto col = createColumn(type, ROWS);
     auto offsets = createOffsets(ROWS);
     for (auto _ : state)
     {
         auto new_col = col->replicate(offsets);
         benchmark::DoNotOptimize(new_col);
     }
 }

 BENCHMARK_TEMPLATE(BM_replicate, type_string);
 BENCHMARK_TEMPLATE(BM_replicate, type_nullable_string);


 IColumn::Filter mockFilter(size_t rows)
 {
     IColumn::Filter result(rows);
     for (size_t i = 0; i < rows; ++i)
         result[i] = rand() % 2 ? 0 : 1;
     return std::move(result);
 }

 ColumnPtr mockIndexes(size_t rows)
 {
     auto filter = mockFilter(rows);
     auto result = ColumnUInt64::create();
     auto & data = result->getData();
     for (size_t i = 0; i < rows; ++i)
     {
         if (filter[i])
             data.push_back(i);
     }
     return std::move(result);
 }

 template <const std::string & str_type>
 static void BM_filter(benchmark::State & state)
 {
     auto type = DataTypeFactory::instance().get(str_type);
     auto src = createColumn(type, ROWS);
     auto filter = mockFilter(ROWS);
     auto dst_size_hint = countBytesInFilter(filter);
     for (auto _ : state)
     {
         auto dst = src->filter(filter, dst_size_hint);
         benchmark::DoNotOptimize(dst);
     }
 }

 template <const std::string & str_type>
 static void BM_index(benchmark::State & state)
 {
     auto type = DataTypeFactory::instance().get(str_type);
     auto src = createColumn(type, ROWS);
     auto indexes = mockIndexes(ROWS);
     for (auto _ : state)
     {
         auto dst = src->index(*indexes, 0);
         benchmark::DoNotOptimize(dst);
     }
 }


 /*
 template <const std::string & str_type>
 static void BM_filterInPlace(benchmark::State & state)
 {
     auto type = DataTypeFactory::instance().get(str_type);
     auto indexes_col = mockIndexes(ROWS);
     const auto & indexes = assert_cast<const ColumnUInt64 &>(*indexes_col).getData();
     for (auto _ : state)
     {
         state.PauseTiming();
         auto src = createColumn(type, ROWS);
         auto mutable_src = src->assumeMutable();
         state.ResumeTiming();

         mutable_src->filterInPlace(indexes, 0);
         benchmark::DoNotOptimize(src);
     }
 }
 */


 /*
 10%
 ---------------------------------------------------------------------------------
 Benchmark                                       Time             CPU   Iterations
 ---------------------------------------------------------------------------------
 BM_filter<type_int64>                       28485 ns        28484 ns        24242
 BM_index<type_int64>                         7925 ns         7925 ns        88317
 BM_filterInPlace<type_int64>                 8553 ns         8520 ns        84726
 BM_filter<type_nullable_int64>              64513 ns        64512 ns        10885
 BM_index<type_nullable_int64>               14209 ns        14209 ns        49289
 BM_filterInPlace<type_nullable_int64>       17854 ns        17706 ns        41712
 BM_filter<type_string>                      82993 ns        82990 ns         8313
 BM_index<type_string>                       35828 ns        35827 ns        19943
 BM_filterInPlace<type_string>               28748 ns        28621 ns        23922
 BM_filter<type_nullable_string>            115106 ns       115099 ns         6057
 BM_index<type_nullable_string>              42771 ns        42767 ns        17238
 BM_filterInPlace<type_nullable_string>      35743 ns        35625 ns


 99%
 ---------------------------------------------------------------------------------
 Benchmark                                       Time             CPU   Iterations
 ---------------------------------------------------------------------------------
 BM_filter<type_int64>                       81142 ns        81139 ns         8908
 BM_index<type_int64>                        75867 ns        75865 ns         9197
 BM_filterInPlace<type_int64>                57672 ns        57635 ns        12098
 BM_filter<type_nullable_int64>             150746 ns       150740 ns         4716
 BM_index<type_nullable_int64>              129113 ns       129108 ns         5440
 BM_filterInPlace<type_nullable_int64>      111840 ns       111763 ns         6190
 BM_filter<type_string>                     367415 ns       367092 ns         1917
 BM_index<type_string>                      261185 ns       261174 ns         2682
 BM_filterInPlace<type_string>              215903 ns       215809 ns         3180
 BM_filter<type_nullable_string>            435441 ns       435426 ns         1577
 BM_index<type_nullable_string>             313108 ns       313097 ns         2229
 BM_filterInPlace<type_nullable_string>     271534 ns       271424 ns
 */

 BENCHMARK_TEMPLATE(BM_filter, type_int64);
 BENCHMARK_TEMPLATE(BM_index, type_int64);
 // BENCHMARK_TEMPLATE(BM_filterInPlace, type_int64);

 BENCHMARK_TEMPLATE(BM_filter, type_nullable_int64);
 BENCHMARK_TEMPLATE(BM_index, type_nullable_int64);
 // BENCHMARK_TEMPLATE(BM_filterInPlace, type_nullable_int64);

 BENCHMARK_TEMPLATE(BM_filter, type_string);
 BENCHMARK_TEMPLATE(BM_index, type_string);
 // BENCHMARK_TEMPLATE(BM_filterInPlace, type_string);

 BENCHMARK_TEMPLATE(BM_filter, type_nullable_string);
 BENCHMARK_TEMPLATE(BM_index, type_nullable_string);
 // BENCHMARK_TEMPLATE(BM_filterInPlace, type_nullable_string);


 /// If mask is a number of this kind: [0]*[1]* function returns the length of the cluster of 1s.
 /// Otherwise it returns the special value: 0xFF.
 static inline uint8_t prefixToCopy(UInt64 mask)
 {
     if (mask == 0)
         return 0;
     if (mask == static_cast<UInt64>(-1))
         return 64;
     /// Row with index 0 correspond to the least significant bit.
     /// So the length of the prefix to copy is 64 - #(leading zeroes).
     const UInt64 leading_zeroes = __builtin_clzll(mask);
     if (mask == ((static_cast<UInt64>(-1) << leading_zeroes) >> leading_zeroes))
         return 64 - leading_zeroes;
     else
         return 0xFF;
 }

 static inline uint8_t suffixToCopy(UInt64 mask)
 {
     const auto prefix_to_copy = prefixToCopy(~mask);
     return prefix_to_copy >= 64 ? prefix_to_copy : 64 - prefix_to_copy;
 }


 static inline UInt64 blsr(UInt64 mask)
 {
 #ifdef __BMI__
     return _blsr_u64(mask);
 #else
     return mask & (mask-1);
 #endif
 }

 DECLARE_DEFAULT_CODE(

 template <typename T>
 void myFilterToIndices(const UInt8 * filt, size_t start, size_t end, PaddedPODArray<T> & indices)
 {
     size_t pos = 0;
     for (; start + 64 <= end; start += 64)
     {
         UInt64 mask = bytes64MaskToBits64Mask(filt + start);
         const uint8_t prefix_to_copy = prefixToCopy(mask);

         if (0xFF != prefix_to_copy)
         {
             for (size_t i = 0; i < prefix_to_copy; ++i)
                 indices[pos++] = start + i;
         }
         else
         {
             const uint8_t suffix_to_copy = suffixToCopy(mask);
             if (0xFF != suffix_to_copy)
             {
                 for (size_t i = 64 - suffix_to_copy; i < 64; ++i)
                     indices[pos++] = start + i;
             }
             else
             {
                 while (mask)
                 {
                     size_t index = std::countr_zero(mask);
                     indices[pos++] = start + index;
                     mask = blsr(mask);
                 }
             }
         }
     }

     for (; start != end; ++start)
         if (filt[start])
             indices[pos++] = start;
 }
 )

 DECLARE_AVX512F_SPECIFIC_CODE(

 template <typename T>
 void myFilterToIndices(const UInt8 * filt, size_t start, size_t end, PaddedPODArray<T> & indices)
 {
     static constexpr size_t LOOPS_PER_MASK = sizeof(T);
     static constexpr size_t MASK_BITS_PER_LOOP = 64 / LOOPS_PER_MASK;
     static constexpr UInt64 MASK_IN_LOOP = (1ULL << MASK_BITS_PER_LOOP) - 1;

     __m512i index_vec;
     __m512i increment_vec;
     if constexpr (std::is_same_v<T, UInt64>)
     {
         index_vec
             = _mm512_set_epi64(start + 7, start + 6, start + 5, start + 4, start + 3, start + 2, start + 1, start); // Initial index vector
         increment_vec = _mm512_set1_epi64(8); // Increment vector
     }
     else if constexpr (std::is_same_v<T, UInt32>)
     {
         index_vec = _mm512_set_epi32(
             start + 15,
             start + 14,
             start + 13,
             start + 12,
             start + 11,
             start + 10,
             start + 9,
             start + 8,
             start + 7,
             start + 6,
             start + 5,
             start + 4,
             start + 3,
             start + 2,
             start + 1,
             start); // Initial index vector
         increment_vec = _mm512_set1_epi64(16); // Increment vector
     }
     /*
     else if constexpr (std::is_same_v<T, UInt16>)
     {
         index_vec = _mm512_set_epi16(
             start + 31,
             start + 30,
             start + 29,
             start + 28,
             start + 27,
             start + 26,
             start + 25,
             start + 24,
             start + 23,
             start + 22,
             start + 21,
             start + 20,
             start + 19,
             start + 18,
             start + 17,
             start + 16,
             start + 15,
             start + 14,
             start + 13,
             start + 12,
             start + 11,
             start + 10,
             start + 9,
             start + 8,
             start + 7,
             start + 6,
             start + 5,
             start + 4,
             start + 3,
             start + 2,
             start + 1,
             start); // Initial index vector
         increment_vec = _mm512_set1_epi64(32); // Increment vector
     }
     */

     size_t pos = 0;
     for (; start + 64 <= end; start += 64)
     {
         UInt64 mask64 = bytes64MaskToBits64Mask(filt + start);

         for (size_t i = 0; i < LOOPS_PER_MASK; ++i)
         {
             auto offset = std::popcount(mask64 & MASK_IN_LOOP);
             if (offset)
             {
                 if constexpr (std::is_same_v<T, UInt64>)
                 {
                     __m512i compressed_indices = _mm512_maskz_compress_epi64(mask64 & MASK_IN_LOOP, index_vec); // Compress indices
                     _mm512_storeu_si512(&indices[pos], compressed_indices); // Store compressed indices
                 }
                 else if constexpr (std::is_same_v<T, UInt32>)
                 {
                     __m512i compressed_indices = _mm512_maskz_compress_epi32(mask64 & MASK_IN_LOOP, index_vec); // Compress indices
                     _mm512_storeu_si512(&indices[pos], compressed_indices); // Store compressed indices
                 }
                 /*
                 else if constexpr (std::is_same_v<T, UInt16>)
                 {
                     __m512i compressed_indices = _mm512_maskz_compress_epi16(mask64 & MASK_IN_LOOP, index_vec); // Compress indices
                     _mm512_storeu_si512(&indices[pos], compressed_indices); // Store compressed indices
                 }
                 */

                 pos += offset;
             }


             if constexpr (std::is_same_v<T, UInt64>)
                 index_vec = _mm512_add_epi64(index_vec, increment_vec); // Increment the index vector
             else if constexpr (std::is_same_v<T, UInt32>)
                 index_vec = _mm512_add_epi32(index_vec, increment_vec); // Increment the index vector
             /*
             else if constexpr (std::is_same_v<T, UInt16>)
                 index_vec = _mm512_add_epi16(index_vec, increment_vec); // Increment the index vector
             */

             mask64 >>= MASK_BITS_PER_LOOP;
         }
     }

     for (; start != end; ++start)
     {
         if (filt[start])
             indices[pos++] = start;
     }
 })

 template <typename T>
 static NO_INLINE size_t myFilterToIndicesAVX512(const IColumn::Filter & filt, PaddedPODArray<T> & indices)
 {
     static constexpr size_t PADDING_BYTES = 64/sizeof(T) - 1;
     if (filt.empty())
         return 0;

     size_t start = 0;
     size_t end = filt.size();

     size_t size = countBytesInFilter(filt.data(), start, end);
     indices.resize_exact(size + PADDING_BYTES);
     ::TargetSpecific::AVX512F::myFilterToIndices(filt.data(), start, end, indices);
     indices.resize_exact(size);
     return start;
 }

 template <typename T>
 static NO_INLINE size_t myFilterToIndicesDefault(const IColumn::Filter & filt, PaddedPODArray<T> & indices)
 {
     if (filt.empty())
         return 0;

     size_t start = 0;
     size_t end = filt.size();
     size_t size = countBytesInFilter(filt.data(), start, end);
     indices.resize_exact(size);
     ::TargetSpecific::Default::myFilterToIndices(filt.data(), start, end, indices);
     return start;
 }

 template <typename T>
 static void BM_myFilterToIndicesDefault(benchmark::State & state)
 {
     for (auto _ : state)
     {
         state.PauseTiming();
         auto filter = mockFilter(ROWS);
         state.ResumeTiming();

         PaddedPODArray<T> indices;
         auto start = myFilterToIndicesDefault(filter, indices);
         benchmark::DoNotOptimize(start);
         benchmark::DoNotOptimize(indices);
     }
 }

 template <typename T>
 static void BM_myFilterToIndicesAVX512(benchmark::State & state)
 {
     for (auto _ : state)
     {
         state.PauseTiming();
         auto filter = mockFilter(ROWS);
         state.ResumeTiming();

         PaddedPODArray<T> indices;
         auto start = myFilterToIndicesAVX512(filter, indices);
         benchmark::DoNotOptimize(start);
         benchmark::DoNotOptimize(indices);
     }
 }

 BENCHMARK_TEMPLATE(BM_myFilterToIndicesDefault, UInt16);

 /*
 BENCHMARK_TEMPLATE(BM_myFilterToIndicesDefault, UInt32);
 BENCHMARK_TEMPLATE(BM_myFilterToIndicesDefault, UInt64);

 // BENCHMARK_TEMPLATE(BM_myFilterToIndicesAVX512, UInt16);
 BENCHMARK_TEMPLATE(BM_myFilterToIndicesAVX512, UInt32);
 BENCHMARK_TEMPLATE(BM_myFilterToIndicesAVX512, UInt64);
 */


 #endif