blob: f21ee62fd046fbffd932ec7f352ae55cff2a8a99 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if defined(__x86_64__)
#include <immintrin.h>
#include <Columns/IColumn.h>
#include <DataTypes/IDataType.h>
#include <base/Decimal.h>
#include <base/extended_types.h>
#include <benchmark/benchmark.h>
#include <Common/PODArray.h>
#include <Common/TargetSpecific.h>
using namespace DB;
/// Uses addOverflow method (if available) to avoid UB for sumWithOverflow()
///
/// Since NO_SANITIZE_UNDEFINED works only for the function itself, without
/// callers, and in case of non-POD type (i.e. Decimal) you have overwritten
/// operator+=(), which will have UB.
template <typename T>
struct MyAdd
{
static void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(T & lhs, const T & rhs) { lhs += rhs; }
};
template <typename DecimalNativeType>
struct MyAdd<Decimal<DecimalNativeType>>
{
static void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(Decimal<DecimalNativeType> & lhs, const Decimal<DecimalNativeType> & rhs)
{
lhs.addOverflow(rhs);
}
};
// _Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,bmi2\"))),apply_to=function)")
template <typename T>
struct MySumData
{
using Impl = MyAdd<T>;
T sum{};
MULTITARGET_FUNCTION_AVX512BW_AVX512F_AVX2_SSE42(
MULTITARGET_FUNCTION_HEADER(template <typename Value, bool add_if_zero> void NO_SANITIZE_UNDEFINED NO_INLINE),
addManyConditionalInternalImpl,
MULTITARGET_FUNCTION_BODY((
const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end) /// NOLINT
{
ptr += start;
condition_map += start;
size_t count = end - start;
const auto * end_ptr = ptr + count;
if constexpr (
(is_integer<T> && !is_big_int_v<T>)
|| (is_decimal<T> && !std::is_same_v<T, Decimal256> && !std::is_same_v<T, Decimal128>))
{
/// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
/// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
T local_sum{};
while (ptr < end_ptr)
{
T multiplier = !*condition_map == add_if_zero;
Impl::add(local_sum, *ptr * multiplier);
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
return;
}
if constexpr (std::is_floating_point_v<T>)
{
/// For floating point we use a similar trick as above, except that now we reinterpret the floating point number as an unsigned
/// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
static_assert(sizeof(Value) == 4 || sizeof(Value) == 8);
using equivalent_integer = typename std::conditional_t<sizeof(Value) == 4, UInt32, UInt64>;
constexpr size_t unroll_count = 128 / sizeof(T);
T partial_sums[unroll_count]{};
const auto * unrolled_end = ptr + (count / unroll_count * unroll_count);
while (ptr < unrolled_end)
{
for (size_t i = 0; i < unroll_count; ++i)
{
equivalent_integer value;
std::memcpy(&value, &ptr[i], sizeof(Value));
value &= (!condition_map[i] != add_if_zero) - 1;
Value d;
std::memcpy(&d, &value, sizeof(Value));
Impl::add(partial_sums[i], d);
}
ptr += unroll_count;
condition_map += unroll_count;
}
for (size_t i = 0; i < unroll_count; ++i)
Impl::add(sum, partial_sums[i]);
}
T local_sum{};
while (ptr < end_ptr)
{
if (!*condition_map == add_if_zero)
Impl::add(local_sum, *ptr);
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
}))
/// Vectorized version
template <typename Value, bool add_if_zero>
void NO_INLINE
addManyConditionalInternal(const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end)
{
#if USE_MULTITARGET_CODE
if (isArchSupported(TargetArch::AVX512BW))
{
addManyConditionalInternalImplAVX512BW<Value, add_if_zero>(ptr, condition_map, start, end);
return;
}
if (isArchSupported(TargetArch::AVX512F))
{
addManyConditionalInternalImplAVX512F<Value, add_if_zero>(ptr, condition_map, start, end);
return;
}
if (isArchSupported(TargetArch::AVX2))
{
addManyConditionalInternalImplAVX2<Value, add_if_zero>(ptr, condition_map, start, end);
return;
}
if (isArchSupported(TargetArch::SSE42))
{
addManyConditionalInternalImplSSE42<Value, add_if_zero>(ptr, condition_map, start, end);
return;
}
#endif
addManyConditionalInternalImpl<Value, add_if_zero>(ptr, condition_map, start, end);
}
MULTITARGET_FUNCTION_AVX512BW_AVX512F_AVX2_SSE42(
MULTITARGET_FUNCTION_HEADER(template <typename Value, bool add_if_zero> void NO_SANITIZE_UNDEFINED NO_INLINE),
addManyConditionalInternalImplNew,
MULTITARGET_FUNCTION_BODY((
const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end) /// NOLINT
{
ptr += start;
condition_map += start;
size_t count = end - start;
const auto * end_ptr = ptr + count;
if constexpr ((is_integer<T> || is_decimal<T>)&&!is_over_big_int<T>)
{
/// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
/// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
T local_sum{};
while (ptr < end_ptr)
{
T multiplier = !*condition_map == add_if_zero;
Impl::add(local_sum, *ptr * multiplier);
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
return;
}
else if constexpr (is_integer<T>)
{
T local_sum{};
using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int8>;
alignas(64) const MaskType masks[2] = {0, -1};
while (ptr < end_ptr)
{
Value v = *ptr;
if constexpr (!add_if_zero)
v &= masks[!!*condition_map];
else
v &= masks[!*condition_map];
Impl::add(local_sum, v);
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
return;
}
else if constexpr (is_decimal<T>)
{
T local_sum{};
using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int8>;
alignas(64) const MaskType masks[2] = {0, -1};
while (ptr < end_ptr)
{
Value v = *ptr;
if constexpr (!add_if_zero)
v.value &= masks[!!*condition_map];
else
v.value &= masks[!*condition_map];
Impl::add(local_sum, v);
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
return;
}
else if constexpr (std::is_floating_point_v<T>)
{
/// For floating point we use a similar trick as above, except that now we reinterpret the floating point number as an unsigned
/// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
static_assert(sizeof(Value) == 4 || sizeof(Value) == 8);
using equivalent_integer = typename std::conditional_t<sizeof(Value) == 4, UInt32, UInt64>;
constexpr size_t unroll_count = 128 / sizeof(T);
T partial_sums[unroll_count]{};
const auto * unrolled_end = ptr + (count / unroll_count * unroll_count);
while (ptr < unrolled_end)
{
for (size_t i = 0; i < unroll_count; ++i)
{
equivalent_integer value;
std::memcpy(&value, &ptr[i], sizeof(Value));
value &= (!condition_map[i] != add_if_zero) - 1;
Value d;
std::memcpy(&d, &value, sizeof(Value));
Impl::add(partial_sums[i], d);
}
ptr += unroll_count;
condition_map += unroll_count;
}
for (size_t i = 0; i < unroll_count; ++i)
Impl::add(sum, partial_sums[i]);
}
T local_sum{};
while (ptr < end_ptr)
{
Impl::add(local_sum, !*condition_map == add_if_zero ? *ptr : T{});
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
}))
/// Vectorized version
template <typename Value, bool add_if_zero>
void NO_INLINE
addManyConditionalInternalNew(const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end)
{
#if USE_MULTITARGET_CODE
if (isArchSupported(TargetArch::AVX512BW))
{
addManyConditionalInternalImplNewAVX512BW<Value, add_if_zero>(ptr, condition_map, start, end);
return;
}
if (isArchSupported(TargetArch::AVX512F))
{
addManyConditionalInternalImplNewAVX512F<Value, add_if_zero>(ptr, condition_map, start, end);
return;
}
if (isArchSupported(TargetArch::AVX2))
{
addManyConditionalInternalImplNewAVX2<Value, add_if_zero>(ptr, condition_map, start, end);
return;
}
if (isArchSupported(TargetArch::SSE42))
{
addManyConditionalInternalImplNewSSE42<Value, add_if_zero>(ptr, condition_map, start, end);
return;
}
#endif
addManyConditionalInternalImplNew<Value, add_if_zero>(ptr, condition_map, start, end);
}
/*
template <typename Value, bool add_if_zero>
void NO_SANITIZE_UNDEFINED NO_INLINE addManyConditionalInternalImplSIMD(
const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end) /// NOLINT
{
ptr += start;
condition_map += start;
size_t count = end - start;
const auto * end_ptr = ptr + count;
if constexpr ((is_integer<T> || is_decimal<T>)&&!is_over_big_int<T>)
{
/// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
/// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
T local_sum{};
while (ptr < end_ptr)
{
T multiplier = !*condition_map == add_if_zero;
Impl::add(local_sum, *ptr * multiplier);
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
return;
}
else if constexpr (is_integer<T>)
{
T local_sum{};
using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int64>;
alignas(64) const MaskType masks[2] = {0, -1};
while (ptr < end_ptr)
{
T value = *ptr;
if constexpr (sizeof(T) == 16)
{
__m128i v = _mm_loadu_si128((__m128i *)&value);
__m128i c = _mm_set1_epi8(!*condition_map == add_if_zero);
__m128i r = _mm_and_si128(v, c);
_mm_storeu_si128((__m128i *)&value, r);
}
else
{
__m256i v = _mm256_loadu_si256((__m256i *)&value);
__m256i c = _mm256_set1_epi8(!*condition_map == add_if_zero);
__m256i r = _mm256_and_si256(v, c);
_mm256_storeu_si256((__m256i *)&value, r);
}
Impl::add(local_sum, value);
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
return;
}
else if constexpr (is_decimal<T>)
{
T local_sum{};
using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int64>;
alignas(64) const MaskType masks[2] = {0, -1};
while (ptr < end_ptr)
{
Value v = *ptr;
if constexpr (!add_if_zero)
v.value &= masks[*condition_map];
else
v.value &= masks[!*condition_map];
Impl::add(local_sum, v);
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
return;
}
else if constexpr (std::is_floating_point_v<T>)
{
/// For floating point we use a similar trick as above, except that now we reinterpret the floating point number as an unsigned
/// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
static_assert(sizeof(Value) == 4 || sizeof(Value) == 8);
using equivalent_integer = typename std::conditional_t<sizeof(Value) == 4, UInt32, UInt64>;
constexpr size_t unroll_count = 128 / sizeof(T);
T partial_sums[unroll_count]{};
const auto * unrolled_end = ptr + (count / unroll_count * unroll_count);
while (ptr < unrolled_end)
{
for (size_t i = 0; i < unroll_count; ++i)
{
equivalent_integer value;
std::memcpy(&value, &ptr[i], sizeof(Value));
value &= (!condition_map[i] != add_if_zero) - 1;
Value d;
std::memcpy(&d, &value, sizeof(Value));
Impl::add(partial_sums[i], d);
}
ptr += unroll_count;
condition_map += unroll_count;
}
for (size_t i = 0; i < unroll_count; ++i)
Impl::add(sum, partial_sums[i]);
}
T local_sum{};
while (ptr < end_ptr)
{
Impl::add(local_sum, !*condition_map == add_if_zero ? *ptr : T{});
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
}
template <typename Value, bool add_if_zero>
void NO_INLINE
addManyConditionalInternalSIMD(const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end)
{
addManyConditionalInternalImplSIMD<Value, add_if_zero>(ptr, condition_map, start, end);
}
*/
};
// _Pragma("clang attribute pop")
static constexpr size_t ROWS = 65536;
static void initCondition(PaddedPODArray<UInt8> & cond)
{
cond.resize(ROWS);
for (size_t i = 0; i < ROWS; ++i)
cond[i] = std::rand() % 2;
}
template <typename T>
static void initColumn(PaddedPODArray<T> & data)
{
data.resize(ROWS);
for (size_t i = 0; i < ROWS; ++i)
data[i] = static_cast<T>(std::rand());
}
template <typename T>
static void BM_SumWithCondition(benchmark::State & state)
{
PaddedPODArray<T> data;
initColumn(data);
PaddedPODArray<UInt8> cond;
initCondition(cond);
for (auto _ : state)
{
MySumData<T> sum_data;
sum_data.template addManyConditionalInternal<T, false>(data.data(), cond.data(), 0, ROWS);
benchmark::DoNotOptimize(sum_data);
}
}
template <typename T>
static void BM_SumWithConditionNew(benchmark::State & state)
{
PaddedPODArray<T> data;
initColumn(data);
PaddedPODArray<UInt8> cond;
initCondition(cond);
for (auto _ : state)
{
MySumData<T> sum_data;
sum_data.template addManyConditionalInternalNew<T, false>(data.data(), cond.data(), 0, ROWS);
benchmark::DoNotOptimize(sum_data);
}
}
/*
template <typename T>
static void BM_SumWithConditionSIMD(benchmark::State & state)
{
PaddedPODArray<T> data;
initColumn(data);
PaddedPODArray<UInt8> cond;
initCondition(cond);
for (auto _ : state)
{
MySumData<T> sum_data;
sum_data.template addManyConditionalInternalSIMD<T, false>(data.data(), cond.data(), 0, ROWS);
benchmark::DoNotOptimize(sum_data);
}
}
*/
BENCHMARK_TEMPLATE(BM_SumWithCondition, Int64);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Int64);
BENCHMARK_TEMPLATE(BM_SumWithCondition, UInt64);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, UInt64);
BENCHMARK_TEMPLATE(BM_SumWithCondition, Float64);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Float64);
BENCHMARK_TEMPLATE(BM_SumWithCondition, Int128);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Int128);
// BENCHMARK_TEMPLATE(BM_SumWithConditionSIMD, Int128);
BENCHMARK_TEMPLATE(BM_SumWithCondition, UInt128);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, UInt128);
// BENCHMARK_TEMPLATE(BM_SumWithConditionSIMD, UInt128);
BENCHMARK_TEMPLATE(BM_SumWithCondition, Int256);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Int256);
// BENCHMARK_TEMPLATE(BM_SumWithConditionSIMD, Int256);
BENCHMARK_TEMPLATE(BM_SumWithCondition, UInt256);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, UInt256);
// BENCHMARK_TEMPLATE(BM_SumWithConditionSIMD, UInt256);
BENCHMARK_TEMPLATE(BM_SumWithCondition, Decimal32);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Decimal32);
BENCHMARK_TEMPLATE(BM_SumWithCondition, Decimal64);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Decimal64);
BENCHMARK_TEMPLATE(BM_SumWithCondition, Decimal128);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Decimal128);
BENCHMARK_TEMPLATE(BM_SumWithCondition, Decimal256);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Decimal256);
/*
Run on (32 X 2100 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x16)
L1 Instruction 32 KiB (x16)
L2 Unified 1024 KiB (x16)
L3 Unified 11264 KiB (x2)
Load Average: 7.56, 5.47, 5.16
-----------------------------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------------------------
BM_SumWithCondition<Int64> 8930 ns 8929 ns 77846
BM_SumWithConditionNew<Int64> 8768 ns 8767 ns 80325
BM_SumWithCondition<UInt64> 8816 ns 8816 ns 80369
BM_SumWithConditionNew<UInt64> 8725 ns 8724 ns 80186
BM_SumWithCondition<Float64> 10229 ns 10228 ns 68275
BM_SumWithConditionNew<Float64> 10213 ns 10212 ns 68308
BM_SumWithCondition<Int128> 444262 ns 444247 ns 1575
BM_SumWithConditionNew<Int128> 87837 ns 87834 ns 7991
BM_SumWithCondition<UInt128> 433537 ns 433518 ns 1615
BM_SumWithConditionNew<UInt128> 88010 ns 88008 ns 7955
BM_SumWithCondition<Int256> 659032 ns 658995 ns 1048
BM_SumWithConditionNew<Int256> 189202 ns 189195 ns 3713
BM_SumWithCondition<UInt256> 479715 ns 479695 ns 1457
BM_SumWithConditionNew<UInt256> 198451 ns 198447 ns 3696
BM_SumWithCondition<Decimal32> 4662 ns 4662 ns 150015
BM_SumWithConditionNew<Decimal32> 4670 ns 4669 ns 149746
BM_SumWithCondition<Decimal64> 8742 ns 8742 ns 80315
BM_SumWithConditionNew<Decimal64> 8943 ns 8943 ns 76422
BM_SumWithCondition<Decimal128> 445999 ns 445990 ns 1550
BM_SumWithConditionNew<Decimal128> 88954 ns 88952 ns 8002
BM_SumWithCondition<Decimal256> 515128 ns 515111 ns 1371
BM_SumWithConditionNew<Decimal256> 223425 ns 223420 ns 3184
*/
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wbit-int-extension"
using NewInt128 = signed _BitInt(128);
using NewUInt128 = unsigned _BitInt(128);
using NewInt256 = signed _BitInt(256);
using NewUInt256 = unsigned _BitInt(256);
#pragma clang diagnostic pop
using OldInt128 = Int128;
using OldUInt128 = UInt128;
using OldInt256 = Int256;
using OldUInt256 = UInt256;
template <typename T>
static T generateRandomValue()
{
T value;
for (size_t i = 0; i < sizeof(T); ++i)
{
reinterpret_cast<uint8_t *>(&value)[i] = static_cast<uint8_t>(std::rand() % 256);
}
return value;
}
template <typename T>
static void BM_Addition(benchmark::State & state)
{
T a = generateRandomValue<T>();
T b = generateRandomValue<T>();
for (auto _ : state)
{
T result = a + b;
benchmark::DoNotOptimize(&result);
}
}
template <typename T>
static void BM_Subtraction(benchmark::State & state)
{
T a = generateRandomValue<T>();
T b = generateRandomValue<T>();
for (auto _ : state)
{
T result = a - b;
benchmark::DoNotOptimize(&result);
}
}
template <typename T>
static void BM_Multiplication(benchmark::State & state)
{
T a = generateRandomValue<T>();
T b = generateRandomValue<T>();
for (auto _ : state)
{
T result = a * b;
benchmark::DoNotOptimize(&result);
}
}
template <typename T>
static void BM_Division(benchmark::State & state)
{
T a = generateRandomValue<T>();
T b = generateRandomValue<T>() + 1; // Avoid division by zero
for (auto _ : state)
{
T result = a / b;
benchmark::DoNotOptimize(&result);
}
}
BENCHMARK_TEMPLATE(BM_Addition, OldInt128);
BENCHMARK_TEMPLATE(BM_Subtraction, OldInt128);
BENCHMARK_TEMPLATE(BM_Multiplication, OldInt128);
BENCHMARK_TEMPLATE(BM_Division, OldInt128);
BENCHMARK_TEMPLATE(BM_Addition, NewInt128);
BENCHMARK_TEMPLATE(BM_Subtraction, NewInt128);
BENCHMARK_TEMPLATE(BM_Multiplication, NewInt128);
BENCHMARK_TEMPLATE(BM_Division, NewInt128);
BENCHMARK_TEMPLATE(BM_Addition, OldUInt128);
BENCHMARK_TEMPLATE(BM_Subtraction, OldUInt128);
BENCHMARK_TEMPLATE(BM_Multiplication, OldUInt128);
BENCHMARK_TEMPLATE(BM_Division, OldUInt128);
BENCHMARK_TEMPLATE(BM_Addition, NewUInt128);
BENCHMARK_TEMPLATE(BM_Subtraction, NewUInt128);
BENCHMARK_TEMPLATE(BM_Multiplication, NewUInt128);
BENCHMARK_TEMPLATE(BM_Division, NewUInt128);
BENCHMARK_TEMPLATE(BM_Addition, OldInt256);
BENCHMARK_TEMPLATE(BM_Subtraction, OldInt256);
BENCHMARK_TEMPLATE(BM_Multiplication, OldInt256);
BENCHMARK_TEMPLATE(BM_Division, OldInt256);
BENCHMARK_TEMPLATE(BM_Addition, NewInt256);
BENCHMARK_TEMPLATE(BM_Subtraction, NewInt256);
BENCHMARK_TEMPLATE(BM_Multiplication, NewInt256);
BENCHMARK_TEMPLATE(BM_Division, NewInt256);
BENCHMARK_TEMPLATE(BM_Addition, OldUInt256);
BENCHMARK_TEMPLATE(BM_Subtraction, OldUInt256);
BENCHMARK_TEMPLATE(BM_Multiplication, OldUInt256);
BENCHMARK_TEMPLATE(BM_Division, OldUInt256);
BENCHMARK_TEMPLATE(BM_Addition, NewUInt256);
BENCHMARK_TEMPLATE(BM_Subtraction, NewUInt256);
BENCHMARK_TEMPLATE(BM_Multiplication, NewUInt256);
BENCHMARK_TEMPLATE(BM_Division, NewUInt256);
/*
Running ./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine
Run on (32 X 2100 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x16)
L1 Instruction 32 KiB (x16)
L2 Unified 1024 KiB (x16)
L3 Unified 11264 KiB (x2)
Load Average: 4.79, 5.12, 5.49
./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine --benchmark_filter="(Addition|Subtraction|Multiplication|Division)<New.*>"
------------------------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------------------------
BM_Addition<NewInt128> 1.43 ns 1.43 ns 488198747
BM_Subtraction<NewInt128> 1.51 ns 1.51 ns 486720421
BM_Multiplication<NewInt128> 1.52 ns 1.52 ns 450071487
BM_Division<NewInt128> 1.48 ns 1.48 ns 471973890
BM_Addition<NewUInt128> 1.46 ns 1.46 ns 480687874
BM_Subtraction<NewUInt128> 1.46 ns 1.46 ns 488204076
BM_Multiplication<NewUInt128> 1.45 ns 1.45 ns 468576127
BM_Division<NewUInt128> 1.48 ns 1.48 ns 477379447
BM_Addition<NewInt256> 2.49 ns 2.48 ns 291377319
BM_Subtraction<NewInt256> 2.52 ns 2.52 ns 284595240
BM_Multiplication<NewInt256> 2.48 ns 2.48 ns 276363723
BM_Division<NewInt256> 2.44 ns 2.44 ns 286877215
BM_Addition<NewUInt256> 2.53 ns 2.53 ns 266497385
BM_Subtraction<NewUInt256> 2.48 ns 2.48 ns 287899525
BM_Multiplication<NewUInt256> 2.45 ns 2.45 ns 287882140
BM_Division<NewUInt256> 2.47 ns 2.47 ns 288479037
*/
/*
./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine --benchmark_filter="(Addition|Subtraction|Multiplication|Division)<Old.*>"
------------------------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------------------------
BM_Addition<OldInt128> 1.45 ns 1.45 ns 484711423
BM_Subtraction<OldInt128> 1.45 ns 1.45 ns 475188736
BM_Multiplication<OldInt128> 1.47 ns 1.47 ns 483199322
BM_Division<OldInt128> 1.49 ns 1.49 ns 488830649
BM_Addition<OldUInt128> 1.45 ns 1.45 ns 487019006
BM_Subtraction<OldUInt128> 1.45 ns 1.45 ns 477626299
BM_Multiplication<OldUInt128> 1.47 ns 1.47 ns 475294481
BM_Division<OldUInt128> 1.48 ns 1.48 ns 461236815
BM_Addition<OldInt256> 4.39 ns 4.39 ns 159221253
BM_Subtraction<OldInt256> 5.01 ns 5.01 ns 100000000
BM_Multiplication<OldInt256> 11.3 ns 11.3 ns 54204439
BM_Division<OldInt256> 48.5 ns 48.5 ns 18505649
BM_Addition<OldUInt256> 4.37 ns 4.37 ns 180812154
BM_Subtraction<OldUInt256> 5.41 ns 5.41 ns 133516077
BM_Multiplication<OldUInt256> 2.47 ns 2.47 ns 286377591
BM_Division<OldUInt256> 21.8 ns 21.8 ns 25876643
*/
#endif