| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include <iomanip> |
| #include <iostream> |
| #include <sstream> |
| |
| #include "util/benchmark.h" |
| #include "util/cpu-info.h" |
| |
| #include "runtime/multi-precision.h" |
| #include "common/names.h" |
| |
| // Benchmark to measure operations on different implementation of multi (i.e. > 8) |
| // byte integers. |
| // Machine Info: Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz |
| // Add: Function Rate (iters/ms) Comparison |
| // ---------------------------------------------------------------------- |
| // int128_CPP 1.87e+04 1X |
| // int128_Boost 2842 0.1519X |
| // int96_CPP 1289 0.06891X |
| // int64 2.381e+04 1.273X |
| // int128_Base1B 1.08e+04 0.5775X |
| // doubles 2.559e+04 1.368X |
| // |
| // Multiply: Function Rate (iters/ms) Comparison |
| // ---------------------------------------------------------------------- |
| // int128_CPP 1.263e+04 1X |
| // int128_Boost 4418 0.3497X |
| // int96_CPP 1220 0.09656X |
| // int64 2.913e+04 2.306X |
| // doubles 2.282e+04 1.806X |
| // |
| // Divide: Function Rate (iters/ms) Comparison |
| // ---------------------------------------------------------------------- |
| // int128_CPP 1696 1X |
| // int128_Boost 1112 0.6558X |
| // int96_CPP 561.7 0.3313X |
| // int64 2788 1.644X |
| // double 4565 2.692X |
| |
| |
| using std::max; |
| using std::numeric_limits; |
| using namespace impala; |
| |
| // Multi byte ints encoded using a big base. |
| // The decimal value is: |
| // data[0] + data[1] * BASE^1 + data[2] * BASE^2 + ... |
| // This is not a full implementation of the functionality required. |
| template<uint32_t BASE> |
| struct BaseInt128 { |
| uint32_t data[4]; // least significant first. |
| |
| BaseInt128(uint64_t v = 0) { |
| memset(data, 0, sizeof(data)); |
| int idx = 0; |
| while (v > 0) { |
| data[idx++] = v % BASE; |
| v /= BASE; |
| } |
| } |
| |
| BaseInt128& operator+=(const BaseInt128& rhs) { |
| uint64_t r0 = data[0] + rhs.data[0]; |
| uint64_t r1 = data[1] + rhs.data[1]; |
| uint64_t r2 = data[2] + rhs.data[2]; |
| uint64_t r3 = data[3] + rhs.data[3]; |
| |
| this->data[0] = static_cast<uint32_t>(r0); |
| if (r0 > numeric_limits<uint32_t>::max()) ++r1; |
| this->data[1] = static_cast<uint32_t>(r1); |
| if (r1 > numeric_limits<uint32_t>::max()) ++r2; |
| this->data[2] = static_cast<uint32_t>(r2); |
| if (r2 > numeric_limits<uint32_t>::max()) ++r3; |
| this->data[3] = static_cast<uint32_t>(r3); |
| return *this; |
| } |
| |
| string Print() const { |
| stringstream ss; |
| bool print_padded = false; |
| for (int i = 3; i >= 0; --i) { |
| if (data[i] == 0 && !print_padded) continue; |
| if (print_padded) { |
| ss << setw(9) << data[i]; |
| } else { |
| ss << data[i]; |
| } |
| } |
| ss << data[0]; |
| return ss.str(); |
| } |
| }; |
| |
| // Prototype for int96 by doing the arithmetic as casts to int128. Minimal |
| // implementation to run benchmark. |
| struct int96 { |
| public: |
| int96() { |
| } |
| |
| int96(__int128_t t) { |
| memcpy(data, reinterpret_cast<uint8_t*>(&t), 12); |
| } |
| |
| __int128_t to_int128() const { |
| __int128_t r; |
| // Fill in upper bits and bit shift down to sign extend |
| memcpy(reinterpret_cast<uint8_t*>(&r) + 4, data, 12); |
| r >>= 4; |
| return r; |
| } |
| |
| int96& operator+=(const int96& v) { |
| __int128_t x = to_int128(); |
| __int128_t y = v.to_int128(); |
| __int128_t r = x + y; |
| *this = int96(r); |
| return *this; |
| } |
| |
| int96& operator*=(const int96& v) { |
| __int128_t x = to_int128(); |
| __int128_t y = v.to_int128(); |
| __int128_t r = x * y; |
| *this = int96(r); |
| return *this; |
| } |
| |
| int96 operator/(const int96& v) const { |
| __int128_t x = to_int128(); |
| __int128_t y = v.to_int128(); |
| __int128_t r = x / y; |
| return int96(r); |
| } |
| |
| int96 operator-() const { |
| return int96(-to_int128()); |
| } |
| |
| private: |
| // First (little endian) 12 bytes of int128_t |
| uint8_t data[12]; |
| }; |
| |
| typedef BaseInt128<1000000000> Base1BInt128; |
| |
| struct TestData { |
| // multi precision ints implemented with the boost library. |
| vector<boost::multiprecision::int128_t> boost_add_ints; |
| vector<boost::multiprecision::int128_t> boost_mult_ints; |
| boost::multiprecision::int128_t boost_result; |
| |
| // multi precision ints as defined by the c++ extension |
| vector<__int128_t> cpp_add_ints; |
| vector<__int128_t> cpp_mult_ints; |
| __int128_t cpp_result; |
| |
| vector<int96> cpp96_add_ints; |
| vector<int96> cpp96_mult_ints; |
| int96 cpp96_result; |
| |
| vector<int64_t> int64_ints; |
| int64_t int64_result; |
| |
| vector<double> doubles; |
| double double_result; |
| |
| vector<Base1BInt128> base1b_ints; |
| Base1BInt128 base1b_result; |
| }; |
| |
| // Initialize test data. 1/4 will be negative. 1/2 will require more than 8 bytes. |
| void InitTestData(TestData* data, int n) { |
| for (int i = 0; i < n; ++i) { |
| data->boost_add_ints.push_back(boost::multiprecision::int128_t(i + 1)); |
| data->boost_mult_ints.push_back(boost::multiprecision::int128_t(i + 1)); |
| data->cpp_add_ints.push_back(__int128_t(i + 1)); |
| data->cpp_mult_ints.push_back(__int128_t(i + 1)); |
| data->cpp96_add_ints.push_back(__int128_t(i + 1)); |
| data->cpp96_mult_ints.push_back(__int128_t(i + 1)); |
| |
| if (i % 2 == 0) { |
| data->boost_add_ints[i] *= numeric_limits<int64_t>::max(); |
| data->cpp_add_ints[i] *= numeric_limits<int64_t>::max(); |
| data->cpp96_add_ints[i] *= numeric_limits<int64_t>::max(); |
| } |
| if (i % 4 == 0) { |
| data->boost_add_ints[i] = -data->boost_add_ints[i]; |
| data->cpp_add_ints[i] = -data->cpp_add_ints[i]; |
| data->cpp96_add_ints[i] = -data->cpp96_add_ints[i]; |
| |
| data->boost_mult_ints[i] = -data->boost_mult_ints[i]; |
| data->cpp_mult_ints[i] = -data->cpp_mult_ints[i]; |
| data->cpp96_mult_ints[i] = -data->cpp96_mult_ints[i]; |
| } |
| |
| data->int64_ints.push_back(i + 1); |
| data->base1b_ints.push_back(i + 1); |
| data->doubles.push_back(i + 1); |
| } |
| } |
| |
| #define TEST_ADD(NAME, RESULT, VALS)\ |
| void NAME(int batch_size, void* d) {\ |
| TestData* data = reinterpret_cast<TestData*>(d);\ |
| for (int i = 0; i < batch_size; ++i) {\ |
| data->RESULT = 0;\ |
| for (int j = 0; j < data->VALS.size(); ++j) {\ |
| data->RESULT += data->VALS[j];\ |
| }\ |
| }\ |
| } |
| |
| #define TEST_MULTIPLY(NAME, RESULT, VALS)\ |
| void NAME(int batch_size, void* d) {\ |
| TestData* data = reinterpret_cast<TestData*>(d);\ |
| for (int i = 0; i < batch_size; ++i) {\ |
| data->RESULT = 1;\ |
| for (int j = 0; j < data->VALS.size(); ++j) {\ |
| data->RESULT *= data->VALS[j];\ |
| }\ |
| }\ |
| } |
| |
| #define TEST_DIVIDE(NAME, RESULT, VALS)\ |
| void NAME(int batch_size, void* d) {\ |
| TestData* data = reinterpret_cast<TestData*>(d);\ |
| for (int i = 0; i < batch_size; ++i) {\ |
| data->RESULT = 0;\ |
| for (int j = 0; j < data->VALS.size() - 1; ++j) {\ |
| data->RESULT += data->VALS[j + 1] / data->VALS[j];\ |
| }\ |
| }\ |
| } |
| |
| TEST_ADD(TestBoostAdd, boost_result, boost_add_ints); |
| TEST_ADD(TestCppAdd, cpp_result, cpp_add_ints); |
| TEST_ADD(TestCpp96Add, cpp96_result, cpp96_add_ints); |
| TEST_ADD(TestBaseBillionAdd, base1b_result, base1b_ints); |
| TEST_ADD(TestInt64Add, int64_result, int64_ints); |
| TEST_ADD(TestDoubleAdd, double_result, doubles); |
| |
| TEST_MULTIPLY(TestBoostMultiply, boost_result, boost_mult_ints); |
| TEST_MULTIPLY(TestCppMultiply, cpp_result, cpp_mult_ints); |
| TEST_MULTIPLY(TestCpp96Multiply, cpp96_result, cpp96_mult_ints); |
| TEST_MULTIPLY(TestInt64Multiply, int64_result, int64_ints); |
| TEST_MULTIPLY(TestDoubleMultiply, double_result, doubles); |
| |
| TEST_DIVIDE(TestBoostDivide, boost_result, boost_mult_ints); |
| TEST_DIVIDE(TestCppDivide, cpp_result, cpp_mult_ints); |
| TEST_DIVIDE(TestCpp96Divide, cpp96_result, cpp96_mult_ints); |
| TEST_DIVIDE(TestInt64Divide, int64_result, int64_ints); |
| TEST_DIVIDE(TestDoubleDivide, double_result, doubles); |
| |
| int main(int argc, char** argv) { |
| CpuInfo::Init(); |
| cout << Benchmark::GetMachineInfo() << endl; |
| |
| TestData data; |
| InitTestData(&data, 38); // 38! doesn't overflow int128. |
| |
| Benchmark add_suite("Add"); |
| add_suite.AddBenchmark("int128_CPP", TestCppAdd, &data); |
| add_suite.AddBenchmark("int128_Boost", TestBoostAdd, &data); |
| add_suite.AddBenchmark("int96_CPP", TestCpp96Add, &data); |
| add_suite.AddBenchmark("int64", TestInt64Add, &data); |
| add_suite.AddBenchmark("int128_Base1B", TestBaseBillionAdd, &data); |
| add_suite.AddBenchmark("doubles", TestDoubleAdd, &data); |
| cout << add_suite.Measure() << endl; |
| |
| Benchmark multiply_suite("Multiply"); |
| multiply_suite.AddBenchmark("int128_CPP", TestCppMultiply, &data); |
| multiply_suite.AddBenchmark("int128_Boost", TestBoostMultiply, &data); |
| multiply_suite.AddBenchmark("int96_CPP", TestCpp96Multiply, &data); |
| multiply_suite.AddBenchmark("int64", TestInt64Multiply, &data); |
| multiply_suite.AddBenchmark("doubles", TestDoubleMultiply, &data); |
| cout << multiply_suite.Measure() << endl; |
| |
| Benchmark divide_suite("Divide"); |
| divide_suite.AddBenchmark("int128_CPP", TestCppDivide, &data); |
| divide_suite.AddBenchmark("int128_Boost", TestBoostDivide, &data); |
| divide_suite.AddBenchmark("int96_CPP", TestCpp96Divide, &data); |
| divide_suite.AddBenchmark("int64", TestInt64Divide, &data); |
| divide_suite.AddBenchmark("double", TestDoubleDivide, &data); |
| cout << divide_suite.Measure() << endl; |
| |
| return 0; |
| } |