be/src/benchmarks/overflow-benchmark.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <algorithm>
 #include <iostream>
 #include <sstream>
 #include <vector>

 #include "runtime/decimal-value.inline.h"
 #include "runtime/string-value.h"
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
 #include "util/string-parser.h"

 #include "common/names.h"

 #ifndef __has_builtin
 #define __has_builtin(x) 0
 #endif

 using std::numeric_limits;
 using namespace impala;

 // Machine Info: Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz
 // Decimal16 Add Overflow:Function     Rate (iters/ms)          Comparison
 // ----------------------------------------------------------------------
 //         without_check_overflow               31.27                  1X
 //           builtin_add_overflow               9.901             0.3167X
 //      add_overflow_lookup_table               12.44             0.3979X
 //                   add_overflow               12.62             0.4036X
 //
 // Decimal16 Mul Overflow:Function     Rate (iters/ms)          Comparison
 // ----------------------------------------------------------------------
 //         without_check_overflow               20.47                  1X
 //           builtin_mul_overflow               7.542             0.3685X
 //         mul_overflow_check_msb               2.022            0.09881X
 //                   mul_overflow               2.198             0.1074X
 //
 // Clz of int128_t:      Function     Rate (iters/ms)          Comparison
 // ----------------------------------------------------------------------
 //                    clz_branchy           1.879e+13                  1X
 //                clz_branch_free           4.945e+12             0.2632X

 struct TestData {
   int precision;
   int scale;
   double probability_negative;
   vector<Decimal16Value> values;
   vector<Decimal16Value> results;
   vector<bool> overflows;
   vector<int128_t> int128_values;
 };

 double Rand() {
   return rand() / static_cast<double>(RAND_MAX);
 }

 void AddTestData(TestData* data, int n) {
   int128_t max_whole = data->precision > data->scale ?
       DecimalUtil::GetScaleMultiplier<int128_t>(data->precision - data->scale) - 1 : 0;
   int128_t max_fraction = data->scale > 0 ?
       DecimalUtil::GetScaleMultiplier<int128_t>(data->scale) - 1 : 0;
   ColumnType column_type = ColumnType::CreateDecimalType(data->precision, data->scale);
   Decimal16Value val;
   for (int i = 0; i < n; ++i) {
     stringstream ss;
     if (data->probability_negative > Rand()) ss << "-";
     if (max_whole > 0) ss << static_cast<int128_t>(max_whole * Rand());
     if (max_fraction > 0) ss << "." << static_cast<int128_t>(max_fraction * Rand());
     string str = ss.str();
     StringParser::ParseResult dummy;
     val = StringParser::StringToDecimal<int128_t>(
         str.c_str(), str.length(), column_type, false, &dummy);
     data->values.push_back(val);
     data->int128_values.push_back(numeric_limits<int128_t>::max() * Rand());
   }
 }

 template <typename RESULT_T>
 static bool AdjustToSameScale(const Decimal16Value& x, int x_scale,
     const Decimal16Value& y, int y_scale, int result_precision, RESULT_T* x_scaled,
     RESULT_T* y_scaled) {
   int delta_scale = x_scale - y_scale;
   RESULT_T scale_factor = DecimalUtil::GetScaleMultiplier<RESULT_T>(abs(delta_scale));
   if (delta_scale == 0) {
     *x_scaled = x.value();
     *y_scaled = y.value();
   } else if (delta_scale > 0) {
     if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION &&
         DecimalUtil::MAX_UNSCALED_DECIMAL16 / scale_factor < abs(y.value())) {
       return true;
     }
     *x_scaled = x.value();
     *y_scaled = y.value() * scale_factor;
   } else {
     if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION &&
         DecimalUtil::MAX_UNSCALED_DECIMAL16 / scale_factor < abs(x.value())) {
       return true;
     }
     *x_scaled = x.value() * scale_factor;
     *y_scaled = y.value();
   }
   return false;
 }

 // Accelerate using lookup table.
 template <typename RESULT_T>
 static void AdjustToSameScaleLookupTbl(const Decimal16Value& x, int x_scale,
     const Decimal16Value& y, int y_scale, int result_precision, RESULT_T* x_scaled,
     RESULT_T* y_scaled) {
   int delta_scale = x_scale - y_scale;
   RESULT_T scale_factor = DecimalUtil::GetScaleMultiplier<RESULT_T>(abs(delta_scale));
   if (delta_scale == 0) {
     *x_scaled = x.value();
     *y_scaled = y.value();
   } else if (delta_scale > 0) {
     *x_scaled = x.value();
     *y_scaled = y.value() * scale_factor;
   } else {
     *x_scaled = x.value() * scale_factor;
     *y_scaled = y.value();
   }
 }

 #if 5 <= __GNUC__ || __has_builtin(__builtin_add_overflow)
 #define HAVE_BUILTIN_ADD_OVERFLOW
 template<typename RESULT_T>
 DecimalValue<RESULT_T> BuiltinAdd(const Decimal16Value& val, int this_scale,
     const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
     bool* overflow) {
   DCHECK_EQ(result_scale, std::max(this_scale, other_scale));
   RESULT_T x = 0;
   RESULT_T y = 0;
   *overflow |= AdjustToSameScale(val, this_scale, other, other_scale,
       result_precision, &x, &y);
   if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION) {
     RESULT_T result = 0;
     *overflow |= __builtin_add_overflow(x, y, &result);
     *overflow |= abs(result) > DecimalUtil::MAX_UNSCALED_DECIMAL16;
     return DecimalValue<RESULT_T>(result);
   } else {
     DCHECK(!*overflow) << "Cannot overflow unless result is Decimal16Value";
   }
   return DecimalValue<RESULT_T>(x + y);
 }
 #endif

 template<typename RESULT_T>
 DecimalValue<RESULT_T> AddLookupTbl(const Decimal16Value& val, int this_scale,
     const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
     bool* overflow) {
   DCHECK_EQ(result_scale, std::max(this_scale, other_scale));
   RESULT_T x = 0;
   RESULT_T y = 0;
   AdjustToSameScaleLookupTbl(val, this_scale, other, other_scale,
       result_precision, &x, &y);
   if (sizeof(RESULT_T) == 16) {
     // Check overflow.
     if (!*overflow && (val.value() < 0) == (other.value() < 0) &&
         result_precision == ColumnType::MAX_PRECISION) {
       // Can only overflow if the signs are the same and result precision reaches
       // max precision.
       *overflow |= DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(x) < abs(y);
       // TODO: faster to return here? We don't care at all about the perf on
       // the overflow case but what makes the normal path faster?
     }
   } else {
     DCHECK(!*overflow) << "Cannot overflow unless result is Decimal16Value";
   }
   return DecimalValue<RESULT_T>(x + y);
 }

 template<typename RESULT_T>
 DecimalValue<RESULT_T> Add(const Decimal16Value& val, int this_scale,
     const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
     bool* overflow) {
   DCHECK_EQ(result_scale, std::max(this_scale, other_scale));
   RESULT_T x = 0;
   RESULT_T y = 0;
   *overflow |= AdjustToSameScale(val, this_scale, other, other_scale,
       result_precision, &x, &y);
   if (sizeof(RESULT_T) == 16) {
     // Check overflow.
     if (!*overflow && (val.value() < 0) == (other.value() < 0) &&
         result_precision == ColumnType::MAX_PRECISION) {
       // Can only overflow if the signs are the same and result precision reaches
       // max precision.
       *overflow |= DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(x) < abs(y);
       // TODO: faster to return here? We don't care at all about the perf on
       // the overflow case but what makes the normal path faster?
     }
   } else {
     DCHECK(!*overflow) << "Cannot overflow unless result is Decimal16Value";
   }
   return DecimalValue<RESULT_T>(x + y);
 }

 // set DISABLE_CHECK_OVERFLOW true will disable checking overflow.
 #define TEST_ADD(NAME, FN, DISABLE_CHECK_OVERFLOW) \
   void NAME(int batch_size, void* d) { \
     TestData* data = reinterpret_cast<TestData*>(d); \
     for (int i = 0; i < batch_size; ++i) { \
       int s1 = data->scale; \
       int s2 = data->scale; \
       int p1 = data->precision; \
       int p2 = data->precision; \
       int s_max = std::max(s1, s2); \
       int max_precision = ColumnType::MAX_PRECISION; \
       int result_precision = std::min(s_max + std::max(p1 - s1, p2 - s2) + 1, \
           max_precision); \
       int result_scale = std::min(result_precision, s_max); \
       for (int j = 0; j < data->values.size() - 1; ++j) { \
         bool overflow = DISABLE_CHECK_OVERFLOW; \
         data->results[j] = FN<int128_t>(data->values[j], data->scale, \
             data->values[j + 1], data->scale, result_precision, \
             result_scale, &overflow); \
         data->overflows[j] = overflow; \
       } \
     } \
   }

 TEST_ADD(TestAdd, Add, true);
 #ifdef HAVE_BUILTIN_ADD_OVERFLOW
 TEST_ADD(TestBuiltinAddOverflow, BuiltinAdd, false);
 #endif
 TEST_ADD(TestAddOverflowLookupTbl, AddLookupTbl, false);
 TEST_ADD(TestAddOverflow, Add, false);

 // Disabled __builtin_mul_overflow since Clang emits a call to __muloti4, which is
 // not implemented in the GCC runtime library.
 #if 5 <= __GNUC__
 #define HAVE_BUILTIN_MUL_OVERFLOW
 template<typename RESULT_T>
 DecimalValue<RESULT_T> BuiltinMultiply(const Decimal16Value& val, int this_scale,
     const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
     bool* overflow) {
   // In the non-overflow case, we don't need to adjust by the scale since
   // that is already handled by the FE when it computes the result decimal type.
   // e.g. 1.23 * .2 (scale 2, scale 1 respectively) is identical to:
   // 123 * 2 with a resulting scale 3. We can do the multiply on the unscaled values.
   // The result scale in this case is the sum of the input scales.
   RESULT_T x = val.value();
   RESULT_T y = other.value();
   if (x == 0 || y == 0) {
     // Handle zero to avoid divide by zero in the overflow check below.
     return DecimalValue<RESULT_T>(0);
   }
   RESULT_T result = 0;
   if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION) {
     // Check overflow
     *overflow |= __builtin_mul_overflow(x, y, &result);
     *overflow |= abs(result) > DecimalUtil::MAX_UNSCALED_DECIMAL16;
   } else {
     result = x * y;
   }
   int delta_scale = this_scale + other_scale - result_scale;
   if (UNLIKELY(delta_scale != 0)) {
     // In this case, the required resulting scale is larger than the max we support.
     // We cap the resulting scale to the max supported scale (e.g. truncate) in the FE.
     // TODO: we could also return NULL.
     DCHECK_GT(delta_scale, 0);
     result /= DecimalUtil::GetScaleMultiplier<int128_t>(delta_scale);
   }
   return DecimalValue<RESULT_T>(result);
 }
 #endif

 // Check MSB of decimal values to skip checking overflow.
 template<typename RESULT_T>
 DecimalValue<RESULT_T> MultiplyCheckMSB(const Decimal16Value& val, int this_scale,
     const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
     bool* overflow) {
   // In the non-overflow case, we don't need to adjust by the scale since
   // that is already handled by the FE when it computes the result decimal type.
   // e.g. 1.23 * .2 (scale 2, scale 1 respectively) is identical to:
   // 123 * 2 with a resulting scale 3. We can do the multiply on the unscaled values.
   // The result scale in this case is the sum of the input scales.
   RESULT_T x = val.value();
   RESULT_T y = other.value();
   if (x == 0 || y == 0) {
     // Handle zero to avoid divide by zero in the overflow check below.
     return DecimalValue<RESULT_T>(0);
   }
   if (sizeof(RESULT_T) == 16) {
     // Check overflow
     if (result_precision == ColumnType::MAX_PRECISION &&
         DecimalUtil::Clz(abs(x)) + DecimalUtil::Clz(abs(y)) < 130) {
       *overflow |= DecimalUtil::MAX_UNSCALED_DECIMAL16 / abs(y) < abs(x);
     }
   }
   RESULT_T result = x * y;
   int delta_scale = this_scale + other_scale - result_scale;
   if (UNLIKELY(delta_scale != 0)) {
     // In this case, the required resulting scale is larger than the max we support.
     // We cap the resulting scale to the max supported scale (e.g. truncate) in the FE.
     // TODO: we could also return NULL.
     DCHECK_GT(delta_scale, 0);
     result /= DecimalUtil::GetScaleMultiplier<int128_t>(delta_scale);
   }
   return DecimalValue<RESULT_T>(result);
 }

 template<typename RESULT_T>
 DecimalValue<RESULT_T> Multiply(const Decimal16Value& val, int this_scale,
     const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
     bool* overflow) {
   // In the non-overflow case, we don't need to adjust by the scale since
   // that is already handled by the FE when it computes the result decimal type.
   // e.g. 1.23 * .2 (scale 2, scale 1 respectively) is identical to:
   // 123 * 2 with a resulting scale 3. We can do the multiply on the unscaled values.
   // The result scale in this case is the sum of the input scales.
   RESULT_T x = val.value();
   RESULT_T y = other.value();
   if (x == 0 || y == 0) {
     // Handle zero to avoid divide by zero in the overflow check below.
     return DecimalValue<RESULT_T>(0);
   }
   if (sizeof(RESULT_T) == 16) {
     // Check overflow
     if (result_precision == ColumnType::MAX_PRECISION) {
       *overflow |= DecimalUtil::MAX_UNSCALED_DECIMAL16 / abs(y) < abs(x);
     }
   }
   RESULT_T result = x * y;
   int delta_scale = this_scale + other_scale - result_scale;
   if (UNLIKELY(delta_scale != 0)) {
     // In this case, the required resulting scale is larger than the max we support.
     // We cap the resulting scale to the max supported scale (e.g. truncate) in the FE.
     // TODO: we could also return NULL.
     DCHECK_GT(delta_scale, 0);
     result /= DecimalUtil::GetScaleMultiplier<int128_t>(delta_scale);
   }
   return DecimalValue<RESULT_T>(result);
 }

 // set DISABLE_CHECK_OVERFLOW true will disable checking overflow.
 #define TEST_MUL(NAME, FN, DISABLE_CHECK_OVERFLOW) \
   void NAME(int batch_size, void* d) { \
     TestData* data = reinterpret_cast<TestData*>(d); \
     for (int i = 0; i < batch_size; ++i) { \
       int max_precision = ColumnType::MAX_PRECISION; \
       int result_precision = std::min(data->precision + data->precision, \
           max_precision); \
       int result_scale = std::min(result_precision, data->scale + data->scale); \
       for (int j = 0; j < data->values.size() - 1; ++j) { \
         bool overflow = DISABLE_CHECK_OVERFLOW; \
         data->results[j] = FN<int128_t>(data->values[j], data->scale, \
             data->values[j + 1], data->scale, result_precision, \
             result_scale, &overflow); \
         data->overflows[j] = overflow; \
       } \
     } \
   }

 TEST_MUL(TestMul, Multiply, true);
 #ifdef HAVE_BUILTIN_MUL_OVERFLOW
 TEST_MUL(TestBuiltinMulOverflow, BuiltinMultiply, false);
 #endif
 TEST_MUL(TestMulOverflowCheckMSB, MultiplyCheckMSB, false);
 TEST_MUL(TestMulOverflow, Multiply, false);

 void TestClzBranchy(int batch_size, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
   for (int i = 0; i < batch_size; ++i) {
     // Unroll this to focus more on Put performance.
     for (int j = 0; j < data->int128_values.size(); j += 8) {
       DecimalUtil::Clz(data->int128_values[j + 0]);
       DecimalUtil::Clz(data->int128_values[j + 1]);
       DecimalUtil::Clz(data->int128_values[j + 2]);
       DecimalUtil::Clz(data->int128_values[j + 3]);
       DecimalUtil::Clz(data->int128_values[j + 4]);
       DecimalUtil::Clz(data->int128_values[j + 5]);
       DecimalUtil::Clz(data->int128_values[j + 6]);
       DecimalUtil::Clz(data->int128_values[j + 7]);
     }
   }
 }

 inline int ClzBranchFree(const int128_t& v) {
   // GCC leaves __builtin_clz undefined for zero.
   uint64_t hi = static_cast<uint64_t>(v >> 64);
   uint64_t lo = static_cast<uint64_t>(v);
   int retval[3]={
     __builtin_clzll(hi),
     __builtin_clzll(lo)+64,
     128
   };
   int idx = !hi + ((!lo)&(!hi));
   return retval[idx];
 }

 void TestClzBranchFree(int batch_size, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
   for (int i = 0; i < batch_size; ++i) {
     // Unroll this to focus more on Put performance.
     for (int j = 0; j < data->int128_values.size(); j += 8) {
       ClzBranchFree(data->int128_values[j + 0]);
       ClzBranchFree(data->int128_values[j + 1]);
       ClzBranchFree(data->int128_values[j + 2]);
       ClzBranchFree(data->int128_values[j + 3]);
       ClzBranchFree(data->int128_values[j + 4]);
       ClzBranchFree(data->int128_values[j + 5]);
       ClzBranchFree(data->int128_values[j + 6]);
       ClzBranchFree(data->int128_values[j + 7]);
     }
   }
 }

 int main(int argc, char** argv) {
   CpuInfo::Init();
   cout << Benchmark::GetMachineInfo() << endl;

   TestData data;
   int size = 10000;
   data.precision = ColumnType::MAX_PRECISION;
   data.scale = data.precision / 2;
   data.probability_negative = 0.25;
   AddTestData(&data, size);
   data.results.resize(size - 1);
   data.overflows.resize(size - 1);

   Benchmark add_overflow_suite("Decimal16 Add Overflow");
   add_overflow_suite.AddBenchmark("without_check_overflow", TestAdd, &data);
 #ifdef HAVE_BUILTIN_ADD_OVERFLOW
   add_overflow_suite.AddBenchmark("builtin_add_overflow", TestBuiltinAddOverflow, &data);
 #endif
   add_overflow_suite.AddBenchmark("add_overflow_lookup_table",
       TestAddOverflowLookupTbl, &data);
   add_overflow_suite.AddBenchmark("add_overflow", TestAddOverflow, &data);
   cout << add_overflow_suite.Measure() << endl;

   Benchmark mul_overflow_suite("Decimal16 Mul Overflow");
   mul_overflow_suite.AddBenchmark("without_check_overflow", TestMul, &data);
 #ifdef HAVE_BUILTIN_MUL_OVERFLOW
   mul_overflow_suite.AddBenchmark("builtin_mul_overflow", TestBuiltinMulOverflow, &data);
 #endif
   mul_overflow_suite.AddBenchmark("mul_overflow_check_msb",
       TestMulOverflowCheckMSB, &data);
   mul_overflow_suite.AddBenchmark("mul_overflow", TestMulOverflow, &data);
   cout << mul_overflow_suite.Measure() << endl;

   // Counting the number of leading zeros in a 128-bit integer.
   Benchmark clz_suite("Clz of int128_t");
   clz_suite.AddBenchmark("clz_branchy", TestClzBranchy, &data);
   clz_suite.AddBenchmark("clz_branch_free", TestClzBranchFree, &data);
   cout << clz_suite.Measure() << endl;

   return 0;
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <algorithm>
	#include <iostream>
	#include <sstream>
	#include <vector>

	#include "runtime/decimal-value.inline.h"
	#include "runtime/string-value.h"
	#include "util/benchmark.h"
	#include "util/cpu-info.h"
	#include "util/string-parser.h"

	#include "common/names.h"

	#ifndef __has_builtin
	#define __has_builtin(x) 0
	#endif

	using std::numeric_limits;
	using namespace impala;

	// Machine Info: Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz
	// Decimal16 Add Overflow:Function Rate (iters/ms) Comparison
	// ----------------------------------------------------------------------
	// without_check_overflow 31.27 1X
	// builtin_add_overflow 9.901 0.3167X
	// add_overflow_lookup_table 12.44 0.3979X
	// add_overflow 12.62 0.4036X
	//
	// Decimal16 Mul Overflow:Function Rate (iters/ms) Comparison
	// ----------------------------------------------------------------------
	// without_check_overflow 20.47 1X
	// builtin_mul_overflow 7.542 0.3685X
	// mul_overflow_check_msb 2.022 0.09881X
	// mul_overflow 2.198 0.1074X
	//
	// Clz of int128_t: Function Rate (iters/ms) Comparison
	// ----------------------------------------------------------------------
	// clz_branchy 1.879e+13 1X
	// clz_branch_free 4.945e+12 0.2632X

	struct TestData {
	int precision;
	int scale;
	double probability_negative;
	vector<Decimal16Value> values;
	vector<Decimal16Value> results;
	vector<bool> overflows;
	vector<int128_t> int128_values;
	};

	double Rand() {
	return rand() / static_cast<double>(RAND_MAX);
	}

	void AddTestData(TestData* data, int n) {
	int128_t max_whole = data->precision > data->scale ?
	DecimalUtil::GetScaleMultiplier<int128_t>(data->precision - data->scale) - 1 : 0;
	int128_t max_fraction = data->scale > 0 ?
	DecimalUtil::GetScaleMultiplier<int128_t>(data->scale) - 1 : 0;
	ColumnType column_type = ColumnType::CreateDecimalType(data->precision, data->scale);
	Decimal16Value val;
	for (int i = 0; i < n; ++i) {
	stringstream ss;
	if (data->probability_negative > Rand()) ss << "-";
	if (max_whole > 0) ss << static_cast<int128_t>(max_whole * Rand());
	if (max_fraction > 0) ss << "." << static_cast<int128_t>(max_fraction * Rand());
	string str = ss.str();
	StringParser::ParseResult dummy;
	val = StringParser::StringToDecimal<int128_t>(
	str.c_str(), str.length(), column_type, false, &dummy);
	data->values.push_back(val);
	data->int128_values.push_back(numeric_limits<int128_t>::max() * Rand());
	}
	}

	template <typename RESULT_T>
	static bool AdjustToSameScale(const Decimal16Value& x, int x_scale,
	const Decimal16Value& y, int y_scale, int result_precision, RESULT_T* x_scaled,
	RESULT_T* y_scaled) {
	int delta_scale = x_scale - y_scale;
	RESULT_T scale_factor = DecimalUtil::GetScaleMultiplier<RESULT_T>(abs(delta_scale));
	if (delta_scale == 0) {
	*x_scaled = x.value();
	*y_scaled = y.value();
	} else if (delta_scale > 0) {
	if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION &&
	DecimalUtil::MAX_UNSCALED_DECIMAL16 / scale_factor < abs(y.value())) {
	return true;
	}
	*x_scaled = x.value();
	y_scaled = y.value() scale_factor;
	} else {
	if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION &&
	DecimalUtil::MAX_UNSCALED_DECIMAL16 / scale_factor < abs(x.value())) {
	return true;
	}
	x_scaled = x.value() scale_factor;
	*y_scaled = y.value();
	}
	return false;
	}

	// Accelerate using lookup table.
	template <typename RESULT_T>
	static void AdjustToSameScaleLookupTbl(const Decimal16Value& x, int x_scale,
	const Decimal16Value& y, int y_scale, int result_precision, RESULT_T* x_scaled,
	RESULT_T* y_scaled) {
	int delta_scale = x_scale - y_scale;
	RESULT_T scale_factor = DecimalUtil::GetScaleMultiplier<RESULT_T>(abs(delta_scale));
	if (delta_scale == 0) {
	*x_scaled = x.value();
	*y_scaled = y.value();
	} else if (delta_scale > 0) {
	*x_scaled = x.value();
	y_scaled = y.value() scale_factor;
	} else {
	x_scaled = x.value() scale_factor;
	*y_scaled = y.value();
	}
	}

	#if 5 <= __GNUC__ \|\| __has_builtin(__builtin_add_overflow)
	#define HAVE_BUILTIN_ADD_OVERFLOW
	template<typename RESULT_T>
	DecimalValue<RESULT_T> BuiltinAdd(const Decimal16Value& val, int this_scale,
	const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
	bool* overflow) {
	DCHECK_EQ(result_scale, std::max(this_scale, other_scale));
	RESULT_T x = 0;
	RESULT_T y = 0;
	*overflow \|= AdjustToSameScale(val, this_scale, other, other_scale,
	result_precision, &x, &y);
	if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION) {
	RESULT_T result = 0;
	*overflow \|= __builtin_add_overflow(x, y, &result);
	*overflow \|= abs(result) > DecimalUtil::MAX_UNSCALED_DECIMAL16;
	return DecimalValue<RESULT_T>(result);
	} else {
	DCHECK(!*overflow) << "Cannot overflow unless result is Decimal16Value";
	}
	return DecimalValue<RESULT_T>(x + y);
	}
	#endif

	template<typename RESULT_T>
	DecimalValue<RESULT_T> AddLookupTbl(const Decimal16Value& val, int this_scale,
	const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
	bool* overflow) {
	DCHECK_EQ(result_scale, std::max(this_scale, other_scale));
	RESULT_T x = 0;
	RESULT_T y = 0;
	AdjustToSameScaleLookupTbl(val, this_scale, other, other_scale,
	result_precision, &x, &y);
	if (sizeof(RESULT_T) == 16) {
	// Check overflow.
	if (!*overflow && (val.value() < 0) == (other.value() < 0) &&
	result_precision == ColumnType::MAX_PRECISION) {
	// Can only overflow if the signs are the same and result precision reaches
	// max precision.
	*overflow \|= DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(x) < abs(y);
	// TODO: faster to return here? We don't care at all about the perf on
	// the overflow case but what makes the normal path faster?
	}
	} else {
	DCHECK(!*overflow) << "Cannot overflow unless result is Decimal16Value";
	}
	return DecimalValue<RESULT_T>(x + y);
	}

	template<typename RESULT_T>
	DecimalValue<RESULT_T> Add(const Decimal16Value& val, int this_scale,
	const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
	bool* overflow) {
	DCHECK_EQ(result_scale, std::max(this_scale, other_scale));
	RESULT_T x = 0;
	RESULT_T y = 0;
	*overflow \|= AdjustToSameScale(val, this_scale, other, other_scale,
	result_precision, &x, &y);
	if (sizeof(RESULT_T) == 16) {
	// Check overflow.
	if (!*overflow && (val.value() < 0) == (other.value() < 0) &&
	result_precision == ColumnType::MAX_PRECISION) {
	// Can only overflow if the signs are the same and result precision reaches
	// max precision.
	*overflow \|= DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(x) < abs(y);
	// TODO: faster to return here? We don't care at all about the perf on
	// the overflow case but what makes the normal path faster?
	}
	} else {
	DCHECK(!*overflow) << "Cannot overflow unless result is Decimal16Value";
	}
	return DecimalValue<RESULT_T>(x + y);
	}

	// set DISABLE_CHECK_OVERFLOW true will disable checking overflow.
	#define TEST_ADD(NAME, FN, DISABLE_CHECK_OVERFLOW) \
	void NAME(int batch_size, void* d) { \
	TestData* data = reinterpret_cast<TestData*>(d); \
	for (int i = 0; i < batch_size; ++i) { \
	int s1 = data->scale; \
	int s2 = data->scale; \
	int p1 = data->precision; \
	int p2 = data->precision; \
	int s_max = std::max(s1, s2); \
	int max_precision = ColumnType::MAX_PRECISION; \
	int result_precision = std::min(s_max + std::max(p1 - s1, p2 - s2) + 1, \
	max_precision); \
	int result_scale = std::min(result_precision, s_max); \
	for (int j = 0; j < data->values.size() - 1; ++j) { \
	bool overflow = DISABLE_CHECK_OVERFLOW; \
	data->results[j] = FN<int128_t>(data->values[j], data->scale, \
	data->values[j + 1], data->scale, result_precision, \
	result_scale, &overflow); \
	data->overflows[j] = overflow; \
	} \
	} \
	}

	TEST_ADD(TestAdd, Add, true);
	#ifdef HAVE_BUILTIN_ADD_OVERFLOW
	TEST_ADD(TestBuiltinAddOverflow, BuiltinAdd, false);
	#endif
	TEST_ADD(TestAddOverflowLookupTbl, AddLookupTbl, false);
	TEST_ADD(TestAddOverflow, Add, false);

	// Disabled __builtin_mul_overflow since Clang emits a call to __muloti4, which is
	// not implemented in the GCC runtime library.
	#if 5 <= __GNUC__
	#define HAVE_BUILTIN_MUL_OVERFLOW
	template<typename RESULT_T>
	DecimalValue<RESULT_T> BuiltinMultiply(const Decimal16Value& val, int this_scale,
	const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
	bool* overflow) {
	// In the non-overflow case, we don't need to adjust by the scale since
	// that is already handled by the FE when it computes the result decimal type.
	// e.g. 1.23 * .2 (scale 2, scale 1 respectively) is identical to:
	// 123 * 2 with a resulting scale 3. We can do the multiply on the unscaled values.
	// The result scale in this case is the sum of the input scales.
	RESULT_T x = val.value();
	RESULT_T y = other.value();
	if (x == 0 \|\| y == 0) {
	// Handle zero to avoid divide by zero in the overflow check below.
	return DecimalValue<RESULT_T>(0);
	}
	RESULT_T result = 0;
	if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION) {
	// Check overflow
	*overflow \|= __builtin_mul_overflow(x, y, &result);
	*overflow \|= abs(result) > DecimalUtil::MAX_UNSCALED_DECIMAL16;
	} else {
	result = x * y;
	}
	int delta_scale = this_scale + other_scale - result_scale;
	if (UNLIKELY(delta_scale != 0)) {
	// In this case, the required resulting scale is larger than the max we support.
	// We cap the resulting scale to the max supported scale (e.g. truncate) in the FE.
	// TODO: we could also return NULL.
	DCHECK_GT(delta_scale, 0);
	result /= DecimalUtil::GetScaleMultiplier<int128_t>(delta_scale);
	}
	return DecimalValue<RESULT_T>(result);
	}
	#endif

	// Check MSB of decimal values to skip checking overflow.
	template<typename RESULT_T>
	DecimalValue<RESULT_T> MultiplyCheckMSB(const Decimal16Value& val, int this_scale,
	const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
	bool* overflow) {
	// In the non-overflow case, we don't need to adjust by the scale since
	// that is already handled by the FE when it computes the result decimal type.
	// e.g. 1.23 * .2 (scale 2, scale 1 respectively) is identical to:
	// 123 * 2 with a resulting scale 3. We can do the multiply on the unscaled values.
	// The result scale in this case is the sum of the input scales.
	RESULT_T x = val.value();
	RESULT_T y = other.value();
	if (x == 0 \|\| y == 0) {
	// Handle zero to avoid divide by zero in the overflow check below.
	return DecimalValue<RESULT_T>(0);
	}
	if (sizeof(RESULT_T) == 16) {
	// Check overflow
	if (result_precision == ColumnType::MAX_PRECISION &&
	DecimalUtil::Clz(abs(x)) + DecimalUtil::Clz(abs(y)) < 130) {
	*overflow \|= DecimalUtil::MAX_UNSCALED_DECIMAL16 / abs(y) < abs(x);
	}
	}
	RESULT_T result = x * y;
	int delta_scale = this_scale + other_scale - result_scale;
	if (UNLIKELY(delta_scale != 0)) {
	// In this case, the required resulting scale is larger than the max we support.
	// We cap the resulting scale to the max supported scale (e.g. truncate) in the FE.
	// TODO: we could also return NULL.
	DCHECK_GT(delta_scale, 0);
	result /= DecimalUtil::GetScaleMultiplier<int128_t>(delta_scale);
	}
	return DecimalValue<RESULT_T>(result);
	}

	template<typename RESULT_T>
	DecimalValue<RESULT_T> Multiply(const Decimal16Value& val, int this_scale,
	const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
	bool* overflow) {
	// In the non-overflow case, we don't need to adjust by the scale since
	// that is already handled by the FE when it computes the result decimal type.
	// e.g. 1.23 * .2 (scale 2, scale 1 respectively) is identical to:
	// 123 * 2 with a resulting scale 3. We can do the multiply on the unscaled values.
	// The result scale in this case is the sum of the input scales.
	RESULT_T x = val.value();
	RESULT_T y = other.value();
	if (x == 0 \|\| y == 0) {
	// Handle zero to avoid divide by zero in the overflow check below.
	return DecimalValue<RESULT_T>(0);
	}
	if (sizeof(RESULT_T) == 16) {
	// Check overflow
	if (result_precision == ColumnType::MAX_PRECISION) {
	*overflow \|= DecimalUtil::MAX_UNSCALED_DECIMAL16 / abs(y) < abs(x);
	}
	}
	RESULT_T result = x * y;
	int delta_scale = this_scale + other_scale - result_scale;
	if (UNLIKELY(delta_scale != 0)) {
	// In this case, the required resulting scale is larger than the max we support.
	// We cap the resulting scale to the max supported scale (e.g. truncate) in the FE.
	// TODO: we could also return NULL.
	DCHECK_GT(delta_scale, 0);
	result /= DecimalUtil::GetScaleMultiplier<int128_t>(delta_scale);
	}
	return DecimalValue<RESULT_T>(result);
	}

	// set DISABLE_CHECK_OVERFLOW true will disable checking overflow.
	#define TEST_MUL(NAME, FN, DISABLE_CHECK_OVERFLOW) \
	void NAME(int batch_size, void* d) { \
	TestData* data = reinterpret_cast<TestData*>(d); \
	for (int i = 0; i < batch_size; ++i) { \
	int max_precision = ColumnType::MAX_PRECISION; \
	int result_precision = std::min(data->precision + data->precision, \
	max_precision); \
	int result_scale = std::min(result_precision, data->scale + data->scale); \
	for (int j = 0; j < data->values.size() - 1; ++j) { \
	bool overflow = DISABLE_CHECK_OVERFLOW; \
	data->results[j] = FN<int128_t>(data->values[j], data->scale, \
	data->values[j + 1], data->scale, result_precision, \
	result_scale, &overflow); \
	data->overflows[j] = overflow; \
	} \
	} \
	}

	TEST_MUL(TestMul, Multiply, true);
	#ifdef HAVE_BUILTIN_MUL_OVERFLOW
	TEST_MUL(TestBuiltinMulOverflow, BuiltinMultiply, false);
	#endif
	TEST_MUL(TestMulOverflowCheckMSB, MultiplyCheckMSB, false);
	TEST_MUL(TestMulOverflow, Multiply, false);

	void TestClzBranchy(int batch_size, void* d) {
	TestData* data = reinterpret_cast<TestData*>(d);
	for (int i = 0; i < batch_size; ++i) {
	// Unroll this to focus more on Put performance.
	for (int j = 0; j < data->int128_values.size(); j += 8) {
	DecimalUtil::Clz(data->int128_values[j + 0]);
	DecimalUtil::Clz(data->int128_values[j + 1]);
	DecimalUtil::Clz(data->int128_values[j + 2]);
	DecimalUtil::Clz(data->int128_values[j + 3]);
	DecimalUtil::Clz(data->int128_values[j + 4]);
	DecimalUtil::Clz(data->int128_values[j + 5]);
	DecimalUtil::Clz(data->int128_values[j + 6]);
	DecimalUtil::Clz(data->int128_values[j + 7]);
	}
	}
	}

	inline int ClzBranchFree(const int128_t& v) {
	// GCC leaves __builtin_clz undefined for zero.
	uint64_t hi = static_cast<uint64_t>(v >> 64);
	uint64_t lo = static_cast<uint64_t>(v);
	int retval[3]={
	__builtin_clzll(hi),
	__builtin_clzll(lo)+64,
	128
	};
	int idx = !hi + ((!lo)&(!hi));
	return retval[idx];
	}

	void TestClzBranchFree(int batch_size, void* d) {
	TestData* data = reinterpret_cast<TestData*>(d);
	for (int i = 0; i < batch_size; ++i) {
	// Unroll this to focus more on Put performance.
	for (int j = 0; j < data->int128_values.size(); j += 8) {
	ClzBranchFree(data->int128_values[j + 0]);
	ClzBranchFree(data->int128_values[j + 1]);
	ClzBranchFree(data->int128_values[j + 2]);
	ClzBranchFree(data->int128_values[j + 3]);
	ClzBranchFree(data->int128_values[j + 4]);
	ClzBranchFree(data->int128_values[j + 5]);
	ClzBranchFree(data->int128_values[j + 6]);
	ClzBranchFree(data->int128_values[j + 7]);
	}
	}
	}

	int main(int argc, char** argv) {
	CpuInfo::Init();
	cout << Benchmark::GetMachineInfo() << endl;

	TestData data;
	int size = 10000;
	data.precision = ColumnType::MAX_PRECISION;
	data.scale = data.precision / 2;
	data.probability_negative = 0.25;
	AddTestData(&data, size);
	data.results.resize(size - 1);
	data.overflows.resize(size - 1);

	Benchmark add_overflow_suite("Decimal16 Add Overflow");
	add_overflow_suite.AddBenchmark("without_check_overflow", TestAdd, &data);
	#ifdef HAVE_BUILTIN_ADD_OVERFLOW
	add_overflow_suite.AddBenchmark("builtin_add_overflow", TestBuiltinAddOverflow, &data);
	#endif
	add_overflow_suite.AddBenchmark("add_overflow_lookup_table",
	TestAddOverflowLookupTbl, &data);
	add_overflow_suite.AddBenchmark("add_overflow", TestAddOverflow, &data);
	cout << add_overflow_suite.Measure() << endl;

	Benchmark mul_overflow_suite("Decimal16 Mul Overflow");
	mul_overflow_suite.AddBenchmark("without_check_overflow", TestMul, &data);
	#ifdef HAVE_BUILTIN_MUL_OVERFLOW
	mul_overflow_suite.AddBenchmark("builtin_mul_overflow", TestBuiltinMulOverflow, &data);
	#endif
	mul_overflow_suite.AddBenchmark("mul_overflow_check_msb",
	TestMulOverflowCheckMSB, &data);
	mul_overflow_suite.AddBenchmark("mul_overflow", TestMulOverflow, &data);
	cout << mul_overflow_suite.Measure() << endl;

	// Counting the number of leading zeros in a 128-bit integer.
	Benchmark clz_suite("Clz of int128_t");
	clz_suite.AddBenchmark("clz_branchy", TestClzBranchy, &data);
	clz_suite.AddBenchmark("clz_branch_free", TestClzBranchFree, &data);
	cout << clz_suite.Measure() << endl;

	return 0;
	}