be/src/util/bit-util.h - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.


 #ifndef IMPALA_BIT_UTIL_H
 #define IMPALA_BIT_UTIL_H

 #if defined(__APPLE__)
 #include <machine/endian.h>
 #else
 #include <endian.h>
 #endif

 #include <climits>
 #include <limits>
 #include <typeinfo>
 #include <type_traits>

 #include "common/compiler-util.h"
 #include "gutil/bits.h"
 #include "runtime/multi-precision.h"
 #include "util/cpu-info.h"
 #include "util/sse-util.h"

 namespace impala {

 // Doubles the width of integer types (e.g. int32_t -> int64_t).
 // Currently only works with a few signed types.
 // Feel free to extend it to other types as well.
 template <typename T>
 struct DoubleWidth {};

 template <>
 struct DoubleWidth<int32_t> {
   using type = int64_t;
 };

 template <>
 struct DoubleWidth<int64_t> {
   using type = int128_t;
 };

 template <>
 struct DoubleWidth<int128_t> {
   using type = int256_t;
 };

 /// Utility class to do standard bit tricks
 /// TODO: is this in boost or something else like that?
 class BitUtil {
  public:

   /// Returns the width of the integer portion of the type, not counting the sign bit.
   /// Not safe for use with unknown or non-native types, so make it undefined
   template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
       typename std::enable_if<std::is_integral<CVR_REMOVED>{} ||
                               std::is_same<CVR_REMOVED, unsigned __int128>{} ||
                               std::is_same<CVR_REMOVED, __int128>{}, int>::type = 0>
   constexpr static inline int UnsignedWidth() {
     return std::is_integral<CVR_REMOVED>::value ?
         std::numeric_limits<CVR_REMOVED>::digits :
         std::is_same<CVR_REMOVED, unsigned __int128>::value ? 128 :
         std::is_same<CVR_REMOVED, __int128>::value ? 127 : -1;
   }

   /// Returns the max value that can be represented in T.
   template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
       typename std::enable_if<std::is_integral<CVR_REMOVED> {}||
                               std::is_same<CVR_REMOVED, __int128> {}, int>::type = 0>
   constexpr static inline CVR_REMOVED Max() {
     return std::is_integral<CVR_REMOVED>::value ?
         std::numeric_limits<CVR_REMOVED>::max() :
         std::is_same<CVR_REMOVED, __int128>::value ?
             static_cast<UnsignedType<CVR_REMOVED>>(-1) / 2 : -1;
   }

   /// Return an integer signifying the sign of the value, returning +1 for
   /// positive integers (and zero), -1 for negative integers.
   /// The extra shift is to silence GCC warnings about full width shift on
   /// unsigned types. It compiles out in optimized builds into the expected increment.
   template<typename T>
   constexpr static inline T Sign(T value) {
     return 1 | ((value >> (UnsignedWidth<T>() - 1)) >> 1);
   }

   /// Returns the ceil of value/divisor
   constexpr static inline int64_t Ceil(int64_t value, int64_t divisor) {
     return value / divisor + (value % divisor != 0);
   }

   /// Returns 'value' rounded up to the nearest multiple of 'factor'
   constexpr static inline int64_t RoundUp(int64_t value, int64_t factor) {
     return (value + (factor - 1)) / factor * factor;
   }

   /// Returns 'value' rounded down to the nearest multiple of 'factor'
   constexpr static inline int64_t RoundDown(int64_t value, int64_t factor) {
     return (value / factor) * factor;
   }

   /// Returns the smallest power of two that contains v. If v is a power of two, v is
   /// returned. Taken from
   /// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
   static inline int64_t RoundUpToPowerOfTwo(int64_t v) {
     --v;
     v |= v >> 1;
     v |= v >> 2;
     v |= v >> 4;
     v |= v >> 8;
     v |= v >> 16;
     v |= v >> 32;
     ++v;
     return v;
   }

   /// Returns the largest power of two <= v.
   static inline int64_t RoundDownToPowerOfTwo(int64_t v) {
     int64_t v_rounded_up = RoundUpToPowerOfTwo(v);
     return v_rounded_up == v ? v : v_rounded_up / 2;
   }

   /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
   /// a power of two
   static inline int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
     DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
     return (value + (factor - 1)) & ~(factor - 1);
   }

   static inline int64_t RoundDownToPowerOf2(int64_t value, int64_t factor) {
     DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
     return value & ~(factor - 1);
   }

   constexpr static inline bool IsPowerOf2(int64_t value) {
     return (value & (value - 1)) == 0;
   }

   /// Specialized round up and down functions for frequently used factors,
   /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64).
   /// Returns the rounded up number of bytes that fit the number of bits.
   constexpr static inline uint32_t RoundUpNumBytes(uint32_t bits) {
     return (bits + 7) >> 3;
   }

   /// Returns the rounded down number of bytes that fit the number of bits.
   constexpr static inline uint32_t RoundDownNumBytes(uint32_t bits) { return bits >> 3; }

   /// Returns the rounded up to 32 multiple. Used for conversions of bits to i32.
   constexpr static inline uint32_t RoundUpNumi32(uint32_t bits) {
     return (bits + 31) >> 5;
   }

   /// Returns the rounded up 32 multiple.
   constexpr static inline uint32_t RoundDownNumi32(uint32_t bits) { return bits >> 5; }

   /// Returns the rounded up to 64 multiple. Used for conversions of bits to i64.
   constexpr static inline uint32_t RoundUpNumi64(uint32_t bits) {
     return (bits + 63) >> 6;
   }

   /// Returns the rounded down to 64 multiple.
   constexpr static inline uint32_t RoundDownNumi64(uint32_t bits) { return bits >> 6; }

   /// Non hw accelerated pop count.
   /// TODO: we don't use this in any perf sensitive code paths currently.  There
   /// might be a much faster way to implement this.
   static inline int PopcountNoHw(uint64_t x) {
     int count = 0;
     for (; x != 0; ++count) x &= x-1;
     return count;
   }

   /// Returns the number of set bits in x
   static inline int Popcount(uint64_t x) {
     if (LIKELY(CpuInfo::IsSupported(CpuInfo::POPCNT))) {
       return POPCNT_popcnt_u64(x);
     } else {
       return PopcountNoHw(x);
     }
   }

   // Compute correct population count for various-width signed integers
   template<typename T>
   static inline int PopcountSigned(T v) {
     // Converting to same-width unsigned then extending preserves the bit pattern.
     return BitUtil::Popcount(static_cast<UnsignedType<T>>(v));
   }

   /// Returns the 'num_bits' least-significant bits of 'v'.
   /// Force inlining - GCC does not always inline this into hot loops.
   static ALWAYS_INLINE uint64_t TrailingBits(uint64_t v, int num_bits) {
     if (UNLIKELY(num_bits >= 64)) return v;
     return ((1UL << num_bits) - 1) & v;
   }

   /// Swaps the byte order (i.e. endianess)
   static inline int64_t ByteSwap(int64_t value) {
     return __builtin_bswap64(value);
   }
   static inline uint64_t ByteSwap(uint64_t value) {
     return __builtin_bswap64(value);
   }
   static inline int32_t ByteSwap(int32_t value) {
     return __builtin_bswap32(value);
   }
   static inline uint32_t ByteSwap(uint32_t value) {
     return __builtin_bswap32(value);
   }
   static inline int16_t ByteSwap(int16_t value) {
     return __builtin_bswap16(value);
   }
   static inline uint16_t ByteSwap(uint16_t value) {
     return __builtin_bswap16(value);
   }

   /// Write the swapped bytes into dest; source and dest must not overlap.
   /// This function is optimized for len <= 16. It reverts to a slow loop-based
   /// swap for len > 16.
   static void ByteSwap(void* dest, const void* source, int len);

 /// Converts to big endian format (if not already in big endian) from the
 /// machine's native endian format.
 #if __BYTE_ORDER == __LITTLE_ENDIAN
   static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); }
   static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
   static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); }
   static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); }
   static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); }
   static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); }
 #else
   static inline int64_t ToBigEndian(int64_t val) { return val; }
   static inline uint64_t ToBigEndian(uint64_t val) { return val; }
   static inline int32_t ToBigEndian(int32_t val) { return val; }
   static inline uint32_t ToBigEndian(uint32_t val) { return val; }
   static inline int16_t ToBigEndian(int16_t val) { return val; }
   static inline uint16_t ToBigEndian(uint16_t val) { return val; }
 #endif

 /// Converts from big endian format to the machine's native endian format.
 #if __BYTE_ORDER == __LITTLE_ENDIAN
   static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); }
   static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); }
   static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); }
   static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); }
   static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); }
   static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); }
 #else
   static inline int64_t FromBigEndian(int64_t val) { return val; }
   static inline uint64_t FromBigEndian(uint64_t val) { return val; }
   static inline int32_t FromBigEndian(int32_t val) { return val; }
   static inline uint32_t FromBigEndian(uint32_t val) { return val; }
   static inline int16_t FromBigEndian(int16_t val) { return val; }
   static inline uint16_t FromBigEndian(uint16_t val) { return val; }
 #endif

   /// Returns true if 'value' is a non-negative 32-bit integer.
   constexpr static inline bool IsNonNegative32Bit(int64_t value) {
     return static_cast<uint64_t>(value) <= std::numeric_limits<int32_t>::max();
   }

   /// Logical right shift for signed integer types
   /// This is needed because the C >> operator does arithmetic right shift
   /// Negative shift amounts lead to undefined behavior
   template <typename T>
   constexpr static T ShiftRightLogical(T v, int shift) {
     // Conversion to unsigned ensures most significant bits always filled with 0's
     return static_cast<UnsignedType<T>>(v) >> shift;
   }

   /// Get an specific bit of a numeric type
   template<typename T>
   static inline int8_t GetBit(T v, int bitpos) {
     T masked = v & (static_cast<T>(0x1) << bitpos);
     return static_cast<int8_t>(ShiftRightLogical(masked, bitpos));
   }

   /// Set a specific bit to 1
   /// Behavior when bitpos is negative is undefined
   template <typename T>
   constexpr static T SetBit(T v, int bitpos) {
     return v | (static_cast<T>(0x1) << bitpos);
   }

   /// Set a specific bit to 0
   /// Behavior when bitpos is negative is undefined
   template <typename T>
   constexpr static T UnsetBit(T v, int bitpos) {
     return v & ~(static_cast<T>(0x1) << bitpos);
   }

   /// Wrappers around __builtin_ctz, which returns an undefined value when the argument is
   /// zero.
   static inline int CountTrailingZeros(
       unsigned int v, int otherwise = sizeof(unsigned int) * CHAR_BIT) {
     if (UNLIKELY(v == 0)) return otherwise;
     return __builtin_ctz(v);
   }

   static inline int CountTrailingZeros(
       unsigned long v, int otherwise = sizeof(unsigned long) * CHAR_BIT) {
     if (UNLIKELY(v == 0)) return otherwise;
     return __builtin_ctzl(v);
   }

   static inline int CountTrailingZeros(
       unsigned long long v, int otherwise = sizeof(unsigned long long) * CHAR_BIT) {
     if (UNLIKELY(v == 0)) return otherwise;
     return __builtin_ctzll(v);
   }

   template<typename T>
   static inline int CountLeadingZeros(T v) {
     DCHECK(v >= 0);
     if (sizeof(T) == 4) {
       uint32_t orig = static_cast<uint32_t>(v);
       return __builtin_clz(orig);
     } else if (sizeof(T) == 8) {
       uint64_t orig = static_cast<uint64_t>(v);
       return __builtin_clzll(orig);
     } else {
       DCHECK(sizeof(T) == 16);
       if (UNLIKELY(v == 0)) return 128;
       unsigned __int128 orig = static_cast<unsigned __int128>(v);
       unsigned __int128 shifted = orig >> 64;
       if (shifted != 0) {
         return __builtin_clzll(shifted);
       } else {
         return __builtin_clzll(orig) + 64;
       }
     }
   }

   // Wrap the gutil/ version for convenience.
   static inline int Log2Floor(uint32_t n) {
     return Bits::Log2Floor(n);
   }

   // Wrap the gutil/ version for convenience.
   static inline int Log2Floor64(uint64_t n) {
     return Bits::Log2Floor64(n);
   }

   // Wrap the gutil/ version for convenience.
   static inline int Log2FloorNonZero64(uint64_t n) {
     return Bits::Log2FloorNonZero64(n);
   }

   /// More efficient version of similar functions found in gutil/
   static inline int Log2Ceiling(uint32 n) {
     int floor = Log2Floor(n);
     // Check if zero or a power of two. This pattern is recognised by gcc and optimised
     // into branch-free code.
     if (0 == (n & (n - 1))) {
       return floor;
     } else {
       return floor + 1;
     }
   }

   static inline int Log2Ceiling64(uint64_t n) {
     int floor = Log2Floor64(n);
     // Check if zero or a power of two. This pattern is recognised by gcc and optimised
     // into branch-free code.
     if (0 == (n & (n - 1))) {
       return floor;
     } else {
       return floor + 1;
     }
   }

   static inline int Log2CeilingNonZero64(uint64_t n) {
     int floor = Log2FloorNonZero64(n);
     // Check if zero or a power of two. This pattern is recognised by gcc and optimised
     // into branch-free code.
     if (0 == (n & (n - 1))) {
       return floor;
     } else {
       return floor + 1;
     }
   }
 };

 template<>
 inline int256_t BitUtil::Sign(int256_t value) {
   return value < 0 ? -1 : 1;
 }

 /// An encapsulation class of SIMD byteswap functions
 class SimdByteSwap {
  public:
   /// Byteswap an array in the scalar way with some builtin optimization for arrays of
   /// length <= 16 bytes.
   ///   const void* src: the source address of the input array;
   ///   int len: the length of the input array;
   ///   void* dst: the destination address of the output array;
   static void ByteSwapScalar(const void* src, int len, void* dst);

   /// SIMD ByteSwap functions:
   /// ByteSwap128 is to byteswap an array of 16 bytes(128 bits) using SSSE3 intrinsics;
   /// ByteSwap256 is to byteswap an array of 32 bytes(256 bits) using AVX2 intrinsics;
   /// Function parameters have the same meaning as ByteSwapScalar.
   static void ByteSwap128(const uint8_t* src, uint8_t* dst);
   static void ByteSwap256(const uint8_t* src, uint8_t* dst);

   /// Template function ByteSwapSimd is the entry point function to byteswap an array
   /// using SIMD approach.
   /// Template parameter:
   ///   int TEMPLATE_DATA_WIDTH: only 16 or 32 are valid now;
   ///   16 means using ByteSwap128 as the internal SIMD implementation;
   ///   32 means using ByteSwap256 as the internal SIMD implementation;
   /// Function parameters have the same meaning as ByteSwapScalar.
   template <int TEMPLATE_DATA_WIDTH>
   static void ByteSwapSimd(const void* src, const int len, void* dst);
 };
 }

 #endif
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.


	#ifndef IMPALA_BIT_UTIL_H
	#define IMPALA_BIT_UTIL_H

	#if defined(__APPLE__)
	#include <machine/endian.h>
	#else
	#include <endian.h>
	#endif

	#include <climits>
	#include <limits>
	#include <typeinfo>
	#include <type_traits>

	#include "common/compiler-util.h"
	#include "gutil/bits.h"
	#include "runtime/multi-precision.h"
	#include "util/cpu-info.h"
	#include "util/sse-util.h"

	namespace impala {

	// Doubles the width of integer types (e.g. int32_t -> int64_t).
	// Currently only works with a few signed types.
	// Feel free to extend it to other types as well.
	template <typename T>
	struct DoubleWidth {};

	template <>
	struct DoubleWidth<int32_t> {
	using type = int64_t;
	};

	template <>
	struct DoubleWidth<int64_t> {
	using type = int128_t;
	};

	template <>
	struct DoubleWidth<int128_t> {
	using type = int256_t;
	};

	/// Utility class to do standard bit tricks
	/// TODO: is this in boost or something else like that?
	class BitUtil {
	public:

	/// Returns the width of the integer portion of the type, not counting the sign bit.
	/// Not safe for use with unknown or non-native types, so make it undefined
	template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
	typename std::enable_if<std::is_integral<CVR_REMOVED>{} \|\|
	std::is_same<CVR_REMOVED, unsigned __int128>{} \|\|
	std::is_same<CVR_REMOVED, __int128>{}, int>::type = 0>
	constexpr static inline int UnsignedWidth() {
	return std::is_integral<CVR_REMOVED>::value ?
	std::numeric_limits<CVR_REMOVED>::digits :
	std::is_same<CVR_REMOVED, unsigned __int128>::value ? 128 :
	std::is_same<CVR_REMOVED, __int128>::value ? 127 : -1;
	}

	/// Returns the max value that can be represented in T.
	template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
	typename std::enable_if<std::is_integral<CVR_REMOVED> {}\|\|
	std::is_same<CVR_REMOVED, __int128> {}, int>::type = 0>
	constexpr static inline CVR_REMOVED Max() {
	return std::is_integral<CVR_REMOVED>::value ?
	std::numeric_limits<CVR_REMOVED>::max() :
	std::is_same<CVR_REMOVED, __int128>::value ?
	static_cast<UnsignedType<CVR_REMOVED>>(-1) / 2 : -1;
	}

	/// Return an integer signifying the sign of the value, returning +1 for
	/// positive integers (and zero), -1 for negative integers.
	/// The extra shift is to silence GCC warnings about full width shift on
	/// unsigned types. It compiles out in optimized builds into the expected increment.
	template<typename T>
	constexpr static inline T Sign(T value) {
	return 1 \| ((value >> (UnsignedWidth<T>() - 1)) >> 1);
	}

	/// Returns the ceil of value/divisor
	constexpr static inline int64_t Ceil(int64_t value, int64_t divisor) {
	return value / divisor + (value % divisor != 0);
	}

	/// Returns 'value' rounded up to the nearest multiple of 'factor'
	constexpr static inline int64_t RoundUp(int64_t value, int64_t factor) {
	return (value + (factor - 1)) / factor * factor;
	}

	/// Returns 'value' rounded down to the nearest multiple of 'factor'
	constexpr static inline int64_t RoundDown(int64_t value, int64_t factor) {
	return (value / factor) * factor;
	}

	/// Returns the smallest power of two that contains v. If v is a power of two, v is
	/// returned. Taken from
	/// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
	static inline int64_t RoundUpToPowerOfTwo(int64_t v) {
	--v;
	v \|= v >> 1;
	v \|= v >> 2;
	v \|= v >> 4;
	v \|= v >> 8;
	v \|= v >> 16;
	v \|= v >> 32;
	++v;
	return v;
	}

	/// Returns the largest power of two <= v.
	static inline int64_t RoundDownToPowerOfTwo(int64_t v) {
	int64_t v_rounded_up = RoundUpToPowerOfTwo(v);
	return v_rounded_up == v ? v : v_rounded_up / 2;
	}

	/// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
	/// a power of two
	static inline int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
	DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
	return (value + (factor - 1)) & ~(factor - 1);
	}

	static inline int64_t RoundDownToPowerOf2(int64_t value, int64_t factor) {
	DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
	return value & ~(factor - 1);
	}

	constexpr static inline bool IsPowerOf2(int64_t value) {
	return (value & (value - 1)) == 0;
	}

	/// Specialized round up and down functions for frequently used factors,
	/// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64).
	/// Returns the rounded up number of bytes that fit the number of bits.
	constexpr static inline uint32_t RoundUpNumBytes(uint32_t bits) {
	return (bits + 7) >> 3;
	}

	/// Returns the rounded down number of bytes that fit the number of bits.
	constexpr static inline uint32_t RoundDownNumBytes(uint32_t bits) { return bits >> 3; }

	/// Returns the rounded up to 32 multiple. Used for conversions of bits to i32.
	constexpr static inline uint32_t RoundUpNumi32(uint32_t bits) {
	return (bits + 31) >> 5;
	}

	/// Returns the rounded up 32 multiple.
	constexpr static inline uint32_t RoundDownNumi32(uint32_t bits) { return bits >> 5; }

	/// Returns the rounded up to 64 multiple. Used for conversions of bits to i64.
	constexpr static inline uint32_t RoundUpNumi64(uint32_t bits) {
	return (bits + 63) >> 6;
	}

	/// Returns the rounded down to 64 multiple.
	constexpr static inline uint32_t RoundDownNumi64(uint32_t bits) { return bits >> 6; }

	/// Non hw accelerated pop count.
	/// TODO: we don't use this in any perf sensitive code paths currently. There
	/// might be a much faster way to implement this.
	static inline int PopcountNoHw(uint64_t x) {
	int count = 0;
	for (; x != 0; ++count) x &= x-1;
	return count;
	}

	/// Returns the number of set bits in x
	static inline int Popcount(uint64_t x) {
	if (LIKELY(CpuInfo::IsSupported(CpuInfo::POPCNT))) {
	return POPCNT_popcnt_u64(x);
	} else {
	return PopcountNoHw(x);
	}
	}

	// Compute correct population count for various-width signed integers
	template<typename T>
	static inline int PopcountSigned(T v) {
	// Converting to same-width unsigned then extending preserves the bit pattern.
	return BitUtil::Popcount(static_cast<UnsignedType<T>>(v));
	}

	/// Returns the 'num_bits' least-significant bits of 'v'.
	/// Force inlining - GCC does not always inline this into hot loops.
	static ALWAYS_INLINE uint64_t TrailingBits(uint64_t v, int num_bits) {
	if (UNLIKELY(num_bits >= 64)) return v;
	return ((1UL << num_bits) - 1) & v;
	}

	/// Swaps the byte order (i.e. endianess)
	static inline int64_t ByteSwap(int64_t value) {
	return __builtin_bswap64(value);
	}
	static inline uint64_t ByteSwap(uint64_t value) {
	return __builtin_bswap64(value);
	}
	static inline int32_t ByteSwap(int32_t value) {
	return __builtin_bswap32(value);
	}
	static inline uint32_t ByteSwap(uint32_t value) {
	return __builtin_bswap32(value);
	}
	static inline int16_t ByteSwap(int16_t value) {
	return __builtin_bswap16(value);
	}
	static inline uint16_t ByteSwap(uint16_t value) {
	return __builtin_bswap16(value);
	}

	/// Write the swapped bytes into dest; source and dest must not overlap.
	/// This function is optimized for len <= 16. It reverts to a slow loop-based
	/// swap for len > 16.
	static void ByteSwap(void* dest, const void* source, int len);

	/// Converts to big endian format (if not already in big endian) from the
	/// machine's native endian format.
	#if __BYTE_ORDER == __LITTLE_ENDIAN
	static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); }
	static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
	static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); }
	static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); }
	static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); }
	static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); }
	#else
	static inline int64_t ToBigEndian(int64_t val) { return val; }
	static inline uint64_t ToBigEndian(uint64_t val) { return val; }
	static inline int32_t ToBigEndian(int32_t val) { return val; }
	static inline uint32_t ToBigEndian(uint32_t val) { return val; }
	static inline int16_t ToBigEndian(int16_t val) { return val; }
	static inline uint16_t ToBigEndian(uint16_t val) { return val; }
	#endif

	/// Converts from big endian format to the machine's native endian format.
	#if __BYTE_ORDER == __LITTLE_ENDIAN
	static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); }
	static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); }
	static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); }
	static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); }
	static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); }
	static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); }
	#else
	static inline int64_t FromBigEndian(int64_t val) { return val; }
	static inline uint64_t FromBigEndian(uint64_t val) { return val; }
	static inline int32_t FromBigEndian(int32_t val) { return val; }
	static inline uint32_t FromBigEndian(uint32_t val) { return val; }
	static inline int16_t FromBigEndian(int16_t val) { return val; }
	static inline uint16_t FromBigEndian(uint16_t val) { return val; }
	#endif

	/// Returns true if 'value' is a non-negative 32-bit integer.
	constexpr static inline bool IsNonNegative32Bit(int64_t value) {
	return static_cast<uint64_t>(value) <= std::numeric_limits<int32_t>::max();
	}

	/// Logical right shift for signed integer types
	/// This is needed because the C >> operator does arithmetic right shift
	/// Negative shift amounts lead to undefined behavior
	template <typename T>
	constexpr static T ShiftRightLogical(T v, int shift) {
	// Conversion to unsigned ensures most significant bits always filled with 0's
	return static_cast<UnsignedType<T>>(v) >> shift;
	}

	/// Get an specific bit of a numeric type
	template<typename T>
	static inline int8_t GetBit(T v, int bitpos) {
	T masked = v & (static_cast<T>(0x1) << bitpos);
	return static_cast<int8_t>(ShiftRightLogical(masked, bitpos));
	}

	/// Set a specific bit to 1
	/// Behavior when bitpos is negative is undefined
	template <typename T>
	constexpr static T SetBit(T v, int bitpos) {
	return v \| (static_cast<T>(0x1) << bitpos);
	}

	/// Set a specific bit to 0
	/// Behavior when bitpos is negative is undefined
	template <typename T>
	constexpr static T UnsetBit(T v, int bitpos) {
	return v & ~(static_cast<T>(0x1) << bitpos);
	}

	/// Wrappers around __builtin_ctz, which returns an undefined value when the argument is
	/// zero.
	static inline int CountTrailingZeros(
	unsigned int v, int otherwise = sizeof(unsigned int) * CHAR_BIT) {
	if (UNLIKELY(v == 0)) return otherwise;
	return __builtin_ctz(v);
	}

	static inline int CountTrailingZeros(
	unsigned long v, int otherwise = sizeof(unsigned long) * CHAR_BIT) {
	if (UNLIKELY(v == 0)) return otherwise;
	return __builtin_ctzl(v);
	}

	static inline int CountTrailingZeros(
	unsigned long long v, int otherwise = sizeof(unsigned long long) * CHAR_BIT) {
	if (UNLIKELY(v == 0)) return otherwise;
	return __builtin_ctzll(v);
	}

	template<typename T>
	static inline int CountLeadingZeros(T v) {
	DCHECK(v >= 0);
	if (sizeof(T) == 4) {
	uint32_t orig = static_cast<uint32_t>(v);
	return __builtin_clz(orig);
	} else if (sizeof(T) == 8) {
	uint64_t orig = static_cast<uint64_t>(v);
	return __builtin_clzll(orig);
	} else {
	DCHECK(sizeof(T) == 16);
	if (UNLIKELY(v == 0)) return 128;
	unsigned __int128 orig = static_cast<unsigned __int128>(v);
	unsigned __int128 shifted = orig >> 64;
	if (shifted != 0) {
	return __builtin_clzll(shifted);
	} else {
	return __builtin_clzll(orig) + 64;
	}
	}
	}

	// Wrap the gutil/ version for convenience.
	static inline int Log2Floor(uint32_t n) {
	return Bits::Log2Floor(n);
	}

	// Wrap the gutil/ version for convenience.
	static inline int Log2Floor64(uint64_t n) {
	return Bits::Log2Floor64(n);
	}

	// Wrap the gutil/ version for convenience.
	static inline int Log2FloorNonZero64(uint64_t n) {
	return Bits::Log2FloorNonZero64(n);
	}

	/// More efficient version of similar functions found in gutil/
	static inline int Log2Ceiling(uint32 n) {
	int floor = Log2Floor(n);
	// Check if zero or a power of two. This pattern is recognised by gcc and optimised
	// into branch-free code.
	if (0 == (n & (n - 1))) {
	return floor;
	} else {
	return floor + 1;
	}
	}

	static inline int Log2Ceiling64(uint64_t n) {
	int floor = Log2Floor64(n);
	// Check if zero or a power of two. This pattern is recognised by gcc and optimised
	// into branch-free code.
	if (0 == (n & (n - 1))) {
	return floor;
	} else {
	return floor + 1;
	}
	}

	static inline int Log2CeilingNonZero64(uint64_t n) {
	int floor = Log2FloorNonZero64(n);
	// Check if zero or a power of two. This pattern is recognised by gcc and optimised
	// into branch-free code.
	if (0 == (n & (n - 1))) {
	return floor;
	} else {
	return floor + 1;
	}
	}
	};

	template<>
	inline int256_t BitUtil::Sign(int256_t value) {
	return value < 0 ? -1 : 1;
	}

	/// An encapsulation class of SIMD byteswap functions
	class SimdByteSwap {
	public:
	/// Byteswap an array in the scalar way with some builtin optimization for arrays of
	/// length <= 16 bytes.
	/// const void* src: the source address of the input array;
	/// int len: the length of the input array;
	/// void* dst: the destination address of the output array;
	static void ByteSwapScalar(const void* src, int len, void* dst);

	/// SIMD ByteSwap functions:
	/// ByteSwap128 is to byteswap an array of 16 bytes(128 bits) using SSSE3 intrinsics;
	/// ByteSwap256 is to byteswap an array of 32 bytes(256 bits) using AVX2 intrinsics;
	/// Function parameters have the same meaning as ByteSwapScalar.
	static void ByteSwap128(const uint8_t* src, uint8_t* dst);
	static void ByteSwap256(const uint8_t* src, uint8_t* dst);

	/// Template function ByteSwapSimd is the entry point function to byteswap an array
	/// using SIMD approach.
	/// Template parameter:
	/// int TEMPLATE_DATA_WIDTH: only 16 or 32 are valid now;
	/// 16 means using ByteSwap128 as the internal SIMD implementation;
	/// 32 means using ByteSwap256 as the internal SIMD implementation;
	/// Function parameters have the same meaning as ByteSwapScalar.
	template <int TEMPLATE_DATA_WIDTH>
	static void ByteSwapSimd(const void* src, const int len, void* dst);
	};
	}

	#endif