| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // This file is copied from |
| // https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/bit-util.h |
| // and modified by Doris |
| |
| #pragma once |
| |
| #include <type_traits> |
| |
| #include "vec/core/wide_integer.h" |
| #ifndef __APPLE__ |
| #include <endian.h> |
| #endif |
| |
| #include "common/compiler_util.h" // IWYU pragma: keep |
| #include "gutil/bits.h" |
| #include "gutil/endian.h" |
| #include "util/cpu_info.h" |
| #include "util/sse_util.hpp" |
| |
| namespace doris { |
| |
| // Utility class to do standard bit tricks |
| // TODO: is this in boost or something else like that? |
| class BitUtil { |
| public: |
| // Returns the ceil of value/divisor |
| static inline int64_t ceil(int64_t value, int64_t divisor) { |
| return value / divisor + (value % divisor != 0); |
| } |
| |
| // Returns 'value' rounded up to the nearest multiple of 'factor' |
| static inline int64_t round_up(int64_t value, int64_t factor) { |
| return (value + (factor - 1)) / factor * factor; |
| } |
| |
| // Returns the smallest power of two that contains v. Taken from |
| // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 |
| // TODO: Pick a better name, as it is not clear what happens when the input is |
| // already a power of two. |
| static inline int64_t next_power_of_two(int64_t v) { |
| --v; |
| v |= v >> 1; |
| v |= v >> 2; |
| v |= v >> 4; |
| v |= v >> 8; |
| v |= v >> 16; |
| v |= v >> 32; |
| ++v; |
| return v; |
| } |
| |
| // Non hw accelerated pop count. |
| // TODO: we don't use this in any perf sensitive code paths currently. There |
| // might be a much faster way to implement this. |
| static inline int popcount_no_hw(uint64_t x) { |
| int count = 0; |
| |
| for (; x != 0; ++count) { |
| x &= x - 1; |
| } |
| |
| return count; |
| } |
| |
| // Returns the number of set bits in x |
| static inline int popcount(uint64_t x) { |
| if (LIKELY(CpuInfo::is_supported(CpuInfo::POPCNT))) { |
| return __builtin_popcountl(x); |
| } else { |
| return popcount_no_hw(x); |
| } |
| } |
| |
| // Returns the 'num_bits' least-significant bits of 'v'. |
| static inline uint64_t trailing_bits(uint64_t v, int num_bits) { |
| if (UNLIKELY(num_bits == 0)) { |
| return 0; |
| } |
| |
| if (UNLIKELY(num_bits >= 64)) { |
| return v; |
| } |
| |
| int n = 64 - num_bits; |
| return (v << n) >> n; |
| } |
| |
| template <typename T> |
| static std::string IntToByteBuffer(T input) { |
| std::string buffer; |
| T value = input; |
| for (int i = 0; i < sizeof(value); ++i) { |
| // Applies a mask for a byte range on the input. |
| signed char value_to_save = value & 0XFF; |
| buffer.push_back(value_to_save); |
| // Remove the just processed part from the input so that we can exit early if there |
| // is nothing left to process. |
| value >>= 8; |
| if (value == 0 && value_to_save >= 0) { |
| break; |
| } |
| if (value == -1 && value_to_save < 0) { |
| break; |
| } |
| } |
| std::reverse(buffer.begin(), buffer.end()); |
| return buffer; |
| } |
| |
| // Returns ceil(log2(x)). |
| // TODO: this could be faster if we use __builtin_clz. Fix this if this ever shows up |
| // in a hot path. |
| static inline int log2(uint64_t x) { |
| DCHECK_GT(x, 0); |
| |
| if (x == 1) { |
| return 0; |
| } |
| |
| // Compute result = ceil(log2(x)) |
| // = floor(log2(x - 1)) + 1, for x > 1 |
| // by finding the position of the most significant bit (1-indexed) of x - 1 |
| // (floor(log2(n)) = MSB(n) (0-indexed)) |
| --x; |
| int result = 1; |
| |
| while (x >>= 1) { |
| ++result; |
| } |
| |
| return result; |
| } |
| |
| // Swaps the byte order (i.e. endianess) |
| static inline int64_t byte_swap(int64_t value) { return __builtin_bswap64(value); } |
| static inline uint64_t byte_swap(uint64_t value) { |
| return static_cast<uint64_t>(__builtin_bswap64(value)); |
| } |
| static inline int32_t byte_swap(int32_t value) { return __builtin_bswap32(value); } |
| static inline uint32_t byte_swap(uint32_t value) { |
| return static_cast<uint32_t>(__builtin_bswap32(value)); |
| } |
| static inline int16_t byte_swap(int16_t value) { |
| return (((value >> 8) & 0xff) | ((value & 0xff) << 8)); |
| } |
| static inline uint16_t byte_swap(uint16_t value) { |
| return static_cast<uint16_t>(byte_swap(static_cast<int16_t>(value))); |
| } |
| |
| // Write the swapped bytes into dst. len must be 1, 2, 4 or 8. |
| static inline void byte_swap(void* dst, void* src, int len) { |
| switch (len) { |
| case 1: |
| *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<int8_t*>(src); |
| break; |
| |
| case 2: |
| *reinterpret_cast<int16_t*>(dst) = byte_swap(*reinterpret_cast<int16_t*>(src)); |
| break; |
| |
| case 4: |
| *reinterpret_cast<int32_t*>(dst) = byte_swap(*reinterpret_cast<int32_t*>(src)); |
| break; |
| |
| case 8: |
| *reinterpret_cast<int64_t*>(dst) = byte_swap(*reinterpret_cast<int64_t*>(src)); |
| break; |
| |
| default: |
| DCHECK(false); |
| } |
| } |
| |
| // Returns the rounded up to 64 multiple. Used for conversions of bits to i64. |
| static inline uint32_t round_up_numi64(uint32_t bits) { return (bits + 63) >> 6; } |
| |
| // Returns the rounded up to 32 multiple. Used for conversions of bits to i32. |
| constexpr static inline uint32_t round_up_numi32(uint32_t bits) { return (bits + 31) >> 5; } |
| |
| #if __BYTE_ORDER == __LITTLE_ENDIAN |
| // Converts to big endian format (if not already in big endian). |
| static inline int64_t big_endian(int64_t value) { return byte_swap(value); } |
| static inline uint64_t big_endian(uint64_t value) { return byte_swap(value); } |
| static inline int32_t big_endian(int32_t value) { return byte_swap(value); } |
| static inline uint32_t big_endian(uint32_t value) { return byte_swap(value); } |
| static inline int16_t big_endian(int16_t value) { return byte_swap(value); } |
| static inline uint16_t big_endian(uint16_t value) { return byte_swap(value); } |
| #else |
| static inline int64_t big_endian(int64_t val) { return val; } |
| static inline uint64_t big_endian(uint64_t val) { return val; } |
| static inline int32_t big_endian(int32_t val) { return val; } |
| static inline uint32_t big_endian(uint32_t val) { return val; } |
| static inline int16_t big_endian(int16_t val) { return val; } |
| static inline uint16_t big_endian(uint16_t val) { return val; } |
| #endif |
| |
| template <typename T> |
| static T big_endian_to_host(T value) { |
| if constexpr (std::is_same_v<T, wide::Int256>) { |
| return BigEndian::ToHost256(value); |
| } else if constexpr (std::is_same_v<T, wide::UInt256>) { |
| return BigEndian::ToHost256(value); |
| } else if constexpr (std::is_same_v<T, __int128>) { |
| return BigEndian::ToHost128(value); |
| } else if constexpr (std::is_same_v<T, unsigned __int128>) { |
| return BigEndian::ToHost128(value); |
| } else if constexpr (std::is_same_v<T, int64_t>) { |
| return BigEndian::ToHost64(value); |
| } else if constexpr (std::is_same_v<T, uint64_t>) { |
| return BigEndian::ToHost64(value); |
| } else if constexpr (std::is_same_v<T, int32_t>) { |
| return BigEndian::ToHost32(value); |
| } else if constexpr (std::is_same_v<T, uint32_t>) { |
| return BigEndian::ToHost32(value); |
| } else if constexpr (std::is_same_v<T, int16_t>) { |
| return BigEndian::ToHost16(value); |
| } else if constexpr (std::is_same_v<T, uint16_t>) { |
| return BigEndian::ToHost16(value); |
| } else if constexpr (std::is_same_v<T, int8_t>) { |
| return value; |
| } else if constexpr (std::is_same_v<T, uint8_t>) { |
| return value; |
| } else { |
| throw Exception(Status::FatalError("__builtin_unreachable")); |
| } |
| } |
| |
| /// Returns the smallest power of two that contains v. If v is a power of two, v is |
| /// returned. Taken from |
| /// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 |
| static inline int64_t RoundUpToPowerOfTwo(int64_t v) { |
| --v; |
| v |= v >> 1; |
| v |= v >> 2; |
| v |= v >> 4; |
| v |= v >> 8; |
| v |= v >> 16; |
| v |= v >> 32; |
| ++v; |
| return v; |
| } |
| |
| // Wrap the gutil/ version for convenience. |
| static inline int Log2FloorNonZero64(uint64_t n) { return Bits::Log2FloorNonZero64(n); } |
| |
| // Wrap the gutil/ version for convenience. |
| static inline int Log2Floor64(uint64_t n) { return Bits::Log2Floor64(n); } |
| |
| static inline int Log2Ceiling64(uint64_t n) { |
| int floor = Log2Floor64(n); |
| // Check if zero or a power of two. This pattern is recognised by gcc and optimised |
| // into branch-free code. |
| if (0 == (n & (n - 1))) { |
| return floor; |
| } else { |
| return floor + 1; |
| } |
| } |
| |
| static inline int Log2CeilingNonZero64(uint64_t n) { |
| int floor = Log2FloorNonZero64(n); |
| // Check if zero or a power of two. This pattern is recognised by gcc and optimised |
| // into branch-free code. |
| if (0 == (n & (n - 1))) { |
| return floor; |
| } else { |
| return floor + 1; |
| } |
| } |
| |
| // Returns the rounded up to 64 multiple. Used for conversions of bits to i64. |
| static inline uint32_t round_up_numi_64(uint32_t bits) { return (bits + 63) >> 6; } |
| |
| constexpr static inline int64_t Ceil(int64_t value, int64_t divisor) { |
| return value / divisor + (value % divisor != 0); |
| } |
| |
| constexpr static inline bool IsPowerOf2(int64_t value) { return (value & (value - 1)) == 0; } |
| |
| constexpr static inline int64_t RoundDown(int64_t value, int64_t factor) { |
| return (value / factor) * factor; |
| } |
| |
| /// Specialized round up and down functions for frequently used factors, |
| /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64) |
| /// Returns the rounded up number of bytes that fit the number of bits. |
| constexpr static inline uint32_t RoundUpNumBytes(uint32_t bits) { return (bits + 7) >> 3; } |
| |
| /// Non hw accelerated pop count. |
| /// TODO: we don't use this in any perf sensitive code paths currently. There |
| /// might be a much faster way to implement this. |
| static inline int PopcountNoHw(uint64_t x) { |
| int count = 0; |
| for (; x != 0; ++count) x &= x - 1; |
| return count; |
| } |
| |
| /// Returns the number of set bits in x |
| static inline int Popcount(uint64_t x) { |
| //if (LIKELY(CpuInfo::is_supported(CpuInfo::POPCNT))) { |
| // return POPCNT_popcnt_u64(x); |
| //} else { |
| return PopcountNoHw(x); |
| // } |
| } |
| |
| // Compute correct population count for various-width signed integers |
| template <typename T> |
| static inline int PopcountSigned(T v) { |
| // Converting to same-width unsigned then extending preserves the bit pattern. |
| return BitUtil::Popcount(static_cast<typename std::make_unsigned<T>::type>(v)); |
| } |
| |
| /// Logical right shift for signed integer types |
| /// This is needed because the C >> operator does arithmetic right shift |
| /// Negative shift amounts lead to undefined behavior |
| template <typename T> |
| constexpr static T ShiftRightLogical(T v, int shift) { |
| // Conversion to unsigned ensures most significant bits always filled with 0's |
| return static_cast<typename std::make_unsigned<T>::type>(v) >> shift; |
| } |
| |
| /// Get an specific bit of a numeric type |
| template <typename T> |
| static inline int8_t GetBit(T v, int bitpos) { |
| T masked = v & (static_cast<T>(0x1) << bitpos); |
| return static_cast<int8_t>(ShiftRightLogical(masked, bitpos)); |
| } |
| |
| /// Set a specific bit to 1 |
| /// Behavior when bitpos is negative is undefined |
| template <typename T> |
| constexpr static T SetBit(T v, int bitpos) { |
| return v | (static_cast<T>(0x1) << bitpos); |
| } |
| |
| /// Set a specific bit to 0 |
| /// Behavior when bitpos is negative is undefined |
| template <typename T> |
| constexpr static T UnsetBit(T v, int bitpos) { |
| return v & ~(static_cast<T>(0x1) << bitpos); |
| } |
| |
| /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is |
| /// a power of two |
| static inline int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) { |
| DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); |
| return (value + (factor - 1)) & ~(factor - 1); |
| } |
| |
| // speed up function compute for SIMD |
| static inline size_t RoundUpToPowerOf2Int32(size_t value, size_t factor) { |
| DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); |
| return (value + (factor - 1)) & ~(factor - 1); |
| } |
| |
| template <typename T> |
| static inline T RoundDownToPowerOf2(T value, T factor) { |
| static_assert(std::is_integral<T>::value, "T must be an integral type"); |
| DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); |
| return value & ~(factor - 1); |
| } |
| |
| // Returns the ceil of value/divisor |
| static inline int Ceil(int value, int divisor) { |
| return value / divisor + (value % divisor != 0); |
| } |
| |
| // Returns the 'num_bits' least-significant bits of 'v'. |
| static inline uint64_t TrailingBits(uint64_t v, int num_bits) { |
| if (PREDICT_FALSE(num_bits == 0)) return 0; |
| if (PREDICT_FALSE(num_bits >= 64)) return v; |
| int n = 64 - num_bits; |
| return (v << n) >> n; |
| } |
| |
| static inline uint64_t ShiftLeftZeroOnOverflow(uint64_t v, int num_bits) { |
| if (PREDICT_FALSE(num_bits >= 64)) return 0; |
| return v << num_bits; |
| } |
| |
| static inline uint64_t ShiftRightZeroOnOverflow(uint64_t v, int num_bits) { |
| if (PREDICT_FALSE(num_bits >= 64)) return 0; |
| return v >> num_bits; |
| } |
| |
| static void ByteSwapScalar(void* dest, const void* source, int len) { |
| uint8_t* dst = reinterpret_cast<uint8_t*>(dest); |
| const uint8_t* src = reinterpret_cast<const uint8_t*>(source); |
| switch (len) { |
| case 1: |
| *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src); |
| return; |
| case 2: |
| *reinterpret_cast<uint16_t*>(dst) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint16_t*>(src)); |
| return; |
| case 3: |
| *reinterpret_cast<uint16_t*>(dst + 1) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint16_t*>(src)); |
| *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 2); |
| return; |
| case 4: |
| *reinterpret_cast<uint32_t*>(dst) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint32_t*>(src)); |
| return; |
| case 5: |
| *reinterpret_cast<uint32_t*>(dst + 1) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint32_t*>(src)); |
| *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 4); |
| return; |
| case 6: |
| *reinterpret_cast<uint32_t*>(dst + 2) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint32_t*>(src)); |
| *reinterpret_cast<uint16_t*>(dst) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint16_t*>(src + 4)); |
| return; |
| case 7: |
| *reinterpret_cast<uint32_t*>(dst + 3) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint32_t*>(src)); |
| *reinterpret_cast<uint16_t*>(dst + 1) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint16_t*>(src + 4)); |
| *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 6); |
| return; |
| case 8: |
| *reinterpret_cast<uint64_t*>(dst) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src)); |
| return; |
| case 9: |
| *reinterpret_cast<uint64_t*>(dst + 1) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src)); |
| *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 8); |
| return; |
| case 10: |
| *reinterpret_cast<uint64_t*>(dst + 2) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src)); |
| *reinterpret_cast<uint16_t*>(dst) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint16_t*>(src + 8)); |
| return; |
| case 11: |
| *reinterpret_cast<uint64_t*>(dst + 3) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src)); |
| *reinterpret_cast<uint16_t*>(dst + 1) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint16_t*>(src + 8)); |
| *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 10); |
| return; |
| case 12: |
| *reinterpret_cast<uint64_t*>(dst + 4) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src)); |
| *reinterpret_cast<uint32_t*>(dst) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint32_t*>(src + 8)); |
| return; |
| case 13: |
| *reinterpret_cast<uint64_t*>(dst + 5) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src)); |
| *reinterpret_cast<uint32_t*>(dst + 1) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint32_t*>(src + 8)); |
| *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 12); |
| return; |
| case 14: |
| *reinterpret_cast<uint64_t*>(dst + 6) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src)); |
| *reinterpret_cast<uint32_t*>(dst + 2) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint32_t*>(src + 8)); |
| *reinterpret_cast<uint16_t*>(dst) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint16_t*>(src + 12)); |
| return; |
| case 15: |
| *reinterpret_cast<uint64_t*>(dst + 7) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src)); |
| *reinterpret_cast<uint32_t*>(dst + 3) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint32_t*>(src + 8)); |
| *reinterpret_cast<uint16_t*>(dst + 1) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint16_t*>(src + 12)); |
| *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 14); |
| return; |
| case 16: |
| *reinterpret_cast<uint64_t*>(dst + 8) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src)); |
| *reinterpret_cast<uint64_t*>(dst) = |
| BitUtil::byte_swap(*reinterpret_cast<const uint64_t*>(src + 8)); |
| return; |
| default: |
| // Revert to slow loop-based swap. |
| ByteSwapScalarLoop(source, len, dest); |
| return; |
| } |
| } |
| |
| static void ByteSwapScalarLoop(const void* src, int len, void* dst) { |
| //TODO: improve the performance of following code further using BSWAP intrinsic |
| uint8_t* d = reinterpret_cast<uint8_t*>(dst); |
| const uint8_t* s = reinterpret_cast<const uint8_t*>(src); |
| for (int i = 0; i < len; ++i) d[i] = s[len - i - 1]; |
| } |
| }; |
| |
| } // namespace doris |