be/src/glibc-compatibility/memcpy/memcpy_aarch64.cpp - doris - Git at Google

 #include <stddef.h>

 #include <arm_neon.h>

 typedef int64x2_t __m128i;

 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)

 static inline __attribute__((always_inline)) void _mm_storeu_si128(__m128i *p, __m128i a)
 {
     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
 }

 static inline __attribute__((always_inline)) void _mm_store_si128(__m128i *p, __m128i a)
 {
     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
 }

 static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(const __m128i *p)
 {
     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
 }

 /** Custom memcpy implementation for ClickHouse.
   * It has the following benefits over using glibc's implementation:
   * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability.
   * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient.
   * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis.
   * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries.
   *
   * Writing our own memcpy is extremely difficult for the following reasons:
   * 1. The optimal variant depends on the specific CPU model.
   * 2. The optimal variant depends on the distribution of size arguments.
   * 3. It depends on the number of threads copying data concurrently.
   * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other.
   * Due to vast range of scenarios it makes proper testing especially difficult.
   * When writing our own memcpy there is a risk to overoptimize it
   * on non-representative microbenchmarks while making real-world use cases actually worse.
   *
   * Most of the benchmarks for memcpy on the internet are wrong.
   *
   * Let's look at the details:
   *
   * For small size, the order of branches in code is important.
   * There are variants with specific order of branches (like here or in glibc)
   * or with jump table (in asm code see example from Cosmopolitan libc:
   * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61)
   * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/)
   *
   * It's also important how to copy uneven sizes.
   * Almost every implementation, including this, is using two overlapping movs.
   *
   * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation,
   * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion.
   *
   * For larger sizes it's important to choose the instructions used:
   * - SSE or AVX or AVX-512;
   * - rep movsb;
   * Performance will depend on the size threshold, on the CPU model, on the "erms" flag
   * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes)
   * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy
   *
   * Using AVX-512 can be bad due to throttling.
   * Using AVX can be bad if most code is using SSE due to switching penalty
   * (it also depends on the usage of "vzeroupper" instruction).
   * But in some cases AVX gives a win.
   *
   * It also depends on how many times the loop will be unrolled.
   * We are unrolling the loop 8 times (by the number of available registers), but it not always the best.
   *
   * It also depends on the usage of aligned or unaligned loads/stores.
   * We are using unaligned loads and aligned stores.
   *
   * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD.
   * Setting up correct offset for prefetching is non-obvious.
   *
   * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache).
   * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower,
   * because L3 cache is shared (and L2 cache is partially shared).
   *
   * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios,
   * so we don't pay attention to using non-temporary stores.
   *
   * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial,
   * even comparing to non-temporary aligned unrolled stores even with the most wide registers.
   *
   * memcpy can be written in asm, C or C++. The latter can also use inline asm.
   * The asm implementation can be better to make sure that compiler won't make the code worse,
   * to ensure the order of branches, the code layout, the usage of all required registers.
   * But if it is located in separate translation unit, inlining will not be possible
   * (inline asm can be used to overcome this limitation).
   * Sometimes C or C++ code can be further optimized by compiler.
   * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used.
   *
   * Please note that compiler can replace plain code to memcpy and vice versa.
   * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy;
   *   it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy.
   *   This is often used to implement unaligned load/store without undefined behaviour in C++.
   * - a loop with copying bytes can be recognized and replaced by a call to memcpy;
   *   it is controlled by -ftree-loop-distribute-patterns.
   * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you
   *   inline code somewhat similar to a decent implementation of memcpy.
   *
   * This description is up to date as of Mar 2021.
   *
   * How to test the memcpy implementation for performance:
   * 1. Test on real production workload.
   * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios.
   *
   * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes.
   * See https://habr.com/en/company/yandex/blog/457612/
   */


 static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
 {
     /// We will use pointer arithmetic, so char pointer will be used.
     /// Note that __restrict makes sense (otherwise compiler will reload data from memory
     /// instead of using the value of registers due to possible aliasing).
     char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
     const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);

     /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it.
     /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly
     /// for inlining and removing this single instruction.
     void * ret = dst;

 tail:
     /// Small sizes and tails after the loop for large sizes.
     /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
     /// This order of branches is from the disassembly of glibc's code.
     /// We copy chunks of possibly uneven size with two overlapping movs.
     /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
     if (size <= 16)
     {
         if (size >= 8)
         {
             /// Chunks of 8..16 bytes.
             __builtin_memcpy(dst + size - 8, src + size - 8, 8);
             __builtin_memcpy(dst, src, 8);
         }
         else if (size >= 4)
         {
             /// Chunks of 4..7 bytes.
             __builtin_memcpy(dst + size - 4, src + size - 4, 4);
             __builtin_memcpy(dst, src, 4);
         }
         else if (size >= 2)
         {
             /// Chunks of 2..3 bytes.
             __builtin_memcpy(dst + size - 2, src + size - 2, 2);
             __builtin_memcpy(dst, src, 2);
         }
         else if (size >= 1)
         {
             /// A single byte.
             *dst = *src;
         }
         /// No bytes remaining.
     }
     else
     {
         /// Medium and large sizes.
         if (size <= 128)
         {
             /// Medium size, not enough for full loop unrolling.

             /// We will copy the last 16 bytes.
             _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));

             /// Then we will copy every 16 bytes from the beginning in a loop.
             /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes.
             /// This is Ok, similar to the code for small sizes above.
             while (size > 16)
             {
                 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
                 dst += 16;
                 src += 16;
                 size -= 16;
             }
         }
         else
         {
             /// Large size with fully unrolled loop.

             /// Align destination to 16 bytes boundary.
             size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;

             /// If not aligned - we will copy first 16 bytes with unaligned stores.
             if (padding > 0)
             {
                 __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
                 dst += padding;
                 src += padding;
                 size -= padding;
             }

             /// Aligned unrolled copy. We will use half of available SSE registers.
             /// It's not possible to have both src and dst aligned.
             /// So, we will use aligned stores and unaligned loads.
             __m128i c0, c1, c2, c3, c4, c5, c6, c7;

             while (size >= 128)
             {
                 c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
                 c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
                 c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
                 c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
                 c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
                 c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
                 c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
                 c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
                 src += 128;
                 _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
                 _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
                 _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
                 _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
                 _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
                 _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
                 _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
                 _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
                 dst += 128;

                 size -= 128;
             }

             /// The latest remaining 0..127 bytes will be processed as usual.
             goto tail;
         }
     }

     return ret;
 }

 extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size)
 {
     return inline_memcpy(dst, src, size);
 }
	#include <stddef.h>

	#include <arm_neon.h>

	typedef int64x2_t __m128i;

	#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
	#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)

	static inline __attribute__((always_inline)) void _mm_storeu_si128(__m128i *p, __m128i a)
	{
	vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
	}

	static inline __attribute__((always_inline)) void _mm_store_si128(__m128i *p, __m128i a)
	{
	vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
	}

	static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(const __m128i *p)
	{
	return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
	}

	/** Custom memcpy implementation for ClickHouse.
	* It has the following benefits over using glibc's implementation:
	* 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability.
	* 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient.
	* 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis.
	* 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries.
	*
	* Writing our own memcpy is extremely difficult for the following reasons:
	* 1. The optimal variant depends on the specific CPU model.
	* 2. The optimal variant depends on the distribution of size arguments.
	* 3. It depends on the number of threads copying data concurrently.
	* 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other.
	* Due to vast range of scenarios it makes proper testing especially difficult.
	* When writing our own memcpy there is a risk to overoptimize it
	* on non-representative microbenchmarks while making real-world use cases actually worse.
	*
	* Most of the benchmarks for memcpy on the internet are wrong.
	*
	* Let's look at the details:
	*
	* For small size, the order of branches in code is important.
	* There are variants with specific order of branches (like here or in glibc)
	* or with jump table (in asm code see example from Cosmopolitan libc:
	* https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61)
	* or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/)
	*
	* It's also important how to copy uneven sizes.
	* Almost every implementation, including this, is using two overlapping movs.
	*
	* It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation,
	* otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion.
	*
	* For larger sizes it's important to choose the instructions used:
	* - SSE or AVX or AVX-512;
	* - rep movsb;
	* Performance will depend on the size threshold, on the CPU model, on the "erms" flag
	* ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes)
	* https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy
	*
	* Using AVX-512 can be bad due to throttling.
	* Using AVX can be bad if most code is using SSE due to switching penalty
	* (it also depends on the usage of "vzeroupper" instruction).
	* But in some cases AVX gives a win.
	*
	* It also depends on how many times the loop will be unrolled.
	* We are unrolling the loop 8 times (by the number of available registers), but it not always the best.
	*
	* It also depends on the usage of aligned or unaligned loads/stores.
	* We are using unaligned loads and aligned stores.
	*
	* It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD.
	* Setting up correct offset for prefetching is non-obvious.
	*
	* Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache).
	* But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower,
	* because L3 cache is shared (and L2 cache is partially shared).
	*
	* Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios,
	* so we don't pay attention to using non-temporary stores.
	*
	* On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial,
	* even comparing to non-temporary aligned unrolled stores even with the most wide registers.
	*
	* memcpy can be written in asm, C or C++. The latter can also use inline asm.
	* The asm implementation can be better to make sure that compiler won't make the code worse,
	* to ensure the order of branches, the code layout, the usage of all required registers.
	* But if it is located in separate translation unit, inlining will not be possible
	* (inline asm can be used to overcome this limitation).
	* Sometimes C or C++ code can be further optimized by compiler.
	* For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used.
	*
	* Please note that compiler can replace plain code to memcpy and vice versa.
	* - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy;
	* it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy.
	* This is often used to implement unaligned load/store without undefined behaviour in C++.
	* - a loop with copying bytes can be recognized and replaced by a call to memcpy;
	* it is controlled by -ftree-loop-distribute-patterns.
	* - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you
	* inline code somewhat similar to a decent implementation of memcpy.
	*
	* This description is up to date as of Mar 2021.
	*
	* How to test the memcpy implementation for performance:
	* 1. Test on real production workload.
	* 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios.
	*
	* TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes.
	* See https://habr.com/en/company/yandex/blog/457612/
	*/


	static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
	{
	/// We will use pointer arithmetic, so char pointer will be used.
	/// Note that __restrict makes sense (otherwise compiler will reload data from memory
	/// instead of using the value of registers due to possible aliasing).
	char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
	const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);

	/// Standard memcpy returns the original value of dst. It is rarely used but we have to do it.
	/// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly
	/// for inlining and removing this single instruction.
	void * ret = dst;

	tail:
	/// Small sizes and tails after the loop for large sizes.
	/// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
	/// This order of branches is from the disassembly of glibc's code.
	/// We copy chunks of possibly uneven size with two overlapping movs.
	/// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
	if (size <= 16)
	{
	if (size >= 8)
	{
	/// Chunks of 8..16 bytes.
	__builtin_memcpy(dst + size - 8, src + size - 8, 8);
	__builtin_memcpy(dst, src, 8);
	}
	else if (size >= 4)
	{
	/// Chunks of 4..7 bytes.
	__builtin_memcpy(dst + size - 4, src + size - 4, 4);
	__builtin_memcpy(dst, src, 4);
	}
	else if (size >= 2)
	{
	/// Chunks of 2..3 bytes.
	__builtin_memcpy(dst + size - 2, src + size - 2, 2);
	__builtin_memcpy(dst, src, 2);
	}
	else if (size >= 1)
	{
	/// A single byte.
	dst = src;
	}
	/// No bytes remaining.
	}
	else
	{
	/// Medium and large sizes.
	if (size <= 128)
	{
	/// Medium size, not enough for full loop unrolling.

	/// We will copy the last 16 bytes.
	_mm_storeu_si128(reinterpret_cast<__m128i >(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i >(src + size - 16)));

	/// Then we will copy every 16 bytes from the beginning in a loop.
	/// The last loop iteration will possibly overwrite some part of already copied last 16 bytes.
	/// This is Ok, similar to the code for small sizes above.
	while (size > 16)
	{
	_mm_storeu_si128(reinterpret_cast<__m128i >(dst), _mm_loadu_si128(reinterpret_cast<const __m128i >(src)));
	dst += 16;
	src += 16;
	size -= 16;
	}
	}
	else
	{
	/// Large size with fully unrolled loop.

	/// Align destination to 16 bytes boundary.
	size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;

	/// If not aligned - we will copy first 16 bytes with unaligned stores.
	if (padding > 0)
	{
	__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
	dst += padding;
	src += padding;
	size -= padding;
	}

	/// Aligned unrolled copy. We will use half of available SSE registers.
	/// It's not possible to have both src and dst aligned.
	/// So, we will use aligned stores and unaligned loads.
	__m128i c0, c1, c2, c3, c4, c5, c6, c7;

	while (size >= 128)
	{
	c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
	c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
	c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
	c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
	c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
	c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
	c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
	c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
	src += 128;
	_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
	_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
	_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
	_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
	_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
	_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
	_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
	_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
	dst += 128;

	size -= 128;
	}

	/// The latest remaining 0..127 bytes will be processed as usual.
	goto tail;
	}
	}

	return ret;
	}

	extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size)
	{
	return inline_memcpy(dst, src, size);
	}