blob: 27af2f777e29ceb0aba82d584ca2bd3ab48d9635 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#pragma once
#ifdef __AVX2__
#include <emmintrin.h>
#include <immintrin.h>
#endif
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "common/compiler_util.h"
namespace doris {
ALWAYS_INLINE inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src,
size_t size) {
auto dst = static_cast<uint8_t*>(_dst);
auto src = static_cast<const uint8_t*>(_src);
[[maybe_unused]] tail :
/// Small sizes and tails after the loop for large sizes.
/// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
/// This order of branches is from the disassembly of glibc's code.
/// We copy chunks of possibly uneven size with two overlapping movs.
/// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
if (size <= 16) {
if (size >= 8) {
/// Chunks of 8..16 bytes.
__builtin_memcpy(dst + size - 8, src + size - 8, 8);
__builtin_memcpy(dst, src, 8);
} else if (size >= 4) {
/// Chunks of 4..7 bytes.
__builtin_memcpy(dst + size - 4, src + size - 4, 4);
__builtin_memcpy(dst, src, 4);
} else if (size >= 2) {
/// Chunks of 2..3 bytes.
__builtin_memcpy(dst + size - 2, src + size - 2, 2);
__builtin_memcpy(dst, src, 2);
} else if (size >= 1) {
/// A single byte.
*dst = *src;
}
/// No bytes remaining.
}
else {
#ifdef __AVX2__
if (size <= 256) {
if (size <= 32) {
__builtin_memcpy(dst, src, 8);
__builtin_memcpy(dst + 8, src + 8, 8);
size -= 16;
dst += 16;
src += 16;
goto tail;
}
/// Then we will copy every 16 bytes from the beginning in a loop.
/// The last loop iteration will possibly overwrite some part of already copied last 32 bytes.
/// This is Ok, similar to the code for small sizes above.
while (size > 32) {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst),
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)));
dst += 32;
src += 32;
size -= 32;
}
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(dst + size - 32),
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32)));
} else {
if (size >= 512 * 1024 && size <= 2048 * 1024) {
asm volatile("rep movsb"
: "=D"(dst), "=S"(src), "=c"(size)
: "0"(dst), "1"(src), "2"(size)
: "memory");
} else {
size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;
if (padding > 0) {
__m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
/// Aligned unrolled copy. We will use half of available AVX registers.
/// It's not possible to have both src and dst aligned.
/// So, we will use aligned stores and unaligned loads.
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
while (size >= 256) {
c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32));
c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64));
c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96));
c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128));
c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160));
c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192));
c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224));
src += 256;
_mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7);
dst += 256;
size -= 256;
}
goto tail;
}
}
#else
memcpy(dst, src, size);
#endif
}
}
} // namespace doris