blob: a389739bed873955ee26986d5683b3dd4064b9e0 [file] [log] [blame]
/*
* memcpy.c
*/
#include <string.h>
#include <stdint.h>
void *memcpy(void *dst, const void *src, size_t n)
{
const char *p = src;
char *q = dst;
#if defined(__i386__)
size_t nl = n >> 2;
asm volatile ("cld ; rep ; movsl ; movl %3,%0 ; rep ; movsb":"+c" (nl),
"+S"(p), "+D"(q)
:"r"(n & 3));
#elif defined(__x86_64__)
size_t nq = n >> 3;
asm volatile ("cld ; rep ; movsq ; movl %3,%%ecx ; rep ; movsb":"+c"
(nq), "+S"(p), "+D"(q)
:"r"((uint32_t) (n & 7)));
#elif defined(ARCH_cortex_m0) || defined(ARCH_cortex_m3) || defined(ARCH_cortex_m4) || defined(ARCH_cortex_m7)
(void)p;
(void)q;
#if defined(ARCH_cortex_m3) || defined(ARCH_cortex_m4) || defined(ARCH_cortex_m7)
/*
* For Cortex-M3/4/7 we can speed up a bit by moving 32-bit words since
* it supports unaligned access.
*/
asm (".syntax unified \n"
" b test1 \n"
"loop1: ldr r3, [r1, r2] \n"
" str r3, [r0, r2] \n"
"test1: subs r2, #4 \n"
" bpl loop1 \n"
" add r2, #4 \n"
);
#endif
asm (".syntax unified \n"
" b test2 \n"
"loop2: ldrb r3, [r1, r2] \n"
" strb r3, [r0, r2] \n"
"test2: subs r2, #1 \n"
" bpl loop2 \n"
);
#else
while (n--) {
*q++ = *p++;
}
#endif
return dst;
}