| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #ifndef DORIS_BE_SRC_OLAP_BHP_LIB_H |
| #define DORIS_BE_SRC_OLAP_BHP_LIB_H |
| |
| #include <stddef.h> |
| |
| namespace doris { |
| |
| inline int memcmp_sse(const void* buf1, const void* buf2, unsigned int count) { |
| int result ; |
| |
| __asm__ __volatile__( |
| "cmpl $16, %%edx;" |
| "jb 9f;" |
| "16:" /* len >= 16 */ |
| "movdqu (%%rdi), %%xmm0;" |
| "movdqu (%%rsi), %%xmm1;" |
| "pcmpeqb %%xmm1, %%xmm0;" |
| "pmovmskb %%xmm0,%%rcx;" |
| "xorl $0xffff, %%ecx;" |
| "jz 15f;" |
| "bsf %%ecx, %%ecx;" /* diff */ |
| "movzb (%%rsi, %%rcx), %%edx;" |
| "movzb (%%rdi, %%rcx), %%eax;" |
| "subl %%edx, %%eax;" |
| "jmp 0f;" |
| "15:" /* same */ |
| "subl $16, %%edx;" |
| "jbe 1f;" |
| "movq $16, %%rcx;" |
| "cmpl $16, %%edx;" |
| "jae 14f;" |
| "movl %%edx, %%ecx;" |
| "14:" |
| //"addq %%rcx, %%rdi;" |
| "lea (%%rdi,%%rcx), %%rdi;" |
| "addq %%rcx, %%rsi;" |
| "jmp 16b;" |
| |
| "9:" /* 8 =< len < 15 */ |
| "cmpl $8, %%edx;" |
| "jb 5f;" |
| "8:" |
| "movq (%%rdi), %%xmm0;" |
| "movq (%%rsi), %%xmm1;" |
| "pcmpeqb %%xmm1, %%xmm0;" |
| "pmovmskb %%xmm0, %%rcx;" |
| "and $0xff, %%ecx;" |
| "xorl $0xff, %%ecx;" |
| "je 7f;" |
| "bsf %%ecx, %%ecx;" /* diff */ |
| "movzb (%%rsi, %%rcx), %%edx;" |
| "movzb (%%rdi, %%rcx), %%eax;" |
| "subl %%edx, %%eax;" |
| "jmp 0f;" |
| |
| "7:" |
| "subl $8, %%edx;" |
| "jz 1f;" |
| "movl %%edx, %%ecx;" |
| "movq (%%rdi, %%rcx), %%xmm0;" |
| "movq (%%rsi, %%rcx), %%xmm1;" |
| "pcmpeqb %%xmm1, %%xmm0;" |
| "pmovmskb %%xmm0, %%rcx;" |
| "and $0xff, %%ecx;" |
| "xorl $0xff, %%ecx;" |
| "je 1f;" |
| "bsf %%ecx, %%ecx;" |
| "addl %%edx, %%ecx;" |
| "movzb (%%rsi, %%rcx), %%edx;" |
| "movzb (%%rdi, %%rcx), %%eax;" |
| "subl %%edx, %%eax;" |
| "jmp 0f;" |
| |
| "5:" |
| "cmpl $4, %%edx;" |
| "jb 13f;" |
| "4:" |
| "subl $4, %%edx;" |
| "movl (%%rdi), %%eax;" |
| "movl (%%rsi), %%ecx;" |
| "cmpl %%ecx, %%eax;" |
| "je 3f;" |
| "bswap %%eax;" |
| "bswap %%ecx;" |
| "cmpl %%ecx, %%eax;" |
| "ja 17f;" |
| "mov $-1, %%eax;" |
| "jmp 0f;" |
| "17:" |
| "mov $1, %%eax;" |
| "jmp 0f;" |
| "3:" |
| "addq $4, %%rdi;" |
| "lea 4(%%rsi), %%rsi;" |
| "13:" |
| "cmpl $0, %%edx;" |
| "je 1f;" |
| "2:" |
| |
| "movzbl (%%rdi), %%eax;" |
| "movzbl (%%rsi), %%ecx;" |
| "subl %%ecx, %%eax;" |
| "jne 0f;" |
| "subl $1, %%edx;" |
| "jz 1f;" |
| "movzbl 1(%%rdi), %%eax;" |
| "movzbl 1(%%rsi), %%ecx;" |
| "subl %%ecx, %%eax;" |
| "jne 0f;" |
| "subl $1, %%edx;" |
| "jz 1f;" |
| "movzbl 2(%%rdi), %%eax;" |
| "movzbl 2(%%rsi), %%ecx;" |
| "subl %%ecx, %%eax;" |
| "jmp 0f;" |
| |
| "1:" |
| "xorl %%eax, %%eax;" |
| "0:" |
| :"=a"(result), "=D"(buf1), "=S"(buf2), "=d"(count) |
| :"D"(buf1), "S"(buf2), "d"(count) |
| :"%rcx", "%xmm1", "%xmm0", "memory" |
| ); |
| return result; |
| } |
| |
| //count must be between 0 and 2GB |
| /*__attribute__((always_inline))*/ inline int memcmp_sse32(const void* buf1, const void* buf2, |
| int count) |
| |
| { |
| int result ; |
| __asm__ __volatile__( |
| //".align 8;" |
| "cmp $1, %%edx;" |
| "jbe 6f;" |
| |
| "addl $16, %%edx ;" |
| "movl %%edx, %%eax ;" |
| "xor %%rcx, %%rcx ;" |
| |
| "2: " |
| "movdqu (%%rdi), %%xmm1;" |
| "movdqu (%%rsi), %%xmm2;" |
| "subl $16, %%edx ;" |
| "subl $16, %%eax ;" |
| |
| // " pcmpestri $0x18, %%xmm2, %%xmm1 ;" |
| ".byte 0x66, 0x0f, 0x3a, 0x61, 0xca, 0x18;" |
| " lea 16(%%rsi), %%rsi ;" |
| " lea 16(%%rdi), %%rdi ;" |
| //zflag=0 and cflag=0;no diff and no end, so continue the loop |
| " ja 2b ;" |
| // if cflag=1, jmp; no end but diff |
| " jc 1f ;" |
| |
| "xorl %%eax, %%eax;" |
| "jmp 0f;" |
| |
| "6:" |
| "xor %%eax, %%eax;" |
| "test %%edx, %%edx ;" |
| "jz 0f ;" |
| "movzbl (%%rdi), %%eax;" |
| "movzbl (%%rsi), %%edx;" |
| "subl %%edx, %%eax;" |
| "jmp 0f;" |
| |
| "1:" |
| "movzbl -16(%%rsi, %%rcx), %%edx ;" |
| "movzbl -16(%%rdi, %%rcx), %%eax ;" |
| "subl %%edx, %%eax ;" |
| |
| "0:" |
| //"mov %%eax, %0;" |
| |
| :"=a"(result), "=D"(buf1), "=S"(buf2), "=d"(count) |
| :"D"(buf1), "S"(buf2), "d"(count) |
| :"%rcx", "memory", "xmm1", "xmm2" |
| ); |
| return result; |
| } |
| |
| /*__attribute__((always_inline))*/ inline int memcmp_sse64(const void* buf1, const void* buf2, |
| size_t count) { |
| |
| int result ; |
| __asm__ __volatile__( |
| "cmp $1, %%rdx;" |
| "jbe 6f;" |
| |
| "addq $16, %%rdx;" |
| "movq %%rdx,%%rax;" |
| //"xor %%rcx, %%rcx ;" |
| |
| "2: " |
| "movdqu (%%rdi), %%xmm1;" |
| "movdqu (%%rsi), %%xmm2;" |
| |
| "subq $16, %%rax;" |
| "subq $16, %%rdx;" |
| |
| //"addq $16, %%rsi;" |
| //"addq $16, %%rdi;" |
| // " pcmpestri $0x18, %%xmm2, %%xmm1 ;" |
| ".byte 0x66, 0x0f, 0x3a, 0x61, 0xca, 0x18;" |
| "lea 16(%%rsi), %%rsi;" |
| "lea 16(%%rdi), %%rdi;" |
| "ja 2b;" //no diff and no end, so continue the loop |
| "jc 1f;" // no end but diff |
| |
| "xorl %%eax, %%eax;" |
| "jmp 0f;" |
| |
| "6:" |
| "xor %%eax, %%eax;" |
| "test %%edx, %%edx ;" |
| "jz 0f ;" |
| "movzbl (%%rdi), %%eax;" |
| "movzbl (%%rsi), %%edx;" |
| "subl %%edx, %%eax;" |
| "jmp 0f;" |
| |
| "1:" |
| "movzbl -16(%%rsi, %%rcx), %%edx ;" |
| "movzbl -16(%%rdi, %%rcx), %%eax ;" |
| "subl %%edx, %%eax ;" |
| |
| "0:" |
| //"mov %%eax, %0;" |
| |
| :"=a"(result), "=D"(buf1), "=S"(buf2), "=d"(count) |
| :"D"(buf1), "S"(buf2), "d"(count) |
| :"%rcx", "memory", "xmm1", "xmm2" |
| ); |
| return result; |
| } |
| |
| /*__attribute__((always_inline))*/ inline int find_chr_from_mem(const char* s, int c, int len) { |
| //len : edx; c: esi; s:rdi |
| int index; |
| __asm__ __volatile__( |
| "and $0xff, %%esi;" //clear upper bytes |
| "movd %%esi, %%xmm1;" |
| |
| "mov $1, %%eax;" |
| "add $16, %%edx;" |
| "mov %%rdi ,%%r8;" |
| |
| "1:" |
| "movdqu (%%rdi), %%xmm2;" |
| "sub $16, %%edx;" |
| "addq $16, %%rdi;" |
| //"pcmpestri $0x0, %%xmm2,%%xmm1;" |
| ".byte 0x66 ,0x0f ,0x3a ,0x61 ,0xca ,0x00;" |
| //"lea 16(%%rdi), %%rdi;" |
| "ja 1b;" //Res2==0:no match and zflag==0: s is not end |
| "jc 3f;" //Res2==1: match and s is not end |
| |
| "mov $0xffffffff, %%eax;"//no match |
| "jmp 0f;" |
| |
| "3:" |
| "sub %%r8, %%rdi;" |
| "lea -16(%%edi,%%ecx),%%eax;" |
| |
| "0:" |
| // "mov %%eax, %0;" |
| :"=a"(index), "=D"(s), "=S"(c), "=d"(len) |
| :"D"(s), "S"(c), "d"(len) |
| :"rcx", "r8", "memory", "xmm1", "xmm2" |
| ); |
| return index; |
| } |
| |
| /*__attribute__((always_inline))*/ inline int find_chr_from_str(const char* s, int c, int len) { |
| //s:rdi; c:rsi; len:rdx |
| int index; |
| __asm__ __volatile__( |
| "and $0xff, %%esi;" //clear upper bytes |
| "movd %%esi, %%xmm1;" |
| "xor %%r8d,%%r8d;" |
| |
| "1:" |
| "movdqu (%%rdi), %%xmm2;" |
| "add $16, %%r8d;" |
| "addq $16, %%rdi;" |
| // "pcmpistri $0x0, %%xmm2,%%xmm1;" |
| ".byte 0x66 ,0x0f ,0x3a ,0x63 ,0xca ,0x00;" |
| //"lea 16(%%rdi), %%rdi;" |
| "ja 4f;" // not null and no match, so clarify whether over the end |
| "jc 2f;" //match |
| "jmp 3f;" //null and no match |
| |
| "4:" |
| "cmp %%r8d,%%edx;" |
| "ja 1b;" |
| |
| "3:" |
| "mov $0xffffffff, %%eax;" // the end and no match |
| "jmp 0f;" |
| |
| "2:" |
| |
| "lea -16(%%r8d, %%ecx), %%eax;" |
| "cmp %%edx, %%eax;" |
| "jae 3b;" |
| |
| "0:" |
| // "mov %%eax, %0;" |
| |
| :"=a"(index), "=D"(s), "=S"(c), "=d"(len) |
| :"D"(s), "S"(c), "d"(len) |
| :"rcx", "r8", "memory", "xmm1", "xmm2" |
| ); |
| return index; |
| } |
| |
| /*__attribute__((always_inline))*/ inline char* strchr_sse(const char* s, int c) { |
| //s:rdi; c:rsi |
| char* ret; |
| __asm__ __volatile__( |
| "and $0xff, %%esi;" //clear upper bytes |
| //c==0 |
| "test %%esi, %%esi;" |
| "jnz 0f ;" |
| "movq %%rdi, %%rax;" |
| "pxor %%xmm1, %%xmm1;" |
| "3:" |
| "movdqu (%%rdi), %%xmm2;" |
| |
| "addq $16, %%rdi;" |
| // "pcmpistri $0x8, %%xmm2,%%xmm1;" |
| ".byte 0x66, 0x0f, 0x3a, 0x63, 0xca, 0x08;" |
| "jnz 3b;" |
| |
| "leaq -16(%%rdi,%%rcx), %%rax;" |
| "jmp 2f;" |
| |
| "0:" |
| "movd %%esi, %%xmm1;" |
| //"xor %%rcx, %%rcx;" |
| "xor %%rax, %%rax;" |
| |
| "1:" |
| "movdqu (%%rdi), %%xmm2;" |
| |
| "addq $16, %%rdi;" |
| // "pcmpistri $0x0, %%xmm2,%%xmm1;" |
| ".byte 0x66 ,0x0f ,0x3a ,0x63 ,0xca ,0x00;" |
| "ja 1b;" |
| "jnc 2f;" |
| "lea -16(%%rdi, %%rcx), %%rax;" |
| "2:" |
| |
| :"=a"(ret), "=D"(s), "=S"(c) |
| :"D"(s), "S"(c) |
| :"rcx", "memory", "xmm1", "xmm2" |
| ); |
| return ret; |
| } |
| |
| /*__attribute__((always_inline))*/ inline char* strrchr_sse(const char* s, int c) { |
| //s:rdi; c:rsi |
| char* ret; |
| __asm__ __volatile__( |
| "and $0xff, %%esi;" //clear upper bytes |
| //c==0 |
| "test %%esi, %%esi;" |
| "jnz 0f ;" |
| |
| "movq %%rdi, %%rax;" |
| "pxor %%xmm1, %%xmm1;" |
| "3:" |
| "movdqu (%%rdi), %%xmm2;" |
| |
| "addq $16, %%rdi;" |
| // "pcmpistri $0x8, %%xmm2,%%xmm1;" |
| ".byte 0x66, 0x0f, 0x3a, 0x63, 0xca, 0x08;" |
| "jnz 3b;" |
| |
| "leaq -16(%%rdi,%%rcx), %%rax;" |
| "jmp 3f;" |
| |
| "0:" |
| "movd %%esi, %%xmm1;" |
| //"xor %%rcx, %%rcx;" |
| "xor %%rax, %%rax;" |
| |
| "1:" |
| "movdqu (%%rdi), %%xmm2;" |
| |
| "addq $16, %%rdi;" |
| // "pcmpistri $0x40, %%xmm2,%%xmm1;" |
| ".byte 0x66 ,0x0f ,0x3a ,0x63 ,0xca ,0x40;" |
| "ja 1b;" //zflag =0 and cflag =0, it means no end and no match |
| |
| "jz 2f;" //zflag =1, the end of string |
| "lea -16(%%rdi, %%rcx), %%rax;" //cflag =1 |
| "jmp 1b;" |
| |
| "2:" |
| "jnc 3f;" |
| "lea -16(%%rdi, %%rcx), %%rax;" |
| "3:" |
| //"mov %%rax, %0;" |
| :"=a"(ret), "=D"(s), "=S"(c) |
| :"D"(s), "S"(c) |
| :"rcx", "memory", "xmm1", "xmm2" |
| ); |
| return ret; |
| } |
| |
| inline char* strrchr_end_sse(char const* b, char const* e, char c) { |
| //b:rdi; e:rsi; c:rdx |
| char* ret; |
| |
| __asm__ __volatile__( |
| |
| // "movzbq %5, %%rdx;" |
| // "mov %%rdx, %%r8;" |
| "movzbq %5, %%r8;" |
| |
| "cmp $0, %%rdi;" |
| "jbe 1f;" |
| |
| //calculate rdx, decide where to go |
| "mov %%rsi, %%rdx;" |
| "subq %%rdi, %%rdx;" |
| "jbe 1f;" // if begin >= end, return |
| "cmp $7, %%rdx;" |
| "jna 2f;" |
| |
| // rdx >= 8 |
| "movd %%r8, %%xmm1;" |
| "mov $1, %%rax;" |
| "cmp $16, %%rdx;" |
| "ja 3f;" // if rdx > 16, jmp to 3f |
| |
| "5:" |
| // 8 <= rdx <= 16 |
| "subq %%rdx, %%rsi;" |
| "movdqu (%%rsi), %%xmm2;" |
| // "pcmpestri $0x40, %%xmm2, %%xmm1;" |
| ".byte 0x66, 0x0f, 0x3a, 0x61, 0xca, 0x40;" |
| "jnc 1f; " // if cflag=0, not match, jmp to 1f |
| "lea (%%rsi, %%rcx), %%rax;" // matched |
| "jmp 0f;" |
| |
| // after 16-bytes compare |
| "4:" |
| "subq $16, %%rdx;" |
| "cmp $7, %%rdx;" |
| "jna 2f;" // if rdx < 8, jmp to 2f |
| "cmp $16, %%rdx;" |
| "jna 5b;" |
| |
| "3:" |
| "subq $16, %%rsi;" |
| "movdqu (%%rsi), %%xmm2;" |
| // "pcmpestri $0x40, %%xmm2, %%xmm1;" |
| ".byte 0x66, 0x0f, 0x3a, 0x61, 0xca, 0x40;" |
| "ja 4b;" // cflag = 0:not match && zflag = 0:not end >>> loopback |
| "lea (%%rsi, %%rcx), %%rax;" // rdx > 16, zflag always = 0, match |
| "jmp 0f;" |
| |
| "2:" |
| // 0 < rdx < 8 |
| "mov %%r8, %%rax;" |
| |
| // switch rdx; |
| "cmpb -1(%%rsi), %%al;" |
| "jne 11f;" |
| "lea -1(%%rsi), %%rax;" |
| "jmp 0f;" |
| "11:" |
| "cmp $1, %%rdx;" |
| "je 1f;" |
| |
| "cmpb -2(%%rsi), %%al;" |
| "jne 12f;" |
| "lea -2(%%rsi), %%rax;" |
| "jmp 0f;" |
| "12:" |
| "cmp $2, %%rdx;" |
| "je 1f;" |
| |
| "cmpb -3(%%rsi), %%al;" |
| "jne 13f;" |
| "lea -3(%%rsi), %%rax;" |
| "jmp 0f;" |
| "13:" |
| "cmp $3, %%rdx;" |
| "je 1f;" |
| |
| "cmpb -4(%%rsi), %%al;" |
| "jne 14f;" |
| "lea -4(%%rsi), %%rax;" |
| "jmp 0f;" |
| "14:" |
| "cmp $4, %%rdx;" |
| "je 1f;" |
| |
| "cmpb -5(%%rsi), %%al;" |
| "jne 15f;" |
| "lea -5(%%rsi), %%rax;" |
| "jmp 0f;" |
| "15:" |
| "cmp $5, %%rdx;" |
| "je 1f;" |
| |
| "cmpb -6(%%rsi), %%al;" |
| "jne 16f;" |
| "lea -6(%%rsi), %%rax;" |
| "jmp 0f;" |
| "16:" |
| "cmp $6, %%rdx;" |
| "je 1f;" |
| |
| "cmpb -7(%%rsi), %%al;" |
| "jne 1f;" |
| "lea -7(%%rsi), %%rax;" |
| "jmp 0f;" |
| |
| // failed return |
| "1:" |
| "xor %%rax, %%rax;" // return null |
| |
| // success return |
| "0:" |
| |
| :"=a"(ret), "=D"(b), "=S"(e) //,"=d"(c) |
| :"D"(b), "S"(e), "r"(c) |
| :"r8", "rcx", "memory", "xmm1", "xmm2", "rdx" |
| ); |
| return ret; |
| } |
| |
| /*__attribute__((always_inline))*/ inline void* memchr_sse(const void* s, int c, size_t n) { |
| //s:rdi; c:rsi; n:rdx |
| void* ret; |
| __asm__ __volatile__( |
| "and $0xff, %%esi;" //clear upper bytes |
| "movd %%esi, %%xmm1;" |
| |
| "mov $1, %%rax;" |
| "add $16, %%rdx;" |
| |
| "1:" |
| "movdqu (%%rdi), %%xmm2;" |
| "sub $16, %%rdx;" |
| "addq $16, %%rdi;" |
| //"pcmpestri $0x0, %%xmm2,%%xmm1;" |
| ".byte 0x66 ,0x0f ,0x3a ,0x61 ,0xca ,0x00;" |
| //"lea 16(%%rdi), %%rdi;" |
| "ja 1b;" //Res2==0:no match and zflag==0: s is not end |
| "jc 3f;" //Res2==1: match and s is not end |
| |
| "mov $0x0, %%rax;"//no match |
| "jmp 0f;" |
| |
| "3:" |
| |
| "lea -16(%%rdi,%%rcx),%%rax;" |
| |
| "0:" |
| //"mov %%rax, %0;" |
| :"=a"(ret), "=D"(s), "=S"(c), "=d"(n) |
| :"D"(s), "S"(c), "d"(n) |
| :"rcx", "memory", "xmm1", "xmm2" |
| ); |
| return ret; |
| } |
| |
| /*__attribute__((always_inline))*/ inline size_t strlen_sse(const char* s) { |
| //s:rdi |
| size_t ret ; |
| __asm__ __volatile__( |
| "movq $-16, %%rax;" |
| //"xor %%rcx, %%rcx;" |
| "pxor %%xmm0, %%xmm0;" |
| |
| "1:" |
| "movdqu (%%rdi), %%xmm1;" |
| "addq $16, %%rax;" |
| "addq $16, %%rdi;" |
| //"pcmpistri $0x8, %%xmm1,%%xmm0;" |
| ".byte 0x66, 0x0f, 0x3a, 0x63, 0xc1, 0x08;" |
| //"lea 16(%%rdi), %%rdi ;" |
| //"lea 16(%%rax), %%rax ;" |
| "jnz 1b;" |
| |
| "addq %%rcx, %%rax;" |
| //"mov %%rax, %0;" |
| :"=a"(ret), "=D"(s) |
| :"D"(s) |
| :"rcx", "memory", "xmm0", "xmm1" |
| ); |
| return ret; |
| } |
| |
| /*__attribute__((always_inline))*/ inline int strcmp_sse(const char* s1, const char* s2) |
| |
| { |
| //s1:rdi; s2:rsi |
| int result ; |
| __asm__ __volatile__( |
| "xor %%rax, %%rax ;" |
| //"xor %%rcx, %%rcx ;" |
| |
| "1:" |
| "movdqu (%%rdi), %%xmm1;" |
| "movdqu (%%rsi), %%xmm2;" |
| "addq $16, %%rsi;" |
| "addq $16, %%rdi;" |
| // " pcmpistri $0x18, %%xmm2, %%xmm1 ;" |
| ".byte 0x66 ,0x0f ,0x3a ,0x63 ,0xca ,0x18;" |
| " ja 1b ;" |
| |
| "jnc 0f;" |
| "movzbq -16(%%rsi, %%rcx), %%rdx ;" |
| "movzbq -16(%%rdi, %%rcx), %%rax ;" |
| // "sub %%rdx, %%rax ;" |
| "movl $1, %%ecx;" |
| "movl $-1, %%edi;" |
| "cmp %%rdx, %%rax;" |
| "cmova %%ecx, %%eax;" |
| "cmovb %%edi, %%eax;" |
| |
| "0:" |
| //"mov %%eax, %0;" |
| |
| :"=a"(result), "=D"(s1), "=S"(s2) |
| :"D"(s1), "S"(s2) |
| :"rcx", "rdx", "memory", "xmm1", "xmm2" |
| ); |
| return result; |
| } |
| |
| /*__attribute__((always_inline))*/ inline int strncmp_sse(const char* s1, const char* s2, size_t n) |
| |
| { |
| //s1:rdi; s2:rsi; n:rdx |
| int result ; |
| __asm__ __volatile__( |
| "cmp $1, %%rdx;" |
| "jbe 3f;" |
| |
| "xor %%rax, %%rax ;" |
| |
| "1:" |
| "movdqu (%%rdi), %%xmm1;" |
| "movdqu (%%rsi), %%xmm2;" |
| "addq $16, %%rdi;" |
| "addq $16, %%rsi;" |
| // " pcmpistri $0x18, %%xmm2, %%xmm1 ;" |
| ".byte 0x66 ,0x0f ,0x3a ,0x63 ,0xca ,0x18;" |
| // "lea 16(%%rsi), %%rsi;" |
| // "lea 16(%%rdi), %%rdi;" |
| "ja 2f ;" //both 16Byte data elements are valid and identical |
| "jnc 0f;" //Both 16byte data elements have EOS and identical |
| |
| //the following situation is Both 16byte data elements differ at offset X (ecx). |
| |
| "cmp %%rdx, %%rcx;" |
| "jae 0f;" // X is out of n |
| |
| "movzbq -16(%%rsi, %%rcx), %%rdx ;" // X is in the range of n |
| "movzbq -16(%%rdi, %%rcx), %%rax ;" |
| "subq %%rdx, %%rax ;" |
| "jmp 0f;" |
| |
| "2:" |
| "subq $16, %%rdx;" |
| "jbe 0f;" |
| "ja 1b;" |
| |
| "3:" |
| "xor %%eax, %%eax;" |
| "test %%rdx, %%rdx ;" |
| "jz 0f ;" |
| "movzbl (%%rdi), %%eax;" |
| "movzbl (%%rsi), %%edx;" |
| "subl %%edx, %%eax;" |
| |
| "0:" |
| // "mov %%eax, %0;" |
| |
| :"=a"(result), "=D"(s1), "=S"(s2), "=d"(n) |
| :"D"(s1), "S"(s2), "d"(n) |
| :"rcx", "memory", "xmm1", "xmm2" |
| ); |
| return result; |
| } |
| |
| /*__attribute__((always_inline))*/ inline int baidu_crc32_byte(char const* src, int crc, |
| int length) { |
| |
| int crc_out ; |
| __asm__ __volatile__( |
| "1:" |
| "movzbl (%%rdi), %%ecx;" |
| //"crc32b %%cl, %%esi;" |
| ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1;" |
| |
| "add $1, %%rdi;" |
| "sub $1, %%edx;" |
| "jnz 1b;" |
| "movl %%esi,%%eax;" |
| :"=a"(crc_out), "=D"(src), "=S"(crc), "=d"(length) |
| :"D"(src), "S"(crc), "d"(length) |
| :"memory", "ecx" |
| ); |
| |
| return crc_out; |
| } |
| |
| inline int crc32c_qw(char const* src, int crc, unsigned int qwlen) { |
| int crc_out ; |
| __asm__ __volatile__( |
| "1:" |
| // "crc32q (%%rdi), %%rsi;" |
| ".byte 0xf2 ,0x48 ,0x0f ,0x38 ,0xf1, 0x37;" |
| |
| "addq $8, %%rdi;" |
| "subl $1, %%edx;" |
| "jnz 1b;" |
| "mov %%esi,%%eax;" |
| :"=a"(crc_out), "=D"(src), "=S"(crc), "=d"(qwlen) |
| :"D"(src), "S"(crc), "d"(qwlen) |
| :"memory" |
| ); |
| return crc_out; |
| } |
| |
| inline int baidu_crc32_qw(char const* src, int crc, unsigned int length) { |
| unsigned int iquotient = length >> 3; |
| unsigned int iremainder = length & 0x7; |
| char const* p ; |
| |
| if (iquotient) { |
| crc = crc32c_qw(src, crc, iquotient); |
| } |
| |
| if (iremainder) { |
| |
| p = src + (length - iremainder); |
| crc = baidu_crc32_byte(p, crc, iremainder); |
| } |
| |
| return crc; |
| } |
| |
| } // namespace doris |
| |
| #endif // DORIS_BE_SRC_OLAP_BHP_LIB_H |