blob: 290b8a619955d7a554785f5e03123c74920debbf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Portions of this file are from http://www.evanjones.ca/crc32c.html under
* the BSD license:
* Copyright 2008,2009,2010 Massachusetts Institute of Technology.
* All rights reserved. Use of this source code is governed by a
* BSD-style license that can be found in the LICENSE file.
*/
#include <assert.h>
#include <stddef.h> // for size_t
#include "bulk_crc32.h"
#include "gcc_optimizations.h"
#include "gcc_optimizations.h"
///////////////////////////////////////////////////////////////////////////
// Begin code for SSE4.2 specific hardware support of CRC32C
///////////////////////////////////////////////////////////////////////////
# define SSE42_FEATURE_BIT (1 << 20)
# define CPUID_FEATURES 1
/**
* Call the cpuid instruction to determine CPU feature flags.
*/
static uint32_t cpuid(uint32_t eax_in) {
uint32_t eax, ebx, ecx, edx;
# if defined(__PIC__) && !defined(__LP64__)
// 32-bit PIC code uses the ebx register for the base offset --
// have to save and restore it on the stack
asm("pushl %%ebx\n\t"
"cpuid\n\t"
"movl %%ebx, %[ebx]\n\t"
"popl %%ebx" : "=a" (eax), [ebx] "=r"(ebx), "=c"(ecx), "=d"(edx) : "a" (eax_in)
: "cc");
# else
asm("cpuid" : "=a" (eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(eax_in)
: "cc");
# endif
return ecx;
}
//
// Definitions of the SSE4.2 crc32 operations. Using these instead of
// the GCC __builtin_* intrinsics allows this code to compile without
// -msse4.2, since we do dynamic CPU detection at runtime.
//
# ifdef __LP64__
inline uint64_t _mm_crc32_u64(uint64_t crc, uint64_t value) {
asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
return crc;
}
# endif
inline uint32_t _mm_crc32_u32(uint32_t crc, uint32_t value) {
asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
return crc;
}
inline uint32_t _mm_crc32_u16(uint32_t crc, uint16_t value) {
asm("crc32w %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
return crc;
}
inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t value) {
asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value));
return crc;
}
# ifdef __LP64__
/**
* Pipelined version of hardware-accelerated CRC32C calculation using
* the 64 bit crc32q instruction.
* One crc32c instruction takes three cycles, but two more with no data
* dependency can be in the pipeline to achieve something close to single
* instruction/cycle. Here we feed three blocks in RR.
*
* crc1, crc2, crc3 : Store initial checksum for each block before
* calling. When it returns, updated checksums are stored.
* p_buf : The base address of the data buffer. The buffer should be
* at least as big as block_size * num_blocks.
* block_size : The size of each block in bytes.
* num_blocks : The number of blocks to work on. Min = 1, Max = 3
*/
static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) {
uint64_t c1 = *crc1;
uint64_t c2 = *crc2;
uint64_t c3 = *crc3;
uint64_t *data = (uint64_t*)p_buf;
int counter = block_size / sizeof(uint64_t);
int remainder = block_size % sizeof(uint64_t);
uint8_t *bdata;
/* We do switch here because the loop has to be tight in order
* to fill the pipeline. Any other statement inside the loop
* or inbetween crc32 instruction can slow things down. Calling
* individual crc32 instructions three times from C also causes
* gcc to insert other instructions inbetween.
*
* Do not rearrange the following code unless you have verified
* the generated machine code is as efficient as before.
*/
switch (num_blocks) {
case 3:
/* Do three blocks */
while (likely(counter)) {
__asm__ __volatile__(
"crc32q (%7), %0;\n\t"
"crc32q (%7,%6,1), %1;\n\t"
"crc32q (%7,%6,2), %2;\n\t"
: "=r"(c1), "=r"(c2), "=r"(c3)
: "0"(c1), "1"(c2), "2"(c3), "r"(block_size), "r"(data)
);
data++;
counter--;
}
/* Take care of the remainder. They are only up to seven bytes,
* so performing byte-level crc32 won't take much time.
*/
bdata = (uint8_t*)data;
while (likely(remainder)) {
__asm__ __volatile__(
"crc32b (%7), %0;\n\t"
"crc32b (%7,%6,1), %1;\n\t"
"crc32b (%7,%6,2), %2;\n\t"
: "=r"(c1), "=r"(c2), "=r"(c3)
: "0"(c1), "1"(c2), "2"(c3), "r"(block_size), "r"(bdata)
);
bdata++;
remainder--;
}
break;
case 2:
/* Do two blocks */
while (likely(counter)) {
__asm__ __volatile__(
"crc32q (%5), %0;\n\t"
"crc32q (%5,%4,1), %1;\n\t"
: "=r"(c1), "=r"(c2)
: "0"(c1), "1"(c2), "r"(block_size), "r"(data)
);
data++;
counter--;
}
bdata = (uint8_t*)data;
while (likely(remainder)) {
__asm__ __volatile__(
"crc32b (%5), %0;\n\t"
"crc32b (%5,%4,1), %1;\n\t"
: "=r"(c1), "=r"(c2)
: "0"(c1), "1"(c2), "r"(block_size), "r"(bdata)
);
bdata++;
remainder--;
}
break;
case 1:
/* single block */
while (likely(counter)) {
__asm__ __volatile__(
"crc32q (%2), %0;\n\t"
: "=r"(c1)
: "0"(c1), "r"(data)
);
data++;
counter--;
}
bdata = (uint8_t*)data;
while (likely(remainder)) {
__asm__ __volatile__(
"crc32b (%2), %0;\n\t"
: "=r"(c1)
: "0"(c1), "r"(bdata)
);
bdata++;
remainder--;
}
break;
case 0:
return;
default:
assert(0 && "BUG: Invalid number of checksum blocks");
}
*crc1 = c1;
*crc2 = c2;
*crc3 = c3;
return;
}
# else // 32-bit
/**
* Pipelined version of hardware-accelerated CRC32C calculation using
* the 32 bit crc32l instruction.
* One crc32c instruction takes three cycles, but two more with no data
* dependency can be in the pipeline to achieve something close to single
* instruction/cycle. Here we feed three blocks in RR.
*
* crc1, crc2, crc3 : Store initial checksum for each block before
* calling. When it returns, updated checksums are stored.
* data : The base address of the data buffer. The buffer should be
* at least as big as block_size * num_blocks.
* block_size : The size of each block in bytes.
* num_blocks : The number of blocks to work on. Min = 1, Max = 3
*/
static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, const uint8_t *p_buf, size_t block_size, int num_blocks) {
uint32_t c1 = *crc1;
uint32_t c2 = *crc2;
uint32_t c3 = *crc3;
int counter = block_size / sizeof(uint32_t);
int remainder = block_size % sizeof(uint32_t);
uint32_t *data = (uint32_t*)p_buf;
uint8_t *bdata;
/* We do switch here because the loop has to be tight in order
* to fill the pipeline. Any other statement inside the loop
* or inbetween crc32 instruction can slow things down. Calling
* individual crc32 instructions three times from C also causes
* gcc to insert other instructions inbetween.
*
* Do not rearrange the following code unless you have verified
* the generated machine code is as efficient as before.
*/
switch (num_blocks) {
case 3:
/* Do three blocks */
while (likely(counter)) {
__asm__ __volatile__(
"crc32l (%7), %0;\n\t"
"crc32l (%7,%6,1), %1;\n\t"
"crc32l (%7,%6,2), %2;\n\t"
: "=r"(c1), "=r"(c2), "=r"(c3)
: "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(data)
);
data++;
counter--;
}
/* Take care of the remainder. They are only up to three bytes,
* so performing byte-level crc32 won't take much time.
*/
bdata = (uint8_t*)data;
while (likely(remainder)) {
__asm__ __volatile__(
"crc32b (%7), %0;\n\t"
"crc32b (%7,%6,1), %1;\n\t"
"crc32b (%7,%6,2), %2;\n\t"
: "=r"(c1), "=r"(c2), "=r"(c3)
: "r"(c1), "r"(c2), "r"(c3), "r"(block_size), "r"(bdata)
);
bdata++;
remainder--;
}
break;
case 2:
/* Do two blocks */
while (likely(counter)) {
__asm__ __volatile__(
"crc32l (%5), %0;\n\t"
"crc32l (%5,%4,1), %1;\n\t"
: "=r"(c1), "=r"(c2)
: "r"(c1), "r"(c2), "r"(block_size), "r"(data)
);
data++;
counter--;
}
bdata = (uint8_t*)data;
while (likely(remainder)) {
__asm__ __volatile__(
"crc32b (%5), %0;\n\t"
"crc32b (%5,%4,1), %1;\n\t"
: "=r"(c1), "=r"(c2)
: "r"(c1), "r"(c2), "r"(block_size), "r"(bdata)
);
bdata++;
remainder--;
}
break;
case 1:
/* single block */
while (likely(counter)) {
__asm__ __volatile__(
"crc32l (%2), %0;\n\t"
: "=r"(c1)
: "r"(c1), "r"(data)
);
data++;
counter--;
}
bdata = (uint8_t*)data;
while (likely(remainder)) {
__asm__ __volatile__(
"crc32b (%2), %0;\n\t"
: "=r"(c1)
: "r"(c1), "r"(bdata)
);
bdata++;
remainder--;
}
break;
case 0:
return;
default:
assert(0 && "BUG: Invalid number of checksum blocks");
}
*crc1 = c1;
*crc2 = c2;
*crc3 = c3;
return;
}
# endif // 64-bit vs 32-bit
/**
* On library load, initiailize the cached function pointer
* if cpu supports SSE4.2's crc32 instruction.
*/
typedef void (*crc_pipelined_func_t)(uint32_t *, uint32_t *, uint32_t *, const uint8_t *, size_t, int);
extern crc_pipelined_func_t pipelined_crc32c_func;
void __attribute__ ((constructor)) init_cpu_support_flag(void) {
uint32_t ecx = cpuid(CPUID_FEATURES);
if (ecx & SSE42_FEATURE_BIT) pipelined_crc32c_func = pipelined_crc32c;
}