blob: 8c52a27e1eec937dd4ad93274525b1a26508333a [file] [log] [blame]
/*******************************************************************************
* Copyright 2014 Trevor Robinson
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#include "crc32c_sse42.h"
#include <boost/version.hpp>
#if BOOST_VERSION >= 105500
#include <boost/predef.h>
#else
#if _MSC_VER
#pragma message("Boost version is < 1.55, disable CRC32C")
#else
#warning "Boost version is < 1.55, disable CRC32C"
#endif
#endif
#include <assert.h>
#include <stdlib.h>
#include "gf2.hpp"
#include "lib/checksum/crc32c_sw.h"
#if BOOST_ARCH_X86_64 && !defined(__arm64__)
#define PULSAR_X86_64
#include <nmmintrin.h> // SSE4.2
#include <wmmintrin.h> // PCLMUL
#else
#ifdef _MSC_VER
#pragma message("BOOST_ARCH_X86_64 is not defined, CRC32C will be disabled")
#else
#warning "BOOST_ARCH_X86_64 is not defined, CRC32C SSE4.2 will be disabled"
#endif
#endif
#ifdef _MSC_VER
#include <intrin.h>
#elif defined(PULSAR_X86_64)
#include <cpuid.h>
#endif
//#define CRC32C_DEBUG
#define CRC32C_PCLMULQDQ
#ifdef CRC32C_DEBUG
#include <stdio.h>
#define DEBUG_PRINTF1(fmt, v1) printf(fmt, v1)
#define DEBUG_PRINTF2(fmt, v1, v2) printf(fmt, v1, v2)
#define DEBUG_PRINTF3(fmt, v1, v2, v3) printf(fmt, v1, v2, v3)
#define DEBUG_PRINTF4(fmt, v1, v2, v3, v4) printf(fmt, v1, v2, v3, v4)
#else
#define DEBUG_PRINTF1(fmt, v1)
#define DEBUG_PRINTF2(fmt, v1, v2)
#define DEBUG_PRINTF3(fmt, v1, v2, v3)
#define DEBUG_PRINTF4(fmt, v1, v2, v3, v4)
#endif
namespace pulsar {
static bool initialized = false;
static bool has_sse42 = false;
static bool has_pclmulqdq = false;
bool crc32c_initialize() {
if (!initialized) {
#ifdef _MSC_VER
const uint32_t cpuid_ecx_sse42 = (1 << 20);
const uint32_t cpuid_ecx_pclmulqdq = (1 << 1);
int CPUInfo[4] = {};
__cpuid(CPUInfo, 1);
has_sse42 = (CPUInfo[2] & cpuid_ecx_sse42) != 0;
has_pclmulqdq = (CPUInfo[2] & cpuid_ecx_pclmulqdq) != 0;
#elif defined(PULSAR_X86_64)
const uint32_t cpuid_ecx_sse42 = (1 << 20);
const uint32_t cpuid_ecx_pclmulqdq = (1 << 1);
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
has_sse42 = (ecx & cpuid_ecx_sse42) != 0;
has_pclmulqdq = (ecx & cpuid_ecx_pclmulqdq) != 0;
}
#else
has_sse42 = false;
has_pclmulqdq = false;
#endif
DEBUG_PRINTF1("has_sse42 = %d\n", has_sse42);
DEBUG_PRINTF1("has_pclmulqdq = %d\n", has_pclmulqdq);
initialized = true;
}
return has_sse42;
}
chunk_config::chunk_config(size_t words, const chunk_config *next) : words(words), next(next) {
assert(words > 0);
assert(!next || next->words < words);
const size_t loop_bytes = loops() * 8;
make_shift_table(loop_bytes, shift1);
make_shift_table(loop_bytes * 2, shift2);
}
void chunk_config::make_shift_table(size_t bytes, uint32_t table[256]) {
bitmatrix<32, 32> op;
op.lower_shift();
op[0] = 0x82f63b78; // reversed CRC-32C polynomial
bitmatrix<32, 32> m;
pow(m, op, bytes * 8);
for (unsigned int i = 0; i < 256; ++i) table[i] = (const bitvector<32>)mul(m, bitvector<32>(i));
}
#ifdef PULSAR_X86_64
static uint32_t crc32c_chunk(uint32_t crc, const void *buf, const chunk_config &config) {
DEBUG_PRINTF3(" crc32c_chunk(crc = 0x%08x, buf = %p, config.words = " SIZE_T_FORMAT ")", crc, buf,
config.words);
const uint64_t *pq = (const uint64_t *)buf;
uint64_t crc0 = config.extra() > 1 ? _mm_crc32_u64(crc, *pq++) : crc;
uint64_t crc1 = 0;
uint64_t crc2 = 0;
const size_t loops = config.loops();
for (unsigned int i = 0; i < loops; ++i, ++pq) {
crc1 = _mm_crc32_u64(crc1, pq[1 * loops]);
crc2 = _mm_crc32_u64(crc2, pq[2 * loops]);
crc0 = _mm_crc32_u64(crc0, pq[0 * loops]);
}
pq += 2 * loops;
uint64_t tmp = *pq++;
#ifdef CRC32C_PCLMULQDQ
if (has_pclmulqdq) {
__m128i k = _mm_set_epi64x(config.shift1[1], config.shift2[1]);
__m128i mul1 = _mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)crc1), k, 0x10);
__m128i mul0 = _mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)crc0), k, 0x00);
tmp ^= (uint64_t)_mm_cvtsi128_si64(mul1);
tmp ^= (uint64_t)_mm_cvtsi128_si64(mul0);
} else
#endif
{
tmp ^= config.shift1[crc1 & 0xff];
tmp ^= ((uint64_t)config.shift1[(crc1 >> 8) & 0xff]) << 8;
tmp ^= ((uint64_t)config.shift1[(crc1 >> 16) & 0xff]) << 16;
tmp ^= ((uint64_t)config.shift1[(crc1 >> 24) & 0xff]) << 24;
tmp ^= config.shift2[crc0 & 0xff];
tmp ^= ((uint64_t)config.shift2[(crc0 >> 8) & 0xff]) << 8;
tmp ^= ((uint64_t)config.shift2[(crc0 >> 16) & 0xff]) << 16;
tmp ^= ((uint64_t)config.shift2[(crc0 >> 24) & 0xff]) << 24;
}
crc2 = _mm_crc32_u64(crc2, tmp);
if (config.extra() > 2) // only if words is divisible by 3
crc2 = _mm_crc32_u64(crc2, *pq);
crc = (uint32_t)crc2;
DEBUG_PRINTF1(" = 0x%08x\n", crc);
return crc;
}
static uint32_t crc32c_words(uint32_t crc, const void *buf, size_t count) {
DEBUG_PRINTF3(" crc32c_words(crc = 0x%08x, buf = %p, count = " SIZE_T_FORMAT ")", crc, buf, count);
const uint64_t *pq = (const uint64_t *)buf;
size_t loops = (count + 7) / 8;
assert(loops > 0);
switch (count & 7) {
case 0:
do {
crc = (uint32_t)_mm_crc32_u64(crc, *pq++);
case 7:
crc = (uint32_t)_mm_crc32_u64(crc, *pq++);
case 6:
crc = (uint32_t)_mm_crc32_u64(crc, *pq++);
case 5:
crc = (uint32_t)_mm_crc32_u64(crc, *pq++);
case 4:
crc = (uint32_t)_mm_crc32_u64(crc, *pq++);
case 3:
crc = (uint32_t)_mm_crc32_u64(crc, *pq++);
case 2:
crc = (uint32_t)_mm_crc32_u64(crc, *pq++);
case 1:
crc = (uint32_t)_mm_crc32_u64(crc, *pq++);
} while (--loops > 0);
}
DEBUG_PRINTF1(" = 0x%08x\n", crc);
return crc;
}
static uint32_t crc32c_bytes(uint32_t crc, const void *buf, size_t count) {
DEBUG_PRINTF3(" crc32c_bytes(crc = 0x%08x, buf = %p, count = " SIZE_T_FORMAT ")", crc, buf, count);
const uint8_t *pc = (const uint8_t *)buf;
size_t loops = (count + 7) / 8;
assert(loops > 0);
switch (count & 7) {
case 0:
do {
crc = (uint32_t)_mm_crc32_u8(crc, *pc++);
case 7:
crc = (uint32_t)_mm_crc32_u8(crc, *pc++);
case 6:
crc = (uint32_t)_mm_crc32_u8(crc, *pc++);
case 5:
crc = (uint32_t)_mm_crc32_u8(crc, *pc++);
case 4:
crc = (uint32_t)_mm_crc32_u8(crc, *pc++);
case 3:
crc = (uint32_t)_mm_crc32_u8(crc, *pc++);
case 2:
crc = (uint32_t)_mm_crc32_u8(crc, *pc++);
case 1:
crc = (uint32_t)_mm_crc32_u8(crc, *pc++);
} while (--loops > 0);
}
DEBUG_PRINTF1(" = 0x%08x\n", crc);
return crc;
}
uint32_t crc32c(uint32_t init, const void *buf, size_t len, const chunk_config *config) {
DEBUG_PRINTF3("crc32c(init = 0x%08x, buf = %p, len = " SIZE_T_FORMAT ")\n", init, buf, len);
uint32_t crc = ~init;
const char *pc = (const char *)buf;
if (len >= 24) {
if ((uintptr_t)pc & 7) {
size_t unaligned = 8 - ((uintptr_t)pc & 7);
crc = crc32c_bytes(crc, pc, unaligned);
pc += unaligned;
len -= unaligned;
}
size_t words = len / 8;
while (config) {
while (words >= config->words) {
crc = crc32c_chunk(crc, pc, *config);
pc += config->words * 8;
words -= config->words;
}
config = config->next;
}
if (words > 0) {
crc = crc32c_words(crc, pc, words);
pc += words * 8;
}
len &= 7;
}
if (len) crc = crc32c_bytes(crc, pc, len);
crc = ~crc;
DEBUG_PRINTF1("crc = 0x%08x\n", crc);
return crc;
}
#else // ! PULSAR_X86_64
uint32_t crc32c(uint32_t init, const void *buf, size_t len, const chunk_config *config) {
// SSE 4.2 extension for hw implementation are not present
return crc32c_sw(init, buf, len); // fallback to the software implementation
}
#endif
} // namespace pulsar