| From 8463fb7b14f797de715d11cca2df4b9534f98d17 Mon Sep 17 00:00:00 2001 |
| From: Sebastian Pop <spop@amazon.com> |
| Date: Fri, 7 Apr 2023 21:26:54 +0000 |
| Subject: [PATCH 1/2] [arm64] use a better translation for move_mask |
| |
| No changes | With the patch | Speedup |
| $ python3 ./tests/test_ext.py | | |
| .bitshuffle 64 : 4.94 s/GB, 0.20 GB/s | 1.53 s/GB, 0.65 GB/s | 3.25x |
| .bitunshuffle 64 : 5.09 s/GB, 0.20 GB/s | 1.53 s/GB, 0.65 GB/s | 3.25x |
| .compress 64 : 5.26 s/GB, 0.19 GB/s | 1.80 s/GB, 0.55 GB/s | 2.89x |
| .compress zstd 64 : 8.02 s/GB, 0.12 GB/s | 4.80 s/GB, 0.21 GB/s | 1.75x |
| .decompress 64 : 5.72 s/GB, 0.17 GB/s | 2.21 s/GB, 0.45 GB/s | 2.64x |
| .decompress zstd 64 : 5.71 s/GB, 0.18 GB/s | 2.18 s/GB, 0.46 GB/s | 2.55x |
| --- |
| src/bitshuffle_core.c | 89 +++++++++++++++++++++++++++++-------------- |
| 1 file changed, 60 insertions(+), 29 deletions(-) |
| |
| diff --git a/src/bitshuffle_core.c b/src/bitshuffle_core.c |
| index ba41473..22203db 100644 |
| --- a/src/bitshuffle_core.c |
| +++ b/src/bitshuffle_core.c |
| @@ -605,44 +605,59 @@ int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size, |
| } |
| } |
| |
| - |
| -/* Creates a mask made up of the most significant |
| - * bit of each byte of 'input' |
| - */ |
| -int32_t move_byte_mask_neon(uint8x16_t input) { |
| - |
| - return ( ((input[0] & 0x80) >> 7) | (((input[1] & 0x80) >> 7) << 1) | (((input[2] & 0x80) >> 7) << 2) | (((input[3] & 0x80) >> 7) << 3) |
| - | (((input[4] & 0x80) >> 7) << 4) | (((input[5] & 0x80) >> 7) << 5) | (((input[6] & 0x80) >> 7) << 6) | (((input[7] & 0x80) >> 7) << 7) |
| - | (((input[8] & 0x80) >> 7) << 8) | (((input[9] & 0x80) >> 7) << 9) | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11) |
| - | (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15) |
| - ); |
| +uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { |
| + const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, |
| + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; |
| + uint8x16_t t0 = vandq_u8(p0, bitmask); |
| + uint8x16_t t1 = vandq_u8(p1, bitmask); |
| + uint8x16_t t2 = vandq_u8(p2, bitmask); |
| + uint8x16_t t3 = vandq_u8(p3, bitmask); |
| + uint8x16_t sum0 = vpaddq_u8(t0, t1); |
| + uint8x16_t sum1 = vpaddq_u8(t2, t3); |
| + sum0 = vpaddq_u8(sum0, sum1); |
| + sum0 = vpaddq_u8(sum0, sum0); |
| + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); |
| } |
| |
| /* Transpose bits within bytes. */ |
| int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size, |
| const size_t elem_size) { |
| |
| - size_t ii, kk; |
| + size_t ii; |
| const char* in_b = (const char*) in; |
| char* out_b = (char*) out; |
| - uint16_t* out_ui16; |
| - |
| int64_t count; |
| - |
| size_t nbyte = elem_size * size; |
| |
| CHECK_MULT_EIGHT(nbyte); |
| |
| - int16x8_t xmm; |
| - int32_t bt; |
| + const uint8x16_t a0 = vdupq_n_u8(0x80); |
| + const uint8x16_t a1 = vdupq_n_u8(0x40); |
| + const uint8x16_t a2 = vdupq_n_u8(0x20); |
| + const uint8x16_t a3 = vdupq_n_u8(0x10); |
| + const uint8x16_t a4 = vdupq_n_u8(0x8); |
| + const uint8x16_t a5 = vdupq_n_u8(0x4); |
| + const uint8x16_t a6 = vdupq_n_u8(0x2); |
| + const uint8x16_t a7 = vdupq_n_u8(0x1); |
| |
| for (ii = 0; ii + 15 < nbyte; ii += 16) { |
| - xmm = vld1q_s16((int16_t *) (in_b + ii)); |
| + uint8x16_t x = vld1q_u8((uint8_t *) (in_b + ii)); |
| + uint8x16_t x0 = vceqq_u8(a0, vandq_u8(x, a0)); |
| + uint8x16_t x1 = vceqq_u8(a1, vandq_u8(x, a1)); |
| + uint8x16_t x2 = vceqq_u8(a2, vandq_u8(x, a2)); |
| + uint8x16_t x3 = vceqq_u8(a3, vandq_u8(x, a3)); |
| + uint8x16_t x4 = vceqq_u8(a4, vandq_u8(x, a4)); |
| + uint8x16_t x5 = vceqq_u8(a5, vandq_u8(x, a5)); |
| + uint8x16_t x6 = vceqq_u8(a6, vandq_u8(x, a6)); |
| + uint8x16_t x7 = vceqq_u8(a7, vandq_u8(x, a7)); |
| + |
| + uint64_t out[2]; |
| + out[0] = neonmovemask_bulk(x0, x1, x2, x3); |
| + out[1] = neonmovemask_bulk(x4, x5, x6, x7); |
| + int kk; |
| for (kk = 0; kk < 8; kk++) { |
| - bt = move_byte_mask_neon((uint8x16_t) xmm); |
| - xmm = vshlq_n_s16(xmm, 1); |
| - out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
| - *out_ui16 = bt; |
| + uint16_t *out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
| + *out_ui16 = ((uint16_t*)out)[kk]; |
| } |
| } |
| count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, |
| @@ -785,21 +800,37 @@ int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t |
| size_t ii, jj, kk; |
| size_t nbyte = elem_size * size; |
| |
| - int16x8_t xmm; |
| - int32_t bt; |
| - |
| if (elem_size % 2) { |
| bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); |
| } else { |
| + const uint8x16_t a0 = vdupq_n_u8(0x80); |
| + const uint8x16_t a1 = vdupq_n_u8(0x40); |
| + const uint8x16_t a2 = vdupq_n_u8(0x20); |
| + const uint8x16_t a3 = vdupq_n_u8(0x10); |
| + const uint8x16_t a4 = vdupq_n_u8(0x8); |
| + const uint8x16_t a5 = vdupq_n_u8(0x4); |
| + const uint8x16_t a6 = vdupq_n_u8(0x2); |
| + const uint8x16_t a7 = vdupq_n_u8(0x1); |
| for (ii = 0; ii + 8 * elem_size - 1 < nbyte; |
| ii += 8 * elem_size) { |
| for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { |
| - xmm = vld1q_s16((int16_t *) &in_b[ii + jj]); |
| + uint8x16_t x = vld1q_u8((uint8_t *) &in_b[ii + jj]); |
| + uint8x16_t x0 = vceqq_u8(a0, vandq_u8(x, a0)); |
| + uint8x16_t x1 = vceqq_u8(a1, vandq_u8(x, a1)); |
| + uint8x16_t x2 = vceqq_u8(a2, vandq_u8(x, a2)); |
| + uint8x16_t x3 = vceqq_u8(a3, vandq_u8(x, a3)); |
| + uint8x16_t x4 = vceqq_u8(a4, vandq_u8(x, a4)); |
| + uint8x16_t x5 = vceqq_u8(a5, vandq_u8(x, a5)); |
| + uint8x16_t x6 = vceqq_u8(a6, vandq_u8(x, a6)); |
| + uint8x16_t x7 = vceqq_u8(a7, vandq_u8(x, a7)); |
| + |
| + uint64_t out[2]; |
| + out[0] = neonmovemask_bulk(x0, x1, x2, x3); |
| + out[1] = neonmovemask_bulk(x4, x5, x6, x7); |
| + |
| for (kk = 0; kk < 8; kk++) { |
| - bt = move_byte_mask_neon((uint8x16_t) xmm); |
| - xmm = vshlq_n_s16(xmm, 1); |
| size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); |
| - out_ui16[ind / 2] = bt; |
| + out_ui16[ind / 2] = ((uint16_t *)out)[kk]; |
| } |
| } |
| } |
| -- |
| 2.31.1 |
| |
| |
| From db32ce8d0f0ab3f245008da58f0fe983243f6823 Mon Sep 17 00:00:00 2001 |
| From: Sebastian Pop <spop@amazon.com> |
| Date: Sat, 15 Apr 2023 13:42:06 +0000 |
| Subject: [PATCH 2/2] fix aliasing bug |
| |
| Patch from Andrew Pinski <pinskia@gcc.gnu.org>. |
| --- |
| src/bitshuffle_core.c | 14 ++++++++------ |
| 1 file changed, 8 insertions(+), 6 deletions(-) |
| |
| diff --git a/src/bitshuffle_core.c b/src/bitshuffle_core.c |
| index 22203db..f3b6ca0 100644 |
| --- a/src/bitshuffle_core.c |
| +++ b/src/bitshuffle_core.c |
| @@ -49,6 +49,8 @@ typedef int64_t omp_size_t; |
| typedef size_t omp_size_t; |
| #endif |
| |
| +typedef uint16_t alias_uint16_t __attribute__((may_alias)); |
| + |
| // Macros. |
| #define CHECK_MULT_EIGHT(n) if (n % 8) return -80; |
| #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) |
| @@ -656,8 +658,8 @@ int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size, |
| out[1] = neonmovemask_bulk(x4, x5, x6, x7); |
| int kk; |
| for (kk = 0; kk < 8; kk++) { |
| - uint16_t *out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
| - *out_ui16 = ((uint16_t*)out)[kk]; |
| + alias_uint16_t *out_ui16 = (alias_uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
| + *out_ui16 = ((alias_uint16_t*)out)[kk]; |
| } |
| } |
| count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, |
| @@ -795,7 +797,7 @@ int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t |
| // With a bit of care, this could be written such that such that it is |
| // in_buf = out_buf safe. |
| const char* in_b = (const char*) in; |
| - uint16_t* out_ui16 = (uint16_t*) out; |
| + alias_uint16_t* out_ui16 = (alias_uint16_t*) out; |
| |
| size_t ii, jj, kk; |
| size_t nbyte = elem_size * size; |
| @@ -830,7 +832,7 @@ int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t |
| |
| for (kk = 0; kk < 8; kk++) { |
| size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); |
| - out_ui16[ind / 2] = ((uint16_t *)out)[kk]; |
| + out_ui16[ind / 2] = ((alias_uint16_t *)out)[kk]; |
| } |
| } |
| } |
| @@ -1145,7 +1147,7 @@ int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, |
| size_t ii, kk; |
| const char* in_b = (const char*) in; |
| char* out_b = (char*) out; |
| - uint16_t* out_ui16; |
| + alias_uint16_t* out_ui16; |
| |
| int64_t count; |
| |
| @@ -1161,7 +1163,7 @@ int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, |
| for (kk = 0; kk < 8; kk++) { |
| bt = _mm_movemask_epi8(xmm); |
| xmm = _mm_slli_epi16(xmm, 1); |
| - out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
| + out_ui16 = (alias_uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
| *out_ui16 = bt; |
| } |
| } |
| -- |
| 2.31.1 |
| |