|
6 | 6 | */
|
7 | 7 |
|
8 | 8 | #include "crc32_simd.h"
|
9 |
| - |
10 |
| -#if defined(CRC32_SIMD_SSE42_PCLMUL) |
| 9 | +#if defined(CRC32_SIMD_AVX512_PCLMUL) |
11 | 10 |
|
12 | 11 | /*
|
13 |
| - * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer |
14 |
| - * length must be at least 64, and a multiple of 16. Based on: |
| 12 | + * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer |
| 13 | + * length must be at least 256, and a multiple of 64. Based on: |
15 | 14 | *
|
16 | 15 | * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
|
17 | 16 | * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
|
18 | 17 | */
|
19 | 18 |
|
| 19 | +#include <emmintrin.h> |
| 20 | +#include <smmintrin.h> |
| 21 | +#include <wmmintrin.h> |
| 22 | +#include <immintrin.h> |
| 23 | + |
| 24 | +uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */ |
| 25 | + const unsigned char *buf, |
| 26 | + z_size_t len, |
| 27 | + uint32_t crc) |
| 28 | +{ |
| 29 | + /* |
| 30 | + * Definitions of the bit-reflected domain constants k1,k2,k3,k4 |
| 31 | + * are similar to those given at the end of the paper, and remaining |
| 32 | + * constants and CRC32+Barrett polynomials remain unchanged. |
| 33 | + * |
| 34 | + * Replace the index of x from 128 to 512. As follows: |
| 35 | + * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a |
| 36 | + * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430 |
| 37 | + * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4 |
| 38 | + * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596 |
| 39 | + */ |
| 40 | + static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430, |
| 41 | + 0x011542778a, 0x01322d1430, |
| 42 | + 0x011542778a, 0x01322d1430, |
| 43 | + 0x011542778a, 0x01322d1430 }; |
| 44 | + static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596, |
| 45 | + 0x0154442bd4, 0x01c6e41596, |
| 46 | + 0x0154442bd4, 0x01c6e41596, |
| 47 | + 0x0154442bd4, 0x01c6e41596 }; |
| 48 | + static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e }; |
| 49 | + static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 }; |
| 50 | + static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; |
| 51 | + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; |
| 52 | + __m128i a0, a1, a2, a3; |
| 53 | + |
| 54 | + /* |
| 55 | + * There's at least one block of 256. |
| 56 | + */ |
| 57 | + x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); |
| 58 | + x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); |
| 59 | + x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); |
| 60 | + x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); |
| 61 | + |
| 62 | + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); |
| 63 | + |
| 64 | + x0 = _mm512_load_si512((__m512i *)k1k2); |
| 65 | + |
| 66 | + buf += 256; |
| 67 | + len -= 256; |
| 68 | + |
| 69 | + /* |
| 70 | + * Parallel fold blocks of 256, if any. |
| 71 | + */ |
| 72 | + while (len >= 256) |
| 73 | + { |
| 74 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 75 | + x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); |
| 76 | + x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); |
| 77 | + x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); |
| 78 | + |
| 79 | + |
| 80 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 81 | + x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); |
| 82 | + x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); |
| 83 | + x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); |
| 84 | + |
| 85 | + y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); |
| 86 | + y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); |
| 87 | + y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); |
| 88 | + y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); |
| 89 | + |
| 90 | + x1 = _mm512_xor_si512(x1, x5); |
| 91 | + x2 = _mm512_xor_si512(x2, x6); |
| 92 | + x3 = _mm512_xor_si512(x3, x7); |
| 93 | + x4 = _mm512_xor_si512(x4, x8); |
| 94 | + |
| 95 | + x1 = _mm512_xor_si512(x1, y5); |
| 96 | + x2 = _mm512_xor_si512(x2, y6); |
| 97 | + x3 = _mm512_xor_si512(x3, y7); |
| 98 | + x4 = _mm512_xor_si512(x4, y8); |
| 99 | + |
| 100 | + buf += 256; |
| 101 | + len -= 256; |
| 102 | + } |
| 103 | + |
| 104 | + /* |
| 105 | + * Fold into 512-bits. |
| 106 | + */ |
| 107 | + x0 = _mm512_load_si512((__m512i *)k3k4); |
| 108 | + |
| 109 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 110 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 111 | + x1 = _mm512_xor_si512(x1, x2); |
| 112 | + x1 = _mm512_xor_si512(x1, x5); |
| 113 | + |
| 114 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 115 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 116 | + x1 = _mm512_xor_si512(x1, x3); |
| 117 | + x1 = _mm512_xor_si512(x1, x5); |
| 118 | + |
| 119 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 120 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 121 | + x1 = _mm512_xor_si512(x1, x4); |
| 122 | + x1 = _mm512_xor_si512(x1, x5); |
| 123 | + |
| 124 | + /* |
| 125 | + * Single fold blocks of 64, if any. |
| 126 | + */ |
| 127 | + while (len >= 64) |
| 128 | + { |
| 129 | + x2 = _mm512_loadu_si512((__m512i *)buf); |
| 130 | + |
| 131 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 132 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 133 | + x1 = _mm512_xor_si512(x1, x2); |
| 134 | + x1 = _mm512_xor_si512(x1, x5); |
| 135 | + |
| 136 | + buf += 64; |
| 137 | + len -= 64; |
| 138 | + } |
| 139 | + |
| 140 | + /* |
| 141 | + * Fold 512-bits to 384-bits. |
| 142 | + */ |
| 143 | + a0 = _mm_load_si128((__m128i *)k5k6); |
| 144 | + |
| 145 | + a1 = _mm512_extracti32x4_epi32(x1, 0); |
| 146 | + a2 = _mm512_extracti32x4_epi32(x1, 1); |
| 147 | + |
| 148 | + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); |
| 149 | + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); |
| 150 | + |
| 151 | + a1 = _mm_xor_si128(a1, a3); |
| 152 | + a1 = _mm_xor_si128(a1, a2); |
| 153 | + |
| 154 | + /* |
| 155 | + * Fold 384-bits to 256-bits. |
| 156 | + */ |
| 157 | + a2 = _mm512_extracti32x4_epi32(x1, 2); |
| 158 | + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); |
| 159 | + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); |
| 160 | + a1 = _mm_xor_si128(a1, a3); |
| 161 | + a1 = _mm_xor_si128(a1, a2); |
| 162 | + |
| 163 | + /* |
| 164 | + * Fold 256-bits to 128-bits. |
| 165 | + */ |
| 166 | + a2 = _mm512_extracti32x4_epi32(x1, 3); |
| 167 | + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); |
| 168 | + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); |
| 169 | + a1 = _mm_xor_si128(a1, a3); |
| 170 | + a1 = _mm_xor_si128(a1, a2); |
| 171 | + |
| 172 | + /* |
| 173 | + * Fold 128-bits to 64-bits. |
| 174 | + */ |
| 175 | + a2 = _mm_clmulepi64_si128(a1, a0, 0x10); |
| 176 | + a3 = _mm_setr_epi32(~0, 0, ~0, 0); |
| 177 | + a1 = _mm_srli_si128(a1, 8); |
| 178 | + a1 = _mm_xor_si128(a1, a2); |
| 179 | + |
| 180 | + a0 = _mm_loadl_epi64((__m128i*)k7k8); |
| 181 | + a2 = _mm_srli_si128(a1, 4); |
| 182 | + a1 = _mm_and_si128(a1, a3); |
| 183 | + a1 = _mm_clmulepi64_si128(a1, a0, 0x00); |
| 184 | + a1 = _mm_xor_si128(a1, a2); |
| 185 | + |
| 186 | + /* |
| 187 | + * Barret reduce to 32-bits. |
| 188 | + */ |
| 189 | + a0 = _mm_load_si128((__m128i*)poly); |
| 190 | + |
| 191 | + a2 = _mm_and_si128(a1, a3); |
| 192 | + a2 = _mm_clmulepi64_si128(a2, a0, 0x10); |
| 193 | + a2 = _mm_and_si128(a2, a3); |
| 194 | + a2 = _mm_clmulepi64_si128(a2, a0, 0x00); |
| 195 | + a1 = _mm_xor_si128(a1, a2); |
| 196 | + |
| 197 | + /* |
| 198 | + * Return the crc32. |
| 199 | + */ |
| 200 | + return _mm_extract_epi32(a1, 1); |
| 201 | +} |
| 202 | + |
| 203 | +#elif defined(CRC32_SIMD_SSE42_PCLMUL) |
| 204 | + |
| 205 | +/* |
| 206 | + * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer |
| 207 | + * length must be at least 64, and a multiple of 16. |
| 208 | + */ |
| 209 | + |
20 | 210 | #include <emmintrin.h>
|
21 | 211 | #include <smmintrin.h>
|
22 | 212 | #include <wmmintrin.h>
|
|
0 commit comments