Skip to content

Commit c8f9497

Browse files
committed
apacheGH-40095: [C++][Parquet] Remove AVX512 variants of BYTE_STREAM_SPLIT encoding
Two reasons: * the SSE2 and AVX2 variants are already fast enough (on the order of 10 GB/s) * the AVX512 variants do not seem faster, and can even be slower, on tested Intel machines
1 parent a03d957 commit c8f9497

File tree

3 files changed

+2
-251
lines changed

3 files changed

+2
-251
lines changed

cpp/src/arrow/util/byte_stream_split_internal.h

+2-220
Original file line numberDiff line numberDiff line change
@@ -332,226 +332,11 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const int64_t num_valu
332332
}
333333
#endif // ARROW_HAVE_AVX2
334334

335-
#if defined(ARROW_HAVE_AVX512)
336-
template <int kNumStreams>
337-
void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride,
338-
uint8_t* out) {
339-
static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams.");
340-
constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2);
341-
constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
342-
343-
const int64_t size = num_values * kNumStreams;
344-
if (size < kBlockSize) // Back to AVX2 for small size
345-
return ByteStreamSplitDecodeAvx2<kNumStreams>(data, num_values, stride, out);
346-
const int64_t num_blocks = size / kBlockSize;
347-
348-
// First handle suffix.
349-
const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
350-
for (int64_t i = num_processed_elements; i < num_values; ++i) {
351-
uint8_t gathered_byte_data[kNumStreams];
352-
for (int b = 0; b < kNumStreams; ++b) {
353-
const int64_t byte_index = b * stride + i;
354-
gathered_byte_data[b] = data[byte_index];
355-
}
356-
memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams);
357-
}
358-
359-
// Processed hierarchically using the unpack, then two shuffles.
360-
__m512i stage[kNumStreamsLog2 + 1][kNumStreams];
361-
__m512i shuffle[kNumStreams];
362-
__m512i final_result[kNumStreams];
363-
constexpr int kNumStreamsHalf = kNumStreams / 2U;
364-
365-
for (int64_t i = 0; i < num_blocks; ++i) {
366-
for (int j = 0; j < kNumStreams; ++j) {
367-
stage[0][j] = _mm512_loadu_si512(
368-
reinterpret_cast<const __m512i*>(&data[i * sizeof(__m512i) + j * stride]));
369-
}
370-
371-
for (int step = 0; step < kNumStreamsLog2; ++step) {
372-
for (int j = 0; j < kNumStreamsHalf; ++j) {
373-
stage[step + 1][j * 2] =
374-
_mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
375-
stage[step + 1][j * 2 + 1] =
376-
_mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
377-
}
378-
}
379-
380-
if constexpr (kNumStreams == 8) {
381-
// path for double, 128i index:
382-
// {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
383-
// {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
384-
// {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
385-
// {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
386-
shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
387-
stage[kNumStreamsLog2][1], 0b01000100);
388-
shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
389-
stage[kNumStreamsLog2][3], 0b01000100);
390-
shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
391-
stage[kNumStreamsLog2][5], 0b01000100);
392-
shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
393-
stage[kNumStreamsLog2][7], 0b01000100);
394-
shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
395-
stage[kNumStreamsLog2][1], 0b11101110);
396-
shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
397-
stage[kNumStreamsLog2][3], 0b11101110);
398-
shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
399-
stage[kNumStreamsLog2][5], 0b11101110);
400-
shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
401-
stage[kNumStreamsLog2][7], 0b11101110);
402-
403-
final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
404-
final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
405-
final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
406-
final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
407-
final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
408-
final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
409-
final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
410-
final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
411-
} else {
412-
// path for float, 128i index:
413-
// {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
414-
// {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
415-
shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
416-
stage[kNumStreamsLog2][1], 0b01000100);
417-
shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
418-
stage[kNumStreamsLog2][3], 0b01000100);
419-
shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
420-
stage[kNumStreamsLog2][1], 0b11101110);
421-
shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
422-
stage[kNumStreamsLog2][3], 0b11101110);
423-
424-
final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
425-
final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
426-
final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
427-
final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
428-
}
429-
430-
for (int j = 0; j < kNumStreams; ++j) {
431-
_mm512_storeu_si512(
432-
reinterpret_cast<__m512i*>(out + (i * kNumStreams + j) * sizeof(__m512i)),
433-
final_result[j]);
434-
}
435-
}
436-
}
437-
438-
template <int kNumStreams>
439-
void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const int64_t num_values,
440-
uint8_t* output_buffer_raw) {
441-
static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams.");
442-
constexpr int kBlockSize = sizeof(__m512i) * kNumStreams;
443-
444-
const int64_t size = num_values * kNumStreams;
445-
446-
if (size < kBlockSize) // Back to AVX2 for small size
447-
return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, num_values,
448-
output_buffer_raw);
449-
450-
const int64_t num_blocks = size / kBlockSize;
451-
const __m512i* raw_values_simd = reinterpret_cast<const __m512i*>(raw_values);
452-
__m512i* output_buffer_streams[kNumStreams];
453-
for (int i = 0; i < kNumStreams; ++i) {
454-
output_buffer_streams[i] =
455-
reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]);
456-
}
457-
458-
// First handle suffix.
459-
const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
460-
for (int64_t i = num_processed_elements; i < num_values; ++i) {
461-
for (int j = 0; j < kNumStreams; ++j) {
462-
const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
463-
output_buffer_raw[j * num_values + i] = byte_in_value;
464-
}
465-
}
466-
467-
constexpr int KNumUnpack = (kNumStreams == 8) ? 2 : 3;
468-
__m512i final_result[kNumStreams];
469-
__m512i unpack[KNumUnpack + 1][kNumStreams];
470-
__m512i permutex[kNumStreams];
471-
__m512i permutex_mask;
472-
if constexpr (kNumStreams == 8) {
473-
// use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
474-
permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006,
475-
0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004,
476-
0x001B0013, 0x000B0003, 0x001A0012, 0x000A0002,
477-
0x00190011, 0x00090001, 0x00180010, 0x00080000);
478-
} else {
479-
permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 0x02, 0x0D,
480-
0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00);
481-
}
482-
483-
for (int64_t block_index = 0; block_index < num_blocks; ++block_index) {
484-
for (int i = 0; i < kNumStreams; ++i) {
485-
unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]);
486-
}
487-
488-
for (int unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) {
489-
for (int i = 0; i < kNumStreams / 2; ++i) {
490-
unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8(
491-
unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
492-
unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8(
493-
unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
494-
}
495-
}
496-
497-
if constexpr (kNumStreams == 8) {
498-
// path for double
499-
// 1. unpack to epi16 block
500-
// 2. permutexvar_epi16 to 128i block
501-
// 3. shuffle 128i to final 512i target, index:
502-
// {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
503-
// {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
504-
// {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
505-
// {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
506-
for (int i = 0; i < kNumStreams; ++i)
507-
permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]);
508-
509-
__m512i shuffle[kNumStreams];
510-
shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
511-
shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100);
512-
shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
513-
shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110);
514-
shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
515-
shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100);
516-
shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
517-
shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110);
518-
519-
final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
520-
final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
521-
final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
522-
final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
523-
final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
524-
final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
525-
final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
526-
final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
527-
} else {
528-
// Path for float.
529-
// 1. Processed hierarchically to 32i block using the unpack intrinsics.
530-
// 2. Pack 128i block using _mm256_permutevar8x32_epi32.
531-
// 3. Pack final 256i block with _mm256_permute2x128_si256.
532-
for (int i = 0; i < kNumStreams; ++i)
533-
permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]);
534-
535-
final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
536-
final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
537-
final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
538-
final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
539-
}
540-
541-
for (int i = 0; i < kNumStreams; ++i) {
542-
_mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]);
543-
}
544-
}
545-
}
546-
#endif // ARROW_HAVE_AVX512
547-
548335
#if defined(ARROW_HAVE_SIMD_SPLIT)
549336
template <int kNumStreams>
550337
void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
551338
int64_t stride, uint8_t* out) {
552-
#if defined(ARROW_HAVE_AVX512)
553-
return ByteStreamSplitDecodeAvx512<kNumStreams>(data, num_values, stride, out);
554-
#elif defined(ARROW_HAVE_AVX2)
339+
#if defined(ARROW_HAVE_AVX2)
555340
return ByteStreamSplitDecodeAvx2<kNumStreams>(data, num_values, stride, out);
556341
#elif defined(ARROW_HAVE_SSE4_2)
557342
return ByteStreamSplitDecodeSse2<kNumStreams>(data, num_values, stride, out);
@@ -563,10 +348,7 @@ void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
563348
template <int kNumStreams>
564349
void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const int64_t num_values,
565350
uint8_t* output_buffer_raw) {
566-
#if defined(ARROW_HAVE_AVX512)
567-
return ByteStreamSplitEncodeAvx512<kNumStreams>(raw_values, num_values,
568-
output_buffer_raw);
569-
#elif defined(ARROW_HAVE_AVX2)
351+
#if defined(ARROW_HAVE_AVX2)
570352
return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, num_values,
571353
output_buffer_raw);
572354
#elif defined(ARROW_HAVE_SSE4_2)

cpp/src/arrow/util/byte_stream_split_test.cc

-4
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,6 @@ class TestByteStreamSplitSpecialized : public ::testing::Test {
8181
#if defined(ARROW_HAVE_AVX2)
8282
encode_funcs_.push_back({"avx2", &ByteStreamSplitEncodeAvx2<kWidth>});
8383
decode_funcs_.push_back({"avx2", &ByteStreamSplitDecodeAvx2<kWidth>});
84-
#endif
85-
#if defined(ARROW_HAVE_AVX512)
86-
encode_funcs_.push_back({"avx512", &ByteStreamSplitEncodeAvx512<kWidth>});
87-
decode_funcs_.push_back({"avx512", &ByteStreamSplitDecodeAvx512<kWidth>});
8884
#endif
8985
}
9086

cpp/src/parquet/encoding_benchmark.cc

-27
Original file line numberDiff line numberDiff line change
@@ -468,33 +468,6 @@ BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Range(MIN_RANGE, MAX_RANGE);
468468
BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE);
469469
#endif
470470

471-
#if defined(ARROW_HAVE_AVX512)
472-
static void BM_ByteStreamSplitDecode_Float_Avx512(benchmark::State& state) {
473-
BM_ByteStreamSplitDecode<float>(
474-
state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512<sizeof(float)>);
475-
}
476-
477-
static void BM_ByteStreamSplitDecode_Double_Avx512(benchmark::State& state) {
478-
BM_ByteStreamSplitDecode<double>(
479-
state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512<sizeof(double)>);
480-
}
481-
482-
static void BM_ByteStreamSplitEncode_Float_Avx512(benchmark::State& state) {
483-
BM_ByteStreamSplitEncode<float>(
484-
state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512<sizeof(float)>);
485-
}
486-
487-
static void BM_ByteStreamSplitEncode_Double_Avx512(benchmark::State& state) {
488-
BM_ByteStreamSplitEncode<double>(
489-
state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512<sizeof(double)>);
490-
}
491-
492-
BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE);
493-
BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE);
494-
BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE);
495-
BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE);
496-
#endif
497-
498471
template <typename DType>
499472
static auto MakeDeltaBitPackingInputFixed(size_t length) {
500473
using T = typename DType::c_type;

0 commit comments

Comments
 (0)