@@ -332,226 +332,11 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const int64_t num_valu
332
332
}
333
333
#endif // ARROW_HAVE_AVX2
334
334
335
- #if defined(ARROW_HAVE_AVX512)
336
- template <int kNumStreams >
337
- void ByteStreamSplitDecodeAvx512 (const uint8_t * data, int64_t num_values, int64_t stride,
338
- uint8_t * out) {
339
- static_assert (kNumStreams == 4 || kNumStreams == 8 , " Invalid number of streams." );
340
- constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2 );
341
- constexpr int64_t kBlockSize = sizeof (__m512i) * kNumStreams ;
342
-
343
- const int64_t size = num_values * kNumStreams ;
344
- if (size < kBlockSize ) // Back to AVX2 for small size
345
- return ByteStreamSplitDecodeAvx2<kNumStreams >(data, num_values, stride, out);
346
- const int64_t num_blocks = size / kBlockSize ;
347
-
348
- // First handle suffix.
349
- const int64_t num_processed_elements = (num_blocks * kBlockSize ) / kNumStreams ;
350
- for (int64_t i = num_processed_elements; i < num_values; ++i) {
351
- uint8_t gathered_byte_data[kNumStreams ];
352
- for (int b = 0 ; b < kNumStreams ; ++b) {
353
- const int64_t byte_index = b * stride + i;
354
- gathered_byte_data[b] = data[byte_index];
355
- }
356
- memcpy (out + i * kNumStreams , gathered_byte_data, kNumStreams );
357
- }
358
-
359
- // Processed hierarchically using the unpack, then two shuffles.
360
- __m512i stage[kNumStreamsLog2 + 1 ][kNumStreams ];
361
- __m512i shuffle[kNumStreams ];
362
- __m512i final_result[kNumStreams ];
363
- constexpr int kNumStreamsHalf = kNumStreams / 2U ;
364
-
365
- for (int64_t i = 0 ; i < num_blocks; ++i) {
366
- for (int j = 0 ; j < kNumStreams ; ++j) {
367
- stage[0 ][j] = _mm512_loadu_si512 (
368
- reinterpret_cast <const __m512i*>(&data[i * sizeof (__m512i) + j * stride]));
369
- }
370
-
371
- for (int step = 0 ; step < kNumStreamsLog2 ; ++step) {
372
- for (int j = 0 ; j < kNumStreamsHalf ; ++j) {
373
- stage[step + 1 ][j * 2 ] =
374
- _mm512_unpacklo_epi8 (stage[step][j], stage[step][kNumStreamsHalf + j]);
375
- stage[step + 1 ][j * 2 + 1 ] =
376
- _mm512_unpackhi_epi8 (stage[step][j], stage[step][kNumStreamsHalf + j]);
377
- }
378
- }
379
-
380
- if constexpr (kNumStreams == 8 ) {
381
- // path for double, 128i index:
382
- // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
383
- // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
384
- // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
385
- // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
386
- shuffle[0 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][0 ],
387
- stage[kNumStreamsLog2 ][1 ], 0b01000100 );
388
- shuffle[1 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][2 ],
389
- stage[kNumStreamsLog2 ][3 ], 0b01000100 );
390
- shuffle[2 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][4 ],
391
- stage[kNumStreamsLog2 ][5 ], 0b01000100 );
392
- shuffle[3 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][6 ],
393
- stage[kNumStreamsLog2 ][7 ], 0b01000100 );
394
- shuffle[4 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][0 ],
395
- stage[kNumStreamsLog2 ][1 ], 0b11101110 );
396
- shuffle[5 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][2 ],
397
- stage[kNumStreamsLog2 ][3 ], 0b11101110 );
398
- shuffle[6 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][4 ],
399
- stage[kNumStreamsLog2 ][5 ], 0b11101110 );
400
- shuffle[7 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][6 ],
401
- stage[kNumStreamsLog2 ][7 ], 0b11101110 );
402
-
403
- final_result[0 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b10001000 );
404
- final_result[1 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b10001000 );
405
- final_result[2 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b11011101 );
406
- final_result[3 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b11011101 );
407
- final_result[4 ] = _mm512_shuffle_i32x4 (shuffle[4 ], shuffle[5 ], 0b10001000 );
408
- final_result[5 ] = _mm512_shuffle_i32x4 (shuffle[6 ], shuffle[7 ], 0b10001000 );
409
- final_result[6 ] = _mm512_shuffle_i32x4 (shuffle[4 ], shuffle[5 ], 0b11011101 );
410
- final_result[7 ] = _mm512_shuffle_i32x4 (shuffle[6 ], shuffle[7 ], 0b11011101 );
411
- } else {
412
- // path for float, 128i index:
413
- // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
414
- // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
415
- shuffle[0 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][0 ],
416
- stage[kNumStreamsLog2 ][1 ], 0b01000100 );
417
- shuffle[1 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][2 ],
418
- stage[kNumStreamsLog2 ][3 ], 0b01000100 );
419
- shuffle[2 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][0 ],
420
- stage[kNumStreamsLog2 ][1 ], 0b11101110 );
421
- shuffle[3 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][2 ],
422
- stage[kNumStreamsLog2 ][3 ], 0b11101110 );
423
-
424
- final_result[0 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b10001000 );
425
- final_result[1 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b11011101 );
426
- final_result[2 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b10001000 );
427
- final_result[3 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b11011101 );
428
- }
429
-
430
- for (int j = 0 ; j < kNumStreams ; ++j) {
431
- _mm512_storeu_si512 (
432
- reinterpret_cast <__m512i*>(out + (i * kNumStreams + j) * sizeof (__m512i)),
433
- final_result[j]);
434
- }
435
- }
436
- }
437
-
438
- template <int kNumStreams >
439
- void ByteStreamSplitEncodeAvx512 (const uint8_t * raw_values, const int64_t num_values,
440
- uint8_t * output_buffer_raw) {
441
- static_assert (kNumStreams == 4 || kNumStreams == 8 , " Invalid number of streams." );
442
- constexpr int kBlockSize = sizeof (__m512i) * kNumStreams ;
443
-
444
- const int64_t size = num_values * kNumStreams ;
445
-
446
- if (size < kBlockSize ) // Back to AVX2 for small size
447
- return ByteStreamSplitEncodeAvx2<kNumStreams >(raw_values, num_values,
448
- output_buffer_raw);
449
-
450
- const int64_t num_blocks = size / kBlockSize ;
451
- const __m512i* raw_values_simd = reinterpret_cast <const __m512i*>(raw_values);
452
- __m512i* output_buffer_streams[kNumStreams ];
453
- for (int i = 0 ; i < kNumStreams ; ++i) {
454
- output_buffer_streams[i] =
455
- reinterpret_cast <__m512i*>(&output_buffer_raw[num_values * i]);
456
- }
457
-
458
- // First handle suffix.
459
- const int64_t num_processed_elements = (num_blocks * kBlockSize ) / kNumStreams ;
460
- for (int64_t i = num_processed_elements; i < num_values; ++i) {
461
- for (int j = 0 ; j < kNumStreams ; ++j) {
462
- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
463
- output_buffer_raw[j * num_values + i] = byte_in_value;
464
- }
465
- }
466
-
467
- constexpr int KNumUnpack = (kNumStreams == 8 ) ? 2 : 3 ;
468
- __m512i final_result[kNumStreams ];
469
- __m512i unpack[KNumUnpack + 1 ][kNumStreams ];
470
- __m512i permutex[kNumStreams ];
471
- __m512i permutex_mask;
472
- if constexpr (kNumStreams == 8 ) {
473
- // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
474
- permutex_mask = _mm512_set_epi32 (0x001F0017 , 0x000F0007 , 0x001E0016 , 0x000E0006 ,
475
- 0x001D0015 , 0x000D0005 , 0x001C0014 , 0x000C0004 ,
476
- 0x001B0013 , 0x000B0003 , 0x001A0012 , 0x000A0002 ,
477
- 0x00190011 , 0x00090001 , 0x00180010 , 0x00080000 );
478
- } else {
479
- permutex_mask = _mm512_set_epi32 (0x0F , 0x0B , 0x07 , 0x03 , 0x0E , 0x0A , 0x06 , 0x02 , 0x0D ,
480
- 0x09 , 0x05 , 0x01 , 0x0C , 0x08 , 0x04 , 0x00 );
481
- }
482
-
483
- for (int64_t block_index = 0 ; block_index < num_blocks; ++block_index) {
484
- for (int i = 0 ; i < kNumStreams ; ++i) {
485
- unpack[0 ][i] = _mm512_loadu_si512 (&raw_values_simd[block_index * kNumStreams + i]);
486
- }
487
-
488
- for (int unpack_lvl = 0 ; unpack_lvl < KNumUnpack; ++unpack_lvl) {
489
- for (int i = 0 ; i < kNumStreams / 2 ; ++i) {
490
- unpack[unpack_lvl + 1 ][i * 2 ] = _mm512_unpacklo_epi8 (
491
- unpack[unpack_lvl][i * 2 ], unpack[unpack_lvl][i * 2 + 1 ]);
492
- unpack[unpack_lvl + 1 ][i * 2 + 1 ] = _mm512_unpackhi_epi8 (
493
- unpack[unpack_lvl][i * 2 ], unpack[unpack_lvl][i * 2 + 1 ]);
494
- }
495
- }
496
-
497
- if constexpr (kNumStreams == 8 ) {
498
- // path for double
499
- // 1. unpack to epi16 block
500
- // 2. permutexvar_epi16 to 128i block
501
- // 3. shuffle 128i to final 512i target, index:
502
- // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
503
- // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
504
- // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
505
- // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
506
- for (int i = 0 ; i < kNumStreams ; ++i)
507
- permutex[i] = _mm512_permutexvar_epi16 (permutex_mask, unpack[KNumUnpack][i]);
508
-
509
- __m512i shuffle[kNumStreams ];
510
- shuffle[0 ] = _mm512_shuffle_i32x4 (permutex[0 ], permutex[2 ], 0b01000100 );
511
- shuffle[1 ] = _mm512_shuffle_i32x4 (permutex[4 ], permutex[6 ], 0b01000100 );
512
- shuffle[2 ] = _mm512_shuffle_i32x4 (permutex[0 ], permutex[2 ], 0b11101110 );
513
- shuffle[3 ] = _mm512_shuffle_i32x4 (permutex[4 ], permutex[6 ], 0b11101110 );
514
- shuffle[4 ] = _mm512_shuffle_i32x4 (permutex[1 ], permutex[3 ], 0b01000100 );
515
- shuffle[5 ] = _mm512_shuffle_i32x4 (permutex[5 ], permutex[7 ], 0b01000100 );
516
- shuffle[6 ] = _mm512_shuffle_i32x4 (permutex[1 ], permutex[3 ], 0b11101110 );
517
- shuffle[7 ] = _mm512_shuffle_i32x4 (permutex[5 ], permutex[7 ], 0b11101110 );
518
-
519
- final_result[0 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b10001000 );
520
- final_result[1 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b11011101 );
521
- final_result[2 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b10001000 );
522
- final_result[3 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b11011101 );
523
- final_result[4 ] = _mm512_shuffle_i32x4 (shuffle[4 ], shuffle[5 ], 0b10001000 );
524
- final_result[5 ] = _mm512_shuffle_i32x4 (shuffle[4 ], shuffle[5 ], 0b11011101 );
525
- final_result[6 ] = _mm512_shuffle_i32x4 (shuffle[6 ], shuffle[7 ], 0b10001000 );
526
- final_result[7 ] = _mm512_shuffle_i32x4 (shuffle[6 ], shuffle[7 ], 0b11011101 );
527
- } else {
528
- // Path for float.
529
- // 1. Processed hierarchically to 32i block using the unpack intrinsics.
530
- // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
531
- // 3. Pack final 256i block with _mm256_permute2x128_si256.
532
- for (int i = 0 ; i < kNumStreams ; ++i)
533
- permutex[i] = _mm512_permutexvar_epi32 (permutex_mask, unpack[KNumUnpack][i]);
534
-
535
- final_result[0 ] = _mm512_shuffle_i32x4 (permutex[0 ], permutex[2 ], 0b01000100 );
536
- final_result[1 ] = _mm512_shuffle_i32x4 (permutex[0 ], permutex[2 ], 0b11101110 );
537
- final_result[2 ] = _mm512_shuffle_i32x4 (permutex[1 ], permutex[3 ], 0b01000100 );
538
- final_result[3 ] = _mm512_shuffle_i32x4 (permutex[1 ], permutex[3 ], 0b11101110 );
539
- }
540
-
541
- for (int i = 0 ; i < kNumStreams ; ++i) {
542
- _mm512_storeu_si512 (&output_buffer_streams[i][block_index], final_result[i]);
543
- }
544
- }
545
- }
546
- #endif // ARROW_HAVE_AVX512
547
-
548
335
#if defined(ARROW_HAVE_SIMD_SPLIT)
549
336
template <int kNumStreams >
550
337
void inline ByteStreamSplitDecodeSimd (const uint8_t * data, int64_t num_values,
551
338
int64_t stride, uint8_t * out) {
552
- #if defined(ARROW_HAVE_AVX512)
553
- return ByteStreamSplitDecodeAvx512<kNumStreams >(data, num_values, stride, out);
554
- #elif defined(ARROW_HAVE_AVX2)
339
+ #if defined(ARROW_HAVE_AVX2)
555
340
return ByteStreamSplitDecodeAvx2<kNumStreams >(data, num_values, stride, out);
556
341
#elif defined(ARROW_HAVE_SSE4_2)
557
342
return ByteStreamSplitDecodeSse2<kNumStreams >(data, num_values, stride, out);
@@ -563,10 +348,7 @@ void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
563
348
template <int kNumStreams >
564
349
void inline ByteStreamSplitEncodeSimd (const uint8_t * raw_values, const int64_t num_values,
565
350
uint8_t * output_buffer_raw) {
566
- #if defined(ARROW_HAVE_AVX512)
567
- return ByteStreamSplitEncodeAvx512<kNumStreams >(raw_values, num_values,
568
- output_buffer_raw);
569
- #elif defined(ARROW_HAVE_AVX2)
351
+ #if defined(ARROW_HAVE_AVX2)
570
352
return ByteStreamSplitEncodeAvx2<kNumStreams >(raw_values, num_values,
571
353
output_buffer_raw);
572
354
#elif defined(ARROW_HAVE_SSE4_2)
0 commit comments