@@ -115,7 +115,7 @@ void kernel_accumulate_block(
115
115
116
116
#else
117
117
118
- // a special version for NQ=1.
118
+ // a special version for NQ=1.
119
119
// Despite the function being large in the text form, it compiles to a very
120
120
// compact assembler code.
121
121
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
@@ -143,10 +143,8 @@ void kernel_accumulate_block_avx512_nq1(
143
143
144
144
// process "nsq - scaler.nscale" part
145
145
const int nsq_minus_nscale = nsq - scaler.nscale ;
146
- const int nsq_minus_nscale_8 =
147
- (nsq_minus_nscale / 8 ) * 8 ;
148
- const int nsq_minus_nscale_4 =
149
- (nsq_minus_nscale / 4 ) * 4 ;
146
+ const int nsq_minus_nscale_8 = (nsq_minus_nscale / 8 ) * 8 ;
147
+ const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4 ) * 4 ;
150
148
151
149
// process in chunks of 8
152
150
for (int sq = 0 ; sq < nsq_minus_nscale_8; sq += 8 ) {
@@ -291,7 +289,7 @@ void kernel_accumulate_block_avx512_nq1(
291
289
accu[q][3 ] += scaler.scale_hi (res1); // handle vectors 48..63
292
290
}
293
291
}
294
-
292
+
295
293
for (int q = 0 ; q < NQ; q++) {
296
294
// load LUTs for 4 quantizers
297
295
simd64uint8 lut (LUT);
@@ -352,12 +350,16 @@ void kernel_accumulate_block_avx512_nq1(
352
350
LUT += 32 ;
353
351
354
352
simd32uint8 res0 = scaler.lookup (lut, clo);
355
- accu[q][0 ] += simd32uint16 (scaler.scale_lo (res0)); // handle vectors 0..7
356
- accu[q][1 ] += simd32uint16 (scaler.scale_hi (res0)); // handle vectors 8..15
353
+ accu[q][0 ] +=
354
+ simd32uint16 (scaler.scale_lo (res0)); // handle vectors 0..7
355
+ accu[q][1 ] +=
356
+ simd32uint16 (scaler.scale_hi (res0)); // handle vectors 8..15
357
357
358
358
simd32uint8 res1 = scaler.lookup (lut, chi);
359
- accu[q][2 ] += simd32uint16 (scaler.scale_lo (res1)); // handle vectors 16..23
360
- accu[q][3 ] += simd32uint16 (scaler.scale_hi (res1)); // handle vectors 24..31
359
+ accu[q][2 ] += simd32uint16 (
360
+ scaler.scale_lo (res1)); // handle vectors 16..23
361
+ accu[q][3 ] += simd32uint16 (
362
+ scaler.scale_hi (res1)); // handle vectors 24..31
361
363
}
362
364
}
363
365
@@ -385,7 +387,6 @@ void kernel_accumulate_block_avx512_nqx(
385
387
const uint8_t * LUT,
386
388
ResultHandler& res,
387
389
const Scaler& scaler) {
388
-
389
390
// dummy alloc to keep the windows compiler happy
390
391
constexpr int NQA = NQ > 0 ? NQ : 1 ;
391
392
// distance accumulators
@@ -400,8 +401,7 @@ void kernel_accumulate_block_avx512_nqx(
400
401
401
402
// process "nsq - scaler.nscale" part
402
403
const int nsq_minus_nscale = nsq - scaler.nscale ;
403
- const int nsq_minus_nscale_4 =
404
- (nsq_minus_nscale / 4 ) * 4 ;
404
+ const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4 ) * 4 ;
405
405
406
406
// process in chunks of 8
407
407
for (int sq = 0 ; sq < nsq_minus_nscale_4; sq += 4 ) {
@@ -518,12 +518,16 @@ void kernel_accumulate_block_avx512_nqx(
518
518
LUT += 32 ;
519
519
520
520
simd32uint8 res0 = scaler.lookup (lut, clo);
521
- accu[q][0 ] += simd32uint16 (scaler.scale_lo (res0)); // handle vectors 0..7
522
- accu[q][1 ] += simd32uint16 (scaler.scale_hi (res0)); // handle vectors 8..15
521
+ accu[q][0 ] +=
522
+ simd32uint16 (scaler.scale_lo (res0)); // handle vectors 0..7
523
+ accu[q][1 ] +=
524
+ simd32uint16 (scaler.scale_hi (res0)); // handle vectors 8..15
523
525
524
526
simd32uint8 res1 = scaler.lookup (lut, chi);
525
- accu[q][2 ] += simd32uint16 (scaler.scale_lo (res1)); // handle vectors 16..23
526
- accu[q][3 ] += simd32uint16 (scaler.scale_hi (res1)); // handle vectors 24..31
527
+ accu[q][2 ] += simd32uint16 (
528
+ scaler.scale_lo (res1)); // handle vectors 16..23
529
+ accu[q][3 ] += simd32uint16 (
530
+ scaler.scale_hi (res1)); // handle vectors 24..31
527
531
}
528
532
}
529
533
@@ -542,16 +546,13 @@ void kernel_accumulate_block(
542
546
const uint8_t * codes,
543
547
const uint8_t * LUT,
544
548
ResultHandler& res,
545
- const Scaler& scaler
546
- ) {
549
+ const Scaler& scaler) {
547
550
if constexpr (NQ == 1 ) {
548
551
kernel_accumulate_block_avx512_nq1<ResultHandler, Scaler>(
549
- nsq, codes, LUT, res, scaler
550
- );
552
+ nsq, codes, LUT, res, scaler);
551
553
} else {
552
554
kernel_accumulate_block_avx512_nqx<NQ, ResultHandler, Scaler>(
553
- nsq, codes, LUT, res, scaler
554
- );
555
+ nsq, codes, LUT, res, scaler);
555
556
}
556
557
}
557
558
0 commit comments