|
41 | 41 | * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
|
42 | 42 | * of the adler s1 s2 of uint32_t type (see adler32.c).
|
43 | 43 | */
|
| 44 | +/* Copyright (C) 2023 SiFive, Inc. All rights reserved. |
| 45 | + * For conditions of distribution and use, see copyright notice in zlib.h |
| 46 | + */ |
44 | 47 |
|
45 | 48 | #include "adler32_simd.h"
|
46 | 49 |
|
@@ -363,4 +366,105 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
|
363 | 366 | return s1 | (s2 << 16);
|
364 | 367 | }
|
365 | 368 |
|
| 369 | +#elif defined(ADLER32_SIMD_RVV) |
| 370 | +#include <riscv_vector.h> |
| 371 | +/* adler32_rvv.c - RVV version of Adler-32 |
| 372 | + * RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com> |
| 373 | + * on https://github.com/zlib-ng/zlib-ng/pull/1532 |
| 374 | + * Port from Simon Hosie's fork: |
| 375 | + * https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1 |
| 376 | + */ |
| 377 | + |
| 378 | +uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */ |
| 379 | + uint32_t adler, |
| 380 | + const unsigned char *buf, |
| 381 | + unsigned long len) |
| 382 | +{ |
| 383 | + /* split Adler-32 into component sums */ |
| 384 | + uint32_t sum2 = (adler >> 16) & 0xffff; |
| 385 | + adler &= 0xffff; |
| 386 | + |
| 387 | + size_t left = len; |
| 388 | + size_t vl = __riscv_vsetvlmax_e8m1(); |
| 389 | + vl = vl > 256 ? 256 : vl; |
| 390 | + vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl); |
| 391 | + vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl); |
| 392 | + vuint16m2_t v_buf16_accu; |
| 393 | + |
| 394 | + /* |
| 395 | + * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator. |
| 396 | + * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit |
| 397 | + * accumulators to boost performance. |
| 398 | + * |
| 399 | + * The block_size is the largest multiple of vl that <= 256, because overflow would occur when |
| 400 | + * vl > 256 (255 * 256 <= UINT16_MAX). |
| 401 | + * |
| 402 | + * We accumulate 8-bit data into a 16-bit accumulator and then |
| 403 | + * move the data into the 32-bit accumulator at the last iteration. |
| 404 | + */ |
| 405 | + size_t block_size = (256 / vl) * vl; |
| 406 | + size_t nmax_limit = (NMAX / block_size); |
| 407 | + size_t cnt = 0; |
| 408 | + while (left >= block_size) { |
| 409 | + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); |
| 410 | + size_t subprob = block_size; |
| 411 | + while (subprob > 0) { |
| 412 | + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); |
| 413 | + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); |
| 414 | + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); |
| 415 | + buf += vl; |
| 416 | + subprob -= vl; |
| 417 | + } |
| 418 | + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); |
| 419 | + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); |
| 420 | + left -= block_size; |
| 421 | + /* do modulo once each block of NMAX size */ |
| 422 | + if (++cnt >= nmax_limit) { |
| 423 | + v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); |
| 424 | + cnt = 0; |
| 425 | + } |
| 426 | + } |
| 427 | + /* the left len <= 256 now, we can use 16-bit accum safely */ |
| 428 | + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); |
| 429 | + size_t res = left; |
| 430 | + while (left >= vl) { |
| 431 | + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); |
| 432 | + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); |
| 433 | + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); |
| 434 | + buf += vl; |
| 435 | + left -= vl; |
| 436 | + } |
| 437 | + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); |
| 438 | + v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); |
| 439 | + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); |
| 440 | + |
| 441 | + vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl); |
| 442 | + vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl); |
| 443 | + vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl); |
| 444 | + |
| 445 | + v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl); |
| 446 | + |
| 447 | + vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl); |
| 448 | + v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl); |
| 449 | + uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum); |
| 450 | + |
| 451 | + sum2 += (sum2_sum + adler * (len - left)); |
| 452 | + |
| 453 | + vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl); |
| 454 | + v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl); |
| 455 | + uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum); |
| 456 | + |
| 457 | + adler += adler_sum; |
| 458 | + |
| 459 | + while (left--) { |
| 460 | + adler += *buf++; |
| 461 | + sum2 += adler; |
| 462 | + } |
| 463 | + |
| 464 | + sum2 %= BASE; |
| 465 | + adler %= BASE; |
| 466 | + |
| 467 | + return adler | (sum2 << 16); |
| 468 | +} |
| 469 | + |
366 | 470 | #endif /* ADLER32_SIMD_SSSE3 */
|
0 commit comments