Skip to content

Commit 38161c3

Browse files
deps: update zlib to 1.3.0.1-motley-24c07df
PR-URL: #52199 Reviewed-By: Marco Ippolito <marcoippolito54@gmail.com> Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
1 parent af48641 commit 38161c3

8 files changed

+183
-26
lines changed

deps/zlib/CMakeLists.txt

+32-14
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
7474

7575
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto")
7676
endif()
77+
78+
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
79+
add_definitions(-DRISCV_RVV)
80+
add_definitions(-DDEFLATE_SLIDE_HASH_RVV)
81+
add_definitions(-DADLER32_SIMD_RVV)
82+
#TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
83+
# Required by CPU features detection code.
84+
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv")
85+
endif()
86+
7787
endif()
7888

7989
#
@@ -180,20 +190,28 @@ set(ZLIB_SRCS
180190
# Update list of source files if optimizations were enabled
181191
#============================================================================
182192
if (ENABLE_SIMD_OPTIMIZATIONS)
183-
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
184-
185-
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
186-
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
187-
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h)
188-
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
189-
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h)
190-
191-
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
192-
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
193-
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
194-
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
195-
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c)
196-
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c)
193+
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
194+
message("RISCVV: Add optimizations.")
195+
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
196+
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
197+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
198+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
199+
else()
200+
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
201+
202+
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
203+
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
204+
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h)
205+
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
206+
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h)
207+
208+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
209+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
210+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
211+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
212+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c)
213+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c)
214+
endif()
197215
endif()
198216

199217
# parse the full version number from zlib.h and include in ZLIB_FULL_VERSION

deps/zlib/adler32.c

+9-4
Original file line numberDiff line numberDiff line change
@@ -58,20 +58,24 @@
5858
#endif
5959

6060
#include "cpu_features.h"
61-
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
61+
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) || defined(ADLER32_SIMD_RVV)
6262
#include "adler32_simd.h"
6363
#endif
6464

6565
/* ========================================================================= */
6666
uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
6767
unsigned long sum2;
6868
unsigned n;
69-
69+
/* TODO(cavalcantii): verify if this lengths are optimal for current CPUs. */
70+
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
71+
|| defined(ADLER32_SIMD_RVV)
7072
#if defined(ADLER32_SIMD_SSSE3)
7173
if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3)
72-
return adler32_simd_(adler, buf, len);
7374
#elif defined(ADLER32_SIMD_NEON)
7475
if (buf != Z_NULL && len >= 64)
76+
#elif defined(ADLER32_SIMD_RVV)
77+
if (buf != Z_NULL && len >= 32 && riscv_cpu_enable_rvv)
78+
#endif
7579
return adler32_simd_(adler, buf, len);
7680
#endif
7781

@@ -90,7 +94,8 @@ uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
9094
return adler | (sum2 << 16);
9195
}
9296

93-
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
97+
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
98+
|| defined(RISCV_RVV)
9499
/*
95100
* Use SIMD to compute the adler32. Since this function can be
96101
* freely used, check CPU features here. zlib convention is to

deps/zlib/adler32_simd.c

+104
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@
4141
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
4242
* of the adler s1 s2 of uint32_t type (see adler32.c).
4343
*/
44+
/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
45+
* For conditions of distribution and use, see copyright notice in zlib.h
46+
*/
4447

4548
#include "adler32_simd.h"
4649

@@ -363,4 +366,105 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
363366
return s1 | (s2 << 16);
364367
}
365368

369+
#elif defined(ADLER32_SIMD_RVV)
370+
#include <riscv_vector.h>
371+
/* adler32_rvv.c - RVV version of Adler-32
372+
* RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com>
373+
* on https://github.com/zlib-ng/zlib-ng/pull/1532
374+
* Port from Simon Hosie's fork:
375+
* https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1
376+
*/
377+
378+
uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */
379+
uint32_t adler,
380+
const unsigned char *buf,
381+
unsigned long len)
382+
{
383+
/* split Adler-32 into component sums */
384+
uint32_t sum2 = (adler >> 16) & 0xffff;
385+
adler &= 0xffff;
386+
387+
size_t left = len;
388+
size_t vl = __riscv_vsetvlmax_e8m1();
389+
vl = vl > 256 ? 256 : vl;
390+
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
391+
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
392+
vuint16m2_t v_buf16_accu;
393+
394+
/*
395+
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
396+
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
397+
* accumulators to boost performance.
398+
*
399+
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when
400+
* vl > 256 (255 * 256 <= UINT16_MAX).
401+
*
402+
* We accumulate 8-bit data into a 16-bit accumulator and then
403+
* move the data into the 32-bit accumulator at the last iteration.
404+
*/
405+
size_t block_size = (256 / vl) * vl;
406+
size_t nmax_limit = (NMAX / block_size);
407+
size_t cnt = 0;
408+
while (left >= block_size) {
409+
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
410+
size_t subprob = block_size;
411+
while (subprob > 0) {
412+
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
413+
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
414+
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
415+
buf += vl;
416+
subprob -= vl;
417+
}
418+
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
419+
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
420+
left -= block_size;
421+
/* do modulo once each block of NMAX size */
422+
if (++cnt >= nmax_limit) {
423+
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
424+
cnt = 0;
425+
}
426+
}
427+
/* the left len <= 256 now, we can use 16-bit accum safely */
428+
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
429+
size_t res = left;
430+
while (left >= vl) {
431+
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
432+
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
433+
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
434+
buf += vl;
435+
left -= vl;
436+
}
437+
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
438+
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
439+
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
440+
441+
vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
442+
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
443+
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
444+
445+
v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
446+
447+
vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
448+
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
449+
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
450+
451+
sum2 += (sum2_sum + adler * (len - left));
452+
453+
vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
454+
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
455+
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
456+
457+
adler += adler_sum;
458+
459+
while (left--) {
460+
adler += *buf++;
461+
sum2 += adler;
462+
}
463+
464+
sum2 %= BASE;
465+
adler %= BASE;
466+
467+
return adler | (sum2 << 16);
468+
}
469+
366470
#endif /* ADLER32_SIMD_SSSE3 */

deps/zlib/cpu_features.c

+28-4
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,13 @@ int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
3333
int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
3434
int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;
3535

36+
int ZLIB_INTERNAL riscv_cpu_enable_rvv = 0;
37+
int ZLIB_INTERNAL riscv_cpu_enable_vclmul = 0;
38+
3639
#ifndef CPU_NO_SIMD
3740

38-
#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS)
41+
#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \
42+
defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS)
3943
#include <pthread.h>
4044
#endif
4145

@@ -62,7 +66,10 @@ int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;
6266
static void _cpu_check_features(void);
6367
#endif
6468

65-
#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS)
69+
#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \
70+
defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || \
71+
defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS) || \
72+
defined(RISCV_RVV)
6673
#if !defined(ARMV8_OS_MACOS)
6774
// _cpu_check_features() doesn't need to do anything on mac/arm since all
6875
// features are known at build time, so don't call it.
@@ -184,6 +191,23 @@ static void _cpu_check_features(void)
184191
x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
185192
#endif
186193
}
194+
#endif // x86 & NO_SIMD
195+
196+
#elif defined(RISCV_RVV)
197+
#include <sys/auxv.h>
198+
199+
#ifndef ZLIB_HWCAP_RVV
200+
#define ZLIB_HWCAP_RVV (1 << ('v' - 'a'))
187201
#endif
188-
#endif
189-
#endif
202+
203+
/* TODO(cavalcantii)
204+
* - add support for Android@RISCV i.e. __riscv_hwprobe().
205+
* - detect vclmul (crypto extensions).
206+
*/
207+
static void _cpu_check_features(void)
208+
{
209+
unsigned long features = getauxval(AT_HWCAP);
210+
riscv_cpu_enable_rvv = !!(features & ZLIB_HWCAP_RVV);
211+
}
212+
#endif // ARM | x86 | RISCV
213+
#endif // NO SIMD CPU

deps/zlib/cpu_features.h

+3
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,7 @@ extern int x86_cpu_enable_ssse3;
1616
extern int x86_cpu_enable_simd;
1717
extern int x86_cpu_enable_avx512;
1818

19+
extern int riscv_cpu_enable_rvv;
20+
extern int riscv_cpu_enable_vclmul;
21+
1922
void cpu_check_features(void);

deps/zlib/crc32.c

+4-2
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,8 @@ unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
706706
* place to cache CPU features if needed for those later, more
707707
* interesting crc32() calls.
708708
*/
709-
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
709+
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
710+
|| defined(RISCV_RVV)
710711
/*
711712
* Since this routine can be freely used, check CPU features here.
712713
*/
@@ -1085,7 +1086,8 @@ unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
10851086
/* Some bots compile with optimizations disabled, others will emulate
10861087
* ARM on x86 and other weird combinations.
10871088
*/
1088-
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
1089+
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
1090+
|| defined(RISCV_RVV)
10891091
/* We got to verify CPU features, so exploit the common usage pattern
10901092
* of calling this function with Z_NULL for an initial valid crc value.
10911093
* This allows to cache the result of the feature check and avoid extraneous

deps/zlib/deflate.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,8 @@ int ZEXPORT deflateInit2_(z_streamp strm, int level, int method,
401401
// for all wrapper formats (e.g. RAW, ZLIB, GZIP).
402402
// Feature detection is not triggered while using RAW mode (i.e. we never
403403
// call crc32() with a NULL buffer).
404-
#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL)
404+
#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL) \
405+
|| defined(RISCV_RVV)
405406
cpu_check_features();
406407
#endif
407408

src/zlib_version.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
// Refer to tools/dep_updaters/update-zlib.sh
33
#ifndef SRC_ZLIB_VERSION_H_
44
#define SRC_ZLIB_VERSION_H_
5-
#define ZLIB_VERSION "1.3.0.1-motley-24342f6"
5+
#define ZLIB_VERSION "1.3.0.1-motley-24c07df"
66
#endif // SRC_ZLIB_VERSION_H_

0 commit comments

Comments
 (0)