diff options
author | Adenilson Cavalcanti <cavalcantii@chromium.org> | 2024-04-09 20:51:21 +0000 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2024-04-09 13:57:49 -0700 |
commit | 37d9855c8db5a130571971e78fde2740314cd98a (patch) | |
tree | 445bb75640dc1c08ed4c9dd68564818bd1b2300b | |
parent | 29a30d38714cec7dd641d0c9e172b7e88b06a7f6 (diff) | |
download | zlib-37d9855c8db5a130571971e78fde2740314cd98a.tar.gz |
[zlib][riscv] Import superior Adler-32 implementation
Replace SiFive code for an alternative checksum implementation
that works in short 22-iteration batches thus avoiding overflowing
16-bit counters.
As a result, it has better parallelism in the inner loop, yielding
a +20% faster checksum speed on a K230 board.
The average *decompression* gain while using the zlib wrapper for
the snappy data corpus was +2.15%, but with near +4% for HTML.
Patch by Simon Hosie, from:
https://github.com/cloudflare/zlib/pull/55
Bug: 329282661
Change-Id: I72e2ce9bb9b3d8626dedb33cf026f1af9b9b4a33
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5433273
Reviewed-by: Hans Wennborg <hans@chromium.org>
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1284684}
NOKEYCHECK=True
GitOrigin-RevId: f68eb88e6ac1139355bad9d1f1eff784e9e82afb
-rw-r--r-- | adler32_simd.c | 166 |
1 files changed, 76 insertions, 90 deletions
diff --git a/adler32_simd.c b/adler32_simd.c index 9970ea9..b3e1f0a 100644 --- a/adler32_simd.c +++ b/adler32_simd.c @@ -41,9 +41,6 @@ * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates * of the adler s1 s2 of uint32_t type (see adler32.c). */ -/* Copyright (C) 2023 SiFive, Inc. All rights reserved. - * For conditions of distribution and use, see copyright notice in zlib.h - */ #include "adler32_simd.h" @@ -368,11 +365,10 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ #elif defined(ADLER32_SIMD_RVV) #include <riscv_vector.h> -/* adler32_rvv.c - RVV version of Adler-32 - * RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com> - * on https://github.com/zlib-ng/zlib-ng/pull/1532 - * Port from Simon Hosie's fork: - * https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1 + +/* + * Patch by Simon Hosie, from: + * https://github.com/cloudflare/zlib/pull/55 */ uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */ @@ -380,91 +376,81 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */ const unsigned char *buf, unsigned long len) { - /* split Adler-32 into component sums */ - uint32_t sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - - size_t left = len; - size_t vl = __riscv_vsetvlmax_e8m1(); - vl = vl > 256 ? 256 : vl; - vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl); - vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl); - vuint16m2_t v_buf16_accu; - - /* - * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator. - * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit - * accumulators to boost performance. - * - * The block_size is the largest multiple of vl that <= 256, because overflow would occur when - * vl > 256 (255 * 256 <= UINT16_MAX). - * - * We accumulate 8-bit data into a 16-bit accumulator and then - * move the data into the 32-bit accumulator at the last iteration. + size_t vl = __riscv_vsetvlmax_e8m2(); + const vuint16m4_t zero16 = __riscv_vmv_v_x_u16m4(0, vl); + vuint16m4_t a_sum = zero16; + vuint32m8_t b_sum = __riscv_vmv_v_x_u32m8(0, vl); + + /* Deal with the part which is not a multiple of vl first; because it's + * easier to zero-stuff the beginning of the checksum than it is to tweak the + * multipliers and sums for odd lengths afterwards. + */ + size_t head = len & (vl - 1); + if (head > 0) { + vuint8m2_t zero8 = __riscv_vmv_v_x_u8m2(0, vl); + vuint8m2_t in = __riscv_vle8_v_u8m2(buf, vl); + in = __riscv_vslideup(zero8, in, vl - head, vl); + vuint16m4_t in16 = __riscv_vwcvtu_x(in, vl); + a_sum = in16; + buf += head; + } + + /* We have a 32-bit accumulator, and in each iteration we add 22-times a + * 16-bit value, plus another 16-bit value. We periodically subtract up to + * 65535 times BASE to avoid overflow. b_overflow estimates how often we + * need to do this subtraction. + */ + const int b_overflow = BASE / 23; + int fixup = b_overflow; + ssize_t iters = (len - head) / vl; + while (iters > 0) { + const vuint16m4_t a_overflow = __riscv_vrsub(a_sum, BASE, vl); + int batch = iters < 22 ? iters : 22; + iters -= batch; + b_sum = __riscv_vwmaccu(b_sum, batch, a_sum, vl); + vuint16m4_t a_batch = zero16, b_batch = zero16; + + /* Do a short batch, where neither a_sum nor b_sum can overflow a 16-bit + * register. Then add them back into the main accumulators. */ - size_t block_size = (256 / vl) * vl; - size_t nmax_limit = (NMAX / block_size); - size_t cnt = 0; - while (left >= block_size) { - v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); - size_t subprob = block_size; - while (subprob > 0) { - vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); - v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); - v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); - buf += vl; - subprob -= vl; - } - v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); - v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); - left -= block_size; - /* do modulo once each block of NMAX size */ - if (++cnt >= nmax_limit) { - v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); - cnt = 0; - } + while (batch-- > 0) { + vuint8m2_t in8 = __riscv_vle8_v_u8m2(buf, vl); + buf += vl; + b_batch = __riscv_vadd(b_batch, a_batch, vl); + a_batch = __riscv_vwaddu_wv(a_batch, in8, vl); } - /* the left len <= 256 now, we can use 16-bit accum safely */ - v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); - size_t res = left; - while (left >= vl) { - vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); - v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); - v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); - buf += vl; - left -= vl; + vbool4_t ov = __riscv_vmsgeu(a_batch, a_overflow, vl); + a_sum = __riscv_vadd(a_sum, a_batch, vl); + a_sum = __riscv_vadd_mu(ov, a_sum, a_sum, 65536 - BASE, vl); + b_sum = __riscv_vwaddu_wv(b_sum, b_batch, vl); + if (--fixup <= 0) { + b_sum = __riscv_vnmsac(b_sum, BASE, __riscv_vsrl(b_sum, 16, vl), vl); + fixup = b_overflow; } - v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); - v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); - v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); - - vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl); - vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl); - vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl); - - v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl); - - vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl); - v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl); - uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum); - - sum2 += (sum2_sum + adler * (len - left)); - - vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl); - v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl); - uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum); - - adler += adler_sum; - - while (left--) { - adler += *buf++; - sum2 += adler; - } - - sum2 %= BASE; - adler %= BASE; - - return adler | (sum2 << 16); + } + /* Adjust per-lane sums to have appropriate offsets from the end of the + * buffer. + */ + const vuint16m4_t off = __riscv_vrsub(__riscv_vid_v_u16m4(vl), vl, vl); + vuint16m4_t bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl); + b_sum = __riscv_vadd(__riscv_vwmulu(a_sum, off, vl), + __riscv_vwmulu(bsum16, vl, vl), vl); + bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl); + + /* And finally, do a horizontal sum across the registers for the final + * result. + */ + uint32_t a = adler & 0xffff; + uint32_t b = ((adler >> 16) + a * (len % BASE)) % BASE; + vuint32m1_t sca = __riscv_vmv_v_x_u32m1(a, 1); + vuint32m1_t scb = __riscv_vmv_v_x_u32m1(b, 1); + sca = __riscv_vwredsumu(a_sum, sca, vl); + scb = __riscv_vwredsumu(bsum16, scb, vl); + a = __riscv_vmv_x(sca); + b = __riscv_vmv_x(scb); + a %= BASE; + b %= BASE; + return (b << 16) | a; } #endif /* ADLER32_SIMD_SSSE3 */ |