From c6652d3a9d4ac854dd2a83935110be76683c06a5 Mon Sep 17 00:00:00 2001 From: Liz Fong-Jones Date: Sun, 4 Dec 2022 06:07:26 -0500 Subject: [PATCH 1/2] [zstd] sync xxhash with final accepted patch upstream Syncs with https://github.com/cespare/xxhash/pull/51 --- zstd/internal/xxhash/xxhash_arm64.s | 83 +++++++++++++++-------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/zstd/internal/xxhash/xxhash_arm64.s b/zstd/internal/xxhash/xxhash_arm64.s index 4d64a17d69..90830ab6e5 100644 --- a/zstd/internal/xxhash/xxhash_arm64.s +++ b/zstd/internal/xxhash/xxhash_arm64.s @@ -4,10 +4,10 @@ // Register allocation. #define digest R1 -#define h R2 // Return value. -#define p R3 // Input pointer. +#define h R2 // Return value. +#define p R3 // Input pointer. #define len R4 -#define nblocks R5 // len / 32. +#define nblocks R5 // len / 32. #define prime1 R7 #define prime2 R8 #define prime3 R9 @@ -22,56 +22,58 @@ #define x3 R22 #define x4 R23 -#define round(acc, x) \ - MADD prime2, acc, x, acc \ - ROR $64-31, acc \ - MUL prime1, acc \ +#define round(acc, x) \ + MADD prime2, acc, x, acc \ + ROR $64-31, acc \ + MUL prime1, acc \ // x = round(0, x). -#define round0(x) \ - MUL prime2, x \ - ROR $64-31, x \ - MUL prime1, x \ +#define round0(x) \ + MUL prime2, x \ + ROR $64-31, x \ + MUL prime1, x \ -#define mergeRound(x) \ - round0(x) \ - EOR x, h \ - MADD h, prime4, prime1, h \ +#define mergeRound(x) \ + round0(x) \ + EOR x, h \ + MADD h, prime4, prime1, h \ // Update v[1-4] with 32-byte blocks. Assumes len >= 32. -#define blocksLoop() \ - LSR $5, len, nblocks \ - PCALIGN $16 \ - loop: \ - LDP.P 32(p), (x1, x2) \ - round(v1, x1) \ - LDP -16(p), (x3, x4) \ - round(v2, x2) \ - SUB $1, nblocks \ - round(v3, x3) \ - round(v4, x4) \ - CBNZ nblocks, loop \ +#define blocksLoop() \ + LSR $5, len, nblocks \ + PCALIGN $16 \ +loop: \ + LDP.P 32(p), (x1, x2) \ + round(v1, x1) \ + LDP -16(p), (x3, x4) \ + round(v2, x2) \ + round(v3, x3) \ + round(v4, x4) \ + SUB $1, nblocks \ + CBNZ nblocks, loop \ + // The primes are repeated here to ensure that they're stored // in a contiguous array, so we can load them with LDP. -DATA primes<> +0(SB)/8, $11400714785074694791 -DATA primes<> +8(SB)/8, $14029467366897019727 -DATA primes<>+16(SB)/8, $1609587929392839161 -DATA primes<>+24(SB)/8, $9650029242287828579 -DATA primes<>+32(SB)/8, $2870177450012600261 +DATA primes<> +0(SB)/8, $11400714785074694791 +DATA primes<> +8(SB)/8, $14029467366897019727 +DATA primes<>+16(SB)/8, $1609587929392839161 +DATA primes<>+24(SB)/8, $9650029242287828579 +DATA primes<>+32(SB)/8, $2870177450012600261 GLOBL primes<>(SB), NOPTR+RODATA, $40 + // func Sum64(b []byte) uint64 TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 - LDP b_base+0(FP), (p, len) + LDP b_base+0(FP), (p, len) LDP primes<> +0(SB), (prime1, prime2) LDP primes<>+16(SB), (prime3, prime4) MOVD primes<>+32(SB), prime5 CMP $32, len - CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 } - BLO afterLoop + CSEL LT, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 } + BLT afterLoop ADD prime1, prime2, v1 MOVD prime2, v2 @@ -154,23 +156,24 @@ try1: end: EOR h >> 33, h - MUL prime2, h + MUL prime2, h EOR h >> 29, h - MUL prime3, h + MUL prime3, h EOR h >> 32, h MOVD h, ret+24(FP) RET + // func writeBlocks(d *Digest, b []byte) int // // Assumes len(b) >= 32. TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 - LDP primes<>(SB), (prime1, prime2) + LDP primes<>(SB), (prime1, prime2) // Load state. Assume v[1-4] are stored contiguously. MOVD d+0(FP), digest - LDP 0(digest), (v1, v2) + LDP 0(digest), (v1, v2) LDP 16(digest), (v3, v4) LDP b_base+8(FP), (p, len) @@ -178,7 +181,7 @@ TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 blocksLoop() // Store updated state. - STP (v1, v2), 0(digest) + STP (v1, v2), 0(digest) STP (v3, v4), 16(digest) BIC $31, len From cf02dfdad8a88600d344622942b5f1e4afdcbe53 Mon Sep 17 00:00:00 2001 From: Liz Fong-Jones Date: Sun, 4 Dec 2022 22:12:40 +1100 Subject: [PATCH 2/2] asmfmt --- zstd/internal/xxhash/xxhash_arm64.s | 79 ++++++++++++++--------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/zstd/internal/xxhash/xxhash_arm64.s b/zstd/internal/xxhash/xxhash_arm64.s index 90830ab6e5..36f2effee0 100644 --- a/zstd/internal/xxhash/xxhash_arm64.s +++ b/zstd/internal/xxhash/xxhash_arm64.s @@ -4,10 +4,10 @@ // Register allocation. #define digest R1 -#define h R2 // Return value. -#define p R3 // Input pointer. +#define h R2 // Return value. +#define p R3 // Input pointer. #define len R4 -#define nblocks R5 // len / 32. +#define nblocks R5 // len / 32. #define prime1 R7 #define prime2 R8 #define prime3 R9 @@ -22,50 +22,48 @@ #define x3 R22 #define x4 R23 -#define round(acc, x) \ - MADD prime2, acc, x, acc \ - ROR $64-31, acc \ - MUL prime1, acc \ +#define round(acc, x) \ + MADD prime2, acc, x, acc \ + ROR $64-31, acc \ + MUL prime1, acc \ // x = round(0, x). -#define round0(x) \ - MUL prime2, x \ - ROR $64-31, x \ - MUL prime1, x \ +#define round0(x) \ + MUL prime2, x \ + ROR $64-31, x \ + MUL prime1, x \ -#define mergeRound(x) \ - round0(x) \ - EOR x, h \ - MADD h, prime4, prime1, h \ +#define mergeRound(x) \ + round0(x) \ + EOR x, h \ + MADD h, prime4, prime1, h \ // Update v[1-4] with 32-byte blocks. Assumes len >= 32. -#define blocksLoop() \ - LSR $5, len, nblocks \ - PCALIGN $16 \ -loop: \ - LDP.P 32(p), (x1, x2) \ - round(v1, x1) \ - LDP -16(p), (x3, x4) \ - round(v2, x2) \ - round(v3, x3) \ - round(v4, x4) \ - SUB $1, nblocks \ - CBNZ nblocks, loop \ - +#define blocksLoop() \ + LSR $5, len, nblocks \ + PCALIGN $16 \ + loop: \ + LDP.P 32(p), (x1, x2) \ + round(v1, x1) \ + LDP -16(p), (x3, x4) \ + round(v2, x2) \ + round(v3, x3) \ + round(v4, x4) \ + SUB $1, nblocks \ + CBNZ nblocks, loop \ // The primes are repeated here to ensure that they're stored // in a contiguous array, so we can load them with LDP. -DATA primes<> +0(SB)/8, $11400714785074694791 -DATA primes<> +8(SB)/8, $14029467366897019727 -DATA primes<>+16(SB)/8, $1609587929392839161 -DATA primes<>+24(SB)/8, $9650029242287828579 -DATA primes<>+32(SB)/8, $2870177450012600261 +DATA primes<> +0(SB)/8, $11400714785074694791 +DATA primes<> +8(SB)/8, $14029467366897019727 +DATA primes<>+16(SB)/8, $1609587929392839161 +DATA primes<>+24(SB)/8, $9650029242287828579 +DATA primes<>+32(SB)/8, $2870177450012600261 GLOBL primes<>(SB), NOPTR+RODATA, $40 - // func Sum64(b []byte) uint64 TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 - LDP b_base+0(FP), (p, len) + LDP b_base+0(FP), (p, len) LDP primes<> +0(SB), (prime1, prime2) LDP primes<>+16(SB), (prime3, prime4) @@ -156,24 +154,23 @@ try1: end: EOR h >> 33, h - MUL prime2, h + MUL prime2, h EOR h >> 29, h - MUL prime3, h + MUL prime3, h EOR h >> 32, h MOVD h, ret+24(FP) RET - // func writeBlocks(d *Digest, b []byte) int // // Assumes len(b) >= 32. TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 - LDP primes<>(SB), (prime1, prime2) + LDP primes<>(SB), (prime1, prime2) // Load state. Assume v[1-4] are stored contiguously. MOVD d+0(FP), digest - LDP 0(digest), (v1, v2) + LDP 0(digest), (v1, v2) LDP 16(digest), (v3, v4) LDP b_base+8(FP), (p, len) @@ -181,7 +178,7 @@ TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 blocksLoop() // Store updated state. - STP (v1, v2), 0(digest) + STP (v1, v2), 0(digest) STP (v3, v4), 16(digest) BIC $31, len