From 0c17641ec2f4d8f3d46df535c3ded707f9f85168 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Wed, 16 Feb 2022 21:35:17 +0100 Subject: [PATCH 1/2] internal/lz4block: arm64 decoder improvements Use fast loop after dict copy. Checking for its possibility costs as many instructions as jumping over it. Move SUBS close to conditional branches for CPUs that fuse these instructions. Shave one instruction off the remainder handling code after this loop. A load from register base+register offset has the same latency and throughput as a load from register+constant offset, at least on Cortex-A72. --- decode_arm64.s | 11 +++++------ decode_asm.go | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/decode_arm64.s b/decode_arm64.s index aa73a1c2..8a784c49 100644 --- a/decode_arm64.s +++ b/decode_arm64.s @@ -163,23 +163,22 @@ copyMatchTry8: AND $7, len, lenRem SUB $8, len copyMatchLoop8: - SUBS $8, len MOVD.P 8(match), tmp1 MOVD.P tmp1, 8(dst) + SUBS $8, len BPL copyMatchLoop8 - ADD lenRem, match + MOVD (match)(len), tmp2 // match+len == match+lenRem-8. ADD lenRem, dst - MOVD -8(match), tmp2 MOVD tmp2, -8(dst) B copyMatchDone copyMatchLoop1: - // Finish with a byte-at-a-time copy. - SUB $1, len + // Byte-at-a-time copy for small offsets. MOVBU.P 1(match), tmp2 MOVB.P tmp2, 1(dst) - CBNZ len, copyMatchLoop1 + SUBS $1, len + BNE copyMatchLoop1 copyMatchDone: CMP src, srcend diff --git a/decode_asm.go b/decode_asm.go index eb05ace0..4f0bb37b 100644 --- a/decode_asm.go +++ b/decode_asm.go @@ -1,3 +1,4 @@ +//go:build (amd64 || arm || arm64) && !appengine && gc && !noasm // +build amd64 arm arm64 // +build !appengine // +build gc From 72f1f00ac24c239a6774e77167f4776e8b283832 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Wed, 16 Feb 2022 22:21:02 +0100 Subject: [PATCH 2/2] internal/lz4block: Detect match with no space for offset This corner case wasn't detected by any of the decoders. --- decode_arm.s | 8 +++++--- decode_arm64.s | 8 ++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/decode_arm.s b/decode_arm.s index a5a1ce07..5b5e0138 100644 --- a/decode_arm.s +++ b/decode_arm.s @@ -109,13 +109,13 @@ copyLiteralFinish: MOVB.NE tmp2, -1(dst) copyLiteralDone: - CMP src, srcend - BEQ end - // Initial part of match length. // This frees up the token register for reuse as offset. AND $15, token, len + CMP src, srcend + BEQ end + // Read offset. ADD.S $2, src BCS shortSrc @@ -188,6 +188,8 @@ copyMatchDone: BNE loop end: + CMP $0, len + BNE corrupt SUB dstorig, dst, tmp1 MOVW tmp1, ret+24(FP) RET diff --git a/decode_arm64.s b/decode_arm64.s index 8a784c49..abc306bf 100644 --- a/decode_arm64.s +++ b/decode_arm64.s @@ -112,6 +112,9 @@ copyLiteralShortEnd: MOVB.P tmp4, 1(dst) copyLiteralDone: + // Initial part of match length. + AND $15, token, len + CMP src, srcend BEQ end @@ -123,8 +126,7 @@ copyLiteralDone: MOVHU -2(src), offset CBZ offset, corrupt - // Read match length. - AND $15, token, len + // Read rest of match length. CMP $15, len BNE readMatchlenDone @@ -170,6 +172,7 @@ copyMatchLoop8: MOVD (match)(len), tmp2 // match+len == match+lenRem-8. ADD lenRem, dst + MOVD $0, len MOVD tmp2, -8(dst) B copyMatchDone @@ -185,6 +188,7 @@ copyMatchDone: BNE loop end: + CBNZ len, corrupt SUB dstorig, dst, tmp1 MOVD tmp1, ret+48(FP) RET