From 2f437394175f7017ca57afd7afded5775ae4c432 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 9 May 2022 03:17:54 -0700 Subject: [PATCH] zstd: Faster decoding memcopy in asm (#583) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use faster method for `copyMemory` and `copyOverlappedMemory`. ``` λ go test -short -bench=seqdec_execute >after.txt&&benchcmp before.txt after.txt benchmark old ns/op new ns/op delta Benchmark_seqdec_execute/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32 129910 130620 +0.55% Benchmark_seqdec_execute/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32 139487 134032 -3.91% Benchmark_seqdec_execute/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32 37155 38636 +3.99% Benchmark_seqdec_execute/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32 16318 15788 -3.25% Benchmark_seqdec_execute/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32 44386 43959 -0.96% Benchmark_seqdec_execute/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32 100065 96156 -3.91% Benchmark_seqdec_execute/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32 8119 7373 -9.19% Benchmark_seqdec_execute/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32 84669 83034 -1.93% Benchmark_seqdec_execute/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32 2914 2773 -4.84% Benchmark_seqdec_execute/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32 4318 3824 -11.44% Benchmark_seqdec_execute/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32 7851 7203 -8.25% Benchmark_seqdec_execute/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32 15161 14315 -5.58% Benchmark_seqdec_execute/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32 23920 20065 -16.12% Benchmark_seqdec_execute/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32 53268 52768 -0.94% ``` --- zstd/_generate/gen.go | 57 +++++++++---------- zstd/decoder_test.go | 6 +- zstd/seqdec_amd64.s | 124 +++++++++++++++++++++--------------------- 3 files changed, 89 insertions(+), 98 deletions(-) diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index bc74530f17..40f573711e 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -874,13 +874,6 @@ type executeSimple struct { safeMem bool } -// copySize returns register size used to fast copy. -// -// See copyMemory() -func (e executeSimple) copySize() int { - return 16 -} - func (e executeSimple) generateProcedure(name string) { Package("github.com/klauspost/compress/zstd") TEXT(name, 0, "func (ctx *executeAsmContext) bool") @@ -1155,42 +1148,44 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle Comment("Copy non-overlapping match") { + ADDQ(ml, c.outPosition) if e.safeMem { e.copyMemoryPrecise("2", src, c.outBase, ml) + ADDQ(ml, c.outBase) } else { - e.copyMemory("2", src, c.outBase, ml) + dst := GP64() + MOVQ(c.outBase, dst) + ADDQ(ml, c.outBase) + e.copyMemory("2", src, dst, ml) } - ADDQ(ml, c.outBase) - ADDQ(ml, c.outPosition) + JMP(LabelRef("handle_loop")) } Comment("Copy overlapping match") Label("copy_overlapping_match") { - e.copyOverlappedMemory("3", src, c.outBase, ml) - ADDQ(ml, c.outBase) ADDQ(ml, c.outPosition) + e.copyOverlappedMemory("3", src, c.outBase, ml) } } } // copyMemory will copy memory in blocks of 16 bytes, // overwriting up to 15 extra bytes. +// src and dst are updated. length will be zero or less. func (e executeSimple) copyMemory(suffix string, src, dst, length reg.GPVirtual) { label := "copy_" + suffix - ofs := GP64() - s := Mem{Base: src, Index: ofs, Scale: 1} - d := Mem{Base: dst, Index: ofs, Scale: 1} - XORQ(ofs, ofs) Label(label) t := XMM() - MOVUPS(s, t) - MOVUPS(t, d) - ADDQ(U8(e.copySize()), ofs) - CMPQ(ofs, length) - JB(LabelRef(label)) + MOVUPS(Mem{Base: src}, t) + MOVUPS(t, Mem{Base: dst}) + ADDQ(U8(16), src) + ADDQ(U8(16), dst) + SUBQ(U8(16), length) + // jump if (CF == 0 and ZF == 0). + JHI(LabelRef(label)) } // copyMemoryPrecise will copy memory in blocks of 16 bytes, @@ -1246,27 +1241,25 @@ func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPV t := XMM() MOVUPS(s, t) MOVUPS(t, d) - ADDQ(U8(e.copySize()), ofs) + ADDQ(U8(16), ofs) Label("copy_" + suffix + "_test") CMPQ(ofs, length) JB(LabelRef(label)) } // copyOverlappedMemory will copy one byte at the time from src to dst. +// src and dst are updated. length will be zero. func (e executeSimple) copyOverlappedMemory(suffix string, src, dst, length reg.GPVirtual) { label := "copy_slow_" + suffix - ofs := GP64() - s := Mem{Base: src, Index: ofs, Scale: 1} - d := Mem{Base: dst, Index: ofs, Scale: 1} - t := GP64() + tmp := GP64() - XORQ(ofs, ofs) Label(label) - MOVB(s, t.As8()) - MOVB(t.As8(), d) - INCQ(ofs) - CMPQ(ofs, length) - JB(LabelRef(label)) + MOVB(Mem{Base: src}, tmp.As8()) + MOVB(tmp.As8(), Mem{Base: dst}) + INCQ(src) + INCQ(dst) + DECQ(length) + JNZ(LabelRef(label)) } type decodeSync struct { diff --git a/zstd/decoder_test.go b/zstd/decoder_test.go index cba5dd80f7..04141628bd 100644 --- a/zstd/decoder_test.go +++ b/zstd/decoder_test.go @@ -1317,7 +1317,7 @@ func BenchmarkDecoder_DecodeAllFilesP(b *testing.B) { if err != nil { b.Error(err) } - _, err = dec.DecodeAll(encoded, nil) + raw, err := dec.DecodeAll(encoded, nil) if err != nil { b.Error(err) } @@ -1326,7 +1326,7 @@ func BenchmarkDecoder_DecodeAllFilesP(b *testing.B) { b.ReportAllocs() b.ResetTimer() b.RunParallel(func(pb *testing.PB) { - buf := make([]byte, len(raw)) + buf := make([]byte, cap(raw)) var err error for pb.Next() { buf, err = dec.DecodeAll(encoded, buf[:0]) @@ -1373,7 +1373,7 @@ func BenchmarkDecoder_DecodeAllParallel(b *testing.B) { b.ReportAllocs() b.ResetTimer() b.RunParallel(func(pb *testing.PB) { - got := make([]byte, len(got)) + got := make([]byte, cap(got)) for pb.Next() { _, err = dec.DecodeAll(in, got[:0]) if err != nil { diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index 01cc23fa8a..2585b2e988 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -1326,30 +1326,30 @@ copy_match: JA copy_overlapping_match // Copy non-overlapping match - XORQ R12, R12 + ADDQ R13, DI + MOVQ BX, R12 + ADDQ R13, BX copy_2: - MOVUPS (R11)(R12*1), X0 - MOVUPS X0, (BX)(R12*1) + MOVUPS (R11), X0 + MOVUPS X0, (R12) + ADDQ $0x10, R11 ADDQ $0x10, R12 - CMPQ R12, R13 - JB copy_2 - ADDQ R13, BX - ADDQ R13, DI + SUBQ $0x10, R13 + JHI copy_2 JMP handle_loop // Copy overlapping match copy_overlapping_match: - XORQ R12, R12 + ADDQ R13, DI copy_slow_3: - MOVB (R11)(R12*1), R14 - MOVB R14, (BX)(R12*1) - INCQ R12 - CMPQ R12, R13 - JB copy_slow_3 - ADDQ R13, BX - ADDQ R13, DI + MOVB (R11), R12 + MOVB R12, (BX) + INCQ R11 + INCQ BX + DECQ R13 + JNZ copy_slow_3 handle_loop: ADDQ $0x18, AX @@ -1826,30 +1826,30 @@ copy_match: JA copy_overlapping_match // Copy non-overlapping match - XORQ CX, CX + ADDQ R13, R12 + MOVQ R10, CX + ADDQ R13, R10 copy_2: - MOVUPS (AX)(CX*1), X0 - MOVUPS X0, (R10)(CX*1) + MOVUPS (AX), X0 + MOVUPS X0, (CX) + ADDQ $0x10, AX ADDQ $0x10, CX - CMPQ CX, R13 - JB copy_2 - ADDQ R13, R10 - ADDQ R13, R12 + SUBQ $0x10, R13 + JHI copy_2 JMP handle_loop // Copy overlapping match copy_overlapping_match: - XORQ CX, CX + ADDQ R13, R12 copy_slow_3: - MOVB (AX)(CX*1), R14 - MOVB R14, (R10)(CX*1) - INCQ CX - CMPQ CX, R13 - JB copy_slow_3 - ADDQ R13, R10 - ADDQ R13, R12 + MOVB (AX), CL + MOVB CL, (R10) + INCQ AX + INCQ R10 + DECQ R13 + JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), AX @@ -2333,30 +2333,30 @@ copy_match: JA copy_overlapping_match // Copy non-overlapping match - XORQ R12, R12 + ADDQ R13, R11 + MOVQ R9, R12 + ADDQ R13, R9 copy_2: - MOVUPS (CX)(R12*1), X0 - MOVUPS X0, (R9)(R12*1) + MOVUPS (CX), X0 + MOVUPS X0, (R12) + ADDQ $0x10, CX ADDQ $0x10, R12 - CMPQ R12, R13 - JB copy_2 - ADDQ R13, R9 - ADDQ R13, R11 + SUBQ $0x10, R13 + JHI copy_2 JMP handle_loop // Copy overlapping match copy_overlapping_match: - XORQ R12, R12 + ADDQ R13, R11 copy_slow_3: - MOVB (CX)(R12*1), R14 - MOVB R14, (R9)(R12*1) - INCQ R12 - CMPQ R12, R13 - JB copy_slow_3 - ADDQ R13, R9 - ADDQ R13, R11 + MOVB (CX), R12 + MOVB R12, (R9) + INCQ CX + INCQ R9 + DECQ R13 + JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), CX @@ -2862,6 +2862,7 @@ copy_match: JA copy_overlapping_match // Copy non-overlapping match + ADDQ R13, R12 XORQ CX, CX TESTQ $0x00000001, R13 JZ copy_2_word @@ -2900,21 +2901,19 @@ copy_2_test: CMPQ CX, R13 JB copy_2 ADDQ R13, R10 - ADDQ R13, R12 JMP handle_loop // Copy overlapping match copy_overlapping_match: - XORQ CX, CX + ADDQ R13, R12 copy_slow_3: - MOVB (AX)(CX*1), R14 - MOVB R14, (R10)(CX*1) - INCQ CX - CMPQ CX, R13 - JB copy_slow_3 - ADDQ R13, R10 - ADDQ R13, R12 + MOVB (AX), CL + MOVB CL, (R10) + INCQ AX + INCQ R10 + DECQ R13 + JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), AX @@ -3398,6 +3397,7 @@ copy_match: JA copy_overlapping_match // Copy non-overlapping match + ADDQ R13, R11 XORQ R12, R12 TESTQ $0x00000001, R13 JZ copy_2_word @@ -3436,21 +3436,19 @@ copy_2_test: CMPQ R12, R13 JB copy_2 ADDQ R13, R9 - ADDQ R13, R11 JMP handle_loop // Copy overlapping match copy_overlapping_match: - XORQ R12, R12 + ADDQ R13, R11 copy_slow_3: - MOVB (CX)(R12*1), R14 - MOVB R14, (R9)(R12*1) - INCQ R12 - CMPQ R12, R13 - JB copy_slow_3 - ADDQ R13, R9 - ADDQ R13, R11 + MOVB (CX), R12 + MOVB R12, (R9) + INCQ CX + INCQ R9 + DECQ R13 + JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), CX