From 66dadfca81b0b5284b49cdba11b214e437257dcf Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 12 May 2022 12:05:08 +0200 Subject: [PATCH] zstd: Copy literal in 16 byte blocks when possible Also reduces literal overalloc when full allocs are allowed. ``` benchmark old ns/op new ns/op delta BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32 14572 13898 -4.63% BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32 3946 3682 -6.69% BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32 45150 43296 -4.11% BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32 33525 36679 +9.41% BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32 11952 10496 -12.18% BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32 14081 13339 -5.27% BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32 12111 11745 -3.02% BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32 1073 1037 -3.36% BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32 1759 1841 +4.66% BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32 43722 39755 -9.07% BenchmarkDecoder_DecodeAllParallel/html.zst-32 4144 3756 -9.36% BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32 1240 1240 +0.00% BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-32 250426 240012 -4.16% BenchmarkDecoder_DecodeAll/geo.protodata.zst-32 71861 65548 -8.79% BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-32 829878 736934 -11.20% BenchmarkDecoder_DecodeAll/lcet10.txt.zst-32 609402 683505 +12.16% BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-32 231636 189146 -18.34% BenchmarkDecoder_DecodeAll/alice29.txt.zst-32 245022 226451 -7.58% BenchmarkDecoder_DecodeAll/html_x_4.zst-32 229709 216421 -5.78% BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-32 18400 17850 -2.99% BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-32 9682 9801 +1.23% BenchmarkDecoder_DecodeAll/urls.10K.zst-32 924472 796913 -13.80% BenchmarkDecoder_DecodeAll/html.zst-32 77728 66831 -14.02% BenchmarkDecoder_DecodeAll/comp-data.bin.zst-32 7985 7432 -6.93% Benchmark_seqdec_execute/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32 130498 106559 -18.34% Benchmark_seqdec_execute/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32 136475 121699 -10.83% Benchmark_seqdec_execute/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32 43119 33598 -22.08% Benchmark_seqdec_execute/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32 15723 14472 -7.96% Benchmark_seqdec_execute/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32 25968 19734 -24.01% Benchmark_seqdec_execute/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32 88906 79506 -10.57% Benchmark_seqdec_execute/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32 7385 7269 -1.57% Benchmark_seqdec_execute/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32 83133 64295 -22.66% Benchmark_seqdec_execute/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32 2899 2881 -0.62% Benchmark_seqdec_execute/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32 3951 3961 +0.25% Benchmark_seqdec_execute/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32 7063 6809 -3.60% Benchmark_seqdec_execute/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32 14045 14050 +0.04% Benchmark_seqdec_execute/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32 19679 18611 -5.43% Benchmark_seqdec_execute/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32 48841 45545 -6.75% Benchmark_seqdec_decodeSync/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32 276464 273620 -1.03% Benchmark_seqdec_decodeSync/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32 270905 269049 -0.69% Benchmark_seqdec_decodeSync/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32 146061 145878 -0.13% Benchmark_seqdec_decodeSync/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32 30686 27367 -10.82% Benchmark_seqdec_decodeSync/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32 88493 87167 -1.50% Benchmark_seqdec_decodeSync/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32 195326 195764 +0.22% Benchmark_seqdec_decodeSync/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32 14081 13925 -1.11% Benchmark_seqdec_decodeSync/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32 297178 298192 +0.34% Benchmark_seqdec_decodeSync/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32 2935 2921 -0.48% Benchmark_seqdec_decodeSync/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32 4856 4467 -8.01% Benchmark_seqdec_decodeSync/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32 14059 14050 -0.06% Benchmark_seqdec_decodeSync/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32 35636 33427 -6.20% Benchmark_seqdec_decodeSync/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32 88618 85660 -3.34% Benchmark_seqdec_decodeSync/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32 162282 160568 -1.06% ``` `lcet10.txt` doesn't like it, otherwise mostly positive. Streams before/after: ``` BenchmarkDecoderEnwik9-32 1 1288277200 ns/op 776.23 MB/s 59552 B/op 44 allocs/op BenchmarkDecoderEnwik9/multithreaded-writer-32 1 1191034000 ns/op 839.61 MB/s 13993224 B/op 113 allocs/op BenchmarkDecoderSilesia-32 5 209913160 ns/op 1009.69 MB/s 46715 B/op 38 allocs/op BenchmarkDecoderSilesia/multithreaded-writer-32 5 201394480 ns/op 1052.40 MB/s 5129462 B/op 77 allocs/op ``` --- zstd/_generate/gen.go | 28 +++- zstd/blockdec.go | 30 ++-- zstd/decoder_test.go | 71 +++++++-- zstd/seqdec_amd64.go | 16 +- zstd/seqdec_amd64.s | 346 ++++++++++++++++++++++++++++++++---------- zstd/seqdec_test.go | 2 +- 6 files changed, 381 insertions(+), 112 deletions(-) diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index 40f573711e..2089402cea 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -66,6 +66,8 @@ func main() { safeMem: false, } exec.generateProcedure("sequenceDecs_executeSimple_amd64") + exec.safeMem = true + exec.generateProcedure("sequenceDecs_executeSimple_safe_amd64") decodeSync := decodeSync{} decodeSync.setBMI2(false) @@ -1032,7 +1034,11 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle TESTQ(ll, ll) JZ(LabelRef("check_offset")) // TODO: Investigate if it is possible to consistently overallocate literals. - e.copyMemoryPrecise("1", c.literals, c.outBase, ll) + if e.safeMem { + e.copyMemoryPrecise("1", c.literals, c.outBase, ll) + } else { + e.copyMemoryND("1", c.literals, c.outBase, ll) + } ADDQ(ll, c.literals) ADDQ(ll, c.outBase) ADDQ(ll, c.outPosition) @@ -1188,6 +1194,26 @@ func (e executeSimple) copyMemory(suffix string, src, dst, length reg.GPVirtual) JHI(LabelRef(label)) } +// copyMemoryND will copy memory in blocks of 16 bytes, +// overwriting up to 15 extra bytes. +// All parameters are preserved. +func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtual) { + label := "copy_" + suffix + + ofs := GP64() + s := Mem{Base: src, Index: ofs, Scale: 1} + d := Mem{Base: dst, Index: ofs, Scale: 1} + + XORQ(ofs, ofs) + Label(label) + t := XMM() + MOVUPS(s, t) + MOVUPS(t, d) + ADDQ(U8(16), ofs) + CMPQ(ofs, length) + JB(LabelRef(label)) +} + // copyMemoryPrecise will copy memory in blocks of 16 bytes, // without overwriting nor overreading. func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) { diff --git a/zstd/blockdec.go b/zstd/blockdec.go index b2bca33018..1e79976561 100644 --- a/zstd/blockdec.go +++ b/zstd/blockdec.go @@ -49,11 +49,8 @@ const ( // Maximum possible block size (all Raw+Uncompressed). maxBlockSize = (1 << 21) - 1 - // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header - maxCompressedLiteralSize = 1 << 18 - maxRLELiteralSize = 1 << 20 - maxMatchLen = 131074 - maxSequences = 0x7f00 + 0xffff + maxMatchLen = 131074 + maxSequences = 0x7f00 + 0xffff // We support slightly less than the reference decoder to be able to // use ints on 32 bit archs. @@ -368,14 +365,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err } if cap(b.literalBuf) < litRegenSize { if b.lowMem { - b.literalBuf = make([]byte, litRegenSize) + b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc) } else { - if litRegenSize > maxCompressedLiteralSize { - // Exceptional - b.literalBuf = make([]byte, litRegenSize) - } else { - b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize) - } + b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc) } } literals = b.literalBuf[:litRegenSize] @@ -405,14 +397,14 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err // Ensure we have space to store it. if cap(b.literalBuf) < litRegenSize { if b.lowMem { - b.literalBuf = make([]byte, 0, litRegenSize) + b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc) } else { - b.literalBuf = make([]byte, 0, maxCompressedLiteralSize) + b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc) } } var err error // Use our out buffer. - huff.MaxDecodedSize = maxCompressedBlockSize + huff.MaxDecodedSize = litRegenSize if fourStreams { literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals) } else { @@ -437,9 +429,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err // Ensure we have space to store it. if cap(b.literalBuf) < litRegenSize { if b.lowMem { - b.literalBuf = make([]byte, 0, litRegenSize) + b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc) } else { - b.literalBuf = make([]byte, 0, maxCompressedBlockSize) + b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc) } } huff := hist.huffTree @@ -456,7 +448,7 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err return in, err } hist.huffTree = huff - huff.MaxDecodedSize = maxCompressedBlockSize + huff.MaxDecodedSize = litRegenSize // Use our out buffer. if fourStreams { literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals) @@ -471,6 +463,8 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err if len(literals) != litRegenSize { return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals)) } + // Re-cap to get extra size. + literals = b.literalBuf[:len(literals)] if debugDecoder { printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize) } diff --git a/zstd/decoder_test.go b/zstd/decoder_test.go index 614c5903c0..7a7a79c082 100644 --- a/zstd/decoder_test.go +++ b/zstd/decoder_test.go @@ -1402,12 +1402,7 @@ func benchmarkDecoderWithFile(path string, b *testing.B) { if err != nil { b.Fatal(err) } - dec, err := NewReader(nil, WithDecoderLowmem(false)) - if err != nil { - b.Fatal(err) - } - defer dec.Close() - err = dec.Reset(bytes.NewBuffer(data)) + dec, err := NewReader(bytes.NewBuffer(data), WithDecoderLowmem(false), WithDecoderConcurrency(1)) if err != nil { b.Fatal(err) } @@ -1416,19 +1411,69 @@ func benchmarkDecoderWithFile(path string, b *testing.B) { b.Fatal(err) } - b.SetBytes(n) - b.ReportAllocs() - b.ResetTimer() - for i := 0; i < b.N; i++ { - err = dec.Reset(bytes.NewBuffer(data)) + b.Run("multithreaded-writer", func(b *testing.B) { + dec, err := NewReader(nil) if err != nil { b.Fatal(err) } - _, err := io.CopyN(ioutil.Discard, dec, n) + + b.SetBytes(n) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err = dec.Reset(bytes.NewBuffer(data)) + if err != nil { + b.Fatal(err) + } + _, err := io.CopyN(ioutil.Discard, dec, n) + if err != nil { + b.Fatal(err) + } + } + }) + + b.Run("singlethreaded-writer", func(b *testing.B) { + dec, err := NewReader(nil, WithDecoderConcurrency(1)) if err != nil { b.Fatal(err) } - } + + b.SetBytes(n) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err = dec.Reset(bytes.NewBuffer(data)) + if err != nil { + b.Fatal(err) + } + _, err := io.CopyN(ioutil.Discard, dec, n) + if err != nil { + b.Fatal(err) + } + } + }) + + b.Run("singlethreaded-writerto", func(b *testing.B) { + dec, err := NewReader(nil, WithDecoderConcurrency(1)) + if err != nil { + b.Fatal(err) + } + + b.SetBytes(n) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err = dec.Reset(bytes.NewBuffer(data)) + if err != nil { + b.Fatal(err) + } + // io.Copy will use io.WriterTo + _, err := io.Copy(ioutil.Discard, dec) + if err != nil { + b.Fatal(err) + } + } + }) } func BenchmarkDecoderSilesia(b *testing.B) { diff --git a/zstd/seqdec_amd64.go b/zstd/seqdec_amd64.go index 4676b09cc1..847b322ae3 100644 --- a/zstd/seqdec_amd64.go +++ b/zstd/seqdec_amd64.go @@ -62,6 +62,10 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) { if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) { useSafe = true } + if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc { + useSafe = true + } + br := s.br maxBlockSize := maxCompressedBlockSize @@ -301,6 +305,10 @@ type executeAsmContext struct { //go:noescape func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool +// Same as above, but with safe memcopies +//go:noescape +func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool + // executeSimple handles cases when dictionary is not used. func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error { // Ensure we have enough output size... @@ -327,8 +335,12 @@ func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error { literals: s.literals, windowSize: s.windowSize, } - - ok := sequenceDecs_executeSimple_amd64(&ctx) + var ok bool + if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc { + ok = sequenceDecs_executeSimple_safe_amd64(&ctx) + } else { + ok = sequenceDecs_executeSimple_amd64(&ctx) + } if !ok { return fmt.Errorf("match offset (%d) bigger than current history (%d)", seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist)) diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index 2585b2e988..9665833df1 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -1162,6 +1162,228 @@ TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 // outBase += outPosition ADDQ DI, BX +main_loop: + MOVQ (AX), R11 + MOVQ 16(AX), R12 + MOVQ 8(AX), R13 + + // Copy literals + TESTQ R11, R11 + JZ check_offset + XORQ R14, R14 + +copy_1: + MOVUPS (SI)(R14*1), X0 + MOVUPS X0, (BX)(R14*1) + ADDQ $0x10, R14 + CMPQ R14, R11 + JB copy_1 + ADDQ R11, SI + ADDQ R11, BX + ADDQ R11, DI + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + LEAQ (DI)(R10*1), R11 + CMPQ R12, R11 + JG error_match_off_too_big + CMPQ R12, R8 + JG error_match_off_too_big + + // Copy match from history + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JGE copy_all_from_history + XORQ R11, R11 + TESTQ $0x00000001, R13 + JZ copy_4_word + MOVB (R14)(R11*1), R12 + MOVB R12, (BX)(R11*1) + ADDQ $0x01, R11 + +copy_4_word: + TESTQ $0x00000002, R13 + JZ copy_4_dword + MOVW (R14)(R11*1), R12 + MOVW R12, (BX)(R11*1) + ADDQ $0x02, R11 + +copy_4_dword: + TESTQ $0x00000004, R13 + JZ copy_4_qword + MOVL (R14)(R11*1), R12 + MOVL R12, (BX)(R11*1) + ADDQ $0x04, R11 + +copy_4_qword: + TESTQ $0x00000008, R13 + JZ copy_4_test + MOVQ (R14)(R11*1), R12 + MOVQ R12, (BX)(R11*1) + ADDQ $0x08, R11 + JMP copy_4_test + +copy_4: + MOVUPS (R14)(R11*1), X0 + MOVUPS X0, (BX)(R11*1) + ADDQ $0x10, R11 + +copy_4_test: + CMPQ R11, R13 + JB copy_4 + ADDQ R13, DI + ADDQ R13, BX + ADDQ $0x18, AX + INCQ DX + CMPQ DX, CX + JB main_loop + JMP loop_finished + +copy_all_from_history: + XORQ R15, R15 + TESTQ $0x00000001, R11 + JZ copy_5_word + MOVB (R14)(R15*1), BP + MOVB BP, (BX)(R15*1) + ADDQ $0x01, R15 + +copy_5_word: + TESTQ $0x00000002, R11 + JZ copy_5_dword + MOVW (R14)(R15*1), BP + MOVW BP, (BX)(R15*1) + ADDQ $0x02, R15 + +copy_5_dword: + TESTQ $0x00000004, R11 + JZ copy_5_qword + MOVL (R14)(R15*1), BP + MOVL BP, (BX)(R15*1) + ADDQ $0x04, R15 + +copy_5_qword: + TESTQ $0x00000008, R11 + JZ copy_5_test + MOVQ (R14)(R15*1), BP + MOVQ BP, (BX)(R15*1) + ADDQ $0x08, R15 + JMP copy_5_test + +copy_5: + MOVUPS (R14)(R15*1), X0 + MOVUPS X0, (BX)(R15*1) + ADDQ $0x10, R15 + +copy_5_test: + CMPQ R15, R11 + JB copy_5 + ADDQ R11, BX + ADDQ R11, DI + SUBQ R11, R13 + + // Copy match from the current buffer +copy_match: + TESTQ R13, R13 + JZ handle_loop + MOVQ BX, R11 + SUBQ R12, R11 + + // ml <= mo + CMPQ R13, R12 + JA copy_overlapping_match + + // Copy non-overlapping match + ADDQ R13, DI + MOVQ BX, R12 + ADDQ R13, BX + +copy_2: + MOVUPS (R11), X0 + MOVUPS X0, (R12) + ADDQ $0x10, R11 + ADDQ $0x10, R12 + SUBQ $0x10, R13 + JHI copy_2 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + ADDQ R13, DI + +copy_slow_3: + MOVB (R11), R12 + MOVB R12, (BX) + INCQ R11 + INCQ BX + DECQ R13 + JNZ copy_slow_3 + +handle_loop: + ADDQ $0x18, AX + INCQ DX + CMPQ DX, CX + JB main_loop + +loop_finished: + // Return value + MOVB $0x01, ret+8(FP) + + // Update the context + MOVQ ctx+0(FP), AX + MOVQ DX, 24(AX) + MOVQ DI, 104(AX) + MOVQ 80(AX), CX + SUBQ CX, SI + MOVQ SI, 112(AX) + RET + +error_match_off_too_big: + // Return value + MOVB $0x00, ret+8(FP) + + // Update the context + MOVQ ctx+0(FP), AX + MOVQ DX, 24(AX) + MOVQ DI, 104(AX) + MOVQ 80(AX), CX + SUBQ CX, SI + MOVQ SI, 112(AX) + RET + +empty_seqs: + // Return value + MOVB $0x01, ret+8(FP) + RET + +// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool +// Requires: SSE +TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9 + MOVQ ctx+0(FP), R10 + MOVQ 8(R10), CX + TESTQ CX, CX + JZ empty_seqs + MOVQ (R10), AX + MOVQ 24(R10), DX + MOVQ 32(R10), BX + MOVQ 80(R10), SI + MOVQ 104(R10), DI + MOVQ 120(R10), R8 + MOVQ 56(R10), R9 + MOVQ 64(R10), R10 + ADDQ R10, R9 + + // seqsBase += 24 * seqIndex + LEAQ (DX)(DX*2), R11 + SHLQ $0x03, R11 + ADDQ R11, AX + + // outBase += outPosition + ADDQ DI, BX + main_loop: MOVQ (AX), R11 MOVQ 16(AX), R12 @@ -1326,18 +1548,46 @@ copy_match: JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, DI - MOVQ BX, R12 - ADDQ R13, BX + ADDQ R13, DI + XORQ R12, R12 + TESTQ $0x00000001, R13 + JZ copy_2_word + MOVB (R11)(R12*1), R14 + MOVB R14, (BX)(R12*1) + ADDQ $0x01, R12 + +copy_2_word: + TESTQ $0x00000002, R13 + JZ copy_2_dword + MOVW (R11)(R12*1), R14 + MOVW R14, (BX)(R12*1) + ADDQ $0x02, R12 + +copy_2_dword: + TESTQ $0x00000004, R13 + JZ copy_2_qword + MOVL (R11)(R12*1), R14 + MOVL R14, (BX)(R12*1) + ADDQ $0x04, R12 + +copy_2_qword: + TESTQ $0x00000008, R13 + JZ copy_2_test + MOVQ (R11)(R12*1), R14 + MOVQ R14, (BX)(R12*1) + ADDQ $0x08, R12 + JMP copy_2_test copy_2: - MOVUPS (R11), X0 - MOVUPS X0, (R12) - ADDQ $0x10, R11 + MOVUPS (R11)(R12*1), X0 + MOVUPS X0, (BX)(R12*1) ADDQ $0x10, R12 - SUBQ $0x10, R13 - JHI copy_2 - JMP handle_loop + +copy_2_test: + CMPQ R12, R13 + JB copy_2 + ADDQ R13, BX + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -1673,45 +1923,16 @@ sequenceDecs_decodeSync_amd64_match_len_ofs_ok: TESTQ AX, AX JZ check_offset XORQ R14, R14 - TESTQ $0x00000001, AX - JZ copy_1_word - MOVB (R11)(R14*1), R15 - MOVB R15, (R10)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, AX - JZ copy_1_dword - MOVW (R11)(R14*1), R15 - MOVW R15, (R10)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, AX - JZ copy_1_qword - MOVL (R11)(R14*1), R15 - MOVL R15, (R10)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, AX - JZ copy_1_test - MOVQ (R11)(R14*1), R15 - MOVQ R15, (R10)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test copy_1: MOVUPS (R11)(R14*1), X0 MOVUPS X0, (R10)(R14*1) ADDQ $0x10, R14 - -copy_1_test: - CMPQ R14, AX - JB copy_1 - ADDQ AX, R11 - ADDQ AX, R10 - ADDQ AX, R12 + CMPQ R14, AX + JB copy_1 + ADDQ AX, R11 + ADDQ AX, R10 + ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: @@ -2180,45 +2401,16 @@ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: TESTQ CX, CX JZ check_offset XORQ R14, R14 - TESTQ $0x00000001, CX - JZ copy_1_word - MOVB (R10)(R14*1), R15 - MOVB R15, (R9)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, CX - JZ copy_1_dword - MOVW (R10)(R14*1), R15 - MOVW R15, (R9)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, CX - JZ copy_1_qword - MOVL (R10)(R14*1), R15 - MOVL R15, (R9)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, CX - JZ copy_1_test - MOVQ (R10)(R14*1), R15 - MOVQ R15, (R9)(R14*1) - ADDQ $0x08, R14 - JMP copy_1_test copy_1: MOVUPS (R10)(R14*1), X0 MOVUPS X0, (R9)(R14*1) ADDQ $0x10, R14 - -copy_1_test: - CMPQ R14, CX - JB copy_1 - ADDQ CX, R10 - ADDQ CX, R9 - ADDQ CX, R11 + CMPQ R14, CX + JB copy_1 + ADDQ CX, R10 + ADDQ CX, R9 + ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: diff --git a/zstd/seqdec_test.go b/zstd/seqdec_test.go index fc9470f4eb..82e41ff53a 100644 --- a/zstd/seqdec_test.go +++ b/zstd/seqdec_test.go @@ -150,7 +150,7 @@ func readDecoders(tb testing.TB, buf *bytes.Buffer, ref testSequence) sequenceDe matchLengths: sequenceDec{fse: &fseDecoder{}}, prevOffset: ref.prevOffsets, dict: nil, - literals: make([]byte, ref.lits), + literals: make([]byte, ref.lits, ref.lits+compressedBlockOverAlloc), out: nil, nSeqs: ref.n, br: nil,