Skip to content

Commit

Permalink
zstd: Copy literal in 16 byte blocks when possible (#592)
Browse files Browse the repository at this point in the history
Also reduces literal overalloc when full allocs are allowed.

```
benchmark                                                                                          old ns/op     new ns/op     delta
BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32                                                14572         13898         -4.63%
BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32                                            3946          3682          -6.69%
BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32                                             45150         43296         -4.11%
BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32                                               33525         36679         +9.41%
BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32                                             11952         10496         -12.18%
BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32                                              14081         13339         -5.27%
BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32                                                 12111         11745         -3.02%
BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32                                           1073          1037          -3.36%
BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32                                           1759          1841          +4.66%
BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32                                                 43722         39755         -9.07%
BenchmarkDecoder_DecodeAllParallel/html.zst-32                                                     4144          3756          -9.36%
BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32                                            1240          1240          +0.00%
BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-32                                                        250426        240012        -4.16%
BenchmarkDecoder_DecodeAll/geo.protodata.zst-32                                                    71861         65548         -8.79%
BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-32                                                     829878        736934        -11.20%
BenchmarkDecoder_DecodeAll/lcet10.txt.zst-32                                                       609402        683505        +12.16%
BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-32                                                     231636        189146        -18.34%
BenchmarkDecoder_DecodeAll/alice29.txt.zst-32                                                      245022        226451        -7.58%
BenchmarkDecoder_DecodeAll/html_x_4.zst-32                                                         229709        216421        -5.78%
BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-32                                                   18400         17850         -2.99%
BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-32                                                   9682          9801          +1.23%
BenchmarkDecoder_DecodeAll/urls.10K.zst-32                                                         924472        796913        -13.80%
BenchmarkDecoder_DecodeAll/html.zst-32                                                             77728         66831         -14.02%
BenchmarkDecoder_DecodeAll/comp-data.bin.zst-32                                                    7985          7432          -6.93%
Benchmark_seqdec_execute/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32           130498        106559        -18.34%
Benchmark_seqdec_execute/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32          136475        121699        -10.83%
Benchmark_seqdec_execute/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32                  43119         33598         -22.08%
Benchmark_seqdec_execute/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32           15723         14472         -7.96%
Benchmark_seqdec_execute/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32           25968         19734         -24.01%
Benchmark_seqdec_execute/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32            88906         79506         -10.57%
Benchmark_seqdec_execute/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32                     7385          7269          -1.57%
Benchmark_seqdec_execute/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32        83133         64295         -22.66%
Benchmark_seqdec_execute/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32                      2899          2881          -0.62%
Benchmark_seqdec_execute/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32                       3951          3961          +0.25%
Benchmark_seqdec_execute/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32                   7063          6809          -3.60%
Benchmark_seqdec_execute/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32                      14045         14050         +0.04%
Benchmark_seqdec_execute/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32                 19679         18611         -5.43%
Benchmark_seqdec_execute/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32                       48841         45545         -6.75%
Benchmark_seqdec_decodeSync/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32        276464        273620        -1.03%
Benchmark_seqdec_decodeSync/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32       270905        269049        -0.69%
Benchmark_seqdec_decodeSync/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32               146061        145878        -0.13%
Benchmark_seqdec_decodeSync/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32        30686         27367         -10.82%
Benchmark_seqdec_decodeSync/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32        88493         87167         -1.50%
Benchmark_seqdec_decodeSync/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32         195326        195764        +0.22%
Benchmark_seqdec_decodeSync/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32                  14081         13925         -1.11%
Benchmark_seqdec_decodeSync/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32     297178        298192        +0.34%
Benchmark_seqdec_decodeSync/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32                   2935          2921          -0.48%
Benchmark_seqdec_decodeSync/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32                    4856          4467          -8.01%
Benchmark_seqdec_decodeSync/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32                14059         14050         -0.06%
Benchmark_seqdec_decodeSync/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32                   35636         33427         -6.20%
Benchmark_seqdec_decodeSync/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32              88618         85660         -3.34%
Benchmark_seqdec_decodeSync/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32                    162282        160568        -1.06%
```

`lcet10.txt` doesn't like it, otherwise mostly positive.

Streams before/after:
```
BenchmarkDecoderEnwik9-32    	       1	1288277200 ns/op	 776.23 MB/s	   59552 B/op	      44 allocs/op
BenchmarkDecoderEnwik9/multithreaded-writer-32         	       1	1191034000 ns/op	 839.61 MB/s	13993224 B/op	     113 allocs/op

BenchmarkDecoderSilesia-32    	       5	 209913160 ns/op	1009.69 MB/s	   46715 B/op	      38 allocs/op
BenchmarkDecoderSilesia/multithreaded-writer-32         	       5	 201394480 ns/op	1052.40 MB/s	 5129462 B/op	      77 allocs/op
```
  • Loading branch information
klauspost committed May 12, 2022
1 parent fbaccdc commit 6ebbb85
Show file tree
Hide file tree
Showing 6 changed files with 381 additions and 112 deletions.
28 changes: 27 additions & 1 deletion zstd/_generate/gen.go
Expand Up @@ -66,6 +66,8 @@ func main() {
safeMem: false,
}
exec.generateProcedure("sequenceDecs_executeSimple_amd64")
exec.safeMem = true
exec.generateProcedure("sequenceDecs_executeSimple_safe_amd64")

decodeSync := decodeSync{}
decodeSync.setBMI2(false)
Expand Down Expand Up @@ -1032,7 +1034,11 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
TESTQ(ll, ll)
JZ(LabelRef("check_offset"))
// TODO: Investigate if it is possible to consistently overallocate literals.
e.copyMemoryPrecise("1", c.literals, c.outBase, ll)
if e.safeMem {
e.copyMemoryPrecise("1", c.literals, c.outBase, ll)
} else {
e.copyMemoryND("1", c.literals, c.outBase, ll)
}
ADDQ(ll, c.literals)
ADDQ(ll, c.outBase)
ADDQ(ll, c.outPosition)
Expand Down Expand Up @@ -1188,6 +1194,26 @@ func (e executeSimple) copyMemory(suffix string, src, dst, length reg.GPVirtual)
JHI(LabelRef(label))
}

// copyMemoryND will copy memory in blocks of 16 bytes,
// overwriting up to 15 extra bytes.
// All parameters are preserved.
func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtual) {
label := "copy_" + suffix

ofs := GP64()
s := Mem{Base: src, Index: ofs, Scale: 1}
d := Mem{Base: dst, Index: ofs, Scale: 1}

XORQ(ofs, ofs)
Label(label)
t := XMM()
MOVUPS(s, t)
MOVUPS(t, d)
ADDQ(U8(16), ofs)
CMPQ(ofs, length)
JB(LabelRef(label))
}

// copyMemoryPrecise will copy memory in blocks of 16 bytes,
// without overwriting nor overreading.
func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) {
Expand Down
30 changes: 12 additions & 18 deletions zstd/blockdec.go
Expand Up @@ -49,11 +49,8 @@ const (
// Maximum possible block size (all Raw+Uncompressed).
maxBlockSize = (1 << 21) - 1

// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
maxCompressedLiteralSize = 1 << 18
maxRLELiteralSize = 1 << 20
maxMatchLen = 131074
maxSequences = 0x7f00 + 0xffff
maxMatchLen = 131074
maxSequences = 0x7f00 + 0xffff

// We support slightly less than the reference decoder to be able to
// use ints on 32 bit archs.
Expand Down Expand Up @@ -368,14 +365,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
}
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
b.literalBuf = make([]byte, litRegenSize)
b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
} else {
if litRegenSize > maxCompressedLiteralSize {
// Exceptional
b.literalBuf = make([]byte, litRegenSize)
} else {
b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
}
b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
literals = b.literalBuf[:litRegenSize]
Expand Down Expand Up @@ -405,14 +397,14 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
// Ensure we have space to store it.
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
b.literalBuf = make([]byte, 0, litRegenSize)
b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
} else {
b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
var err error
// Use our out buffer.
huff.MaxDecodedSize = maxCompressedBlockSize
huff.MaxDecodedSize = litRegenSize
if fourStreams {
literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
} else {
Expand All @@ -437,9 +429,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
// Ensure we have space to store it.
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
b.literalBuf = make([]byte, 0, litRegenSize)
b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
} else {
b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
huff := hist.huffTree
Expand All @@ -456,7 +448,7 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
return in, err
}
hist.huffTree = huff
huff.MaxDecodedSize = maxCompressedBlockSize
huff.MaxDecodedSize = litRegenSize
// Use our out buffer.
if fourStreams {
literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
Expand All @@ -471,6 +463,8 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
if len(literals) != litRegenSize {
return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
}
// Re-cap to get extra size.
literals = b.literalBuf[:len(literals)]
if debugDecoder {
printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
}
Expand Down
71 changes: 58 additions & 13 deletions zstd/decoder_test.go
Expand Up @@ -1402,12 +1402,7 @@ func benchmarkDecoderWithFile(path string, b *testing.B) {
if err != nil {
b.Fatal(err)
}
dec, err := NewReader(nil, WithDecoderLowmem(false))
if err != nil {
b.Fatal(err)
}
defer dec.Close()
err = dec.Reset(bytes.NewBuffer(data))
dec, err := NewReader(bytes.NewBuffer(data), WithDecoderLowmem(false), WithDecoderConcurrency(1))
if err != nil {
b.Fatal(err)
}
Expand All @@ -1416,19 +1411,69 @@ func benchmarkDecoderWithFile(path string, b *testing.B) {
b.Fatal(err)
}

b.SetBytes(n)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
err = dec.Reset(bytes.NewBuffer(data))
b.Run("multithreaded-writer", func(b *testing.B) {
dec, err := NewReader(nil)
if err != nil {
b.Fatal(err)
}
_, err := io.CopyN(ioutil.Discard, dec, n)

b.SetBytes(n)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
err = dec.Reset(bytes.NewBuffer(data))
if err != nil {
b.Fatal(err)
}
_, err := io.CopyN(ioutil.Discard, dec, n)
if err != nil {
b.Fatal(err)
}
}
})

b.Run("singlethreaded-writer", func(b *testing.B) {
dec, err := NewReader(nil, WithDecoderConcurrency(1))
if err != nil {
b.Fatal(err)
}
}

b.SetBytes(n)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
err = dec.Reset(bytes.NewBuffer(data))
if err != nil {
b.Fatal(err)
}
_, err := io.CopyN(ioutil.Discard, dec, n)
if err != nil {
b.Fatal(err)
}
}
})

b.Run("singlethreaded-writerto", func(b *testing.B) {
dec, err := NewReader(nil, WithDecoderConcurrency(1))
if err != nil {
b.Fatal(err)
}

b.SetBytes(n)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
err = dec.Reset(bytes.NewBuffer(data))
if err != nil {
b.Fatal(err)
}
// io.Copy will use io.WriterTo
_, err := io.Copy(ioutil.Discard, dec)
if err != nil {
b.Fatal(err)
}
}
})
}

func BenchmarkDecoderSilesia(b *testing.B) {
Expand Down
16 changes: 14 additions & 2 deletions zstd/seqdec_amd64.go
Expand Up @@ -62,6 +62,10 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
useSafe = true
}
if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
useSafe = true
}

br := s.br

maxBlockSize := maxCompressedBlockSize
Expand Down Expand Up @@ -301,6 +305,10 @@ type executeAsmContext struct {
//go:noescape
func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool

// Same as above, but with safe memcopies
//go:noescape
func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool

// executeSimple handles cases when dictionary is not used.
func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
// Ensure we have enough output size...
Expand All @@ -327,8 +335,12 @@ func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
literals: s.literals,
windowSize: s.windowSize,
}

ok := sequenceDecs_executeSimple_amd64(&ctx)
var ok bool
if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
} else {
ok = sequenceDecs_executeSimple_amd64(&ctx)
}
if !ok {
return fmt.Errorf("match offset (%d) bigger than current history (%d)",
seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
Expand Down

0 comments on commit 6ebbb85

Please sign in to comment.