From 7fa9b6162d74b9d103f0907aeea91dffbd359a4f Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sun, 20 Feb 2022 18:29:15 +0100 Subject: [PATCH] Use buffers for 1X decodes Also make buffers safe from concurrent decoder use. ``` benchmark old ns/op new ns/op delta BenchmarkDecompress1XTable/digits-32 256367 262489 +2.39% BenchmarkDecompress1XTable/gettysburg-32 5189 5224 +0.67% BenchmarkDecompress1XTable/twain-32 825950 825534 -0.05% BenchmarkDecompress1XTable/low-ent.10k-32 88050 88429 +0.43% BenchmarkDecompress1XTable/superlow-ent-10k-32 23420 23456 +0.15% BenchmarkDecompress1XTable/crash2-32 687 676 -1.59% BenchmarkDecompress1XTable/endzerobits-32 80.9 78.4 -3.07% BenchmarkDecompress1XTable/endnonzero-32 494 511 +3.40% BenchmarkDecompress1XTable/case1-32 1948 1924 -1.23% BenchmarkDecompress1XTable/case2-32 1916 1886 -1.57% BenchmarkDecompress1XTable/case3-32 1945 1921 -1.23% BenchmarkDecompress1XTable/pngdata.001-32 126909 125848 -0.84% BenchmarkDecompress1XTable/normcount2-32 1285 1295 +0.78% BenchmarkDecompress1XNoTable/digits-32 255503 261690 +2.42% BenchmarkDecompress1XNoTable/gettysburg-32 4029 4014 -0.37% BenchmarkDecompress1XNoTable/twain-32 823710 821251 -0.30% BenchmarkDecompress1XNoTable/low-ent.10k-32 87024 87182 +0.18% BenchmarkDecompress1XNoTable/superlow-ent-10k-32 22812 23353 +2.37% BenchmarkDecompress1XNoTable/crash2-32 82.6 67.2 -18.62% BenchmarkDecompress1XNoTable/endzerobits-32 54.3 43.8 -19.47% BenchmarkDecompress1XNoTable/endnonzero-32 59.6 46.6 -21.86% BenchmarkDecompress1XNoTable/case1-32 179 159 -11.23% BenchmarkDecompress1XNoTable/case2-32 144 128 -10.71% BenchmarkDecompress1XNoTable/case3-32 165 145 -12.29% BenchmarkDecompress1XNoTable/pngdata.001-32 123734 123297 -0.35% BenchmarkDecompress1XNoTable/normcount2-32 248 241 -3.14% BenchmarkDecompress4XNoTable/digits-32 152812 151641 -0.77% BenchmarkDecompress4XNoTable/gettysburg-32 2585 2712 +4.91% BenchmarkDecompress4XNoTable/twain-32 529935 550282 +3.84% BenchmarkDecompress4XNoTable/low-ent.10k-32 53602 52664 -1.75% BenchmarkDecompress4XNoTable/superlow-ent-10k-32 14375 14054 -2.23% BenchmarkDecompress4XNoTable/case1-32 254 226 -10.84% BenchmarkDecompress4XNoTable/case2-32 207 182 -12.22% BenchmarkDecompress4XNoTable/case3-32 215 186 -13.14% BenchmarkDecompress4XNoTable/pngdata.001-32 73031 76067 +4.16% BenchmarkDecompress4XNoTable/normcount2-32 309 280 -9.38% BenchmarkDecompress4XNoTableTableLog8/digits-32 152307 150121 -1.44% BenchmarkDecompress4XTable/digits-32 152793 150602 -1.43% BenchmarkDecompress4XTable/gettysburg-32 3861 3924 +1.63% BenchmarkDecompress4XTable/twain-32 536438 550964 +2.71% BenchmarkDecompress4XTable/low-ent.10k-32 54465 53176 -2.37% BenchmarkDecompress4XTable/superlow-ent-10k-32 14904 14677 -1.52% BenchmarkDecompress4XTable/case1-32 2000 2007 +0.35% BenchmarkDecompress4XTable/case2-32 1982 1968 -0.71% BenchmarkDecompress4XTable/case3-32 1992 2014 +1.10% BenchmarkDecompress4XTable/pngdata.001-32 75929 79317 +4.46% BenchmarkDecompress4XTable/normcount2-32 1372 1344 -2.04% benchmark old MB/s new MB/s speedup BenchmarkDecompress1XTable/digits-32 390.08 380.98 0.98x BenchmarkDecompress1XTable/gettysburg-32 298.30 296.30 0.99x BenchmarkDecompress1XTable/twain-32 317.38 317.54 1.00x BenchmarkDecompress1XTable/low-ent.10k-32 454.29 452.34 1.00x BenchmarkDecompress1XTable/superlow-ent-10k-32 448.34 447.66 1.00x BenchmarkDecompress1XTable/crash2-32 21.84 22.19 1.02x BenchmarkDecompress1XTable/endzerobits-32 61.81 63.77 1.03x BenchmarkDecompress1XTable/endnonzero-32 14.17 13.71 0.97x BenchmarkDecompress1XTable/case1-32 28.23 28.59 1.01x BenchmarkDecompress1XTable/case2-32 23.49 23.86 1.02x BenchmarkDecompress1XTable/case3-32 24.67 24.99 1.01x BenchmarkDecompress1XTable/pngdata.001-32 403.44 406.84 1.01x BenchmarkDecompress1XTable/normcount2-32 67.71 67.18 0.99x BenchmarkDecompress1XNoTable/digits-32 391.40 382.14 0.98x BenchmarkDecompress1XNoTable/gettysburg-32 384.26 385.65 1.00x BenchmarkDecompress1XNoTable/twain-32 318.25 319.20 1.00x BenchmarkDecompress1XNoTable/low-ent.10k-32 459.65 458.81 1.00x BenchmarkDecompress1XNoTable/superlow-ent-10k-32 460.29 449.62 0.98x BenchmarkDecompress1XNoTable/crash2-32 181.62 223.18 1.23x BenchmarkDecompress1XNoTable/endzerobits-32 92.04 114.29 1.24x BenchmarkDecompress1XNoTable/endnonzero-32 117.43 150.28 1.28x BenchmarkDecompress1XNoTable/case1-32 307.35 346.21 1.13x BenchmarkDecompress1XNoTable/case2-32 313.02 350.41 1.12x BenchmarkDecompress1XNoTable/case3-32 290.54 331.28 1.14x BenchmarkDecompress1XNoTable/pngdata.001-32 413.79 415.26 1.00x BenchmarkDecompress1XNoTable/normcount2-32 350.06 361.51 1.03x BenchmarkDecompress4XNoTable/digits-32 654.42 659.47 1.01x BenchmarkDecompress4XNoTable/gettysburg-32 598.78 570.81 0.95x BenchmarkDecompress4XNoTable/twain-32 494.67 476.38 0.96x BenchmarkDecompress4XNoTable/low-ent.10k-32 746.25 759.53 1.02x BenchmarkDecompress4XNoTable/superlow-ent-10k-32 730.46 747.10 1.02x BenchmarkDecompress4XNoTable/case1-32 216.72 243.09 1.12x BenchmarkDecompress4XNoTable/case2-32 217.24 247.59 1.14x BenchmarkDecompress4XNoTable/case3-32 223.64 257.53 1.15x BenchmarkDecompress4XNoTable/pngdata.001-32 701.07 673.09 0.96x BenchmarkDecompress4XNoTable/normcount2-32 281.26 310.40 1.10x BenchmarkDecompress4XNoTableTableLog8/digits-32 656.59 666.15 1.01x BenchmarkDecompress4XTable/digits-32 654.50 664.02 1.01x BenchmarkDecompress4XTable/gettysburg-32 400.94 394.53 0.98x BenchmarkDecompress4XTable/twain-32 488.67 475.79 0.97x BenchmarkDecompress4XTable/low-ent.10k-32 734.42 752.22 1.02x BenchmarkDecompress4XTable/superlow-ent-10k-32 704.50 715.40 1.02x BenchmarkDecompress4XTable/case1-32 27.51 27.41 1.00x BenchmarkDecompress4XTable/case2-32 22.71 22.87 1.01x BenchmarkDecompress4XTable/case3-32 24.09 23.84 0.99x BenchmarkDecompress4XTable/pngdata.001-32 674.31 645.51 0.96x BenchmarkDecompress4XTable/normcount2-32 63.43 64.73 1.02x ``` --- huff0/decompress.go | 65 ++++++++++++++++++++++++++++++++++++++++----- huff0/huff0.go | 2 ++ 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/huff0/decompress.go b/huff0/decompress.go index f94bd0ad40..2668b64d37 100644 --- a/huff0/decompress.go +++ b/huff0/decompress.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "io" + "sync" "github.com/klauspost/compress/fse" ) @@ -216,6 +217,7 @@ func (s *Scratch) Decoder() *Decoder { return &Decoder{ dt: s.dt, actualTableLog: s.actualTableLog, + bufs: &s.decPool, } } @@ -223,7 +225,15 @@ func (s *Scratch) Decoder() *Decoder { type Decoder struct { dt dTable actualTableLog uint8 - buf [4][256]byte + bufs *sync.Pool +} + +func (d *Decoder) buffer() *[4][256]byte { + buf, ok := d.bufs.Get().(*[4][256]byte) + if ok { + return buf + } + return &[4][256]byte{} } // Decompress1X will decompress a 1X encoded stream. @@ -250,7 +260,8 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { dt := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + bufs := d.buffer() + buf := &bufs[0] var off uint8 for br.off >= 8 { @@ -278,6 +289,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { if off == 0 { if len(dst)+256 > maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } dst = append(dst, buf[:]...) @@ -285,6 +297,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { } if len(dst)+int(off) > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -311,6 +324,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { } } if len(dst) >= maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -320,6 +334,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { bitsLeft -= nBits dst = append(dst, uint8(v.entry>>8)) } + d.bufs.Put(bufs) return dst, br.close() } @@ -342,7 +357,8 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { dt := d.dt.single[:256] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + bufs := d.buffer() + buf := &bufs[0] var off uint8 switch d.actualTableLog { @@ -370,6 +386,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { if off == 0 { if len(dst)+256 > maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } dst = append(dst, buf[:]...) @@ -399,6 +416,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { if off == 0 { if len(dst)+256 > maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } dst = append(dst, buf[:]...) @@ -427,6 +445,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -456,6 +475,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -485,6 +505,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -514,6 +535,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -543,6 +565,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -572,6 +595,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -579,10 +603,12 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { } } default: + d.bufs.Put(bufs) return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog) } if len(dst)+int(off) > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -602,6 +628,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { } if len(dst) >= maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } v := dt[br.peekByteFast()>>shift] @@ -610,6 +637,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { bitsLeft -= int8(nBits) dst = append(dst, uint8(v.entry>>8)) } + d.bufs.Put(bufs) return dst, br.close() } @@ -629,7 +657,8 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { dt := d.dt.single[:256] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + bufs := d.buffer() + buf := &bufs[0] var off uint8 const shift = 56 @@ -656,6 +685,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -664,6 +694,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { } if len(dst)+int(off) > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -680,6 +711,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { } } if len(dst) >= maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -689,6 +721,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { bitsLeft -= int8(nBits) dst = append(dst, uint8(v.entry>>8)) } + d.bufs.Put(bufs) return dst, br.close() } @@ -736,7 +769,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { single := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - buf := &d.buf + buf := d.buffer() var off uint8 var decoded int @@ -801,6 +834,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { if off == 0 { if bufoff > dstEvery { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 1") } copy(out, buf[0][:]) @@ -811,6 +845,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { decoded += bufoff * 4 // There must at least be 3 buffers left. if len(out) < dstEvery*3 { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 2") } } @@ -818,6 +853,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { if off > 0 { ioff := int(off) if len(out) < dstEvery*3+ioff { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 3") } copy(out, buf[0][:off]) @@ -853,6 +889,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { } // end inline... if offset >= len(out) { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 4") } @@ -871,6 +908,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { return nil, err } } + d.bufs.Put(buf) if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } @@ -916,7 +954,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { single := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - buf := &d.buf + buf := d.buffer() var off uint8 var decoded int @@ -1022,6 +1060,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { if off == 0 { if bufoff > dstEvery { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 1") } copy(out, buf[0][:]) @@ -1032,6 +1071,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { decoded += bufoff * 4 // There must at least be 3 buffers left. if len(out) < dstEvery*3 { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 2") } } @@ -1039,6 +1079,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { if off > 0 { ioff := int(off) if len(out) < dstEvery*3+ioff { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 3") } copy(out, buf[0][:off]) @@ -1056,6 +1097,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { bitsLeft := int(br.off*8) + int(64-br.bitsRead) for bitsLeft > 0 { if br.finished() { + d.bufs.Put(buf) return nil, io.ErrUnexpectedEOF } if br.bitsRead >= 56 { @@ -1076,6 +1118,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { } // end inline... if offset >= len(out) { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 4") } @@ -1090,9 +1133,11 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { decoded += offset - dstEvery*i err = br.close() if err != nil { + d.bufs.Put(buf) return nil, err } } + d.bufs.Put(buf) if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } @@ -1134,7 +1179,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { single := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - buf := &d.buf + buf := d.buffer() var off uint8 var decoded int @@ -1240,6 +1285,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { if off == 0 { if bufoff > dstEvery { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 1") } copy(out, buf[0][:]) @@ -1250,6 +1296,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { decoded += bufoff * 4 // There must at least be 3 buffers left. if len(out) < dstEvery*3 { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 2") } } @@ -1274,6 +1321,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { bitsLeft := int(br.off*8) + int(64-br.bitsRead) for bitsLeft > 0 { if br.finished() { + d.bufs.Put(buf) return nil, io.ErrUnexpectedEOF } if br.bitsRead >= 56 { @@ -1294,6 +1342,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { } // end inline... if offset >= len(out) { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 4") } @@ -1308,9 +1357,11 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { decoded += offset - dstEvery*i err = br.close() if err != nil { + d.bufs.Put(buf) return nil, err } } + d.bufs.Put(buf) if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } diff --git a/huff0/huff0.go b/huff0/huff0.go index 3ee00ecb47..e8ad17ad08 100644 --- a/huff0/huff0.go +++ b/huff0/huff0.go @@ -8,6 +8,7 @@ import ( "fmt" "math" "math/bits" + "sync" "github.com/klauspost/compress/fse" ) @@ -116,6 +117,7 @@ type Scratch struct { nodes []nodeElt tmpOut [4][]byte fse *fse.Scratch + decPool sync.Pool // *[4][256]byte buffers. huffWeight [maxSymbolValue + 1]byte }