Skip to content

Commit

Permalink
flate: Improve decompression speed 5-10% (#483)
Browse files Browse the repository at this point in the history
* flate: Improve decompression speed 5-10%

```
benchmark                               old ns/op     new ns/op     delta
BenchmarkDecodeDigitsSpeed1e4-32        49461         44204         -10.63%
BenchmarkDecodeDigitsSpeed1e5-32        520488        509001        -2.21%
BenchmarkDecodeDigitsSpeed1e6-32        5152811       5000738       -2.95%
BenchmarkDecodeDigitsDefault1e4-32      50983         47693         -6.45%
BenchmarkDecodeDigitsDefault1e5-32      494800        488243        -1.33%
BenchmarkDecodeDigitsDefault1e6-32      4990322       4752297       -4.77%
BenchmarkDecodeDigitsCompress1e4-32     49973         43992         -11.97%
BenchmarkDecodeDigitsCompress1e5-32     515033        467616        -9.21%
BenchmarkDecodeDigitsCompress1e6-32     5128402       4659296       -9.15%
BenchmarkDecodeTwainSpeed1e4-32         51740         48324         -6.60%
BenchmarkDecodeTwainSpeed1e5-32         532690        513209        -3.66%
BenchmarkDecodeTwainSpeed1e6-32         5304535       5129081       -3.31%
BenchmarkDecodeTwainDefault1e4-32       50613         48007         -5.15%
BenchmarkDecodeTwainDefault1e5-32       488404        476945        -2.35%
BenchmarkDecodeTwainDefault1e6-32       4881062       4710812       -3.49%
BenchmarkDecodeTwainCompress1e4-32      49583         45632         -7.97%
BenchmarkDecodeTwainCompress1e5-32      458843        445645        -2.88%
BenchmarkDecodeTwainCompress1e6-32      4544787       4392530       -3.35%
BenchmarkDecodeRandomSpeed1e4-32        298           305           +2.21%
BenchmarkDecodeRandomSpeed1e5-32        1909          1909          +0.00%
BenchmarkDecodeRandomSpeed1e6-32        19987         19809         -0.89%

benchmark                               old MB/s     new MB/s     speedup
BenchmarkDecodeDigitsSpeed1e4-32        202.18       226.23       1.12x
BenchmarkDecodeDigitsSpeed1e5-32        192.13       196.46       1.02x
BenchmarkDecodeDigitsSpeed1e6-32        194.07       199.97       1.03x
BenchmarkDecodeDigitsDefault1e4-32      196.15       209.68       1.07x
BenchmarkDecodeDigitsDefault1e5-32      202.10       204.82       1.01x
BenchmarkDecodeDigitsDefault1e6-32      200.39       210.42       1.05x
BenchmarkDecodeDigitsCompress1e4-32     200.11       227.31       1.14x
BenchmarkDecodeDigitsCompress1e5-32     194.16       213.85       1.10x
BenchmarkDecodeDigitsCompress1e6-32     194.99       214.62       1.10x
BenchmarkDecodeTwainSpeed1e4-32         193.27       206.94       1.07x
BenchmarkDecodeTwainSpeed1e5-32         187.73       194.85       1.04x
BenchmarkDecodeTwainSpeed1e6-32         188.52       194.97       1.03x
BenchmarkDecodeTwainDefault1e4-32       197.58       208.30       1.05x
BenchmarkDecodeTwainDefault1e5-32       204.75       209.67       1.02x
BenchmarkDecodeTwainDefault1e6-32       204.87       212.28       1.04x
BenchmarkDecodeTwainCompress1e4-32      201.68       219.14       1.09x
BenchmarkDecodeTwainCompress1e5-32      217.94       224.39       1.03x
BenchmarkDecodeTwainCompress1e6-32      220.03       227.66       1.03x
BenchmarkDecodeRandomSpeed1e4-32        33551.69     32828.68     0.98x
BenchmarkDecodeRandomSpeed1e5-32        52391.84     52395.57     1.00x
BenchmarkDecodeRandomSpeed1e6-32        50031.69     50482.80     1.01x
```
  • Loading branch information
klauspost committed Feb 1, 2022
1 parent 61f58c1 commit 60b19fa
Show file tree
Hide file tree
Showing 3 changed files with 536 additions and 455 deletions.
112 changes: 60 additions & 52 deletions flate/_gen/gen_inflate.go
@@ -1,7 +1,8 @@
//go:build generate
// +build generate

//go:generate go run $GOFILE && gofmt -w ../inflate_gen.go
//go:generate go run $GOFILE
//go:generate go fmt ../inflate_gen.go

package main

Expand All @@ -16,9 +17,9 @@ func main() {
panic(err)
}
defer f.Close()
types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader", "Reader"}
names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader", "GenericReader"}
imports := []string{"bytes", "bufio", "fmt", "strings", "math/bits"}
f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.
package flate
Expand All @@ -44,6 +45,11 @@ func (f *decompressor) $FUNCNAME$() {
)
fr := f.r.($TYPE$)
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
fnb, fb := f.nb, f.b
switch f.stepState {
case stateInit:
goto readLiteral
Expand All @@ -62,41 +68,35 @@ readLiteral:
// cases, the chunks slice will be 0 for the invalid sequence, leading it
// satisfy the n == 0 check below.
n := uint(f.hl.maxRead)
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
nb, b := f.nb, f.b
for {
for nb < n {
for fnb < n {
c, err := fr.ReadByte()
if err != nil {
f.b = b
f.nb = nb
f.b, f.nb = fb, fnb
f.err = noEOF(err)
return
}
f.roffset++
b |= uint32(c) << (nb & regSizeMaskUint32)
nb += 8
fb |= uint32(c) << (fnb & regSizeMaskUint32)
fnb += 8
}
chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
n = uint(chunk & huffmanCountMask)
if n > huffmanChunkBits {
chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
n = uint(chunk & huffmanCountMask)
}
if n <= nb {
if n <= fnb {
if n == 0 {
f.b = b
f.nb = nb
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("huffsym: n==0")
}
f.err = CorruptInputError(f.roffset)
return
}
f.b = b >> (n & regSizeMaskUint32)
f.nb = nb - n
fb = fb >> (n & regSizeMaskUint32)
fnb = fnb - n
v = int(chunk >> huffmanValueShift)
break
}
Expand All @@ -111,10 +111,12 @@ readLiteral:
f.toRead = f.dict.readFlush()
f.step = (*decompressor).$FUNCNAME$
f.stepState = stateInit
f.b, f.nb = fb, fnb
return
}
goto readLiteral
case v == 256:
f.b, f.nb = fb, fnb
f.finishBlock()
return
// otherwise, reference to older data
Expand All @@ -124,48 +126,51 @@ readLiteral:
val := decCodeToLen[(v - 257)]
length = int(val.length) + 3
n := uint(val.extra)
for f.nb < n {
for fnb < n {
c, err := fr.ReadByte()
if err != nil {
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("morebits n>0:", err)
}
f.err = err
return
}
f.roffset++
f.b |= uint32(c) << f.nb
f.nb += 8
fb |= uint32(c) << (fnb&regSizeMaskUint32)
fnb += 8
}
length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
f.b >>= n & regSizeMaskUint32
f.nb -= n
length += int(fb & bitMask32[n])
fb >>= n & regSizeMaskUint32
fnb -= n
default:
if debugDecode {
fmt.Println(v, ">= maxNumLit")
}
f.err = CorruptInputError(f.roffset)
f.b, f.nb = fb, fnb
return
}
var dist uint32
if f.hd == nil {
for f.nb < 5 {
for fnb < 5 {
c, err := fr.ReadByte()
if err != nil {
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("morebits f.nb<5:", err)
}
f.err = err
return
}
f.roffset++
f.b |= uint32(c) << f.nb
f.nb += 8
fb |= uint32(c) << (fnb&regSizeMaskUint32)
fnb += 8
}
dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
f.b >>= 5
f.nb -= 5
dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
fb >>= 5
fnb -= 5
} else {
// Since a huffmanDecoder can be empty or be composed of a degenerate tree
// with single element, huffSym must error on these two edge cases. In both
Expand All @@ -175,38 +180,35 @@ readLiteral:
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
nb, b := f.nb, f.b
for {
for nb < n {
for fnb < n {
c, err := fr.ReadByte()
if err != nil {
f.b = b
f.nb = nb
f.b, f.nb = fb, fnb
f.err = noEOF(err)
return
}
f.roffset++
b |= uint32(c) << (nb & regSizeMaskUint32)
nb += 8
fb |= uint32(c) << (fnb & regSizeMaskUint32)
fnb += 8
}
chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
n = uint(chunk & huffmanCountMask)
if n > huffmanChunkBits {
chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
n = uint(chunk & huffmanCountMask)
}
if n <= nb {
if n <= fnb {
if n == 0 {
f.b = b
f.nb = nb
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("huffsym: n==0")
}
f.err = CorruptInputError(f.roffset)
return
}
f.b = b >> (n & regSizeMaskUint32)
f.nb = nb - n
fb = fb >> (n & regSizeMaskUint32)
fnb = fnb - n
dist = uint32(chunk >> huffmanValueShift)
break
}
Expand All @@ -220,24 +222,27 @@ readLiteral:
nb := uint(dist-2) >> 1
// have 1 bit in bottom of dist, need nb more.
extra := (dist & 1) << (nb & regSizeMaskUint32)
for f.nb < nb {
for fnb < nb {
c, err := fr.ReadByte()
if err != nil {
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("morebits f.nb<nb:", err)
}
f.err = err
return
}
f.roffset++
f.b |= uint32(c) << f.nb
f.nb += 8
fb |= uint32(c) << (fnb&regSizeMaskUint32)
fnb += 8
}
extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
f.b >>= nb & regSizeMaskUint32
f.nb -= nb
extra |= fb & bitMask32[nb]
fb >>= nb & regSizeMaskUint32
fnb -= nb
dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
// slower: dist = bitMask32[nb+1] + 2 + extra
default:
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("dist too big:", dist, maxNumDist)
}
Expand All @@ -247,6 +252,7 @@ readLiteral:
// No check on length; encoding can be prescient.
if dist > uint32(f.dict.histSize()) {
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
}
Expand All @@ -271,10 +277,12 @@ copyHistory:
f.toRead = f.dict.readFlush()
f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
f.stepState = stateDict
f.b, f.nb = fb, fnb
return
}
goto readLiteral
}
// Not reached
}
`
Expand All @@ -290,6 +298,6 @@ copyHistory:
f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
}
f.WriteString("\t\tdefault:\n")
f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
f.WriteString("\t\t\treturn f.huffmanGenericReader\n")
f.WriteString("\t}\n}\n")
}

0 comments on commit 60b19fa

Please sign in to comment.