Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

flate: Improve decompression speed 5-10% #483

Merged
merged 2 commits into from Feb 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
112 changes: 60 additions & 52 deletions flate/_gen/gen_inflate.go
@@ -1,7 +1,8 @@
//go:build generate
// +build generate

//go:generate go run $GOFILE && gofmt -w ../inflate_gen.go
//go:generate go run $GOFILE
//go:generate go fmt ../inflate_gen.go

package main

Expand All @@ -16,9 +17,9 @@ func main() {
panic(err)
}
defer f.Close()
types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader", "Reader"}
names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader", "GenericReader"}
imports := []string{"bytes", "bufio", "fmt", "strings", "math/bits"}
f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.

package flate
Expand All @@ -44,6 +45,11 @@ func (f *decompressor) $FUNCNAME$() {
)
fr := f.r.($TYPE$)

// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
fnb, fb := f.nb, f.b

switch f.stepState {
case stateInit:
goto readLiteral
Expand All @@ -62,41 +68,35 @@ readLiteral:
// cases, the chunks slice will be 0 for the invalid sequence, leading it
// satisfy the n == 0 check below.
n := uint(f.hl.maxRead)
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
nb, b := f.nb, f.b
for {
for nb < n {
for fnb < n {
c, err := fr.ReadByte()
if err != nil {
f.b = b
f.nb = nb
f.b, f.nb = fb, fnb
f.err = noEOF(err)
return
}
f.roffset++
b |= uint32(c) << (nb & regSizeMaskUint32)
nb += 8
fb |= uint32(c) << (fnb & regSizeMaskUint32)
fnb += 8
}
chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
n = uint(chunk & huffmanCountMask)
if n > huffmanChunkBits {
chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
n = uint(chunk & huffmanCountMask)
}
if n <= nb {
if n <= fnb {
if n == 0 {
f.b = b
f.nb = nb
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("huffsym: n==0")
}
f.err = CorruptInputError(f.roffset)
return
}
f.b = b >> (n & regSizeMaskUint32)
f.nb = nb - n
fb = fb >> (n & regSizeMaskUint32)
fnb = fnb - n
v = int(chunk >> huffmanValueShift)
break
}
Expand All @@ -111,10 +111,12 @@ readLiteral:
f.toRead = f.dict.readFlush()
f.step = (*decompressor).$FUNCNAME$
f.stepState = stateInit
f.b, f.nb = fb, fnb
return
}
goto readLiteral
case v == 256:
f.b, f.nb = fb, fnb
f.finishBlock()
return
// otherwise, reference to older data
Expand All @@ -124,48 +126,51 @@ readLiteral:
val := decCodeToLen[(v - 257)]
length = int(val.length) + 3
n := uint(val.extra)
for f.nb < n {
for fnb < n {
c, err := fr.ReadByte()
if err != nil {
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("morebits n>0:", err)
}
f.err = err
return
}
f.roffset++
f.b |= uint32(c) << f.nb
f.nb += 8
fb |= uint32(c) << (fnb&regSizeMaskUint32)
fnb += 8
}
length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
f.b >>= n & regSizeMaskUint32
f.nb -= n
length += int(fb & bitMask32[n])
fb >>= n & regSizeMaskUint32
fnb -= n
default:
if debugDecode {
fmt.Println(v, ">= maxNumLit")
}
f.err = CorruptInputError(f.roffset)
f.b, f.nb = fb, fnb
return
}

var dist uint32
if f.hd == nil {
for f.nb < 5 {
for fnb < 5 {
c, err := fr.ReadByte()
if err != nil {
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("morebits f.nb<5:", err)
}
f.err = err
return
}
f.roffset++
f.b |= uint32(c) << f.nb
f.nb += 8
fb |= uint32(c) << (fnb&regSizeMaskUint32)
fnb += 8
}
dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
f.b >>= 5
f.nb -= 5
dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
fb >>= 5
fnb -= 5
} else {
// Since a huffmanDecoder can be empty or be composed of a degenerate tree
// with single element, huffSym must error on these two edge cases. In both
Expand All @@ -175,38 +180,35 @@ readLiteral:
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
// but is smart enough to keep local variables in registers, so use nb and b,
// inline call to moreBits and reassign b,nb back to f on return.
nb, b := f.nb, f.b
for {
for nb < n {
for fnb < n {
c, err := fr.ReadByte()
if err != nil {
f.b = b
f.nb = nb
f.b, f.nb = fb, fnb
f.err = noEOF(err)
return
}
f.roffset++
b |= uint32(c) << (nb & regSizeMaskUint32)
nb += 8
fb |= uint32(c) << (fnb & regSizeMaskUint32)
fnb += 8
}
chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
n = uint(chunk & huffmanCountMask)
if n > huffmanChunkBits {
chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
n = uint(chunk & huffmanCountMask)
}
if n <= nb {
if n <= fnb {
if n == 0 {
f.b = b
f.nb = nb
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("huffsym: n==0")
}
f.err = CorruptInputError(f.roffset)
return
}
f.b = b >> (n & regSizeMaskUint32)
f.nb = nb - n
fb = fb >> (n & regSizeMaskUint32)
fnb = fnb - n
dist = uint32(chunk >> huffmanValueShift)
break
}
Expand All @@ -220,24 +222,27 @@ readLiteral:
nb := uint(dist-2) >> 1
// have 1 bit in bottom of dist, need nb more.
extra := (dist & 1) << (nb & regSizeMaskUint32)
for f.nb < nb {
for fnb < nb {
c, err := fr.ReadByte()
if err != nil {
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("morebits f.nb<nb:", err)
}
f.err = err
return
}
f.roffset++
f.b |= uint32(c) << f.nb
f.nb += 8
fb |= uint32(c) << (fnb&regSizeMaskUint32)
fnb += 8
}
extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
f.b >>= nb & regSizeMaskUint32
f.nb -= nb
extra |= fb & bitMask32[nb]
fb >>= nb & regSizeMaskUint32
fnb -= nb
dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
// slower: dist = bitMask32[nb+1] + 2 + extra
default:
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("dist too big:", dist, maxNumDist)
}
Expand All @@ -247,6 +252,7 @@ readLiteral:

// No check on length; encoding can be prescient.
if dist > uint32(f.dict.histSize()) {
f.b, f.nb = fb, fnb
if debugDecode {
fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
}
Expand All @@ -271,10 +277,12 @@ copyHistory:
f.toRead = f.dict.readFlush()
f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
f.stepState = stateDict
f.b, f.nb = fb, fnb
return
}
goto readLiteral
}
// Not reached
}

`
Expand All @@ -290,6 +298,6 @@ copyHistory:
f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
}
f.WriteString("\t\tdefault:\n")
f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
f.WriteString("\t\t\treturn f.huffmanGenericReader\n")
f.WriteString("\t}\n}\n")
}