Skip to content

Commit

Permalink
flate: Improve level 1 speed by ~10% (#482)
Browse files Browse the repository at this point in the history
Mainly through manual inlining.

```
BenchmarkEncodeDigitsSpeed1e4-32     53121         47995         -9.65%
BenchmarkEncodeDigitsSpeed1e5-32     686799        630834        -8.15%
BenchmarkEncodeDigitsSpeed1e6-32     6636421       6054570       -8.77%
BenchmarkEncodeTwainSpeed1e4-32      77016         69146         -10.22%
BenchmarkEncodeTwainSpeed1e5-32      690640        634364        -8.15%
BenchmarkEncodeTwainSpeed1e6-32      6915519       6367504       -7.92%

BenchmarkEncodeDigitsSpeed1e4-32     188.25       208.35       1.11x
BenchmarkEncodeDigitsSpeed1e5-32     145.60       158.52       1.09x
BenchmarkEncodeDigitsSpeed1e6-32     150.68       165.16       1.10x
BenchmarkEncodeTwainSpeed1e4-32      129.84       144.62       1.11x
BenchmarkEncodeTwainSpeed1e5-32      144.79       157.64       1.09x
BenchmarkEncodeTwainSpeed1e6-32      144.60       157.05       1.09x
```
  • Loading branch information
klauspost committed Jan 31, 2022
1 parent ea5a4d4 commit 61f58c1
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 8 deletions.
2 changes: 1 addition & 1 deletion flate/fast_encoder.go
Expand Up @@ -179,7 +179,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
// matchlenLong will return the match length between offsets and t in src.
// It is assumed that s > t, that t >=0 and s < len(src).
func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
if debugDecode {
if debugDeflate {
if t >= s {
panic(fmt.Sprint("t >=s:", t, s))
}
Expand Down
4 changes: 2 additions & 2 deletions flate/huffman_bit_writer.go
Expand Up @@ -833,9 +833,9 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
bits, nbits, nbytes := w.bits, w.nbits, w.nbytes

for _, t := range tokens {
if t < matchType {
if t < 256 {
//w.writeCode(lits[t.literal()])
c := lits[t.literal()]
c := lits[t]
bits |= uint64(c.code) << (nbits & 63)
nbits += c.len
if nbits >= 48 {
Expand Down
56 changes: 53 additions & 3 deletions flate/level1.go
@@ -1,6 +1,10 @@
package flate

import "fmt"
import (
"encoding/binary"
"fmt"
"math/bits"
)

// fastGen maintains the table for matches,
// and the previous byte block for level 2.
Expand Down Expand Up @@ -116,7 +120,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {

// Extend the 4-byte match as long as possible.
t := candidate.offset - e.cur
l := e.matchlenLong(s+4, t+4, src) + 4
var l = int32(4)
if false {
l = e.matchlenLong(s+4, t+4, src) + 4
} else {
// inlined:
a := src[s+4:]
b := src[t+4:]
for len(a) >= 8 {
if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
l += int32(bits.TrailingZeros64(diff) >> 3)
break
}
l += 8
a = a[8:]
b = b[8:]
}
if len(a) < 8 {
b = b[:len(a)]
for i := range a {
if a[i] != b[i] {
break
}
l++
}
}
}

// Extend backwards
for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
Expand All @@ -129,7 +158,28 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
}

// Save the match found
dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
if false {
dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
} else {
// Inlined...
xoffset := uint32(s - t - baseMatchOffset)
xlength := l
oc := offsetCode(xoffset)
xoffset |= oc << 16
for xlength > 0 {
xl := xlength
if xl > 258 {
// We need to have at least baseMatchLength left over for next loop.
xl = 258 - baseMatchLength
}
xlength -= xl
xl -= baseMatchLength
dst.extraHist[lengthCodes1[uint8(xl)]]++
dst.offHist[oc]++
dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
dst.n++
}
}
s += l
nextEmit = s
if nextS >= s {
Expand Down
4 changes: 2 additions & 2 deletions flate/token.go
Expand Up @@ -276,7 +276,7 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
xoffset |= oCode << 16

t.extraHist[lengthCodes1[uint8(xlength)]]++
t.offHist[oCode]++
t.offHist[oCode&31]++
t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
t.n++
}
Expand All @@ -300,7 +300,7 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
xlength -= xl
xl -= baseMatchLength
t.extraHist[lengthCodes1[uint8(xl)]]++
t.offHist[oc]++
t.offHist[oc&31]++
t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
t.n++
}
Expand Down

0 comments on commit 61f58c1

Please sign in to comment.