Skip to content

Commit

Permalink
flate: Improve huffman generation speed ~5-10% faster (#490)
Browse files Browse the repository at this point in the history
Mainly gains for small blocks & fastest levels.

```
benchmark                               old ns/op     new ns/op     delta
BenchmarkEncodeDigitsConstant1e4-32     16080         15984         -0.60%
BenchmarkEncodeDigitsSpeed1e4-32        48105         45890         -4.60%
BenchmarkEncodeDigitsDefault1e4-32      114016        111234        -2.44%
BenchmarkEncodeDigitsCompress1e4-32     228570        225041        -1.54%
BenchmarkEncodeDigitsSL1e4-32           46695         45006         -3.62%
BenchmarkEncodeTwainConstant1e4-32      23557         22033         -6.47%
BenchmarkEncodeTwainSpeed1e4-32         68594         63177         -7.90%
BenchmarkEncodeTwainDefault1e4-32       112498        111798        -0.62%
BenchmarkEncodeTwainCompress1e4-32      282909        274804        -2.86%
BenchmarkEncodeTwainSL1e4-32            68406         62362         -8.84%

benchmark                               old MB/s     new MB/s     speedup
BenchmarkEncodeDigitsConstant1e4-32     621.89       625.64       1.01x
BenchmarkEncodeDigitsSpeed1e4-32        207.88       217.91       1.05x
BenchmarkEncodeDigitsDefault1e4-32      87.71        89.90        1.02x
BenchmarkEncodeDigitsCompress1e4-32     43.75        44.44        1.02x
BenchmarkEncodeDigitsSL1e4-32           214.16       222.19       1.04x
BenchmarkEncodeTwainConstant1e4-32      424.51       453.87       1.07x
BenchmarkEncodeTwainSpeed1e4-32         145.79       158.29       1.09x
BenchmarkEncodeTwainDefault1e4-32       88.89        89.45        1.01x
BenchmarkEncodeTwainCompress1e4-32      35.35        36.39        1.03x
BenchmarkEncodeTwainSL1e4-32            146.19       160.35       1.10x
```

(output is byte exact)
  • Loading branch information
klauspost committed Feb 5, 2022
1 parent 24a2710 commit 43829fc
Showing 1 changed file with 20 additions and 9 deletions.
29 changes: 20 additions & 9 deletions flate/huffman_code.go
Expand Up @@ -188,14 +188,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
// of the level j ancestor.
var leafCounts [maxBitsLimit][maxBitsLimit]int32

// Descending to only have 1 bounds check.
l2f := int32(list[2].freq)
l1f := int32(list[1].freq)
l0f := int32(list[0].freq) + int32(list[1].freq)

for level := int32(1); level <= maxBits; level++ {
// For every level, the first two items are the first two characters.
// We initialize the levels as if we had already figured this out.
levels[level] = levelInfo{
level: level,
lastFreq: int32(list[1].freq),
nextCharFreq: int32(list[2].freq),
nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
lastFreq: l1f,
nextCharFreq: l2f,
nextPairFreq: l0f,
}
leafCounts[level][level] = 2
if level == 1 {
Expand All @@ -206,8 +211,8 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
// We need a total of 2*n - 2 items at top level and have already generated 2.
levels[maxBits].needed = 2*n - 4

level := maxBits
for {
level := uint32(maxBits)
for level < 16 {
l := &levels[level]
if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
// We've run out of both leafs and pairs.
Expand Down Expand Up @@ -239,7 +244,13 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
// more values in the level below
l.lastFreq = l.nextPairFreq
// Take leaf counts from the lower level, except counts[level] remains the same.
copy(leafCounts[level][:level], leafCounts[level-1][:level])
if true {
save := leafCounts[level][level]
leafCounts[level] = leafCounts[level-1]
leafCounts[level][level] = save
} else {
copy(leafCounts[level][:level], leafCounts[level-1][:level])
}
levels[l.level-1].needed = 2
}

Expand Down Expand Up @@ -310,6 +321,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
// maxBits The maximum number of bits to use for any literal.
func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
list := h.freqcache[:len(freq)+1]
codes := h.codes[:len(freq)]
// Number of non-zero literals
count := 0
// Set list to be the set of all non-zero literals and their frequencies
Expand All @@ -318,11 +330,10 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
list[count] = literalNode{uint16(i), f}
count++
} else {
list[count] = literalNode{}
h.codes[i].len = 0
codes[i].len = 0
}
}
list[len(freq)] = literalNode{}
list[count] = literalNode{}

list = list[:count]
if count <= 2 {
Expand Down

0 comments on commit 43829fc

Please sign in to comment.