From 4a278e717c4b012008ca42aa1e792db5f281828b Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 31 Jan 2022 12:10:14 +0100 Subject: [PATCH] flate: Improve level 1 speed Mainly through manual inlining. ``` BenchmarkEncodeDigitsSpeed1e4-32 53121 47995 -9.65% BenchmarkEncodeDigitsSpeed1e5-32 686799 630834 -8.15% BenchmarkEncodeDigitsSpeed1e6-32 6636421 6054570 -8.77% BenchmarkEncodeTwainSpeed1e4-32 77016 69146 -10.22% BenchmarkEncodeTwainSpeed1e5-32 690640 634364 -8.15% BenchmarkEncodeTwainSpeed1e6-32 6915519 6367504 -7.92% BenchmarkEncodeDigitsSpeed1e4-32 188.25 208.35 1.11x BenchmarkEncodeDigitsSpeed1e5-32 145.60 158.52 1.09x BenchmarkEncodeDigitsSpeed1e6-32 150.68 165.16 1.10x BenchmarkEncodeTwainSpeed1e4-32 129.84 144.62 1.11x BenchmarkEncodeTwainSpeed1e5-32 144.79 157.64 1.09x BenchmarkEncodeTwainSpeed1e6-32 144.60 157.05 1.09x ``` --- flate/fast_encoder.go | 2 +- flate/huffman_bit_writer.go | 4 +-- flate/level1.go | 56 +++++++++++++++++++++++++++++++++++-- flate/token.go | 4 +-- 4 files changed, 58 insertions(+), 8 deletions(-) diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go index 0b2e54972c..d55ea2a775 100644 --- a/flate/fast_encoder.go +++ b/flate/fast_encoder.go @@ -179,7 +179,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 { // matchlenLong will return the match length between offsets and t in src. // It is assumed that s > t, that t >=0 and s < len(src). func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 { - if debugDecode { + if debugDeflate { if t >= s { panic(fmt.Sprint("t >=s:", t, s)) } diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go index fd49efd75b..31a9092d90 100644 --- a/flate/huffman_bit_writer.go +++ b/flate/huffman_bit_writer.go @@ -833,9 +833,9 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) bits, nbits, nbytes := w.bits, w.nbits, w.nbytes for _, t := range tokens { - if t < matchType { + if t < 256 { //w.writeCode(lits[t.literal()]) - c := lits[t.literal()] + c := lits[t] bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { diff --git a/flate/level1.go b/flate/level1.go index 1e5eea3968..0022c8bb6b 100644 --- a/flate/level1.go +++ b/flate/level1.go @@ -1,6 +1,10 @@ package flate -import "fmt" +import ( + "encoding/binary" + "fmt" + "math/bits" +) // fastGen maintains the table for matches, // and the previous byte block for level 2. @@ -116,7 +120,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { // Extend the 4-byte match as long as possible. t := candidate.offset - e.cur - l := e.matchlenLong(s+4, t+4, src) + 4 + var l = int32(4) + if false { + l = e.matchlenLong(s+4, t+4, src) + 4 + } else { + // inlined: + a := src[s+4:] + b := src[t+4:] + for len(a) >= 8 { + if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 { + l += int32(bits.TrailingZeros64(diff) >> 3) + break + } + l += 8 + a = a[8:] + b = b[8:] + } + if len(a) < 8 { + b = b[:len(a)] + for i := range a { + if a[i] != b[i] { + break + } + l++ + } + } + } // Extend backwards for t > 0 && s > nextEmit && src[t-1] == src[s-1] { @@ -129,7 +158,28 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { } // Save the match found - dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + if false { + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + } else { + // Inlined... + xoffset := uint32(s - t - baseMatchOffset) + xlength := l + oc := offsetCode(xoffset) + xoffset |= oc << 16 + for xlength > 0 { + xl := xlength + if xl > 258 { + // We need to have at least baseMatchLength left over for next loop. + xl = 258 - baseMatchLength + } + xlength -= xl + xl -= baseMatchLength + dst.extraHist[lengthCodes1[uint8(xl)]]++ + dst.offHist[oc]++ + dst.tokens[dst.n] = token(matchType | uint32(xl)<= s { diff --git a/flate/token.go b/flate/token.go index 3a9618ee19..ef69c05a1d 100644 --- a/flate/token.go +++ b/flate/token.go @@ -276,7 +276,7 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) { xoffset |= oCode << 16 t.extraHist[lengthCodes1[uint8(xlength)]]++ - t.offHist[oCode]++ + t.offHist[oCode&31]++ t.tokens[t.n] = token(matchType | xlength<