flate: Improve level 1 speed by ~10% (#482)

Mainly through manual inlining. ``` BenchmarkEncodeDigitsSpeed1e4-32 53121 47995 -9.65% BenchmarkEncodeDigitsSpeed1e5-32 686799 630834 -8.15% BenchmarkEncodeDigitsSpeed1e6-32 6636421 6054570 -8.77% BenchmarkEncodeTwainSpeed1e4-32 77016 69146 -10.22% BenchmarkEncodeTwainSpeed1e5-32 690640 634364 -8.15% BenchmarkEncodeTwainSpeed1e6-32 6915519 6367504 -7.92% BenchmarkEncodeDigitsSpeed1e4-32 188.25 208.35 1.11x BenchmarkEncodeDigitsSpeed1e5-32 145.60 158.52 1.09x BenchmarkEncodeDigitsSpeed1e6-32 150.68 165.16 1.10x BenchmarkEncodeTwainSpeed1e4-32 129.84 144.62 1.11x BenchmarkEncodeTwainSpeed1e5-32 144.79 157.64 1.09x BenchmarkEncodeTwainSpeed1e6-32 144.60 157.05 1.09x ```
klauspost · Jan 31, 2022 · 61f58c1 · 61f58c1
1 parent ea5a4d4
commit 61f58c1
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 8 deletions.
diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go
@@ -179,7 +179,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
 func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
-	if debugDecode {
+	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}

diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go
@@ -833,9 +833,9 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
 
 	for _, t := range tokens {
-		if t < matchType {
+		if t < 256 {
 			//w.writeCode(lits[t.literal()])
-			c := lits[t.literal()]
+			c := lits[t]
 			bits |= uint64(c.code) << (nbits & 63)
 			nbits += c.len
 			if nbits >= 48 {

diff --git a/flate/level1.go b/flate/level1.go
@@ -1,6 +1,10 @@
 package flate
 
-import "fmt"
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
 
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
@@ -116,7 +120,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			var l = int32(4)
+			if false {
+				l = e.matchlenLong(s+4, t+4, src) + 4
+			} else {
+				// inlined:
+				a := src[s+4:]
+				b := src[t+4:]
+				for len(a) >= 8 {
+					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+						l += int32(bits.TrailingZeros64(diff) >> 3)
+						break
+					}
+					l += 8
+					a = a[8:]
+					b = b[8:]
+				}
+				if len(a) < 8 {
+					b = b[:len(a)]
+					for i := range a {
+						if a[i] != b[i] {
+							break
+						}
+						l++
+					}
+				}
+			}
 
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
@@ -129,7 +158,28 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			}
 
 			// Save the match found
-			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			if false {
+				dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			} else {
+				// Inlined...
+				xoffset := uint32(s - t - baseMatchOffset)
+				xlength := l
+				oc := offsetCode(xoffset)
+				xoffset |= oc << 16
+				for xlength > 0 {
+					xl := xlength
+					if xl > 258 {
+						// We need to have at least baseMatchLength left over for next loop.
+						xl = 258 - baseMatchLength
+					}
+					xlength -= xl
+					xl -= baseMatchLength
+					dst.extraHist[lengthCodes1[uint8(xl)]]++
+					dst.offHist[oc]++
+					dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+					dst.n++
+				}
+			}
 			s += l
 			nextEmit = s
 			if nextS >= s {

diff --git a/flate/token.go b/flate/token.go
@@ -276,7 +276,7 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 	xoffset |= oCode << 16
 
 	t.extraHist[lengthCodes1[uint8(xlength)]]++
-	t.offHist[oCode]++
+	t.offHist[oCode&31]++
 	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
 	t.n++
 }
@@ -300,7 +300,7 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
 		xlength -= xl
 		xl -= baseMatchLength
 		t.extraHist[lengthCodes1[uint8(xl)]]++
-		t.offHist[oc]++
+		t.offHist[oc&31]++
 		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
 		t.n++
 	}