From 4a278e717c4b012008ca42aa1e792db5f281828b Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Mon, 31 Jan 2022 12:10:14 +0100
Subject: [PATCH] flate: Improve level 1 speed

Mainly through manual inlining.

```
BenchmarkEncodeDigitsSpeed1e4-32     53121         47995         -9.65%
BenchmarkEncodeDigitsSpeed1e5-32     686799        630834        -8.15%
BenchmarkEncodeDigitsSpeed1e6-32     6636421       6054570       -8.77%
BenchmarkEncodeTwainSpeed1e4-32      77016         69146         -10.22%
BenchmarkEncodeTwainSpeed1e5-32      690640        634364        -8.15%
BenchmarkEncodeTwainSpeed1e6-32      6915519       6367504       -7.92%

BenchmarkEncodeDigitsSpeed1e4-32     188.25       208.35       1.11x
BenchmarkEncodeDigitsSpeed1e5-32     145.60       158.52       1.09x
BenchmarkEncodeDigitsSpeed1e6-32     150.68       165.16       1.10x
BenchmarkEncodeTwainSpeed1e4-32      129.84       144.62       1.11x
BenchmarkEncodeTwainSpeed1e5-32      144.79       157.64       1.09x
BenchmarkEncodeTwainSpeed1e6-32      144.60       157.05       1.09x
```
---
 flate/fast_encoder.go       |  2 +-
 flate/huffman_bit_writer.go |  4 +--
 flate/level1.go             | 56 +++++++++++++++++++++++++++++++++++--
 flate/token.go              |  4 +--
 4 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go
index 0b2e54972c..d55ea2a775 100644
--- a/flate/fast_encoder.go
+++ b/flate/fast_encoder.go
@@ -179,7 +179,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
 func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
-	if debugDecode {
+	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go
index fd49efd75b..31a9092d90 100644
--- a/flate/huffman_bit_writer.go
+++ b/flate/huffman_bit_writer.go
@@ -833,9 +833,9 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
 
 	for _, t := range tokens {
-		if t < matchType {
+		if t < 256 {
 			//w.writeCode(lits[t.literal()])
-			c := lits[t.literal()]
+			c := lits[t]
 			bits |= uint64(c.code) << (nbits & 63)
 			nbits += c.len
 			if nbits >= 48 {
diff --git a/flate/level1.go b/flate/level1.go
index 1e5eea3968..0022c8bb6b 100644
--- a/flate/level1.go
+++ b/flate/level1.go
@@ -1,6 +1,10 @@
 package flate
 
-import "fmt"
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
 
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
@@ -116,7 +120,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			var l = int32(4)
+			if false {
+				l = e.matchlenLong(s+4, t+4, src) + 4
+			} else {
+				// inlined:
+				a := src[s+4:]
+				b := src[t+4:]
+				for len(a) >= 8 {
+					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+						l += int32(bits.TrailingZeros64(diff) >> 3)
+						break
+					}
+					l += 8
+					a = a[8:]
+					b = b[8:]
+				}
+				if len(a) < 8 {
+					b = b[:len(a)]
+					for i := range a {
+						if a[i] != b[i] {
+							break
+						}
+						l++
+					}
+				}
+			}
 
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
@@ -129,7 +158,28 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			}
 
 			// Save the match found
-			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			if false {
+				dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			} else {
+				// Inlined...
+				xoffset := uint32(s - t - baseMatchOffset)
+				xlength := l
+				oc := offsetCode(xoffset)
+				xoffset |= oc << 16
+				for xlength > 0 {
+					xl := xlength
+					if xl > 258 {
+						// We need to have at least baseMatchLength left over for next loop.
+						xl = 258 - baseMatchLength
+					}
+					xlength -= xl
+					xl -= baseMatchLength
+					dst.extraHist[lengthCodes1[uint8(xl)]]++
+					dst.offHist[oc]++
+					dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+					dst.n++
+				}
+			}
 			s += l
 			nextEmit = s
 			if nextS >= s {
diff --git a/flate/token.go b/flate/token.go
index 3a9618ee19..ef69c05a1d 100644
--- a/flate/token.go
+++ b/flate/token.go
@@ -276,7 +276,7 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 	xoffset |= oCode << 16
 
 	t.extraHist[lengthCodes1[uint8(xlength)]]++
-	t.offHist[oCode]++
+	t.offHist[oCode&31]++
 	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
 	t.n++
 }
@@ -300,7 +300,7 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
 		xlength -= xl
 		xl -= baseMatchLength
 		t.extraHist[lengthCodes1[uint8(xl)]]++
-		t.offHist[oc]++
+		t.offHist[oc&31]++
 		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
 		t.n++
 	}