From 6caf5f7431ecc1a2f3b82665632e235407fa6c1b Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Wed, 8 Jun 2022 12:02:52 +0200
Subject: [PATCH] deflate: Use compound hcode

~5% Faster for huffman-only, same for rest.

```
github-ranks-backup.bin	gzkp	-2	1862623243	1298789681	5547	320.22
github-ranks-backup.bin	gzkp	-2	1862623243	1298789681	5305	334.83
```

Replaces #619 - more speedup, and no regression.
---
 flate/huffman_bit_writer.go | 47 +++++++++++++++++++------------------
 flate/huffman_code.go       | 38 ++++++++++++++++++++----------
 2 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go
index 25f6d1108f..ff9f6f609a 100644
--- a/flate/huffman_bit_writer.go
+++ b/flate/huffman_bit_writer.go
@@ -169,7 +169,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
 	b := w.offsetEncoding.codes
 	b = b[:len(a)]
 	for i, v := range a {
-		if v != 0 && b[i].len == 0 {
+		if v != 0 && b[i].zero() {
 			return false
 		}
 	}
@@ -178,7 +178,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
 	b = w.literalEncoding.codes[256:literalCount]
 	b = b[:len(a)]
 	for i, v := range a {
-		if v != 0 && b[i].len == 0 {
+		if v != 0 && b[i].zero() {
 			return false
 		}
 	}
@@ -186,7 +186,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
 	a = t.litHist[:256]
 	b = w.literalEncoding.codes[:len(a)]
 	for i, v := range a {
-		if v != 0 && b[i].len == 0 {
+		if v != 0 && b[i].zero() {
 			return false
 		}
 	}
@@ -280,12 +280,12 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 	// Copy the concatenated code sizes to codegen. Put a marker at the end.
 	cgnl := codegen[:numLiterals]
 	for i := range cgnl {
-		cgnl[i] = uint8(litEnc.codes[i].len)
+		cgnl[i] = litEnc.codes[i].len()
 	}
 
 	cgnl = codegen[numLiterals : numLiterals+numOffsets]
 	for i := range cgnl {
-		cgnl[i] = uint8(offEnc.codes[i].len)
+		cgnl[i] = offEnc.codes[i].len()
 	}
 	codegen[numLiterals+numOffsets] = badCode
 
@@ -428,8 +428,8 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 
 func (w *huffmanBitWriter) writeCode(c hcode) {
 	// The function does not get inlined if we "& 63" the shift.
-	w.bits |= uint64(c.code) << (w.nbits & 63)
-	w.nbits += c.len
+	w.bits |= c.code64() << (w.nbits & 63)
+	w.nbits += c.len()
 	if w.nbits >= 48 {
 		w.writeOutBits()
 	}
@@ -477,7 +477,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
 	w.writeBits(int32(numCodegens-4), 4)
 
 	for i := 0; i < numCodegens; i++ {
-		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len)
+		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len())
 		w.writeBits(int32(value), 3)
 	}
 
@@ -670,7 +670,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		// Estimate size for using a new table.
 		// Use the previous header size as the best estimate.
 		newSize := w.lastHeader + tokens.EstimatedBits()
-		newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty
 
 		// The estimated size is calculated as an optimal table.
 		// We add a penalty to make it more realistic and re-use a bit more.
@@ -854,8 +854,8 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		if t < 256 {
 			//w.writeCode(lits[t.literal()])
 			c := lits[t]
-			bits |= uint64(c.code) << (nbits & 63)
-			nbits += c.len
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -882,8 +882,8 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		} else {
 			// inlined
 			c := lengths[lengthCode]
-			bits |= uint64(c.code) << (nbits & 63)
-			nbits += c.len
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -931,8 +931,8 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		} else {
 			// inlined
 			c := offs[offsetCode]
-			bits |= uint64(c.code) << (nbits & 63)
-			nbits += c.len
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -1134,12 +1134,12 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 			nbytes = 0
 		}
 		a, b := encoding[input[0]], encoding[input[1]]
-		bits |= uint64(a.code) << (nbits & 63)
-		bits |= uint64(b.code) << ((nbits + a.len) & 63)
+		bits |= a.code64() << (nbits & 63)
+		bits |= b.code64() << ((nbits + a.len()) & 63)
 		c := encoding[input[2]]
-		nbits += b.len + a.len
-		bits |= uint64(c.code) << (nbits & 63)
-		nbits += c.len
+		nbits += b.len() + a.len()
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
 		input = input[3:]
 	}
 
@@ -1165,10 +1165,11 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		}
 		// Bitwriting inlined, ~30% speedup
 		c := encoding[t]
-		bits |= uint64(c.code) << (nbits & 63)
-		nbits += c.len
+		bits |= c.code64() << (nbits & 63)
+
+		nbits += c.len()
 		if debugDeflate {
-			count += int(c.len)
+			count += int(c.len())
 		}
 	}
 	// Restore...
diff --git a/flate/huffman_code.go b/flate/huffman_code.go
index 9ab497c275..e0483bef3f 100644
--- a/flate/huffman_code.go
+++ b/flate/huffman_code.go
@@ -16,9 +16,18 @@ const (
 )
 
 // hcode is a huffman code with a bit code and bit length.
-type hcode struct {
-	code uint16
-	len  uint8
+type hcode uint32
+
+func (h hcode) len() uint8 {
+	return uint8(h)
+}
+
+func (h hcode) code64() uint64 {
+	return uint64(h >> 8)
+}
+
+func (h hcode) zero() bool {
+	return h == 0
 }
 
 type huffmanEncoder struct {
@@ -58,8 +67,11 @@ type levelInfo struct {
 
 // set sets the code and length of an hcode.
 func (h *hcode) set(code uint16, length uint8) {
-	h.len = length
-	h.code = code
+	*h = hcode(length) | (hcode(code) << 8)
+}
+
+func newhcode(code uint16, length uint8) hcode {
+	return hcode(length) | (hcode(code) << 8)
 }
 
 func reverseBits(number uint16, bitLength byte) uint16 {
@@ -100,7 +112,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 			bits = ch + 192 - 280
 			size = 8
 		}
-		codes[ch] = hcode{code: reverseBits(bits, size), len: size}
+		codes[ch] = newhcode(reverseBits(bits, size), size)
 	}
 	return h
 }
@@ -109,7 +121,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
 	h := newHuffmanEncoder(30)
 	codes := h.codes
 	for ch := range codes {
-		codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
+		codes[ch] = newhcode(reverseBits(uint16(ch), 5), 5)
 	}
 	return h
 }
@@ -121,7 +133,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	var total int
 	for i, f := range freq {
 		if f != 0 {
-			total += int(f) * int(h.codes[i].len)
+			total += int(f) * int(h.codes[i].len())
 		}
 	}
 	return total
@@ -130,7 +142,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
 	var total int
 	for _, f := range b {
-		total += int(h.codes[f].len)
+		total += int(h.codes[f].len())
 	}
 	return total
 }
@@ -141,10 +153,10 @@ func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
 	for i, f := range freq {
 		if f != 0 {
 			code := h.codes[i]
-			if code.len == 0 {
+			if code.zero() {
 				return math.MaxInt32
 			}
-			total += int(f) * int(code.len)
+			total += int(f) * int(code.len())
 		}
 	}
 	return total
@@ -308,7 +320,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 
 		sortByLiteral(chunk)
 		for _, node := range chunk {
-			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint8(n)}
+			h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n))
 			code++
 		}
 		list = list[0 : len(list)-int(bits)]
@@ -330,7 +342,7 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 			list[count] = literalNode{uint16(i), f}
 			count++
 		} else {
-			codes[i].len = 0
+			codes[i] = 0
 		}
 	}
 	list[count] = literalNode{}