diff --git a/flate/deflate.go b/flate/deflate.go
index b27f5a93bc..1e9e41e315 100644
--- a/flate/deflate.go
+++ b/flate/deflate.go
@@ -10,9 +10,6 @@ import (
 	"fmt"
 	"io"
 	"math"
-	"math/bits"
-
-	comp "github.com/klauspost/compress"
 )
 
 const (
@@ -110,6 +107,7 @@ type advancedState struct {
 type compressor struct {
 	compressionLevel
 
+	h *huffmanEncoder
 	w *huffmanBitWriter
 
 	// compression algorithm
@@ -271,7 +269,7 @@ func (d *compressor) fillWindow(b []byte) {
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
 // pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
-func (d *compressor) findMatch(pos int, prevHead int, lookahead, bpb int) (length, offset int, ok bool) {
+func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) {
 	minMatchLook := maxMatchLength
 	if lookahead < minMatchLook {
 		minMatchLook = lookahead
@@ -297,14 +295,19 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead, bpb int) (lengt
 	}
 	offset = 0
 
+	// Some like it higher (CSV), some like it lower (JSON)
+	const baseCost = 6
 	// Base is 4 bytes at with an additional cost.
 	// Matches must be better than this.
-	cGain := minMatchLength*bpb - 12
+	cGain := 0
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
 			if n > length {
-				newGain := n*bpb - bits.Len32(uint32(pos-i))
+				// Calculate gain. Estimate
+				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
+
+				//fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]))
 				if newGain > cGain {
 					length = n
 					offset = pos - i
@@ -389,10 +392,17 @@ func (d *compressor) deflateLazy() {
 	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
 		return
 	}
-	s.estBitsPerByte = 8
-	if !d.sync {
-		s.estBitsPerByte = comp.ShannonEntropyBits(d.window[s.index:d.windowEnd])
-		s.estBitsPerByte = int(1 + float64(s.estBitsPerByte)/float64(d.windowEnd-s.index))
+	if d.windowEnd != s.index {
+		// Get literal huffman coder.
+		if d.h == nil {
+			d.h = newHuffmanEncoder(maxFlateBlockTokens)
+		}
+		var tmp [256]uint16
+		for _, v := range d.window[s.index:d.windowEnd] {
+			tmp[v]++
+		}
+		d.h.generate(tmp[:], 15)
+		//s.estBitsPerByte = d.h.bitLength(tmp[:])/(d.windowEnd-s.index) + 1
 	}
 
 	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
@@ -446,7 +456,7 @@ func (d *compressor) deflateLazy() {
 		}
 
 		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
-			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead, s.estBitsPerByte); ok {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok {
 				s.length = newLength
 				s.offset = newOffset
 			}
diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go
index fb1701eecc..fd49efd75b 100644
--- a/flate/huffman_bit_writer.go
+++ b/flate/huffman_bit_writer.go
@@ -52,18 +52,18 @@ var lengthBase = [32]uint8{
 }
 
 // offset code word extra bits.
-var offsetExtraBits = [64]int8{
+var offsetExtraBits = [32]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
 	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
 	/* extended window */
-	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
+	14, 14,
 }
 
 var offsetCombined = [32]uint32{}
 
 func init() {
-	var offsetBase = [64]uint32{
+	var offsetBase = [32]uint32{
 		/* normal deflate */
 		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
 		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
@@ -73,9 +73,7 @@ func init() {
 		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
 
 		/* extended window */
-		0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
-		0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
-		0x100000, 0x180000, 0x200000, 0x300000,
+		0x008000, 0x00c000,
 	}
 
 	for i := range offsetCombined[:] {
diff --git a/flate/huffman_code.go b/flate/huffman_code.go
index 67b2b38728..f35e00261d 100644
--- a/flate/huffman_code.go
+++ b/flate/huffman_code.go
@@ -129,9 +129,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
 	var total int
 	for _, f := range b {
-		if f != 0 {
-			total += int(h.codes[f].len)
-		}
+		total += int(h.codes[f].len)
 	}
 	return total
 }
diff --git a/flate/token.go b/flate/token.go
index eb862d7a92..3a9618ee19 100644
--- a/flate/token.go
+++ b/flate/token.go
@@ -129,11 +129,11 @@ var offsetCodes14 = [256]uint32{
 type token uint32
 
 type tokens struct {
-	nLits     int
 	extraHist [32]uint16  // codes 256->maxnumlit
 	offHist   [32]uint16  // offset codes
 	litHist   [256]uint16 // codes 0->255
-	n         uint16      // Must be able to contain maxStoreBlockSize
+	nFilled   int
+	n         uint16 // Must be able to contain maxStoreBlockSize
 	tokens    [maxStoreBlockSize + 1]token
 }
 
@@ -142,7 +142,7 @@ func (t *tokens) Reset() {
 		return
 	}
 	t.n = 0
-	t.nLits = 0
+	t.nFilled = 0
 	for i := range t.litHist[:] {
 		t.litHist[i] = 0
 	}
@@ -161,12 +161,12 @@ func (t *tokens) Fill() {
 	for i, v := range t.litHist[:] {
 		if v == 0 {
 			t.litHist[i] = 1
-			t.nLits++
+			t.nFilled++
 		}
 	}
 	for i, v := range t.extraHist[:literalCount-256] {
 		if v == 0 {
-			t.nLits++
+			t.nFilled++
 			t.extraHist[i] = 1
 		}
 	}
@@ -202,14 +202,12 @@ func emitLiteral(dst *tokens, lit []byte) {
 		dst.litHist[v]++
 	}
 	dst.n += uint16(len(lit))
-	dst.nLits += len(lit)
 }
 
 func (t *tokens) AddLiteral(lit byte) {
 	t.tokens[t.n] = token(lit)
 	t.litHist[lit]++
 	t.n++
-	t.nLits++
 }
 
 // from https://stackoverflow.com/a/28730362
@@ -230,8 +228,9 @@ func (t *tokens) EstimatedBits() int {
 	shannon := float32(0)
 	bits := int(0)
 	nMatches := 0
-	if t.nLits > 0 {
-		invTotal := 1.0 / float32(t.nLits)
+	total := int(t.n) + t.nFilled
+	if total > 0 {
+		invTotal := 1.0 / float32(total)
 		for _, v := range t.litHist[:] {
 			if v > 0 {
 				n := float32(v)
@@ -275,7 +274,6 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 	}
 	oCode := offsetCode(xoffset)
 	xoffset |= oCode << 16
-	t.nLits++
 
 	t.extraHist[lengthCodes1[uint8(xlength)]]++
 	t.offHist[oCode]++
@@ -301,7 +299,6 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
 		}
 		xlength -= xl
 		xl -= baseMatchLength
-		t.nLits++
 		t.extraHist[lengthCodes1[uint8(xl)]]++
 		t.offHist[oc]++
 		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)