diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go
index 25f6d1108f..7906e938dd 100644
--- a/flate/huffman_bit_writer.go
+++ b/flate/huffman_bit_writer.go
@@ -1009,8 +1009,6 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		}
 	}
 
-	// Fill is rarely better...
-	const fill = false
 	const numLiterals = endBlockMarker + 1
 	const numOffsets = 1
 
@@ -1019,7 +1017,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	// Assume header is around 70 bytes:
 	// https://stackoverflow.com/a/25454430
 	const guessHeaderSizeBits = 70 * 8
-	histogram(input, w.literalFreq[:numLiterals], fill)
+	histogram(input, w.literalFreq[:numLiterals])
 	ssize, storable := w.storedSize(input)
 	if storable && len(input) > 1024 {
 		// Quick check for incompressible content.
@@ -1045,19 +1043,14 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	}
 	w.literalFreq[endBlockMarker] = 1
 	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
-	if fill {
-		// Clear fill...
-		for i := range w.literalFreq[:numLiterals] {
-			w.literalFreq[i] = 0
-		}
-		histogram(input, w.literalFreq[:numLiterals], false)
-	}
 	estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
-	estBits += w.lastHeader
-	if w.lastHeader == 0 {
-		estBits += guessHeaderSizeBits
+	if estBits < math.MaxInt32 {
+		estBits += w.lastHeader
+		if w.lastHeader == 0 {
+			estBits += guessHeaderSizeBits
+		}
+		estBits += estBits >> w.logNewTablePenalty
 	}
-	estBits += estBits >> w.logNewTablePenalty
 
 	// Store bytes, if we don't get a reasonable improvement.
 	if storable && ssize <= estBits {
diff --git a/flate/huffman_code.go b/flate/huffman_code.go
index 9ab497c275..c0bcdad5be 100644
--- a/flate/huffman_code.go
+++ b/flate/huffman_code.go
@@ -364,21 +364,37 @@ func atLeastOne(v float32) float32 {
 	return v
 }
 
-// Unassigned values are assigned '1' in the histogram.
-func fillHist(b []uint16) {
-	for i, v := range b {
-		if v == 0 {
-			b[i] = 1
+func histogram(b []byte, h []uint16) {
+	if true && len(b) >= 8<<10 {
+		// Split for bigger inputs
+		histogramSplit(b, h)
+	} else {
+		h = h[:256]
+		for _, t := range b {
+			h[t]++
 		}
 	}
 }
 
-func histogram(b []byte, h []uint16, fill bool) {
+func histogramSplit(b []byte, h []uint16) {
+	// Tested, and slightly faster than 2-way.
+	// Writing to separate arrays and combining is also slightly slower.
 	h = h[:256]
-	for _, t := range b {
-		h[t]++
+	for len(b)&3 != 0 {
+		h[b[0]]++
+		b = b[1:]
 	}
-	if fill {
-		fillHist(h)
+	n := len(b) / 4
+	x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:]
+	y, z, w = y[:len(x)], z[:len(x)], w[:len(x)]
+	for i, t := range x {
+		v0 := &h[t]
+		v1 := &h[y[i]]
+		v3 := &h[w[i]]
+		v2 := &h[z[i]]
+		*v0++
+		*v1++
+		*v2++
+		*v3++
 	}
 }