flate: Improve level 1-3 compression (#678)

Use 5 byte hash instead of 4 byte hash. This improves compression in most cases and will also yield faster decompression. Little to no performance impact. Before/after: ``` file out level insize outsize millis nyc-taxi-data-10M.csv gzkp 1 3325605752 922273214 14065 225.49 nyc-taxi-data-10M.csv gzkp 1 3325605752 846471964 14342 221.12 nyc-taxi-data-10M.csv gzkp 2 3325605752 883782053 15683 202.22 nyc-taxi-data-10M.csv gzkp 2 3325605752 815766227 14865 213.35 nyc-taxi-data-10M.csv gzkp 3 3325605752 878726683 17308 183.24 nyc-taxi-data-10M.csv gzkp 3 3325605752 808448239 16882 187.86 nyc-taxi-data-10M.csv gzkp 4 3325605752 789447233 20651 153.57 nyc-taxi-data-10M.csv gzkp 4 3325605752 789447233 20657 153.53 file out level insize outsize millis mb/s enwik9 gzkp 1 1000000000 382781160 5713 166.90 enwik9 gzkp 1 1000000000 374131553 5826 163.69 enwik9 gzkp 2 1000000000 371351753 6131 155.55 enwik9 gzkp 2 1000000000 361881529 5910 161.36 enwik9 gzkp 3 1000000000 364881746 6891 138.39 enwik9 gzkp 3 1000000000 355065173 6960 137.02 enwik9 gzkp 4 1000000000 342732211 8339 114.36 enwik9 gzkp 4 1000000000 342732211 8252 115.57 file reset out level files insize outsize millis mb/s objectfiles true gzkp 1 708 300491980 56114777 1008 284.27 objectfiles true gzkp 1 708 300491980 55300071 998 286.90 objectfiles true gzkp 2 708 300491980 53946448 1147 249.71 objectfiles true gzkp 2 708 300491980 52750260 1109 258.36 objectfiles true gzkp 3 708 300491980 53110452 1220 234.82 objectfiles true gzkp 3 708 300491980 51947585 1211 236.46 One of the few regressions: file out level insize outsize millis mb/s rawstudio-mint14.tar gzkp 1 8558382592 3960117298 36682 222.50 rawstudio-mint14.tar gzkp 1 8558382592 3985295228 36619 222.88 rawstudio-mint14.tar gzkp 2 8558382592 3899597850 38683 210.99 rawstudio-mint14.tar gzkp 2 8558382592 3921716642 36754 222.06 rawstudio-mint14.tar gzkp 3 8558382592 3848762302 46588 175.19 rawstudio-mint14.tar gzkp 3 8558382592 3846475496 45611 178.94 ```
klauspost · Sep 25, 2022 · b8a3c61 · b8a3c61
1 parent 3822c7c
commit b8a3c61
Show file tree

Hide file tree

Showing 9 changed files with 100 additions and 106 deletions.
diff --git a/flate/deflate.go b/flate/deflate.go
@@ -374,6 +374,12 @@ func hash4(b []byte) uint32 {
 	return hash4u(binary.LittleEndian.Uint32(b), hashBits)
 }
 
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> (32 - h)
+}
+
 // bulkHash4 will compute hashes using the same
 // algorithm as hash4
 func bulkHash4(b []byte, dst []uint32) {

diff --git a/flate/deflate_test.go b/flate/deflate_test.go
@@ -316,13 +316,13 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
 	}
 	w.Write(input)
 	w.Close()
-	if limit > 0 && buffer.Len() > limit {
-		t.Errorf("level: %d, len(compress(data)) = %d > limit = %d", level, buffer.Len(), limit)
-		return
-	}
 	if limit > 0 {
 		t.Logf("level: %d - Size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len())
 	}
+	if limit > 0 && buffer.Len() > limit {
+		t.Errorf("level: %d, len(compress(data)) = %d > limit = %d", level, buffer.Len(), limit)
+	}
+
 	r := NewReader(&buffer)
 	out, err := io.ReadAll(r)
 	if err != nil {

diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go
@@ -58,17 +58,6 @@ const (
 	prime8bytes = 0xcf1bbcdcb7a56463
 )
 
-func load32(b []byte, i int) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load64(b []byte, i int) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
-}
-
 func load3232(b []byte, i int32) uint32 {
 	return binary.LittleEndian.Uint32(b[i:])
 }
@@ -77,10 +66,6 @@ func load6432(b []byte, i int32) uint64 {
 	return binary.LittleEndian.Uint64(b[i:])
 }
 
-func hash(u uint32) uint32 {
-	return (u * 0x1e35a7bd) >> tableShift
-}
-
 type tableEntry struct {
 	offset int32
 }
@@ -115,39 +100,36 @@ func (e *fastGen) addBlock(src []byte) int32 {
 	return s
 }
 
-// hash4 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4u(u uint32, h uint8) uint32 {
-	return (u * prime4bytes) >> (32 - h)
-}
-
 type tableEntryPrev struct {
 	Cur  tableEntry
 	Prev tableEntry
 }
 
-// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4x64(u uint64, h uint8) uint32 {
-	return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32)
-}
-
 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash7(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))
 }
 
-// hash8 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash8(u uint64, h uint8) uint32 {
-	return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64))
-}
-
-// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash6(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64))
+// hashLen returns a hash of the lowest mls bytes of with length output bits.
+// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
+// length should always be < 32.
+// Preferably length and mls should be a constant for inlining.
+func hashLen(u uint64, length, mls uint8) uint32 {
+	switch mls {
+	case 3:
+		return (uint32(u<<8) * prime3bytes) >> (32 - length)
+	case 5:
+		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
+	case 6:
+		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
+	case 7:
+		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
+	case 8:
+		return uint32((u * prime8bytes) >> (64 - length))
+	default:
+		return (uint32(u) * prime4bytes) >> (32 - length)
+	}
 }
 
 // matchlen will return the match length between offsets and t in src.

diff --git a/flate/level1.go b/flate/level1.go
@@ -19,6 +19,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
 	)
 	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
@@ -68,7 +69,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 	sLimit := int32(len(src) - inputMargin)
 
 	// nextEmit is where in src the next emitLiteral should start from.
-	cv := load3232(src, s)
+	cv := load6432(src, s)
 
 	for {
 		const skipLog = 5
@@ -77,7 +78,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash(cv)
+			nextHash := hashLen(cv, tableBits, hashBytes)
 			candidate = e.table[nextHash]
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -86,28 +87,28 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
-			nextHash = hash(uint32(now))
+			nextHash = hashLen(now, tableBits, hashBytes)
 
 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 
 			// Do one right away...
-			cv = uint32(now)
+			cv = now
 			s = nextS
 			nextS++
 			candidate = e.table[nextHash]
 			now >>= 8
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
 
 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
-			cv = uint32(now)
+			cv = now
 			s = nextS
 		}
 
@@ -198,9 +199,9 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			}
 			if s >= sLimit {
 				// Index first pair after match end.
-				if int(s+l+4) < len(src) {
-					cv := load3232(src, s)
-					e.table[hash(cv)] = tableEntry{offset: s + e.cur}
+				if int(s+l+8) < len(src) {
+					cv := load6432(src, s)
+					e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
@@ -213,16 +214,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			// three load32 calls.
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
-			prevHash := hash(uint32(x))
+			prevHash := hashLen(x, tableBits, hashBytes)
 			e.table[prevHash] = tableEntry{offset: o}
 			x >>= 16
-			currHash := hash(uint32(x))
+			currHash := hashLen(x, tableBits, hashBytes)
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2}
 
 			offset := s - (candidate.offset - e.cur)
 			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) {
-				cv = uint32(x >> 8)
+				cv = x >> 8
 				s++
 				break
 			}

diff --git a/flate/level2.go b/flate/level2.go
@@ -16,6 +16,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
 	)
 
 	if debugDeflate && e.cur < 0 {
@@ -66,7 +67,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 	sLimit := int32(len(src) - inputMargin)
 
 	// nextEmit is where in src the next emitLiteral should start from.
-	cv := load3232(src, s)
+	cv := load6432(src, s)
 	for {
 		// When should we start skipping if we haven't found matches in a long while.
 		const skipLog = 5
@@ -75,7 +76,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash4u(cv, bTableBits)
+			nextHash := hashLen(cv, bTableBits, hashBytes)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -84,27 +85,27 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			candidate = e.table[nextHash]
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
-			nextHash = hash4u(uint32(now), bTableBits)
+			nextHash = hashLen(now, bTableBits, hashBytes)
 
 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 
 			// Do one right away...
-			cv = uint32(now)
+			cv = now
 			s = nextS
 			nextS++
 			candidate = e.table[nextHash]
 			now >>= 8
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
 
 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				break
 			}
-			cv = uint32(now)
+			cv = now
 		}
 
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
@@ -154,25 +155,25 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 
 			if s >= sLimit {
 				// Index first pair after match end.
-				if int(s+l+4) < len(src) {
-					cv := load3232(src, s)
-					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur}
+				if int(s+l+8) < len(src) {
+					cv := load6432(src, s)
+					e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
 
 			// Store every second hash in-between, but offset by 1.
 			for i := s - l + 2; i < s-5; i += 7 {
 				x := load6432(src, i)
-				nextHash := hash4u(uint32(x), bTableBits)
+				nextHash := hashLen(x, bTableBits, hashBytes)
 				e.table[nextHash] = tableEntry{offset: e.cur + i}
 				// Skip one
 				x >>= 16
-				nextHash = hash4u(uint32(x), bTableBits)
+				nextHash = hashLen(x, bTableBits, hashBytes)
 				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
 				// Skip one
 				x >>= 16
-				nextHash = hash4u(uint32(x), bTableBits)
+				nextHash = hashLen(x, bTableBits, hashBytes)
 				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
 			}
 
@@ -184,17 +185,17 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			// three load32 calls.
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
-			prevHash := hash4u(uint32(x), bTableBits)
-			prevHash2 := hash4u(uint32(x>>8), bTableBits)
+			prevHash := hashLen(x, bTableBits, hashBytes)
+			prevHash2 := hashLen(x>>8, bTableBits, hashBytes)
 			e.table[prevHash] = tableEntry{offset: o}
 			e.table[prevHash2] = tableEntry{offset: o + 1}
-			currHash := hash4u(uint32(x>>16), bTableBits)
+			currHash := hashLen(x>>16, bTableBits, hashBytes)
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2}
 
 			offset := s - (candidate.offset - e.cur)
 			if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) {
-				cv = uint32(x >> 24)
+				cv = x >> 24
 				s++
 				break
 			}