From 1bac876e33c3d3dc25dd9444e97c71baece4d353 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 7 Jun 2022 16:16:33 +0200 Subject: [PATCH] deflate: Separate code and lengths Experiment by adding https://github.com/golang/go/pull/46007 from @teivah Before/after: ``` file out level insize outsize millis mb/s github-ranks-backup.bin gzkp 1 1862623243 458201422 6979 254.51 github-ranks-backup.bin gzkp 1 1862623243 458201422 7273 244.22 enwik9 gzkp 1 1000000000 382781160 5805 164.26 enwik9 gzkp 1 1000000000 382781160 5976 159.57 github-ranks-backup.bin gzkp -2 1862623243 1298789681 5592 317.65 github-ranks-backup.bin gzkp -2 1862623243 1298789681 5420 327.70 ``` Slower for general compression, but faster for huffman only compression. --- flate/huffman_bit_writer.go | 163 +++++++++++++++++------------------- flate/huffman_code.go | 45 +++++----- 2 files changed, 102 insertions(+), 106 deletions(-) diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go index 25f6d1108f..945ab099bd 100644 --- a/flate/huffman_bit_writer.go +++ b/flate/huffman_bit_writer.go @@ -166,27 +166,27 @@ func (w *huffmanBitWriter) reset(writer io.Writer) { func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { a := t.offHist[:offsetCodeCount] - b := w.offsetEncoding.codes + b := w.offsetEncoding.codes.len b = b[:len(a)] for i, v := range a { - if v != 0 && b[i].len == 0 { + if v != 0 && b[i] == 0 { return false } } a = t.extraHist[:literalCount-256] - b = w.literalEncoding.codes[256:literalCount] + b = w.literalEncoding.codes.len[256:literalCount] b = b[:len(a)] for i, v := range a { - if v != 0 && b[i].len == 0 { + if v != 0 && b[i] == 0 { return false } } a = t.litHist[:256] - b = w.literalEncoding.codes[:len(a)] + b = w.literalEncoding.codes.len[:len(a)] for i, v := range a { - if v != 0 && b[i].len == 0 { + if v != 0 && b[i] == 0 { return false } } @@ -200,7 +200,7 @@ func (w *huffmanBitWriter) flush() { } if w.lastHeader > 0 { // We owe an EOB - w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.writeCode(w.literalEncoding.codes, endBlockMarker) w.lastHeader = 0 } n := w.nbytes @@ -280,12 +280,12 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE // Copy the concatenated code sizes to codegen. Put a marker at the end. cgnl := codegen[:numLiterals] for i := range cgnl { - cgnl[i] = uint8(litEnc.codes[i].len) + cgnl[i] = litEnc.codes.len[i] } cgnl = codegen[numLiterals : numLiterals+numOffsets] for i := range cgnl { - cgnl[i] = uint8(offEnc.codes[i].len) + cgnl[i] = offEnc.codes.len[i] } codegen[numLiterals+numOffsets] = badCode @@ -426,10 +426,10 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { return 0, false } -func (w *huffmanBitWriter) writeCode(c hcode) { +func (w *huffmanBitWriter) writeCode(c hcode, idx int) { // The function does not get inlined if we "& 63" the shift. - w.bits |= uint64(c.code) << (w.nbits & 63) - w.nbits += c.len + w.bits |= uint64(c.code[idx]) << (w.nbits & 63) + w.nbits += c.len[idx] if w.nbits >= 48 { w.writeOutBits() } @@ -477,18 +477,18 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n w.writeBits(int32(numCodegens-4), 4) for i := 0; i < numCodegens; i++ { - value := uint(w.codegenEncoding.codes[codegenOrder[i]].len) + value := uint(w.codegenEncoding.codes.len[codegenOrder[i]]) w.writeBits(int32(value), 3) } i := 0 for { - var codeWord = uint32(w.codegen[i]) + var codeWord = int(w.codegen[i]) i++ if codeWord == badCode { break } - w.writeCode(w.codegenEncoding.codes[codeWord]) + w.writeCode(w.codegenEncoding.codes, codeWord) switch codeWord { case 16: @@ -513,7 +513,7 @@ func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) { } if w.lastHeader > 0 { // We owe an EOB - w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.writeCode(w.literalEncoding.codes, endBlockMarker) w.lastHeader = 0 } @@ -542,7 +542,7 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { } if w.lastHeader > 0 { // We owe an EOB - w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.writeCode(w.literalEncoding.codes, endBlockMarker) w.lastHeader = 0 } @@ -567,7 +567,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) { tokens.AddEOB() if w.lastHeader > 0 { // We owe an EOB - w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.writeCode(w.literalEncoding.codes, endBlockMarker) w.lastHeader = 0 } numLiterals, numOffsets := w.indexTokens(tokens, false) @@ -638,7 +638,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b // We cannot reuse pure huffman table, and must mark as EOF. if (w.lastHuffMan || eof) && w.lastHeader > 0 { // We will not try to reuse. - w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.writeCode(w.literalEncoding.codes, endBlockMarker) w.lastHeader = 0 w.lastHuffMan = false } @@ -650,7 +650,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b // Check if we can reuse... if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) { - w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.writeCode(w.literalEncoding.codes, endBlockMarker) w.lastHeader = 0 } @@ -670,7 +670,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b // Estimate size for using a new table. // Use the previous header size as the best estimate. newSize := w.lastHeader + tokens.EstimatedBits() - newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty + newSize += int(w.literalEncoding.codes.len[endBlockMarker]) + newSize>>w.logNewTablePenalty // The estimated size is calculated as an optimal table. // We add a penalty to make it more realistic and re-use a bit more. @@ -679,7 +679,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b // Check if a new table is better. if newSize < reuseSize { // Write the EOB we owe. - w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.writeCode(w.literalEncoding.codes, endBlockMarker) size = newSize w.lastHeader = 0 } else { @@ -826,7 +826,7 @@ func (w *huffmanBitWriter) generate() { // writeTokens writes a slice of tokens to the output. // codes for literal and offset encoding must be supplied. -func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) { +func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes hcode) { if w.err != nil { return } @@ -842,20 +842,23 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } // Create slices up to the next power of two to avoid bounds checks. - lits := leCodes[:256] - offs := oeCodes[:32] - lengths := leCodes[lengthCodesStart:] - lengths = lengths[:32] - + litLens := leCodes.len[:256] + litCodes := leCodes.code[:256] + offLens := oeCodes.len[:32] + offCodes := oeCodes.code[:32] + + llLengths := leCodes.len[lengthCodesStart:] + llLengths = llLengths[:32] + llCodes := leCodes.code[lengthCodesStart:] + llCodes = llCodes[:32] // Go 1.16 LOVES having these on stack. bits, nbits, nbytes := w.bits, w.nbits, w.nbytes for _, t := range tokens { if t < 256 { //w.writeCode(lits[t.literal()]) - c := lits[t] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + bits |= uint64(litCodes[t]) << (nbits & 63) + nbits += litLens[t] if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -877,27 +880,23 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) // Write the length length := t.length() lengthCode := lengthCode(length) & 31 - if false { - w.writeCode(lengths[lengthCode]) - } else { - // inlined - c := lengths[lengthCode] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len - if nbits >= 48 { - binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) - //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits - bits >>= 48 - nbits -= 48 - nbytes += 6 - if nbytes >= bufferFlushSize { - if w.err != nil { - nbytes = 0 - return - } - _, w.err = w.writer.Write(w.bytes[:nbytes]) + + // inlined + bits |= uint64(llCodes[lengthCode]) << (nbits & 63) + nbits += llLengths[lengthCode] + if nbits >= 48 { + binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { nbytes = 0 + return } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 } } @@ -926,27 +925,22 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) // Write the offset offset := t.offset() offsetCode := (offset >> 16) & 31 - if false { - w.writeCode(offs[offsetCode]) - } else { - // inlined - c := offs[offsetCode] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len - if nbits >= 48 { - binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) - //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits - bits >>= 48 - nbits -= 48 - nbytes += 6 - if nbytes >= bufferFlushSize { - if w.err != nil { - nbytes = 0 - return - } - _, w.err = w.writer.Write(w.bytes[:nbytes]) + // inlined + bits |= uint64(offCodes[offsetCode]) << (nbits & 63) + nbits += offLens[offsetCode] + if nbits >= 48 { + binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { nbytes = 0 + return } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 } } @@ -976,7 +970,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) w.bits, w.nbits, w.nbytes = bits, nbits, nbytes if deferEOB { - w.writeCode(leCodes[endBlockMarker]) + w.writeCode(leCodes, endBlockMarker) } } @@ -1077,7 +1071,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes") } // We owe an EOB - w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.writeCode(w.literalEncoding.codes, endBlockMarker) w.lastHeader = 0 } else if debugDeflate { fmt.Println("reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8) @@ -1104,7 +1098,9 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { } } - encoding := w.literalEncoding.codes[:256] + encodingLens := w.literalEncoding.codes.len[:256] + encodingCodes := w.literalEncoding.codes.code[:256] + // Go 1.16 LOVES having these on stack. At least 1.5x the speed. bits, nbits, nbytes := w.bits, w.nbits, w.nbytes @@ -1133,13 +1129,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { _, w.err = w.writer.Write(w.bytes[:nbytes]) nbytes = 0 } - a, b := encoding[input[0]], encoding[input[1]] - bits |= uint64(a.code) << (nbits & 63) - bits |= uint64(b.code) << ((nbits + a.len) & 63) - c := encoding[input[2]] - nbits += b.len + a.len - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + c, b, a := input[2], input[1], input[0] + aL, bL := encodingLens[a], encodingLens[b] + bits |= uint64(encodingCodes[a]) << (nbits & 63) + bits |= uint64(encodingCodes[b]) << ((nbits + aL) & 63) + nbits += aL + bL + bits |= uint64(encodingCodes[c]) << (nbits & 63) + nbits += encodingLens[c] input = input[3:] } @@ -1164,11 +1160,10 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { } } // Bitwriting inlined, ~30% speedup - c := encoding[t] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + bits |= uint64(encodingCodes[t]) << (nbits & 63) + nbits += encodingLens[t] if debugDeflate { - count += int(c.len) + count += int(encodingCodes[t]) } } // Restore... @@ -1184,7 +1179,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { } if eof || sync { - w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.writeCode(w.literalEncoding.codes, endBlockMarker) w.lastHeader = 0 w.lastHuffMan = false } diff --git a/flate/huffman_code.go b/flate/huffman_code.go index 9ab497c275..2dcd6bd0d5 100644 --- a/flate/huffman_code.go +++ b/flate/huffman_code.go @@ -17,12 +17,12 @@ const ( // hcode is a huffman code with a bit code and bit length. type hcode struct { - code uint16 - len uint8 + code []uint16 + len []uint8 } type huffmanEncoder struct { - codes []hcode + codes hcode bitCount [17]int32 // Allocate a reusable buffer with the longest possible frequency table. @@ -56,12 +56,6 @@ type levelInfo struct { needed int32 } -// set sets the code and length of an hcode. -func (h *hcode) set(code uint16, length uint8) { - h.len = length - h.code = code -} - func reverseBits(number uint16, bitLength byte) uint16 { return bits.Reverse16(number << ((16 - bitLength) & 15)) } @@ -71,7 +65,10 @@ func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} func newHuffmanEncoder(size int) *huffmanEncoder { // Make capacity to next power of two. c := uint(bits.Len32(uint32(size - 1))) - return &huffmanEncoder{codes: make([]hcode, size, 1<