From 2878205bda99754286840d77f1d27331374f5e3a Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sat, 26 Nov 2022 12:18:36 +0100 Subject: [PATCH] zstd: Improve throughput of SpeedBestCompression encoder (#699) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lifted ofCode and mlCode computations out of match.estBits, so that method will be inlined into its only caller. Also some changes to eliminate a branch: the last if block becomes two CMOVs/CSELs on amd64 and arm64. ``` name old speed new speed delta Encoder_EncodeAllSimple/best-8 11.1MB/s ± 1% 16.9MB/s ± 1% +52.23% (p=0.000 n=10+10) Encoder_EncodeAllSimple4K/best-8 8.41MB/s ± 1% 10.95MB/s ± 0% +30.20% (p=0.000 n=10+10) name old alloc/op new alloc/op delta Encoder_EncodeAllSimple/best-8 20.0B ± 0% 18.0B ± 0% -10.00% (p=0.002 n=8+10) Encoder_EncodeAllSimple4K/best-8 2.00B ± 0% 2.00B ± 0% ~ (all equal) ``` --- zstd/enc_best.go | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/zstd/enc_best.go b/zstd/enc_best.go index b1c3db8f7b..ab0d1b7cd9 100644 --- a/zstd/enc_best.go +++ b/zstd/enc_best.go @@ -38,27 +38,29 @@ type match struct { const highScore = 25000 // estBits will estimate output bits from predefined tables. -func (m *match) estBits(bitsPerByte int32) { - mlc := mlCode(uint32(m.length - zstdMinMatch)) - var ofc uint8 - if m.rep < 0 { - ofc = ofCode(uint32(m.s-m.offset) + 3) - } else { - ofc = ofCode(uint32(m.rep)) - } +func (m *match) estBits(bitsPerByte int32, ofc, mlc uint8) { // Cost, excluding ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc] // Add cost of match encoding... - m.est = int32(ofTT.outBits + mlTT.outBits) - m.est += int32(ofTT.deltaNbBits>>16 + mlTT.deltaNbBits>>16) + est := int32(ofTT.outBits + mlTT.outBits) + est += int32(ofTT.deltaNbBits>>16 + mlTT.deltaNbBits>>16) // Subtract savings compared to literal encoding... - m.est -= (m.length * bitsPerByte) >> 10 - if m.est > 0 { + length := m.length + est -= (length * bitsPerByte) >> 10 + if est > 0 { // Unlikely gain.. - m.length = 0 - m.est = highScore + est, length = highScore, 0 + } + m.est, m.length = est, length +} + +func (m *match) ofCode() uint8 { + offset := uint32(m.rep) + if offset < 0 { + offset = uint32(m.s-m.offset) + 3 } + return ofCode(offset) } // bestFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches. @@ -216,7 +218,7 @@ encodeLoop: } } m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep} - m.estBits(bitsPerByte) + m.estBits(bitsPerByte, m.ofCode(), mlCode(uint32(m.length-zstdMinMatch))) return m }