Skip to content

Commit

Permalink
zstd: Speed up best encoder
Browse files Browse the repository at this point in the history
name                              old speed      new speed      delta
Encoder_EncodeAllSimple/best-8    14.8MB/s ± 3%  20.7MB/s ± 3%   +39.53%  (p=0.000 n=17+19)
Encoder_EncodeAllSimple4K/best-8  11.8MB/s ± 1%  19.2MB/s ± 6%   +62.17%  (p=0.000 n=20+20)

name                              old alloc/op   new alloc/op   delta
Encoder_EncodeAllSimple/best-8       14.0B ± 0%     10.2B ± 8%   -27.07%  (p=0.000 n=16+19)
Encoder_EncodeAllSimple4K/best-8     1.00B ± 0%     0.00B       -100.00%  (p=0.000 n=20+19)

Also, compressing enwik9 takes 6.375% less wall clock time.

Output from silesia corpus and enwik9 is about .05% bigger, due to the
different order in which comparisons are done:

dickens    3222189    3220994 (× 0.99963)
enwik9   259699309  259846164 (× 1.00057)
mozilla   16912341   16912437 (× 1.00001)
mr         3505553    3502823 (× 0.99922)
nci        2289871    2306320 (× 1.00718)
ooffice    2896410    2896907 (× 1.00017)
osdb       3390871    3390548 (× 0.99990)
reymont    1656006    1657380 (× 1.00083)
samba      4326783    4329898 (× 1.00072)
sao        5416932    5416648 (× 0.99995)
webster    9966351    9972808 (× 1.00065)
xml         538378     542277 (× 1.00724)
x-ray      5733061    5733121 (× 1.00001)
total    319554055  319728325 (× 1.00055)

This is still smaller than before klauspost#705.
  • Loading branch information
greatroar committed Mar 12, 2023
1 parent c73f008 commit 503fe67
Showing 1 changed file with 27 additions and 38 deletions.
65 changes: 27 additions & 38 deletions zstd/enc_best.go
Expand Up @@ -32,7 +32,6 @@ type match struct {
length int32
rep int32
est int32
_ [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes
}

const highScore = 25000
Expand Down Expand Up @@ -189,53 +188,48 @@ encodeLoop:
panic("offset0 was 0")
}

bestOf := func(a, b *match) *match {
if a.est-b.est+(a.s-b.s)*bitsPerByte>>10 < 0 {
return a
}
return b
}
const goodEnough = 100

nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]

matchAt := func(offset int32, s int32, first uint32, rep int32) match {
// Set m to a match at offset if it looks like that will improve compression.
improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
return match{s: s, est: highScore}
return
}
if debugAsserts {
if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
}
}
m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
m.estBits(bitsPerByte)
return m
cand := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
cand.estBits(bitsPerByte)
if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
*m = cand
}
}

m1 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)
m2 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)
m3 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)
m4 := matchAt(candidateS.prev-e.cur, s, uint32(cv), -1)
best := bestOf(bestOf(&m1, &m2), bestOf(&m3, &m4))
best := match{s: s, est: highScore}
improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)

if canRepeat && best.length < goodEnough {
cv32 := uint32(cv >> 8)
spp := s + 1
m1 := matchAt(spp-offset1, spp, cv32, 1)
m2 := matchAt(spp-offset2, spp, cv32, 2)
m3 := matchAt(spp-offset3, spp, cv32, 3)
best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3))
improve(&best, spp-offset1, spp, cv32, 1)
improve(&best, spp-offset2, spp, cv32, 2)
improve(&best, spp-offset3, spp, cv32, 3)
if best.length > 0 {
cv32 = uint32(cv >> 24)
spp += 2
m1 := matchAt(spp-offset1, spp, cv32, 1)
m2 := matchAt(spp-offset2, spp, cv32, 2)
m3 := matchAt(spp-offset3, spp, cv32, 3)
best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3))
improve(&best, spp-offset1, spp, cv32, 1)
improve(&best, spp-offset2, spp, cv32, 2)
improve(&best, spp-offset3, spp, cv32, 3)
}
}
// Load next and check...
Expand All @@ -262,18 +256,16 @@ encodeLoop:
candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]

// Short at s+1
m1 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
// Long at s+1, s+2
m2 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)
m3 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)
m4 := matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
m5 := matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
best = bestOf(bestOf(bestOf(best, &m1), &m2), bestOf(bestOf(&m3, &m4), &m5))
improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
improve(&best, candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
improve(&best, candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
if false {
// Short at s+3.
// Too often worse...
m := matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
best = bestOf(best, &m)
improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
}
// See if we can find a better match by checking where the current best ends.
// Use that offset to see if we can find a better full match.
Expand All @@ -284,13 +276,10 @@ encodeLoop:
// For this compression level 2 yields the best results.
const skipBeginning = 2
if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 {
m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
bestEnd := bestOf(best, &m)
improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 {
m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
bestEnd = bestOf(bestEnd, &m)
improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
}
best = bestEnd
}
}
}
Expand Down

0 comments on commit 503fe67

Please sign in to comment.