Skip to content

Commit

Permalink
s2: Improve "better" compression. (#635)
Browse files Browse the repository at this point in the history
Extend long hash table to 17 bits and do intermediate hashing on matches.

Improves better compression, with minor speed drop.
  • Loading branch information
klauspost committed Sep 8, 2022
1 parent 463e7df commit 49d8cf2
Show file tree
Hide file tree
Showing 4 changed files with 661 additions and 484 deletions.
126 changes: 94 additions & 32 deletions s2/_generate/gen.go
Expand Up @@ -59,12 +59,12 @@ func main() {

o.outputMargin = 6
o.maxSkip = 100 // Blocks can be long, limit max skipping.
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 16, 7, 7, 4<<20)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 17, 14, 7, 7, limit14B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 17, 14, 7, 7, 4<<20)
o.maxSkip = 0
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 12, 6, 6, limit12B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 10, 5, 6, limit10B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 8, 4, 6, limit8B)

// Snappy compatible
o.snappy = true
Expand All @@ -76,12 +76,12 @@ func main() {
o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B)

o.maxSkip = 100
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm", 16, 7, 7, limit14B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm", 17, 14, 7, 7, limit14B)
o.maxSkip = 0
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 7, 7, 64<<10-1)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm12B", 14, 6, 6, limit12B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm10B", 12, 5, 6, limit10B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm8B", 10, 4, 6, limit8B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 14, 7, 7, 64<<10-1)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm12B", 14, 12, 6, 6, limit12B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm10B", 12, 10, 5, 6, limit10B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm8B", 10, 8, 4, 6, limit8B)

o.snappy = false
o.outputMargin = 0
Expand Down Expand Up @@ -785,7 +785,7 @@ func maxLitOverheadFor(n int) int {
return 5
}

func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHashBytes, maxLen int) {
func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, skipLog, lHashBytes, maxLen int) {
TEXT(name, 0, "func(dst, src []byte) int")
Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
fmt.Sprintf("Maximum input %d bytes.", maxLen),
Expand All @@ -797,7 +797,6 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
}
var literalMaxOverhead = maxLitOverheadFor(maxLen)

var sTableBits = lTableBits - 2
const sHashBytes = 4
o.maxLen = maxLen

Expand Down Expand Up @@ -998,10 +997,34 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
MOVL(s, sTab.Idx(hash1, 4))
}

longVal := GP64()
shortVal := GP64()
MOVQ(Mem{Base: src, Index: candidate, Scale: 1}, longVal)
MOVQ(Mem{Base: src, Index: candidateS, Scale: 1}, shortVal)

// If we have at least 8 bytes match, choose that first.
CMPQ(longVal, cv.As64())
JEQ(LabelRef("candidate_match_" + name))

CMPQ(shortVal, cv.As64())
JNE(LabelRef("no_short_found_" + name))
MOVL(candidateS.As32(), candidate.As32())
JMP(LabelRef("candidate_match_" + name))

Label("no_short_found_" + name)
MOVL(longVal.As32(), longVal.As32())

// En/disable repeat matching.
// Too small improvement
if false {
{
CMPL(repeatL, U8(0))
JEQ(LabelRef("no_repeat_found_" + name))
}
// Check repeat at offset checkRep
const checkRep = 1
const wantRepeatBytes = 6
const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
{
// rep = s - repeat
rep := GP32()
Expand All @@ -1010,10 +1033,13 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash

// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
left, right := GP64(), GP64()
MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32())
MOVQ(Mem{Base: src, Index: rep, Disp: 0, Scale: 1}, right.As64())
MOVQ(cv, left)
SHRQ(U8(checkRep*8), left)
CMPL(left.As32(), right.As32())
tmp := GP64()
MOVQ(U64(repeatMask), tmp)
ANDQ(tmp, left)
ANDQ(tmp, right)
CMPQ(left.As64(), right.As64())
// BAIL, no repeat.
JNE(LabelRef("no_repeat_found_" + name))
}
Expand Down Expand Up @@ -1057,7 +1083,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
// Extend forward
{
// s += 4 + checkRep
ADDL(U8(4+checkRep), s)
ADDL(U8(wantRepeatBytes+checkRep), s)

if true {
// candidate := s - repeat + 4 + checkRep
Expand Down Expand Up @@ -1097,18 +1123,8 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
offsetVal := GP32()
MOVL(repeatL, offsetVal)

if !o.snappy {
// if nextEmit == 0 {do copy instead...}
TESTL(nextEmit, nextEmit)
JZ(LabelRef("repeat_as_copy_" + name))

// Emit as repeat...
o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), false)

// Emit as copy instead...
Label("repeat_as_copy_" + name)
}
o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
// Emit as repeat...
o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), false)

Label("repeat_end_emit_" + name)
// Store new dst and nextEmit
Expand Down Expand Up @@ -1145,11 +1161,11 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
JG(ok)
})

CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
CMPL(longVal.As32(), cv.As32())
JEQ(LabelRef("candidate_match_" + name))

//if uint32(cv) == load32(src, candidateS)
CMPL(Mem{Base: src, Index: candidateS, Scale: 1}, cv.As32())
CMPL(shortVal.As32(), cv.As32())
JEQ(LabelRef("candidateS_match_" + name))

// No match found, next loop
Expand Down Expand Up @@ -1338,11 +1354,57 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
}
}
Label("match_nolit_dst_ok_" + name)
// cv must be set to value at base+1 before arriving here
if true {
lHasher := hashN(lHashBytes, lTableBits)
sHasher := hashN(sHashBytes, sTableBits)

index0, index1 := GP64(), GP64()
// index0 := base + 1
LEAQ(Mem{Base: base, Disp: 1}, index0)
// index1 := s - 2
LEAQ(Mem{Base: s, Disp: -2}, index1)
hash0l, hash0s, hash1l, hash1s := GP64(), GP64(), GP64(), GP64()
MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 0}, hash0l)
MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 1}, hash0s)
MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 0}, hash1l)
MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 1}, hash1s)

lHasher.hash(hash0l)
sHasher.hash(hash0s)
lHasher.hash(hash1l)
sHasher.hash(hash1s)

plusone0, plusone1 := GP64(), GP64()
LEAQ(Mem{Base: index0, Disp: 1}, plusone0)
LEAQ(Mem{Base: index1, Disp: 1}, plusone1)
MOVL(index0.As32(), lTab.Idx(hash0l, 4))
MOVL(index1.As32(), lTab.Idx(hash1l, 4))
MOVL(plusone0.As32(), sTab.Idx(hash0s, 4))
MOVL(plusone1.As32(), sTab.Idx(hash1s, 4))

ADDQ(U8(1), index0)
SUBQ(U8(1), index1)

Label("index_loop_" + name)
CMPQ(index0, index1)
JAE(LabelRef("search_loop_" + name))
hash0l, hash1l = GP64(), GP64()
MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 0}, hash0l)
MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 0}, hash1l)

lHasher.hash(hash0l)
lHasher.hash(hash1l)

MOVL(index0.As32(), lTab.Idx(hash0l, 4))
MOVL(index1.As32(), lTab.Idx(hash1l, 4))

ADDQ(U8(2), index0)
SUBQ(U8(2), index1)
JMP(LabelRef("index_loop_" + name))
} else {
lHasher := hashN(lHashBytes, lTableBits)
sHasher := hashN(sHashBytes, sTableBits)

// Index base+1 long, base+2 short...
cv := GP64()
INCL(base)
Expand Down Expand Up @@ -1412,8 +1474,8 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
MOVL(sm2, lTab.Idx(hash0, 4))
MOVL(sm1, sTab.Idx(hash1, 4))
MOVL(sm1, lTab.Idx(hash3, 4))
JMP(LabelRef("search_loop_" + name))
}
JMP(LabelRef("search_loop_" + name))

Label("emit_remainder_" + name)
// Bail if we exceed the maximum size.
Expand Down
99 changes: 72 additions & 27 deletions s2/encode_better.go
Expand Up @@ -57,7 +57,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
// Initialize the hash tables.
const (
// Long hash matches.
lTableBits = 16
lTableBits = 17
maxLTableSize = 1 << lTableBits

// Short hash matches.
Expand Down Expand Up @@ -98,9 +98,26 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
lTable[hashL] = uint32(s)
sTable[hashS] = uint32(s)

valLong := load64(src, candidateL)
valShort := load64(src, candidateS)

// If long matches at least 8 bytes, use that.
if cv == valLong {
break
}
if cv == valShort {
candidateL = candidateS
break
}

// Check repeat at offset checkRep.
const checkRep = 1
if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
// Minimum length of a repeat. Tested with various values.
// While 4-5 offers improvements in some, 6 reduces
// regressions significantly.
const wantRepeatBytes = 6
const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
base := s + checkRep
// Extend back
for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
Expand All @@ -110,8 +127,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
d += emitLiteral(dst[d:], src[nextEmit:base])

// Extend forward
candidate := s - repeat + 4 + checkRep
s += 4 + checkRep
candidate := s - repeat + wantRepeatBytes + checkRep
s += wantRepeatBytes + checkRep
for s < len(src) {
if len(src)-s < 8 {
if src[s] == src[candidate] {
Expand All @@ -128,28 +145,40 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
s += 8
candidate += 8
}
if nextEmit > 0 {
// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
d += emitRepeat(dst[d:], repeat, s-base)
} else {
// First match, cannot be repeat.
d += emitCopy(dst[d:], repeat, s-base)
}
// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
d += emitRepeat(dst[d:], repeat, s-base)
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
// Index in-between
index0 := base + 1
index1 := s - 2

cv = load64(src, s)
for index0 < index1 {
cv0 := load64(src, index0)
cv1 := load64(src, index1)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)

lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
index0 += 2
index1 -= 2
}

cv = load64(src, s)
continue
}

if uint32(cv) == load32(src, candidateL) {
// Long likely matches 7, so take that.
if uint32(cv) == uint32(valLong) {
break
}

// Check our short candidate
if uint32(cv) == load32(src, candidateS) {
if uint32(cv) == uint32(valShort) {
// Try a long candidate at s+1
hashL = hash7(cv>>8, lTableBits)
candidateL = int(lTable[hashL])
Expand Down Expand Up @@ -228,21 +257,29 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
// Do we have space for more, if not bail.
return 0
}
// Index match start+1 (long) and start+2 (short)

// Index short & long
index0 := base + 1
// Index match end-2 (long) and end-1 (short)
index1 := s - 2

cv0 := load64(src, index0)
cv1 := load64(src, index1)
cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
lTable[hash7(cv1, lTableBits)] = uint32(index1)
lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)

lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
index0 += 1
index1 -= 1
cv = load64(src, s)

// index every second long in between.
for index0 < index1 {
lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
index0 += 2
index1 -= 2
}
}

emitRemainder:
Expand Down Expand Up @@ -404,21 +441,29 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
// Do we have space for more, if not bail.
return 0
}
// Index match start+1 (long) and start+2 (short)

// Index short & long
index0 := base + 1
// Index match end-2 (long) and end-1 (short)
index1 := s - 2

cv0 := load64(src, index0)
cv1 := load64(src, index1)
cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
lTable[hash7(cv1, lTableBits)] = uint32(index1)
lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)

lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
index0 += 1
index1 -= 1
cv = load64(src, s)

// index every second long in between.
for index0 < index1 {
lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
index0 += 2
index1 -= 2
}
}

emitRemainder:
Expand Down

0 comments on commit 49d8cf2

Please sign in to comment.