Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zstd: Faster decoding memcopy in asm #583

Merged
merged 1 commit into from May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
57 changes: 25 additions & 32 deletions zstd/_generate/gen.go
Expand Up @@ -874,13 +874,6 @@ type executeSimple struct {
safeMem bool
}

// copySize returns register size used to fast copy.
//
// See copyMemory()
func (e executeSimple) copySize() int {
return 16
}

func (e executeSimple) generateProcedure(name string) {
Package("github.com/klauspost/compress/zstd")
TEXT(name, 0, "func (ctx *executeAsmContext) bool")
Expand Down Expand Up @@ -1155,42 +1148,44 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle

Comment("Copy non-overlapping match")
{
ADDQ(ml, c.outPosition)
if e.safeMem {
e.copyMemoryPrecise("2", src, c.outBase, ml)
ADDQ(ml, c.outBase)
} else {
e.copyMemory("2", src, c.outBase, ml)
dst := GP64()
MOVQ(c.outBase, dst)
ADDQ(ml, c.outBase)
e.copyMemory("2", src, dst, ml)
}
ADDQ(ml, c.outBase)
ADDQ(ml, c.outPosition)

JMP(LabelRef("handle_loop"))
}

Comment("Copy overlapping match")
Label("copy_overlapping_match")
{
e.copyOverlappedMemory("3", src, c.outBase, ml)
ADDQ(ml, c.outBase)
ADDQ(ml, c.outPosition)
e.copyOverlappedMemory("3", src, c.outBase, ml)
}
}
}

// copyMemory will copy memory in blocks of 16 bytes,
// overwriting up to 15 extra bytes.
// src and dst are updated. length will be zero or less.
func (e executeSimple) copyMemory(suffix string, src, dst, length reg.GPVirtual) {
label := "copy_" + suffix
ofs := GP64()
s := Mem{Base: src, Index: ofs, Scale: 1}
d := Mem{Base: dst, Index: ofs, Scale: 1}

XORQ(ofs, ofs)
Label(label)
t := XMM()
MOVUPS(s, t)
MOVUPS(t, d)
ADDQ(U8(e.copySize()), ofs)
CMPQ(ofs, length)
JB(LabelRef(label))
MOVUPS(Mem{Base: src}, t)
MOVUPS(t, Mem{Base: dst})
ADDQ(U8(16), src)
ADDQ(U8(16), dst)
SUBQ(U8(16), length)
// jump if (CF == 0 and ZF == 0).
JHI(LabelRef(label))
}

// copyMemoryPrecise will copy memory in blocks of 16 bytes,
Expand Down Expand Up @@ -1246,27 +1241,25 @@ func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPV
t := XMM()
MOVUPS(s, t)
MOVUPS(t, d)
ADDQ(U8(e.copySize()), ofs)
ADDQ(U8(16), ofs)
Label("copy_" + suffix + "_test")
CMPQ(ofs, length)
JB(LabelRef(label))
}

// copyOverlappedMemory will copy one byte at the time from src to dst.
// src and dst are updated. length will be zero.
func (e executeSimple) copyOverlappedMemory(suffix string, src, dst, length reg.GPVirtual) {
label := "copy_slow_" + suffix
ofs := GP64()
s := Mem{Base: src, Index: ofs, Scale: 1}
d := Mem{Base: dst, Index: ofs, Scale: 1}
t := GP64()
tmp := GP64()

XORQ(ofs, ofs)
Label(label)
MOVB(s, t.As8())
MOVB(t.As8(), d)
INCQ(ofs)
CMPQ(ofs, length)
JB(LabelRef(label))
MOVB(Mem{Base: src}, tmp.As8())
MOVB(tmp.As8(), Mem{Base: dst})
INCQ(src)
INCQ(dst)
DECQ(length)
JNZ(LabelRef(label))
}

type decodeSync struct {
Expand Down
6 changes: 3 additions & 3 deletions zstd/decoder_test.go
Expand Up @@ -1317,7 +1317,7 @@ func BenchmarkDecoder_DecodeAllFilesP(b *testing.B) {
if err != nil {
b.Error(err)
}
_, err = dec.DecodeAll(encoded, nil)
raw, err := dec.DecodeAll(encoded, nil)
if err != nil {
b.Error(err)
}
Expand All @@ -1326,7 +1326,7 @@ func BenchmarkDecoder_DecodeAllFilesP(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
buf := make([]byte, len(raw))
buf := make([]byte, cap(raw))
var err error
for pb.Next() {
buf, err = dec.DecodeAll(encoded, buf[:0])
Expand Down Expand Up @@ -1373,7 +1373,7 @@ func BenchmarkDecoder_DecodeAllParallel(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
got := make([]byte, len(got))
got := make([]byte, cap(got))
for pb.Next() {
_, err = dec.DecodeAll(in, got[:0])
if err != nil {
Expand Down
124 changes: 61 additions & 63 deletions zstd/seqdec_amd64.s
Expand Up @@ -1326,30 +1326,30 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
XORQ R12, R12
ADDQ R13, DI
MOVQ BX, R12
ADDQ R13, BX

copy_2:
MOVUPS (R11)(R12*1), X0
MOVUPS X0, (BX)(R12*1)
MOVUPS (R11), X0
MOVUPS X0, (R12)
ADDQ $0x10, R11
ADDQ $0x10, R12
CMPQ R12, R13
JB copy_2
ADDQ R13, BX
ADDQ R13, DI
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ R12, R12
ADDQ R13, DI

copy_slow_3:
MOVB (R11)(R12*1), R14
MOVB R14, (BX)(R12*1)
INCQ R12
CMPQ R12, R13
JB copy_slow_3
ADDQ R13, BX
ADDQ R13, DI
MOVB (R11), R12
MOVB R12, (BX)
INCQ R11
INCQ BX
DECQ R13
JNZ copy_slow_3

handle_loop:
ADDQ $0x18, AX
Expand Down Expand Up @@ -1826,30 +1826,30 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
XORQ CX, CX
ADDQ R13, R12
MOVQ R10, CX
ADDQ R13, R10

copy_2:
MOVUPS (AX)(CX*1), X0
MOVUPS X0, (R10)(CX*1)
MOVUPS (AX), X0
MOVUPS X0, (CX)
ADDQ $0x10, AX
ADDQ $0x10, CX
CMPQ CX, R13
JB copy_2
ADDQ R13, R10
ADDQ R13, R12
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ CX, CX
ADDQ R13, R12

copy_slow_3:
MOVB (AX)(CX*1), R14
MOVB R14, (R10)(CX*1)
INCQ CX
CMPQ CX, R13
JB copy_slow_3
ADDQ R13, R10
ADDQ R13, R12
MOVB (AX), CL
MOVB CL, (R10)
INCQ AX
INCQ R10
DECQ R13
JNZ copy_slow_3

handle_loop:
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -2333,30 +2333,30 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
XORQ R12, R12
ADDQ R13, R11
MOVQ R9, R12
ADDQ R13, R9

copy_2:
MOVUPS (CX)(R12*1), X0
MOVUPS X0, (R9)(R12*1)
MOVUPS (CX), X0
MOVUPS X0, (R12)
ADDQ $0x10, CX
ADDQ $0x10, R12
CMPQ R12, R13
JB copy_2
ADDQ R13, R9
ADDQ R13, R11
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ R12, R12
ADDQ R13, R11

copy_slow_3:
MOVB (CX)(R12*1), R14
MOVB R14, (R9)(R12*1)
INCQ R12
CMPQ R12, R13
JB copy_slow_3
ADDQ R13, R9
ADDQ R13, R11
MOVB (CX), R12
MOVB R12, (R9)
INCQ CX
INCQ R9
DECQ R13
JNZ copy_slow_3

handle_loop:
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -2862,6 +2862,7 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
ADDQ R13, R12
XORQ CX, CX
TESTQ $0x00000001, R13
JZ copy_2_word
Expand Down Expand Up @@ -2900,21 +2901,19 @@ copy_2_test:
CMPQ CX, R13
JB copy_2
ADDQ R13, R10
ADDQ R13, R12
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ CX, CX
ADDQ R13, R12

copy_slow_3:
MOVB (AX)(CX*1), R14
MOVB R14, (R10)(CX*1)
INCQ CX
CMPQ CX, R13
JB copy_slow_3
ADDQ R13, R10
ADDQ R13, R12
MOVB (AX), CL
MOVB CL, (R10)
INCQ AX
INCQ R10
DECQ R13
JNZ copy_slow_3

handle_loop:
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -3398,6 +3397,7 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
ADDQ R13, R11
XORQ R12, R12
TESTQ $0x00000001, R13
JZ copy_2_word
Expand Down Expand Up @@ -3436,21 +3436,19 @@ copy_2_test:
CMPQ R12, R13
JB copy_2
ADDQ R13, R9
ADDQ R13, R11
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ R12, R12
ADDQ R13, R11

copy_slow_3:
MOVB (CX)(R12*1), R14
MOVB R14, (R9)(R12*1)
INCQ R12
CMPQ R12, R13
JB copy_slow_3
ADDQ R13, R9
ADDQ R13, R11
MOVB (CX), R12
MOVB R12, (R9)
INCQ CX
INCQ R9
DECQ R13
JNZ copy_slow_3

handle_loop:
MOVQ ctx+16(FP), CX
Expand Down