Skip to content

Commit

Permalink
zstd: Faster decoding memcopy in asm (#583)
Browse files Browse the repository at this point in the history
Use faster method for `copyMemory` and `copyOverlappedMemory`.

```
λ go test -short -bench=seqdec_execute >after.txt&&benchcmp before.txt after.txt
benchmark                                                                                       old ns/op     new ns/op     delta
Benchmark_seqdec_execute/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32        129910        130620        +0.55%
Benchmark_seqdec_execute/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32       139487        134032        -3.91%
Benchmark_seqdec_execute/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32               37155         38636         +3.99%
Benchmark_seqdec_execute/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32        16318         15788         -3.25%
Benchmark_seqdec_execute/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32        44386         43959         -0.96%
Benchmark_seqdec_execute/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32         100065        96156         -3.91%
Benchmark_seqdec_execute/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32                  8119          7373          -9.19%
Benchmark_seqdec_execute/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32     84669         83034         -1.93%
Benchmark_seqdec_execute/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32                   2914          2773          -4.84%
Benchmark_seqdec_execute/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32                    4318          3824          -11.44%
Benchmark_seqdec_execute/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32                7851          7203          -8.25%
Benchmark_seqdec_execute/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32                   15161         14315         -5.58%
Benchmark_seqdec_execute/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32              23920         20065         -16.12%
Benchmark_seqdec_execute/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32                    53268         52768         -0.94%
```
  • Loading branch information
klauspost committed May 9, 2022
1 parent 2fa917f commit 2f43739
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 98 deletions.
57 changes: 25 additions & 32 deletions zstd/_generate/gen.go
Expand Up @@ -874,13 +874,6 @@ type executeSimple struct {
safeMem bool
}

// copySize returns register size used to fast copy.
//
// See copyMemory()
func (e executeSimple) copySize() int {
return 16
}

func (e executeSimple) generateProcedure(name string) {
Package("github.com/klauspost/compress/zstd")
TEXT(name, 0, "func (ctx *executeAsmContext) bool")
Expand Down Expand Up @@ -1155,42 +1148,44 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle

Comment("Copy non-overlapping match")
{
ADDQ(ml, c.outPosition)
if e.safeMem {
e.copyMemoryPrecise("2", src, c.outBase, ml)
ADDQ(ml, c.outBase)
} else {
e.copyMemory("2", src, c.outBase, ml)
dst := GP64()
MOVQ(c.outBase, dst)
ADDQ(ml, c.outBase)
e.copyMemory("2", src, dst, ml)
}
ADDQ(ml, c.outBase)
ADDQ(ml, c.outPosition)

JMP(LabelRef("handle_loop"))
}

Comment("Copy overlapping match")
Label("copy_overlapping_match")
{
e.copyOverlappedMemory("3", src, c.outBase, ml)
ADDQ(ml, c.outBase)
ADDQ(ml, c.outPosition)
e.copyOverlappedMemory("3", src, c.outBase, ml)
}
}
}

// copyMemory will copy memory in blocks of 16 bytes,
// overwriting up to 15 extra bytes.
// src and dst are updated. length will be zero or less.
func (e executeSimple) copyMemory(suffix string, src, dst, length reg.GPVirtual) {
label := "copy_" + suffix
ofs := GP64()
s := Mem{Base: src, Index: ofs, Scale: 1}
d := Mem{Base: dst, Index: ofs, Scale: 1}

XORQ(ofs, ofs)
Label(label)
t := XMM()
MOVUPS(s, t)
MOVUPS(t, d)
ADDQ(U8(e.copySize()), ofs)
CMPQ(ofs, length)
JB(LabelRef(label))
MOVUPS(Mem{Base: src}, t)
MOVUPS(t, Mem{Base: dst})
ADDQ(U8(16), src)
ADDQ(U8(16), dst)
SUBQ(U8(16), length)
// jump if (CF == 0 and ZF == 0).
JHI(LabelRef(label))
}

// copyMemoryPrecise will copy memory in blocks of 16 bytes,
Expand Down Expand Up @@ -1246,27 +1241,25 @@ func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPV
t := XMM()
MOVUPS(s, t)
MOVUPS(t, d)
ADDQ(U8(e.copySize()), ofs)
ADDQ(U8(16), ofs)
Label("copy_" + suffix + "_test")
CMPQ(ofs, length)
JB(LabelRef(label))
}

// copyOverlappedMemory will copy one byte at the time from src to dst.
// src and dst are updated. length will be zero.
func (e executeSimple) copyOverlappedMemory(suffix string, src, dst, length reg.GPVirtual) {
label := "copy_slow_" + suffix
ofs := GP64()
s := Mem{Base: src, Index: ofs, Scale: 1}
d := Mem{Base: dst, Index: ofs, Scale: 1}
t := GP64()
tmp := GP64()

XORQ(ofs, ofs)
Label(label)
MOVB(s, t.As8())
MOVB(t.As8(), d)
INCQ(ofs)
CMPQ(ofs, length)
JB(LabelRef(label))
MOVB(Mem{Base: src}, tmp.As8())
MOVB(tmp.As8(), Mem{Base: dst})
INCQ(src)
INCQ(dst)
DECQ(length)
JNZ(LabelRef(label))
}

type decodeSync struct {
Expand Down
6 changes: 3 additions & 3 deletions zstd/decoder_test.go
Expand Up @@ -1317,7 +1317,7 @@ func BenchmarkDecoder_DecodeAllFilesP(b *testing.B) {
if err != nil {
b.Error(err)
}
_, err = dec.DecodeAll(encoded, nil)
raw, err := dec.DecodeAll(encoded, nil)
if err != nil {
b.Error(err)
}
Expand All @@ -1326,7 +1326,7 @@ func BenchmarkDecoder_DecodeAllFilesP(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
buf := make([]byte, len(raw))
buf := make([]byte, cap(raw))
var err error
for pb.Next() {
buf, err = dec.DecodeAll(encoded, buf[:0])
Expand Down Expand Up @@ -1373,7 +1373,7 @@ func BenchmarkDecoder_DecodeAllParallel(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
got := make([]byte, len(got))
got := make([]byte, cap(got))
for pb.Next() {
_, err = dec.DecodeAll(in, got[:0])
if err != nil {
Expand Down
124 changes: 61 additions & 63 deletions zstd/seqdec_amd64.s
Expand Up @@ -1326,30 +1326,30 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
XORQ R12, R12
ADDQ R13, DI
MOVQ BX, R12
ADDQ R13, BX

copy_2:
MOVUPS (R11)(R12*1), X0
MOVUPS X0, (BX)(R12*1)
MOVUPS (R11), X0
MOVUPS X0, (R12)
ADDQ $0x10, R11
ADDQ $0x10, R12
CMPQ R12, R13
JB copy_2
ADDQ R13, BX
ADDQ R13, DI
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ R12, R12
ADDQ R13, DI

copy_slow_3:
MOVB (R11)(R12*1), R14
MOVB R14, (BX)(R12*1)
INCQ R12
CMPQ R12, R13
JB copy_slow_3
ADDQ R13, BX
ADDQ R13, DI
MOVB (R11), R12
MOVB R12, (BX)
INCQ R11
INCQ BX
DECQ R13
JNZ copy_slow_3

handle_loop:
ADDQ $0x18, AX
Expand Down Expand Up @@ -1826,30 +1826,30 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
XORQ CX, CX
ADDQ R13, R12
MOVQ R10, CX
ADDQ R13, R10

copy_2:
MOVUPS (AX)(CX*1), X0
MOVUPS X0, (R10)(CX*1)
MOVUPS (AX), X0
MOVUPS X0, (CX)
ADDQ $0x10, AX
ADDQ $0x10, CX
CMPQ CX, R13
JB copy_2
ADDQ R13, R10
ADDQ R13, R12
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ CX, CX
ADDQ R13, R12

copy_slow_3:
MOVB (AX)(CX*1), R14
MOVB R14, (R10)(CX*1)
INCQ CX
CMPQ CX, R13
JB copy_slow_3
ADDQ R13, R10
ADDQ R13, R12
MOVB (AX), CL
MOVB CL, (R10)
INCQ AX
INCQ R10
DECQ R13
JNZ copy_slow_3

handle_loop:
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -2333,30 +2333,30 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
XORQ R12, R12
ADDQ R13, R11
MOVQ R9, R12
ADDQ R13, R9

copy_2:
MOVUPS (CX)(R12*1), X0
MOVUPS X0, (R9)(R12*1)
MOVUPS (CX), X0
MOVUPS X0, (R12)
ADDQ $0x10, CX
ADDQ $0x10, R12
CMPQ R12, R13
JB copy_2
ADDQ R13, R9
ADDQ R13, R11
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ R12, R12
ADDQ R13, R11

copy_slow_3:
MOVB (CX)(R12*1), R14
MOVB R14, (R9)(R12*1)
INCQ R12
CMPQ R12, R13
JB copy_slow_3
ADDQ R13, R9
ADDQ R13, R11
MOVB (CX), R12
MOVB R12, (R9)
INCQ CX
INCQ R9
DECQ R13
JNZ copy_slow_3

handle_loop:
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -2862,6 +2862,7 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
ADDQ R13, R12
XORQ CX, CX
TESTQ $0x00000001, R13
JZ copy_2_word
Expand Down Expand Up @@ -2900,21 +2901,19 @@ copy_2_test:
CMPQ CX, R13
JB copy_2
ADDQ R13, R10
ADDQ R13, R12
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ CX, CX
ADDQ R13, R12

copy_slow_3:
MOVB (AX)(CX*1), R14
MOVB R14, (R10)(CX*1)
INCQ CX
CMPQ CX, R13
JB copy_slow_3
ADDQ R13, R10
ADDQ R13, R12
MOVB (AX), CL
MOVB CL, (R10)
INCQ AX
INCQ R10
DECQ R13
JNZ copy_slow_3

handle_loop:
MOVQ ctx+16(FP), AX
Expand Down Expand Up @@ -3398,6 +3397,7 @@ copy_match:
JA copy_overlapping_match

// Copy non-overlapping match
ADDQ R13, R11
XORQ R12, R12
TESTQ $0x00000001, R13
JZ copy_2_word
Expand Down Expand Up @@ -3436,21 +3436,19 @@ copy_2_test:
CMPQ R12, R13
JB copy_2
ADDQ R13, R9
ADDQ R13, R11
JMP handle_loop

// Copy overlapping match
copy_overlapping_match:
XORQ R12, R12
ADDQ R13, R11

copy_slow_3:
MOVB (CX)(R12*1), R14
MOVB R14, (R9)(R12*1)
INCQ R12
CMPQ R12, R13
JB copy_slow_3
ADDQ R13, R9
ADDQ R13, R11
MOVB (CX), R12
MOVB R12, (R9)
INCQ CX
INCQ R9
DECQ R13
JNZ copy_slow_3

handle_loop:
MOVQ ctx+16(FP), CX
Expand Down

0 comments on commit 2f43739

Please sign in to comment.