Skip to content

Commit

Permalink
zstd: asm decode - use SSE instruction
Browse files Browse the repository at this point in the history
Benchmarks from an Ice Lake machine:

Benchmark_seqdec_decodeNoBMI/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-16        128501        125128        -2.62%
Benchmark_seqdec_decodeNoBMI/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-16       134702        130647        -3.01%
Benchmark_seqdec_decodeNoBMI/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-16               120971        119333        -1.35%
Benchmark_seqdec_decodeNoBMI/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-16        12804         12794         -0.08%
Benchmark_seqdec_decodeNoBMI/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-16        29078         28750         -1.13%
Benchmark_seqdec_decodeNoBMI/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-16         83494         83368         -0.15%
Benchmark_seqdec_decodeNoBMI/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-16                  8079          7952          -1.57%
Benchmark_seqdec_decodeNoBMI/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-16     164592        162022        -1.56%
Benchmark_seqdec_decodeNoBMI/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-16                   52.4          52.7          +0.63%
Benchmark_seqdec_decodeNoBMI/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-16                    791           789           -0.21%
Benchmark_seqdec_decodeNoBMI/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-16                7793          7782          -0.14%
Benchmark_seqdec_decodeNoBMI/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-16                   23069         22699         -1.60%
Benchmark_seqdec_decodeNoBMI/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-16              32626         32118         -1.56%
Benchmark_seqdec_decodeNoBMI/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-16                    75567         75241         -0.43%
Benchmark_seqdec_decode/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-16             111331        108619        -2.44%
Benchmark_seqdec_decode/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-16            114481        111280        -2.80%
Benchmark_seqdec_decode/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-16                    108650        105387        -3.00%
Benchmark_seqdec_decode/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-16             11454         11091         -3.17%
Benchmark_seqdec_decode/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-16             26245         25501         -2.83%
Benchmark_seqdec_decode/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-16              71110         69874         -1.74%
Benchmark_seqdec_decode/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-16                       7320          7221          -1.35%
Benchmark_seqdec_decode/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-16          131987        130904        -0.82%
Benchmark_seqdec_decode/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-16                        49.5          49.8          +0.63%
Benchmark_seqdec_decode/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-16                         701           696           -0.74%
Benchmark_seqdec_decode/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-16                     6886          6828          -0.84%
Benchmark_seqdec_decode/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-16                        21162         20897         -1.25%
Benchmark_seqdec_decode/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-16                   29856         29612         -0.82%
Benchmark_seqdec_decode/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-16                         64311         63216         -1.70%
  • Loading branch information
WojciechMula committed Mar 24, 2022
1 parent 80e513b commit f4b0ef0
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 28 deletions.
9 changes: 3 additions & 6 deletions zstd/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,13 +467,10 @@ func (o options) adjustOffset(name string, moP, llP Mem, offsetB reg.GPVirtual)
CMPQ(offsetB, U8(1))
JBE(LabelRef(name + "_offsetB_1_or_0"))

// TODO: Test if 1 SSE2 move + write is faster...
tmp, tmp2 := GP64(), GP64()
MOVQ(po0.Addr, tmp) // tmp = s.prevOffset[0]
MOVQ(po1.Addr, tmp2) // tmp2 = s.prevOffset[1]
tmp := XMM()
MOVUPS(po0.Addr, tmp) // tmp = (s.prevOffset[0], s.prevOffset[1])
MOVQ(offset, po0.Addr) // s.prevOffset[0] = offset
MOVQ(tmp, po1.Addr) // s.prevOffset[1] = s.prevOffset[0]
MOVQ(tmp2, po2.Addr) // s.prevOffset[2] = s.prevOffset[1]
MOVUPS(tmp, po1.Addr) // s.prevOffset[1], s.prevOffset[2] = s.prevOffset[0], s.prevOffset[1]
JMP(LabelRef(name + "_end"))
}

Expand Down
40 changes: 18 additions & 22 deletions zstd/seqdec_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// +build !appengine,!noasm,gc,!noasm

// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
Expand Down Expand Up @@ -215,16 +215,14 @@ sequenceDecs_decode_amd64_ofState_updateState_skip_zero:

sequenceDecs_decode_amd64_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 16(R10), R11
CMPQ AX, $0x01
JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
MOVQ 144(CX), AX
MOVQ 152(CX), R12
MOVQ R11, 144(CX)
MOVQ AX, 152(CX)
MOVQ R12, 160(CX)
JMP sequenceDecs_decode_amd64_adjust_end
MOVQ s+0(FP), CX
MOVQ 16(R10), R11
CMPQ AX, $0x01
JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R11, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decode_amd64_adjust_end

sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
Expand Down Expand Up @@ -305,7 +303,7 @@ sequenceDecs_decode_amd64_error_match_len_too_big:
RET

// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
Expand Down Expand Up @@ -494,16 +492,14 @@ sequenceDecs_decode_bmi2_fill_3_end:

sequenceDecs_decode_bmi2_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 16(R9), R11
CMPQ R10, $0x01
JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
MOVQ 144(CX), R10
MOVQ 152(CX), R12
MOVQ R11, 144(CX)
MOVQ R10, 152(CX)
MOVQ R12, 160(CX)
JMP sequenceDecs_decode_bmi2_adjust_end
MOVQ s+0(FP), CX
MOVQ 16(R9), R11
CMPQ R10, $0x01
JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R11, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decode_bmi2_adjust_end

sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
Expand Down

0 comments on commit f4b0ef0

Please sign in to comment.