From 9a048c1564430066f8c75fc147488742ff337879 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Tue, 12 Jul 2022 16:53:00 +0200 Subject: [PATCH] zstd: Branchless getBits for amd64 w/o BMI2 (#640) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This produces the same number of instructions, while requiring less generating code. Benchmarks on the Intel Core i7-3770K show a tiny speedup: ``` name old speed new speed delta Decoder_DecoderSmall/kppkn.gtb.zst-8 430MB/s ± 1% 437MB/s ± 1% +1.60% (p=0.000 n=10+9) Decoder_DecoderSmall/geo.protodata.zst-8 1.11GB/s ± 1% 1.13GB/s ± 0% +1.37% (p=0.000 n=9+9) Decoder_DecoderSmall/plrabn12.txt.zst-8 334MB/s ± 1% 339MB/s ± 1% +1.41% (p=0.000 n=9+10) Decoder_DecoderSmall/lcet10.txt.zst-8 392MB/s ± 2% 404MB/s ± 1% +3.05% (p=0.000 n=10+10) Decoder_DecoderSmall/asyoulik.txt.zst-8 355MB/s ± 2% 357MB/s ± 1% ~ (p=0.315 n=10+9) Decoder_DecoderSmall/alice29.txt.zst-8 344MB/s ± 1% 350MB/s ± 1% +1.69% (p=0.000 n=10+10) Decoder_DecoderSmall/html_x_4.zst-8 2.34GB/s ± 1% 2.37GB/s ± 1% +1.10% (p=0.000 n=10+10) Decoder_DecoderSmall/paper-100k.pdf.zst-8 3.75GB/s ± 0% 3.76GB/s ± 1% ~ (p=0.182 n=9+10) Decoder_DecoderSmall/fireworks.jpeg.zst-8 8.59GB/s ± 1% 8.58GB/s ± 1% ~ (p=0.842 n=10+9) Decoder_DecoderSmall/urls.10K.zst-8 561MB/s ± 1% 556MB/s ± 1% -0.82% (p=0.019 n=10+10) Decoder_DecoderSmall/html.zst-8 900MB/s ± 1% 913MB/s ± 1% +1.42% (p=0.000 n=10+9) Decoder_DecoderSmall/comp-data.bin.zst-8 399MB/s ± 1% 395MB/s ± 1% -0.99% (p=0.000 n=10+10) Decoder_DecodeAll/kppkn.gtb.zst-8 518MB/s ± 0% 526MB/s ± 0% +1.52% (p=0.000 n=10+9) Decoder_DecodeAll/geo.protodata.zst-8 1.28GB/s ± 0% 1.27GB/s ± 2% ~ (p=0.739 n=10+10) Decoder_DecodeAll/plrabn12.txt.zst-8 427MB/s ± 1% 433MB/s ± 1% +1.24% (p=0.000 n=10+10) Decoder_DecodeAll/lcet10.txt.zst-8 480MB/s ± 1% 490MB/s ± 1% +2.06% (p=0.000 n=10+10) Decoder_DecodeAll/asyoulik.txt.zst-8 435MB/s ± 0% 447MB/s ± 0% +2.70% (p=0.000 n=7+9) Decoder_DecodeAll/alice29.txt.zst-8 422MB/s ± 0% 438MB/s ± 1% +3.96% (p=0.000 n=8+9) Decoder_DecodeAll/html_x_4.zst-8 1.60GB/s ± 0% 1.61GB/s ± 0% +0.99% (p=0.000 n=9+10) Decoder_DecodeAll/paper-100k.pdf.zst-8 4.55GB/s ± 1% 4.44GB/s ± 1% -2.42% (p=0.000 n=10+10) Decoder_DecodeAll/fireworks.jpeg.zst-8 9.52GB/s ± 1% 9.47GB/s ± 2% ~ (p=0.143 n=10+10) Decoder_DecodeAll/urls.10K.zst-8 678MB/s ± 1% 684MB/s ± 0% +0.83% (p=0.000 n=10+10) Decoder_DecodeAll/html.zst-8 1.05GB/s ± 0% 1.07GB/s ± 1% +2.11% (p=0.000 n=10+10) Decoder_DecodeAll/comp-data.bin.zst-8 397MB/s ± 1% 391MB/s ± 1% -1.37% (p=0.000 n=10+10) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/fastest-8 437MB/s ± 0% 436MB/s ± 1% -0.21% (p=0.025 n=9+9) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/default-8 448MB/s ± 0% 451MB/s ± 0% +0.70% (p=0.000 n=9+9) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/better-8 478MB/s ± 0% 475MB/s ± 0% -0.53% (p=0.000 n=10+10) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/best-8 461MB/s ± 0% 470MB/s ± 0% +2.07% (p=0.000 n=8+9) Decoder_DecodeAllFiles/e.txt/fastest-8 9.62GB/s ± 3% 9.62GB/s ± 2% ~ (p=1.000 n=10+10) Decoder_DecodeAllFiles/e.txt/default-8 391MB/s ± 0% 406MB/s ± 0% +3.81% (p=0.000 n=10+8) Decoder_DecodeAllFiles/e.txt/better-8 438MB/s ± 0% 448MB/s ± 0% +2.39% (p=0.000 n=8+10) Decoder_DecodeAllFiles/e.txt/best-8 500MB/s ± 0% 500MB/s ± 0% ~ (p=0.119 n=9+9) Decoder_DecodeAllFiles/fse-artifact3.bin/fastest-8 1.07GB/s ± 1% 1.04GB/s ± 1% -2.61% (p=0.000 n=10+10) Decoder_DecodeAllFiles/fse-artifact3.bin/default-8 1.21GB/s ± 1% 1.19GB/s ± 1% -1.33% (p=0.000 n=10+10) Decoder_DecodeAllFiles/fse-artifact3.bin/better-8 994MB/s ± 0% 990MB/s ± 0% -0.42% (p=0.002 n=10+9) Decoder_DecodeAllFiles/fse-artifact3.bin/best-8 389MB/s ± 0% 381MB/s ± 0% -2.00% (p=0.000 n=8+10) Decoder_DecodeAllFiles/gettysburg.txt/fastest-8 274MB/s ± 1% 274MB/s ± 1% ~ (p=1.000 n=10+10) Decoder_DecodeAllFiles/gettysburg.txt/default-8 224MB/s ± 1% 223MB/s ± 1% -0.64% (p=0.015 n=10+10) Decoder_DecodeAllFiles/gettysburg.txt/better-8 228MB/s ± 1% 227MB/s ± 1% -0.40% (p=0.041 n=10+10) Decoder_DecodeAllFiles/gettysburg.txt/best-8 225MB/s ± 1% 223MB/s ± 0% -0.52% (p=0.008 n=10+6) Decoder_DecodeAllFiles/html.txt/fastest-8 599MB/s ± 1% 614MB/s ± 1% +2.41% (p=0.000 n=10+10) Decoder_DecodeAllFiles/html.txt/default-8 601MB/s ± 0% 613MB/s ± 0% +2.01% (p=0.000 n=8+9) Decoder_DecodeAllFiles/html.txt/better-8 626MB/s ± 1% 638MB/s ± 0% +1.99% (p=0.000 n=10+10) Decoder_DecodeAllFiles/html.txt/best-8 601MB/s ± 0% 612MB/s ± 0% +1.87% (p=0.000 n=10+10) Decoder_DecodeAllFiles/pi.txt/fastest-8 9.64GB/s ± 2% 9.66GB/s ± 1% ~ (p=0.529 n=10+10) Decoder_DecodeAllFiles/pi.txt/default-8 390MB/s ± 0% 403MB/s ± 0% +3.48% (p=0.000 n=10+10) Decoder_DecodeAllFiles/pi.txt/better-8 439MB/s ± 0% 451MB/s ± 0% +2.65% (p=0.000 n=10+10) Decoder_DecodeAllFiles/pi.txt/best-8 500MB/s ± 0% 499MB/s ± 0% -0.27% (p=0.009 n=7+10) Decoder_DecodeAllFiles/pngdata.bin/fastest-8 1.70GB/s ± 1% 1.69GB/s ± 1% -0.63% (p=0.013 n=10+9) Decoder_DecodeAllFiles/pngdata.bin/default-8 1.52GB/s ± 1% 1.51GB/s ± 0% -0.75% (p=0.000 n=10+9) Decoder_DecodeAllFiles/pngdata.bin/better-8 1.92GB/s ± 0% 1.90GB/s ± 0% -1.02% (p=0.000 n=10+10) Decoder_DecodeAllFiles/pngdata.bin/best-8 1.47GB/s ± 0% 1.46GB/s ± 0% -0.88% (p=0.000 n=10+9) Decoder_DecodeAllFiles/sharnd.out/fastest-8 9.60GB/s ± 1% 9.67GB/s ± 1% +0.67% (p=0.029 n=10+10) Decoder_DecodeAllFiles/sharnd.out/default-8 9.65GB/s ± 2% 9.71GB/s ± 1% ~ (p=0.353 n=10+10) Decoder_DecodeAllFiles/sharnd.out/better-8 9.67GB/s ± 1% 9.66GB/s ± 0% ~ (p=0.549 n=10+9) Decoder_DecodeAllFiles/sharnd.out/best-8 9.70GB/s ± 1% 9.61GB/s ± 0% -0.91% (p=0.010 n=10+9) [Geo mean] 935MB/s 940MB/s +0.57% ``` --- zstd/_generate/gen.go | 59 ++++-------- zstd/seqdec_amd64.s | 204 ++++++++++++++++++++---------------------- 2 files changed, 111 insertions(+), 152 deletions(-) diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index 70e1b3a1d4..fda6a55ebd 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -298,7 +298,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute MOVBQZX(total.As8(), total) // total = llState.As8() + mlState.As8() + ofState.As8() // Read `total` bits - bits := o.getBitsValue(name+"_getBits", total, brValue, brBitsRead) + bits := o.getBits(total, brValue, brBitsRead) // Update states Comment("Update Offset State") @@ -632,14 +632,13 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu } { - lowBits := o.getBits(name+"_getBits", AX, brValue, brBitsRead, LabelRef(name+"_skip_zero")) + lowBits := o.getBits(AX, brValue, brBitsRead) // Check if below tablelog assert(func(ok LabelRef) { CMPQ(lowBits, U32(512)) JB(ok) }) ADDQ(lowBits, DX) - Label(name + "_skip_zero") } // Load table pointer @@ -695,53 +694,25 @@ func (o options) nextState(name string, state, lowBits reg.GPVirtual, table stri } // getBits will return nbits bits from brValue. -// If nbits == 0 it *may* jump to jmpZero, otherwise 0 is returned. -func (o options) getBits(name string, nBits, brValue, brBitsRead reg.GPVirtual, jmpZero LabelRef) reg.GPVirtual { +func (o options) getBits(nBits, brValue, brBitsRead reg.GPVirtual) reg.GPVirtual { BX := GP64() CX := reg.CL - if o.bmi2 { - LEAQ(Mem{Base: brBitsRead, Index: nBits, Scale: 1}, CX.As64()) - MOVQ(brValue, BX) - MOVQ(CX.As64(), brBitsRead) - ROLQ(CX, BX) - BZHIQ(nBits, BX, BX) - } else { - CMPQ(nBits, U8(0)) - JZ(jmpZero) - MOVQ(brBitsRead, CX.As64()) - ADDQ(nBits, brBitsRead) - MOVQ(brValue, BX) - SHLQ(CX, BX) - MOVQ(nBits, CX.As64()) - NEGQ(CX.As64()) - SHRQ(CX, BX) - } - return BX -} -// getBits will return nbits bits from brValue. -// If nbits == 0 then 0 is returned. -func (o options) getBitsValue(name string, nBits, brValue, brBitsRead reg.GPVirtual) reg.GPVirtual { - BX := GP64() - CX := reg.CL + LEAQ(Mem{Base: brBitsRead, Index: nBits, Scale: 1}, CX.As64()) + MOVQ(brValue, BX) + MOVQ(CX.As64(), brBitsRead) + ROLQ(CX, BX) + + // BX &= (1<