Skip to content

Commit

Permalink
zstd: Branchless getBits for amd64 w/o BMI2 (#640)
Browse files Browse the repository at this point in the history
This produces the same number of instructions, while requiring less
generating code. Benchmarks on the Intel Core i7-3770K show a tiny
speedup:

```
name                                                        old speed      new speed      delta
Decoder_DecoderSmall/kppkn.gtb.zst-8                         430MB/s ± 1%   437MB/s ± 1%  +1.60%  (p=0.000 n=10+9)
Decoder_DecoderSmall/geo.protodata.zst-8                    1.11GB/s ± 1%  1.13GB/s ± 0%  +1.37%  (p=0.000 n=9+9)
Decoder_DecoderSmall/plrabn12.txt.zst-8                      334MB/s ± 1%   339MB/s ± 1%  +1.41%  (p=0.000 n=9+10)
Decoder_DecoderSmall/lcet10.txt.zst-8                        392MB/s ± 2%   404MB/s ± 1%  +3.05%  (p=0.000 n=10+10)
Decoder_DecoderSmall/asyoulik.txt.zst-8                      355MB/s ± 2%   357MB/s ± 1%    ~     (p=0.315 n=10+9)
Decoder_DecoderSmall/alice29.txt.zst-8                       344MB/s ± 1%   350MB/s ± 1%  +1.69%  (p=0.000 n=10+10)
Decoder_DecoderSmall/html_x_4.zst-8                         2.34GB/s ± 1%  2.37GB/s ± 1%  +1.10%  (p=0.000 n=10+10)
Decoder_DecoderSmall/paper-100k.pdf.zst-8                   3.75GB/s ± 0%  3.76GB/s ± 1%    ~     (p=0.182 n=9+10)
Decoder_DecoderSmall/fireworks.jpeg.zst-8                   8.59GB/s ± 1%  8.58GB/s ± 1%    ~     (p=0.842 n=10+9)
Decoder_DecoderSmall/urls.10K.zst-8                          561MB/s ± 1%   556MB/s ± 1%  -0.82%  (p=0.019 n=10+10)
Decoder_DecoderSmall/html.zst-8                              900MB/s ± 1%   913MB/s ± 1%  +1.42%  (p=0.000 n=10+9)
Decoder_DecoderSmall/comp-data.bin.zst-8                     399MB/s ± 1%   395MB/s ± 1%  -0.99%  (p=0.000 n=10+10)
Decoder_DecodeAll/kppkn.gtb.zst-8                            518MB/s ± 0%   526MB/s ± 0%  +1.52%  (p=0.000 n=10+9)
Decoder_DecodeAll/geo.protodata.zst-8                       1.28GB/s ± 0%  1.27GB/s ± 2%    ~     (p=0.739 n=10+10)
Decoder_DecodeAll/plrabn12.txt.zst-8                         427MB/s ± 1%   433MB/s ± 1%  +1.24%  (p=0.000 n=10+10)
Decoder_DecodeAll/lcet10.txt.zst-8                           480MB/s ± 1%   490MB/s ± 1%  +2.06%  (p=0.000 n=10+10)
Decoder_DecodeAll/asyoulik.txt.zst-8                         435MB/s ± 0%   447MB/s ± 0%  +2.70%  (p=0.000 n=7+9)
Decoder_DecodeAll/alice29.txt.zst-8                          422MB/s ± 0%   438MB/s ± 1%  +3.96%  (p=0.000 n=8+9)
Decoder_DecodeAll/html_x_4.zst-8                            1.60GB/s ± 0%  1.61GB/s ± 0%  +0.99%  (p=0.000 n=9+10)
Decoder_DecodeAll/paper-100k.pdf.zst-8                      4.55GB/s ± 1%  4.44GB/s ± 1%  -2.42%  (p=0.000 n=10+10)
Decoder_DecodeAll/fireworks.jpeg.zst-8                      9.52GB/s ± 1%  9.47GB/s ± 2%    ~     (p=0.143 n=10+10)
Decoder_DecodeAll/urls.10K.zst-8                             678MB/s ± 1%   684MB/s ± 0%  +0.83%  (p=0.000 n=10+10)
Decoder_DecodeAll/html.zst-8                                1.05GB/s ± 0%  1.07GB/s ± 1%  +2.11%  (p=0.000 n=10+10)
Decoder_DecodeAll/comp-data.bin.zst-8                        397MB/s ± 1%   391MB/s ± 1%  -1.37%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/fastest-8   437MB/s ± 0%   436MB/s ± 1%  -0.21%  (p=0.025 n=9+9)
Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/default-8   448MB/s ± 0%   451MB/s ± 0%  +0.70%  (p=0.000 n=9+9)
Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/better-8    478MB/s ± 0%   475MB/s ± 0%  -0.53%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/best-8      461MB/s ± 0%   470MB/s ± 0%  +2.07%  (p=0.000 n=8+9)
Decoder_DecodeAllFiles/e.txt/fastest-8                      9.62GB/s ± 3%  9.62GB/s ± 2%    ~     (p=1.000 n=10+10)
Decoder_DecodeAllFiles/e.txt/default-8                       391MB/s ± 0%   406MB/s ± 0%  +3.81%  (p=0.000 n=10+8)
Decoder_DecodeAllFiles/e.txt/better-8                        438MB/s ± 0%   448MB/s ± 0%  +2.39%  (p=0.000 n=8+10)
Decoder_DecodeAllFiles/e.txt/best-8                          500MB/s ± 0%   500MB/s ± 0%    ~     (p=0.119 n=9+9)
Decoder_DecodeAllFiles/fse-artifact3.bin/fastest-8          1.07GB/s ± 1%  1.04GB/s ± 1%  -2.61%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/fse-artifact3.bin/default-8          1.21GB/s ± 1%  1.19GB/s ± 1%  -1.33%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/fse-artifact3.bin/better-8            994MB/s ± 0%   990MB/s ± 0%  -0.42%  (p=0.002 n=10+9)
Decoder_DecodeAllFiles/fse-artifact3.bin/best-8              389MB/s ± 0%   381MB/s ± 0%  -2.00%  (p=0.000 n=8+10)
Decoder_DecodeAllFiles/gettysburg.txt/fastest-8              274MB/s ± 1%   274MB/s ± 1%    ~     (p=1.000 n=10+10)
Decoder_DecodeAllFiles/gettysburg.txt/default-8              224MB/s ± 1%   223MB/s ± 1%  -0.64%  (p=0.015 n=10+10)
Decoder_DecodeAllFiles/gettysburg.txt/better-8               228MB/s ± 1%   227MB/s ± 1%  -0.40%  (p=0.041 n=10+10)
Decoder_DecodeAllFiles/gettysburg.txt/best-8                 225MB/s ± 1%   223MB/s ± 0%  -0.52%  (p=0.008 n=10+6)
Decoder_DecodeAllFiles/html.txt/fastest-8                    599MB/s ± 1%   614MB/s ± 1%  +2.41%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/html.txt/default-8                    601MB/s ± 0%   613MB/s ± 0%  +2.01%  (p=0.000 n=8+9)
Decoder_DecodeAllFiles/html.txt/better-8                     626MB/s ± 1%   638MB/s ± 0%  +1.99%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/html.txt/best-8                       601MB/s ± 0%   612MB/s ± 0%  +1.87%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/pi.txt/fastest-8                     9.64GB/s ± 2%  9.66GB/s ± 1%    ~     (p=0.529 n=10+10)
Decoder_DecodeAllFiles/pi.txt/default-8                      390MB/s ± 0%   403MB/s ± 0%  +3.48%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/pi.txt/better-8                       439MB/s ± 0%   451MB/s ± 0%  +2.65%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/pi.txt/best-8                         500MB/s ± 0%   499MB/s ± 0%  -0.27%  (p=0.009 n=7+10)
Decoder_DecodeAllFiles/pngdata.bin/fastest-8                1.70GB/s ± 1%  1.69GB/s ± 1%  -0.63%  (p=0.013 n=10+9)
Decoder_DecodeAllFiles/pngdata.bin/default-8                1.52GB/s ± 1%  1.51GB/s ± 0%  -0.75%  (p=0.000 n=10+9)
Decoder_DecodeAllFiles/pngdata.bin/better-8                 1.92GB/s ± 0%  1.90GB/s ± 0%  -1.02%  (p=0.000 n=10+10)
Decoder_DecodeAllFiles/pngdata.bin/best-8                   1.47GB/s ± 0%  1.46GB/s ± 0%  -0.88%  (p=0.000 n=10+9)
Decoder_DecodeAllFiles/sharnd.out/fastest-8                 9.60GB/s ± 1%  9.67GB/s ± 1%  +0.67%  (p=0.029 n=10+10)
Decoder_DecodeAllFiles/sharnd.out/default-8                 9.65GB/s ± 2%  9.71GB/s ± 1%    ~     (p=0.353 n=10+10)
Decoder_DecodeAllFiles/sharnd.out/better-8                  9.67GB/s ± 1%  9.66GB/s ± 0%    ~     (p=0.549 n=10+9)
Decoder_DecodeAllFiles/sharnd.out/best-8                    9.70GB/s ± 1%  9.61GB/s ± 0%  -0.91%  (p=0.010 n=10+9)
[Geo mean]                                                   935MB/s        940MB/s       +0.57%
```
  • Loading branch information
greatroar committed Jul 12, 2022
1 parent 08efe28 commit 9a048c1
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 152 deletions.
59 changes: 15 additions & 44 deletions zstd/_generate/gen.go
Expand Up @@ -298,7 +298,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
MOVBQZX(total.As8(), total) // total = llState.As8() + mlState.As8() + ofState.As8()

// Read `total` bits
bits := o.getBitsValue(name+"_getBits", total, brValue, brBitsRead)
bits := o.getBits(total, brValue, brBitsRead)

// Update states
Comment("Update Offset State")
Expand Down Expand Up @@ -632,14 +632,13 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu
}

{
lowBits := o.getBits(name+"_getBits", AX, brValue, brBitsRead, LabelRef(name+"_skip_zero"))
lowBits := o.getBits(AX, brValue, brBitsRead)
// Check if below tablelog
assert(func(ok LabelRef) {
CMPQ(lowBits, U32(512))
JB(ok)
})
ADDQ(lowBits, DX)
Label(name + "_skip_zero")
}

// Load table pointer
Expand Down Expand Up @@ -695,53 +694,25 @@ func (o options) nextState(name string, state, lowBits reg.GPVirtual, table stri
}

// getBits will return nbits bits from brValue.
// If nbits == 0 it *may* jump to jmpZero, otherwise 0 is returned.
func (o options) getBits(name string, nBits, brValue, brBitsRead reg.GPVirtual, jmpZero LabelRef) reg.GPVirtual {
func (o options) getBits(nBits, brValue, brBitsRead reg.GPVirtual) reg.GPVirtual {
BX := GP64()
CX := reg.CL
if o.bmi2 {
LEAQ(Mem{Base: brBitsRead, Index: nBits, Scale: 1}, CX.As64())
MOVQ(brValue, BX)
MOVQ(CX.As64(), brBitsRead)
ROLQ(CX, BX)
BZHIQ(nBits, BX, BX)
} else {
CMPQ(nBits, U8(0))
JZ(jmpZero)
MOVQ(brBitsRead, CX.As64())
ADDQ(nBits, brBitsRead)
MOVQ(brValue, BX)
SHLQ(CX, BX)
MOVQ(nBits, CX.As64())
NEGQ(CX.As64())
SHRQ(CX, BX)
}
return BX
}

// getBits will return nbits bits from brValue.
// If nbits == 0 then 0 is returned.
func (o options) getBitsValue(name string, nBits, brValue, brBitsRead reg.GPVirtual) reg.GPVirtual {
BX := GP64()
CX := reg.CL
LEAQ(Mem{Base: brBitsRead, Index: nBits, Scale: 1}, CX.As64())
MOVQ(brValue, BX)
MOVQ(CX.As64(), brBitsRead)
ROLQ(CX, BX)

// BX &= (1<<nBits) - 1
if o.bmi2 {
LEAQ(Mem{Base: brBitsRead, Index: nBits, Scale: 1}, CX.As64())
MOVQ(brValue, BX)
MOVQ(CX.As64(), brBitsRead)
ROLQ(CX, BX)
BZHIQ(nBits, BX, BX)
} else {
XORQ(BX, BX)
CMPQ(nBits, U8(0))
JZ(LabelRef(name + "_get_bits_value_zero"))
MOVQ(brBitsRead, CX.As64())
ADDQ(nBits, brBitsRead)
MOVQ(brValue, BX)
SHLQ(CX, BX)
MOVQ(nBits, CX.As64())
NEGQ(CX.As64())
SHRQ(CX, BX)
Label(name + "_get_bits_value_zero")
mask := GP32()
MOVL(U32(1), mask)
MOVB(nBits.As8(), CX)
SHLL(CX, mask)
DECL(mask)
ANDQ(mask.As64(), BX)
}
return BX
}
Expand Down

0 comments on commit 9a048c1

Please sign in to comment.