Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
huff0: decompress directly into output (#577)
``` benchmark old ns/op new ns/op delta BenchmarkDecompress4XNoTable/digits/100-32 334 336 +0.66% BenchmarkDecompress4XNoTable/digits/10000-32 10835 9562 -11.75% BenchmarkDecompress4XNoTable/digits/262143-32 303585 270811 -10.80% BenchmarkDecompress4XNoTable/gettysburg/100-32 285 286 +0.56% BenchmarkDecompress4XNoTable/gettysburg/10000-32 11393 10268 -9.87% BenchmarkDecompress4XNoTable/gettysburg/262143-32 327973 289561 -11.71% BenchmarkDecompress4XNoTable/twain/100-32 331 330 -0.27% BenchmarkDecompress4XNoTable/twain/10000-32 11458 10235 -10.67% BenchmarkDecompress4XNoTable/twain/262143-32 374970 345400 -7.89% BenchmarkDecompress4XNoTable/low-ent.10k/100-32 367 371 +1.01% BenchmarkDecompress4XNoTable/low-ent.10k/10000-32 10812 9398 -13.08% BenchmarkDecompress4XNoTable/low-ent.10k/262143-32 256684 221666 -13.64% BenchmarkDecompress4XNoTable/superlow-ent-10k/262143-32 256839 221322 -13.83% BenchmarkDecompress4XNoTable/case1/100-32 318 322 +1.23% BenchmarkDecompress4XNoTable/case1/10000-32 10803 9562 -11.49% BenchmarkDecompress4XNoTable/case1/262143-32 277377 242147 -12.70% BenchmarkDecompress4XNoTable/case2/100-32 345 340 -1.62% BenchmarkDecompress4XNoTable/case2/10000-32 10659 9473 -11.13% BenchmarkDecompress4XNoTable/case2/262143-32 268723 236376 -12.04% BenchmarkDecompress4XNoTable/case3/100-32 333 336 +0.99% BenchmarkDecompress4XNoTable/case3/10000-32 10737 9357 -12.85% BenchmarkDecompress4XNoTable/case3/262143-32 272268 239011 -12.21% BenchmarkDecompress4XNoTable/pngdata.001/100-32 361 350 -2.99% BenchmarkDecompress4XNoTable/pngdata.001/10000-32 11583 10740 -7.28% BenchmarkDecompress4XNoTable/pngdata.001/262143-32 306257 279850 -8.62% BenchmarkDecompress4XNoTable/normcount2/100-32 287 289 +0.80% BenchmarkDecompress4XNoTable/normcount2/10000-32 10832 9696 -10.49% BenchmarkDecompress4XNoTable/normcount2/262143-32 279908 247688 -11.51% BenchmarkDecompress4XNoTableTableLog8/digits-32 107990 96969 -10.21% ```
- Loading branch information
1 parent
20d0f94
commit e7c028f
Showing
10 changed files
with
952 additions
and
1,440 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,310 @@ | ||
package main | ||
|
||
//go:generate go run gen.go -out ../decompress_amd64.s -pkg=huff0 | ||
|
||
import ( | ||
"flag" | ||
"fmt" | ||
"strconv" | ||
|
||
_ "github.com/klauspost/compress" | ||
|
||
. "github.com/mmcloughlin/avo/build" | ||
. "github.com/mmcloughlin/avo/operand" | ||
"github.com/mmcloughlin/avo/reg" | ||
) | ||
|
||
func main() { | ||
flag.Parse() | ||
|
||
ConstraintExpr("amd64,!appengine,!noasm,gc") | ||
|
||
decompress := decompress4x{} | ||
decompress.generateProcedure("decompress4x_main_loop_amd64") | ||
decompress.generateProcedure4x8bit("decompress4x_8b_main_loop_amd64") | ||
|
||
Generate() | ||
} | ||
|
||
type decompress4x struct { | ||
} | ||
|
||
func (d decompress4x) generateProcedure(name string) { | ||
Package("github.com/klauspost/compress/huff0") | ||
TEXT(name, 0, "func(ctx* decompress4xContext)") | ||
Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "") | ||
Pragma("noescape") | ||
|
||
exhausted := GP64() | ||
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false | ||
|
||
limitPtr := AllocLocal(8) | ||
|
||
bufferOrigin := GP64() | ||
peekBits := GP64() | ||
buffer := GP64() | ||
dstEvery := GP64() | ||
table := GP64() | ||
|
||
br0 := GP64() | ||
br1 := GP64() | ||
br2 := GP64() | ||
br3 := GP64() | ||
|
||
Comment("Preload values") | ||
{ | ||
ctx := Dereference(Param("ctx")) | ||
Load(ctx.Field("peekBits"), peekBits) | ||
Load(ctx.Field("out"), buffer) | ||
MOVQ(buffer, bufferOrigin) | ||
limit := Load(ctx.Field("limit"), GP64()) | ||
MOVQ(limit, limitPtr) | ||
Load(ctx.Field("dstEvery"), dstEvery) | ||
Load(ctx.Field("tbl"), table) | ||
Load(ctx.Field("pbr0"), br0) | ||
Load(ctx.Field("pbr1"), br1) | ||
Load(ctx.Field("pbr2"), br2) | ||
Load(ctx.Field("pbr3"), br3) | ||
} | ||
|
||
Comment("Main loop") | ||
Label("main_loop") | ||
|
||
MOVQ(bufferOrigin, buffer) | ||
// Check if we have space | ||
CMPQ(buffer, limitPtr) | ||
SETGE(exhausted.As8()) | ||
d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted) | ||
ADDQ(dstEvery, buffer) | ||
d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted) | ||
ADDQ(dstEvery, buffer) | ||
d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted) | ||
ADDQ(dstEvery, buffer) | ||
d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted) | ||
|
||
ADDQ(U8(2), bufferOrigin) // off += 2 | ||
|
||
TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4? | ||
JZ(LabelRef("main_loop")) | ||
|
||
{ | ||
ctx := Dereference(Param("ctx")) | ||
tmp := Load(ctx.Field("out"), GP64()) | ||
decoded := GP64() | ||
MOVQ(bufferOrigin, decoded) | ||
SUBQ(tmp, decoded) | ||
SHLQ(U8(2), decoded) // decoded *= 4 | ||
|
||
Store(decoded, ctx.Field("decoded")) | ||
} | ||
|
||
RET() | ||
} | ||
|
||
// TODO [wmu]: I believe it's doable in avo, but can't figure out how to deal | ||
// with arbitrary pointers to a given type | ||
const bitReader_in = 0 | ||
const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap} | ||
const bitReader_value = bitReader_off + 8 | ||
const bitReader_bitsRead = bitReader_value + 8 | ||
|
||
func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { | ||
brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) | ||
|
||
val := GP64() | ||
Commentf("val0 := br%d.peekTopBits(peekBits)", id) | ||
CX := reg.CL | ||
MOVQ(brValue, val.As64()) | ||
MOVQ(peekBits, CX.As64()) | ||
SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask | ||
|
||
Comment("v0 := table[val0&mask]") | ||
MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16()) | ||
|
||
Commentf("br%d.advance(uint8(v0.entry)", id) | ||
out := reg.RAX // Fixed since we need 8H | ||
MOVB(CX.As8H(), out.As8()) // AL = uint8(v0.entry >> 8) | ||
|
||
SHLQ(CX, brValue) // value <<= n | ||
ADDB(CX.As8(), brBitsRead.As8()) // bits_read += n | ||
|
||
Commentf("val1 := br%d.peekTopBits(peekBits)", id) | ||
MOVQ(peekBits, CX.As64()) | ||
MOVQ(brValue, val.As64()) | ||
SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask | ||
|
||
Comment("v1 := table[val1&mask]") | ||
MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16()) // tmp - v1 | ||
|
||
Commentf("br%d.advance(uint8(v1.entry))", id) | ||
MOVB(CX.As8H(), out.As8H()) // AH = uint8(v0.entry >> 8) | ||
SHLQ(CX, brValue) // value <<= n | ||
ADDB(CX.As8(), brBitsRead.As8()) // bits_read += n | ||
|
||
Comment("these two writes get coalesced") | ||
Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)") | ||
Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") | ||
MOVW(out.As16(), Mem{Base: buffer}) | ||
|
||
Comment("update the bitrader reader structure") | ||
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value}) | ||
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead}) | ||
} | ||
|
||
func (d decompress4x) generateProcedure4x8bit(name string) { | ||
Package("github.com/klauspost/compress/huff0") | ||
TEXT(name, 0, "func(ctx* decompress4xContext)") | ||
Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "") | ||
Pragma("noescape") | ||
|
||
exhausted := GP64() // Fixed since we need 8H | ||
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false | ||
|
||
bufferOrigin := AllocLocal(8) | ||
limitPtr := AllocLocal(8) | ||
|
||
peekBits := GP64() | ||
buffer := GP64() | ||
dstEvery := GP64() | ||
table := GP64() | ||
|
||
br0 := GP64() | ||
br1 := GP64() | ||
br2 := GP64() | ||
br3 := GP64() | ||
|
||
Comment("Preload values") | ||
{ | ||
ctx := Dereference(Param("ctx")) | ||
Load(ctx.Field("peekBits"), peekBits) | ||
Load(ctx.Field("out"), buffer) | ||
MOVQ(buffer, bufferOrigin) | ||
limit := Load(ctx.Field("limit"), GP64()) | ||
MOVQ(limit, limitPtr) | ||
Load(ctx.Field("dstEvery"), dstEvery) | ||
Load(ctx.Field("tbl"), table) | ||
Load(ctx.Field("pbr0"), br0) | ||
Load(ctx.Field("pbr1"), br1) | ||
Load(ctx.Field("pbr2"), br2) | ||
Load(ctx.Field("pbr3"), br3) | ||
} | ||
|
||
Comment("Main loop") | ||
Label("main_loop") | ||
|
||
MOVQ(bufferOrigin, buffer) | ||
// Check if we have space | ||
CMPQ(buffer, limitPtr) | ||
SETGE(exhausted.As8()) | ||
d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted) | ||
ADDQ(dstEvery, buffer) | ||
d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted) | ||
ADDQ(dstEvery, buffer) | ||
d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted) | ||
ADDQ(dstEvery, buffer) | ||
d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted) | ||
|
||
ADDQ(U8(4), bufferOrigin) // off += 4 | ||
|
||
TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4? | ||
JZ(LabelRef("main_loop")) | ||
|
||
{ | ||
ctx := Dereference(Param("ctx")) | ||
tmp := Load(ctx.Field("out"), GP64()) | ||
decoded := GP64() | ||
MOVQ(bufferOrigin, decoded) | ||
SUBQ(tmp, decoded) | ||
SHLQ(U8(2), decoded) // decoded *= 4 | ||
|
||
Store(decoded, ctx.Field("decoded")) | ||
} | ||
RET() | ||
} | ||
|
||
func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { | ||
brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted) | ||
|
||
decompress := func(valID int, outByte reg.Register) { | ||
CX := reg.CL | ||
val := GP64() | ||
Commentf("val%d := br%d.peekTopBits(peekBits)", valID, id) | ||
MOVQ(brValue, val.As64()) | ||
MOVQ(peekBits, CX.As64()) | ||
SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask | ||
|
||
Commentf("v%d := table[val0&mask]", valID) | ||
MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16()) | ||
|
||
Commentf("br%d.advance(uint8(v%d.entry)", id, valID) | ||
MOVB(CX.As8H(), outByte) // outByte = uint8(v0.entry >> 8) | ||
|
||
SHLQ(CX, brValue) // value <<= n | ||
ADDB(CX, brBitsRead.As8()) // bits_read += n | ||
} | ||
|
||
out := reg.RAX // Fixed since we need 8H | ||
decompress(0, out.As8L()) | ||
decompress(1, out.As8H()) | ||
BSWAPL(out.As32()) | ||
decompress(2, out.As8H()) | ||
decompress(3, out.As8L()) | ||
BSWAPL(out.As32()) | ||
|
||
Comment("these four writes get coalesced") | ||
Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)") | ||
Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") | ||
Comment("out[id * dstEvery + 3] = uint8(v2.entry >> 8)") | ||
Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)") | ||
MOVL(out.As32(), Mem{Base: buffer}) | ||
|
||
Comment("update the bitreader reader structure") | ||
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value}) | ||
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead}) | ||
} | ||
|
||
func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) { | ||
if atLeast > 32 { | ||
panic(fmt.Sprintf("at least (%d) cannot be >32", atLeast)) | ||
} | ||
Commentf("br%d.fillFast32()", id) | ||
brValue = GP64() | ||
brBitsRead = GP64() | ||
MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue) | ||
MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead) | ||
|
||
// We must have at least 2 * max tablelog left | ||
CMPQ(brBitsRead, U8(64-atLeast)) | ||
JBE(LabelRef("skip_fill" + strconv.Itoa(id))) | ||
brOffset := GP64() | ||
MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset) | ||
|
||
SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32 | ||
SUBQ(U8(4), brOffset) // b.off -= 4 | ||
|
||
// v := b.in[b.off-4 : b.off] | ||
// v = v[:4] | ||
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) | ||
tmp := GP64() | ||
MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp) | ||
|
||
Comment("b.value |= uint64(low) << (b.bitsRead & 63)") | ||
addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1} | ||
CX := reg.CL | ||
MOVL(addr, tmp.As32()) // tmp = uint32(b.in[b.off:b.off+4]) | ||
MOVQ(brBitsRead, CX.As64()) | ||
SHLQ(CX, tmp.As64()) | ||
|
||
MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off}) | ||
ORQ(tmp.As64(), brValue) | ||
{ | ||
Commentf("exhausted = exhausted || (br%d.off < 4)", id) | ||
CMPQ(brOffset, U8(4)) | ||
tmp = GP64() | ||
SETLT(tmp.As8()) | ||
ORB(tmp.As8(), exhausted.As8()) | ||
} | ||
|
||
Label("skip_fill" + strconv.Itoa(id)) | ||
return | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
module github.com/klauspost/compress/s2/_generate | ||
|
||
go 1.15 | ||
|
||
require ( | ||
github.com/klauspost/compress v1.15.1 | ||
github.com/mmcloughlin/avo v0.4.0 | ||
) | ||
|
||
replace github.com/klauspost/compress => ../.. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
github.com/mmcloughlin/avo v0.4.0 h1:jeHDRktVD+578ULxWpQHkilor6pkdLF7u7EiTzDbfcU= | ||
github.com/mmcloughlin/avo v0.4.0/go.mod h1:RW9BfYA3TgO9uCdNrKU2h6J8cPD8ZLznvfgHAeszb1s= | ||
github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= | ||
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= | ||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= | ||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= | ||
golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo= | ||
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= | ||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= | ||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= | ||
golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= | ||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= | ||
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= | ||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= | ||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= | ||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= | ||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= | ||
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021 h1:giLT+HuUP/gXYrG2Plg9WTjj4qhfgaW424ZIFog3rlk= | ||
golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= | ||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= | ||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= | ||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= | ||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= | ||
golang.org/x/tools v0.1.7 h1:6j8CgantCy3yc8JGBqkDLMKWqZ0RDU2g1HVgacojGWQ= | ||
golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo= | ||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= | ||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= | ||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= | ||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= | ||
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.