Skip to content

Commit

Permalink
huff0: decompress directly into output (#577)
Browse files Browse the repository at this point in the history
```
benchmark                                                   old ns/op     new ns/op     delta
BenchmarkDecompress4XNoTable/digits/100-32                  334           336           +0.66%
BenchmarkDecompress4XNoTable/digits/10000-32                10835         9562          -11.75%
BenchmarkDecompress4XNoTable/digits/262143-32               303585        270811        -10.80%
BenchmarkDecompress4XNoTable/gettysburg/100-32              285           286           +0.56%
BenchmarkDecompress4XNoTable/gettysburg/10000-32            11393         10268         -9.87%
BenchmarkDecompress4XNoTable/gettysburg/262143-32           327973        289561        -11.71%
BenchmarkDecompress4XNoTable/twain/100-32                   331           330           -0.27%
BenchmarkDecompress4XNoTable/twain/10000-32                 11458         10235         -10.67%
BenchmarkDecompress4XNoTable/twain/262143-32                374970        345400        -7.89%
BenchmarkDecompress4XNoTable/low-ent.10k/100-32             367           371           +1.01%
BenchmarkDecompress4XNoTable/low-ent.10k/10000-32           10812         9398          -13.08%
BenchmarkDecompress4XNoTable/low-ent.10k/262143-32          256684        221666        -13.64%
BenchmarkDecompress4XNoTable/superlow-ent-10k/262143-32     256839        221322        -13.83%
BenchmarkDecompress4XNoTable/case1/100-32                   318           322           +1.23%
BenchmarkDecompress4XNoTable/case1/10000-32                 10803         9562          -11.49%
BenchmarkDecompress4XNoTable/case1/262143-32                277377        242147        -12.70%
BenchmarkDecompress4XNoTable/case2/100-32                   345           340           -1.62%
BenchmarkDecompress4XNoTable/case2/10000-32                 10659         9473          -11.13%
BenchmarkDecompress4XNoTable/case2/262143-32                268723        236376        -12.04%
BenchmarkDecompress4XNoTable/case3/100-32                   333           336           +0.99%
BenchmarkDecompress4XNoTable/case3/10000-32                 10737         9357          -12.85%
BenchmarkDecompress4XNoTable/case3/262143-32                272268        239011        -12.21%
BenchmarkDecompress4XNoTable/pngdata.001/100-32             361           350           -2.99%
BenchmarkDecompress4XNoTable/pngdata.001/10000-32           11583         10740         -7.28%
BenchmarkDecompress4XNoTable/pngdata.001/262143-32          306257        279850        -8.62%
BenchmarkDecompress4XNoTable/normcount2/100-32              287           289           +0.80%
BenchmarkDecompress4XNoTable/normcount2/10000-32            10832         9696          -10.49%
BenchmarkDecompress4XNoTable/normcount2/262143-32           279908        247688        -11.51%
BenchmarkDecompress4XNoTableTableLog8/digits-32             107990        96969         -10.21%
```
  • Loading branch information
WojciechMula committed May 7, 2022
1 parent 20d0f94 commit e7c028f
Show file tree
Hide file tree
Showing 10 changed files with 952 additions and 1,440 deletions.
310 changes: 310 additions & 0 deletions huff0/_generate/gen.go
@@ -0,0 +1,310 @@
package main

//go:generate go run gen.go -out ../decompress_amd64.s -pkg=huff0

import (
"flag"
"fmt"
"strconv"

_ "github.com/klauspost/compress"

. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
"github.com/mmcloughlin/avo/reg"
)

func main() {
flag.Parse()

ConstraintExpr("amd64,!appengine,!noasm,gc")

decompress := decompress4x{}
decompress.generateProcedure("decompress4x_main_loop_amd64")
decompress.generateProcedure4x8bit("decompress4x_8b_main_loop_amd64")

Generate()
}

type decompress4x struct {
}

func (d decompress4x) generateProcedure(name string) {
Package("github.com/klauspost/compress/huff0")
TEXT(name, 0, "func(ctx* decompress4xContext)")
Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "")
Pragma("noescape")

exhausted := GP64()
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false

limitPtr := AllocLocal(8)

bufferOrigin := GP64()
peekBits := GP64()
buffer := GP64()
dstEvery := GP64()
table := GP64()

br0 := GP64()
br1 := GP64()
br2 := GP64()
br3 := GP64()

Comment("Preload values")
{
ctx := Dereference(Param("ctx"))
Load(ctx.Field("peekBits"), peekBits)
Load(ctx.Field("out"), buffer)
MOVQ(buffer, bufferOrigin)
limit := Load(ctx.Field("limit"), GP64())
MOVQ(limit, limitPtr)
Load(ctx.Field("dstEvery"), dstEvery)
Load(ctx.Field("tbl"), table)
Load(ctx.Field("pbr0"), br0)
Load(ctx.Field("pbr1"), br1)
Load(ctx.Field("pbr2"), br2)
Load(ctx.Field("pbr3"), br3)
}

Comment("Main loop")
Label("main_loop")

MOVQ(bufferOrigin, buffer)
// Check if we have space
CMPQ(buffer, limitPtr)
SETGE(exhausted.As8())
d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted)

ADDQ(U8(2), bufferOrigin) // off += 2

TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4?
JZ(LabelRef("main_loop"))

{
ctx := Dereference(Param("ctx"))
tmp := Load(ctx.Field("out"), GP64())
decoded := GP64()
MOVQ(bufferOrigin, decoded)
SUBQ(tmp, decoded)
SHLQ(U8(2), decoded) // decoded *= 4

Store(decoded, ctx.Field("decoded"))
}

RET()
}

// TODO [wmu]: I believe it's doable in avo, but can't figure out how to deal
// with arbitrary pointers to a given type
const bitReader_in = 0
const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap}
const bitReader_value = bitReader_off + 8
const bitReader_bitsRead = bitReader_value + 8

func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted)

val := GP64()
Commentf("val0 := br%d.peekTopBits(peekBits)", id)
CX := reg.CL
MOVQ(brValue, val.As64())
MOVQ(peekBits, CX.As64())
SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask

Comment("v0 := table[val0&mask]")
MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16())

Commentf("br%d.advance(uint8(v0.entry)", id)
out := reg.RAX // Fixed since we need 8H
MOVB(CX.As8H(), out.As8()) // AL = uint8(v0.entry >> 8)

SHLQ(CX, brValue) // value <<= n
ADDB(CX.As8(), brBitsRead.As8()) // bits_read += n

Commentf("val1 := br%d.peekTopBits(peekBits)", id)
MOVQ(peekBits, CX.As64())
MOVQ(brValue, val.As64())
SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask

Comment("v1 := table[val1&mask]")
MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16()) // tmp - v1

Commentf("br%d.advance(uint8(v1.entry))", id)
MOVB(CX.As8H(), out.As8H()) // AH = uint8(v0.entry >> 8)
SHLQ(CX, brValue) // value <<= n
ADDB(CX.As8(), brBitsRead.As8()) // bits_read += n

Comment("these two writes get coalesced")
Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)")
Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)")
MOVW(out.As16(), Mem{Base: buffer})

Comment("update the bitrader reader structure")
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
}

func (d decompress4x) generateProcedure4x8bit(name string) {
Package("github.com/klauspost/compress/huff0")
TEXT(name, 0, "func(ctx* decompress4xContext)")
Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "")
Pragma("noescape")

exhausted := GP64() // Fixed since we need 8H
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false

bufferOrigin := AllocLocal(8)
limitPtr := AllocLocal(8)

peekBits := GP64()
buffer := GP64()
dstEvery := GP64()
table := GP64()

br0 := GP64()
br1 := GP64()
br2 := GP64()
br3 := GP64()

Comment("Preload values")
{
ctx := Dereference(Param("ctx"))
Load(ctx.Field("peekBits"), peekBits)
Load(ctx.Field("out"), buffer)
MOVQ(buffer, bufferOrigin)
limit := Load(ctx.Field("limit"), GP64())
MOVQ(limit, limitPtr)
Load(ctx.Field("dstEvery"), dstEvery)
Load(ctx.Field("tbl"), table)
Load(ctx.Field("pbr0"), br0)
Load(ctx.Field("pbr1"), br1)
Load(ctx.Field("pbr2"), br2)
Load(ctx.Field("pbr3"), br3)
}

Comment("Main loop")
Label("main_loop")

MOVQ(bufferOrigin, buffer)
// Check if we have space
CMPQ(buffer, limitPtr)
SETGE(exhausted.As8())
d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted)

ADDQ(U8(4), bufferOrigin) // off += 4

TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4?
JZ(LabelRef("main_loop"))

{
ctx := Dereference(Param("ctx"))
tmp := Load(ctx.Field("out"), GP64())
decoded := GP64()
MOVQ(bufferOrigin, decoded)
SUBQ(tmp, decoded)
SHLQ(U8(2), decoded) // decoded *= 4

Store(decoded, ctx.Field("decoded"))
}
RET()
}

func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted)

decompress := func(valID int, outByte reg.Register) {
CX := reg.CL
val := GP64()
Commentf("val%d := br%d.peekTopBits(peekBits)", valID, id)
MOVQ(brValue, val.As64())
MOVQ(peekBits, CX.As64())
SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask

Commentf("v%d := table[val0&mask]", valID)
MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16())

Commentf("br%d.advance(uint8(v%d.entry)", id, valID)
MOVB(CX.As8H(), outByte) // outByte = uint8(v0.entry >> 8)

SHLQ(CX, brValue) // value <<= n
ADDB(CX, brBitsRead.As8()) // bits_read += n
}

out := reg.RAX // Fixed since we need 8H
decompress(0, out.As8L())
decompress(1, out.As8H())
BSWAPL(out.As32())
decompress(2, out.As8H())
decompress(3, out.As8L())
BSWAPL(out.As32())

Comment("these four writes get coalesced")
Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)")
Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)")
Comment("out[id * dstEvery + 3] = uint8(v2.entry >> 8)")
Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)")
MOVL(out.As32(), Mem{Base: buffer})

Comment("update the bitreader reader structure")
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
}

func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) {
if atLeast > 32 {
panic(fmt.Sprintf("at least (%d) cannot be >32", atLeast))
}
Commentf("br%d.fillFast32()", id)
brValue = GP64()
brBitsRead = GP64()
MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue)
MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead)

// We must have at least 2 * max tablelog left
CMPQ(brBitsRead, U8(64-atLeast))
JBE(LabelRef("skip_fill" + strconv.Itoa(id)))
brOffset := GP64()
MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset)

SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32
SUBQ(U8(4), brOffset) // b.off -= 4

// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
tmp := GP64()
MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp)

Comment("b.value |= uint64(low) << (b.bitsRead & 63)")
addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1}
CX := reg.CL
MOVL(addr, tmp.As32()) // tmp = uint32(b.in[b.off:b.off+4])
MOVQ(brBitsRead, CX.As64())
SHLQ(CX, tmp.As64())

MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off})
ORQ(tmp.As64(), brValue)
{
Commentf("exhausted = exhausted || (br%d.off < 4)", id)
CMPQ(brOffset, U8(4))
tmp = GP64()
SETLT(tmp.As8())
ORB(tmp.As8(), exhausted.As8())
}

Label("skip_fill" + strconv.Itoa(id))
return
}
10 changes: 10 additions & 0 deletions huff0/_generate/go.mod
@@ -0,0 +1,10 @@
module github.com/klauspost/compress/s2/_generate

go 1.15

require (
github.com/klauspost/compress v1.15.1
github.com/mmcloughlin/avo v0.4.0
)

replace github.com/klauspost/compress => ../..
32 changes: 32 additions & 0 deletions huff0/_generate/go.sum
@@ -0,0 +1,32 @@
github.com/mmcloughlin/avo v0.4.0 h1:jeHDRktVD+578ULxWpQHkilor6pkdLF7u7EiTzDbfcU=
github.com/mmcloughlin/avo v0.4.0/go.mod h1:RW9BfYA3TgO9uCdNrKU2h6J8cPD8ZLznvfgHAeszb1s=
github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021 h1:giLT+HuUP/gXYrG2Plg9WTjj4qhfgaW424ZIFog3rlk=
golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.7 h1:6j8CgantCy3yc8JGBqkDLMKWqZ0RDU2g1HVgacojGWQ=
golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
5 changes: 0 additions & 5 deletions huff0/autogen.go

This file was deleted.

0 comments on commit e7c028f

Please sign in to comment.