Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

huff0: decompress directly into output #577

Merged
347 changes: 347 additions & 0 deletions huff0/_generate/gen.go
@@ -0,0 +1,347 @@
package main

//go:generate go run gen.go -out ../decompress_amd64.s -pkg=huff0

import (
"flag"
"fmt"
"strconv"

_ "github.com/klauspost/compress"

. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
"github.com/mmcloughlin/avo/reg"
)

func main() {
flag.Parse()

ConstraintExpr("amd64,!appengine,!noasm,gc")

decompress := decompress4x{}
decompress.generateProcedure("decompress4x_main_loop_amd64")
decompress.generateProcedure4x8bit("decompress4x_8b_main_loop_amd64")

Generate()
}

type decompress4x struct {
bmi2 bool
}

func (d decompress4x) generateProcedure(name string) {
Package("github.com/klauspost/compress/huff0")
TEXT(name, 0, "func(ctx* decompress4xContext)")
Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "")
Pragma("noescape")

exhausted := GP64()
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false

limitPtr := AllocLocal(8)

bufferOrigin := GP64()
peekBits := GP64()
buffer := GP64()
dstEvery := GP64()
table := GP64()

br0 := GP64()
br1 := GP64()
br2 := GP64()
br3 := GP64()

Comment("Preload values")
{
ctx := Dereference(Param("ctx"))
Load(ctx.Field("peekBits"), peekBits)
Load(ctx.Field("out"), buffer)
MOVQ(buffer, bufferOrigin)
limit := Load(ctx.Field("limit"), GP64())
MOVQ(limit, limitPtr)
Load(ctx.Field("dstEvery"), dstEvery)
Load(ctx.Field("tbl"), table)
Load(ctx.Field("pbr0"), br0)
Load(ctx.Field("pbr1"), br1)
Load(ctx.Field("pbr2"), br2)
Load(ctx.Field("pbr3"), br3)
}

Comment("Main loop")
Label("main_loop")

MOVQ(bufferOrigin, buffer)
// Check if we have space
CMPQ(buffer, limitPtr)
SETGE(exhausted.As8())
d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted)

ADDQ(U8(2), bufferOrigin) // off += 2

TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4?
JZ(LabelRef("main_loop"))

{
ctx := Dereference(Param("ctx"))
tmp := Load(ctx.Field("out"), GP64())
decoded := GP64()
MOVQ(bufferOrigin, decoded)
SUBQ(tmp, decoded)
SHLQ(U8(2), decoded) // decoded *= 4

Store(decoded, ctx.Field("decoded"))
}

RET()
}

// TODO [wmu]: I believe it's doable in avo, but can't figure out how to deal
// with arbitrary pointers to a given type
const bitReader_in = 0
const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap}
const bitReader_value = bitReader_off + 8
const bitReader_bitsRead = bitReader_value + 8

func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted)

val := GP64()
Commentf("val0 := br%d.peekTopBits(peekBits)", id)
CX := reg.CL
if d.bmi2 {
SHRXQ(peekBits, brValue, val.As64()) // val = (value >> peek_bits) & mask
} else {
MOVQ(brValue, val.As64())
MOVQ(peekBits, CX.As64())
SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask
}

Comment("v0 := table[val0&mask]")
v := reg.RDX
MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, v.As16())

Commentf("br%d.advance(uint8(v0.entry)", id)
out := reg.RAX // Fixed since we need 8H
MOVB(v.As8H(), out.As8()) // BL = uint8(v0.entry >> 8)

MOVBQZX(v.As8(), CX.As64())
if d.bmi2 {
SHLXQ(v.As64(), brValue, brValue) // value <<= n
} else {
SHLQ(CX, brValue) // value <<= n
}

ADDQ(CX.As64(), brBitsRead) // bits_read += n
WojciechMula marked this conversation as resolved.
Show resolved Hide resolved

Commentf("val1 := br%d.peekTopBits(peekBits)", id)
if d.bmi2 {
SHRXQ(peekBits, brValue, val.As64()) // val = (value >> peek_bits) & mask
} else {
MOVQ(peekBits, CX.As64())
MOVQ(brValue, val.As64())
SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask
}

Comment("v1 := table[val1&mask]")
MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, v.As16()) // tmp - v1

Commentf("br%d.advance(uint8(v1.entry))", id)
MOVB(v.As8H(), out.As8H()) // BH = uint8(v0.entry >> 8)

MOVBQZX(v.As8(), CX.As64())
if d.bmi2 {
SHLXQ(v.As64(), brValue, brValue) // value <<= n
} else {
SHLQ(CX, brValue) // value <<= n
}

ADDQ(CX.As64(), brBitsRead) // bits_read += n
WojciechMula marked this conversation as resolved.
Show resolved Hide resolved

Comment("these two writes get coalesced")
Comment("out[stream][off] = uint8(v0.entry >> 8)")
Comment("out[stream][off+1] = uint8(v1.entry >> 8)")
MOVW(out.As16(), Mem{Base: buffer})

Comment("update the bitrader reader structure")
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
}

func (d decompress4x) generateProcedure4x8bit(name string) {
Package("github.com/klauspost/compress/huff0")
TEXT(name, 0, "func(ctx* decompress4xContext)")
Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "")
Pragma("noescape")

exhausted := GP64() // Fixed since we need 8H
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false

bufferOrigin := AllocLocal(8)
limitPtr := AllocLocal(8)

peekBits := GP64()
buffer := GP64()
dstEvery := GP64()
table := GP64()

br0 := GP64()
br1 := GP64()
br2 := GP64()
br3 := GP64()

Comment("Preload values")
{
ctx := Dereference(Param("ctx"))
Load(ctx.Field("peekBits"), peekBits)
Load(ctx.Field("out"), buffer)
MOVQ(buffer, bufferOrigin)
limit := Load(ctx.Field("limit"), GP64())
MOVQ(limit, limitPtr)
Load(ctx.Field("dstEvery"), dstEvery)
Load(ctx.Field("tbl"), table)
Load(ctx.Field("pbr0"), br0)
Load(ctx.Field("pbr1"), br1)
Load(ctx.Field("pbr2"), br2)
Load(ctx.Field("pbr3"), br3)
}

Comment("Main loop")
Label("main_loop")

MOVQ(bufferOrigin, buffer)
// Check if we have space
CMPQ(buffer, limitPtr)
SETGE(exhausted.As8())
d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted)

ADDQ(U8(4), bufferOrigin) // off += 4

TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4?
JZ(LabelRef("main_loop"))

{
ctx := Dereference(Param("ctx"))
tmp := Load(ctx.Field("out"), GP64())
decoded := GP64()
MOVQ(bufferOrigin, decoded)
SUBQ(tmp, decoded)
SHLQ(U8(2), decoded) // decoded *= 4

Store(decoded, ctx.Field("decoded"))
}
RET()
}

func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted)

decompress := func(valID int, outByte reg.Register) {
CX := reg.CL
val := GP64()
Commentf("val%d := br%d.peekTopBits(peekBits)", valID, id)
if d.bmi2 {
SHRXQ(peekBits, brValue, val.As64()) // val = (value >> peek_bits) & mask
} else {
MOVQ(brValue, val.As64())
MOVQ(peekBits, CX.As64())
SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask
}

Commentf("v%d := table[val0&mask]", valID)
MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16())

Commentf("br%d.advance(uint8(v%d.entry)", id, valID)
MOVB(CX.As8H(), outByte) // BL = uint8(v0.entry >> 8)

MOVBQZX(CX.As8(), CX.As64())
if d.bmi2 {
SHLXQ(CX.As64(), brValue, brValue) // value <<= n
} else {
SHLQ(CX, brValue) // value <<= n
}

ADDQ(CX.As64(), brBitsRead) // bits_read += n
WojciechMula marked this conversation as resolved.
Show resolved Hide resolved
}

out := reg.RAX // Fixed since we need 8H
decompress(0, out.As8L())
decompress(1, out.As8H())
BSWAPL(out.As32())
decompress(2, out.As8H())
decompress(3, out.As8L())
BSWAPL(out.As32())

Comment("these four writes get coalesced")
Comment("buf[stream][off] = uint8(v0.entry >> 8)")
Comment("buf[stream][off+1] = uint8(v1.entry >> 8)")
Comment("buf[stream][off+2] = uint8(v2.entry >> 8)")
Comment("buf[stream][off+3] = uint8(v3.entry >> 8)")
MOVL(out.As32(), Mem{Base: buffer})

Comment("update the bitreader reader structure")
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
}

func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) {
if atLeast > 32 {
panic(fmt.Sprintf("at least (%d) cannot be >32", atLeast))
}
Commentf("br%d.fillFast32()", id)
brValue = GP64()
brBitsRead = GP64()
MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue)
MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead)

// We must have at least 2 * max tablelog left
CMPQ(brBitsRead, U8(64-atLeast))
JBE(LabelRef("skip_fill" + strconv.Itoa(id)))
brOffset := GP64()
MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset)

SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32
SUBQ(U8(4), brOffset) // b.off -= 4

// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
tmp := GP64()
MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp)

Comment("b.value |= uint64(low) << (b.bitsRead & 63)")
addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1}
if d.bmi2 {
SHLXQ(brBitsRead, addr, tmp.As64()) // tmp = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
} else {
CX := reg.CL
MOVL(addr, tmp.As32()) // tmp = uint32(b.in[b.off:b.off+4])
MOVQ(brBitsRead, CX.As64())
SHLQ(CX, tmp.As64())
}

MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off})
ORQ(tmp.As64(), brValue)
{
Commentf("exhausted = exhausted || (br%d.off < 4)", id)
CMPQ(brOffset, U8(4))
tmp = GP64()
SETLT(tmp.As8())
ORB(tmp.As8(), exhausted.As8())
}

Label("skip_fill" + strconv.Itoa(id))
return
}
10 changes: 10 additions & 0 deletions huff0/_generate/go.mod
@@ -0,0 +1,10 @@
module github.com/klauspost/compress/s2/_generate

go 1.15

require (
github.com/klauspost/compress v1.15.1
github.com/mmcloughlin/avo v0.4.0
)

replace github.com/klauspost/compress => ../..
32 changes: 32 additions & 0 deletions huff0/_generate/go.sum
@@ -0,0 +1,32 @@
github.com/mmcloughlin/avo v0.4.0 h1:jeHDRktVD+578ULxWpQHkilor6pkdLF7u7EiTzDbfcU=
github.com/mmcloughlin/avo v0.4.0/go.mod h1:RW9BfYA3TgO9uCdNrKU2h6J8cPD8ZLznvfgHAeszb1s=
github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021 h1:giLT+HuUP/gXYrG2Plg9WTjj4qhfgaW424ZIFog3rlk=
golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.7 h1:6j8CgantCy3yc8JGBqkDLMKWqZ0RDU2g1HVgacojGWQ=
golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
5 changes: 0 additions & 5 deletions huff0/autogen.go

This file was deleted.