diff --git a/huff0/_generate/gen.go b/huff0/_generate/gen.go new file mode 100644 index 0000000000..04870d91dc --- /dev/null +++ b/huff0/_generate/gen.go @@ -0,0 +1,310 @@ +package main + +//go:generate go run gen.go -out ../decompress_amd64.s -pkg=huff0 + +import ( + "flag" + "fmt" + "strconv" + + _ "github.com/klauspost/compress" + + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + "github.com/mmcloughlin/avo/reg" +) + +func main() { + flag.Parse() + + ConstraintExpr("amd64,!appengine,!noasm,gc") + + decompress := decompress4x{} + decompress.generateProcedure("decompress4x_main_loop_amd64") + decompress.generateProcedure4x8bit("decompress4x_8b_main_loop_amd64") + + Generate() +} + +type decompress4x struct { +} + +func (d decompress4x) generateProcedure(name string) { + Package("github.com/klauspost/compress/huff0") + TEXT(name, 0, "func(ctx* decompress4xContext)") + Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "") + Pragma("noescape") + + exhausted := GP64() + XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false + + limitPtr := AllocLocal(8) + + bufferOrigin := GP64() + peekBits := GP64() + buffer := GP64() + dstEvery := GP64() + table := GP64() + + br0 := GP64() + br1 := GP64() + br2 := GP64() + br3 := GP64() + + Comment("Preload values") + { + ctx := Dereference(Param("ctx")) + Load(ctx.Field("peekBits"), peekBits) + Load(ctx.Field("out"), buffer) + MOVQ(buffer, bufferOrigin) + limit := Load(ctx.Field("limit"), GP64()) + MOVQ(limit, limitPtr) + Load(ctx.Field("dstEvery"), dstEvery) + Load(ctx.Field("tbl"), table) + Load(ctx.Field("pbr0"), br0) + Load(ctx.Field("pbr1"), br1) + Load(ctx.Field("pbr2"), br2) + Load(ctx.Field("pbr3"), br3) + } + + Comment("Main loop") + Label("main_loop") + + MOVQ(bufferOrigin, buffer) + // Check if we have space + CMPQ(buffer, limitPtr) + SETGE(exhausted.As8()) + d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted) + ADDQ(dstEvery, buffer) + d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted) + ADDQ(dstEvery, buffer) + d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted) + ADDQ(dstEvery, buffer) + d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted) + + ADDQ(U8(2), bufferOrigin) // off += 2 + + TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4? + JZ(LabelRef("main_loop")) + + { + ctx := Dereference(Param("ctx")) + tmp := Load(ctx.Field("out"), GP64()) + decoded := GP64() + MOVQ(bufferOrigin, decoded) + SUBQ(tmp, decoded) + SHLQ(U8(2), decoded) // decoded *= 4 + + Store(decoded, ctx.Field("decoded")) + } + + RET() +} + +// TODO [wmu]: I believe it's doable in avo, but can't figure out how to deal +// with arbitrary pointers to a given type +const bitReader_in = 0 +const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap} +const bitReader_value = bitReader_off + 8 +const bitReader_bitsRead = bitReader_value + 8 + +func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { + brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) + + val := GP64() + Commentf("val0 := br%d.peekTopBits(peekBits)", id) + CX := reg.CL + MOVQ(brValue, val.As64()) + MOVQ(peekBits, CX.As64()) + SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask + + Comment("v0 := table[val0&mask]") + MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16()) + + Commentf("br%d.advance(uint8(v0.entry)", id) + out := reg.RAX // Fixed since we need 8H + MOVB(CX.As8H(), out.As8()) // AL = uint8(v0.entry >> 8) + + SHLQ(CX, brValue) // value <<= n + ADDB(CX.As8(), brBitsRead.As8()) // bits_read += n + + Commentf("val1 := br%d.peekTopBits(peekBits)", id) + MOVQ(peekBits, CX.As64()) + MOVQ(brValue, val.As64()) + SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask + + Comment("v1 := table[val1&mask]") + MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16()) // tmp - v1 + + Commentf("br%d.advance(uint8(v1.entry))", id) + MOVB(CX.As8H(), out.As8H()) // AH = uint8(v0.entry >> 8) + SHLQ(CX, brValue) // value <<= n + ADDB(CX.As8(), brBitsRead.As8()) // bits_read += n + + Comment("these two writes get coalesced") + Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)") + Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") + MOVW(out.As16(), Mem{Base: buffer}) + + Comment("update the bitrader reader structure") + MOVQ(brValue, Mem{Base: br, Disp: bitReader_value}) + MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead}) +} + +func (d decompress4x) generateProcedure4x8bit(name string) { + Package("github.com/klauspost/compress/huff0") + TEXT(name, 0, "func(ctx* decompress4xContext)") + Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "") + Pragma("noescape") + + exhausted := GP64() // Fixed since we need 8H + XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false + + bufferOrigin := AllocLocal(8) + limitPtr := AllocLocal(8) + + peekBits := GP64() + buffer := GP64() + dstEvery := GP64() + table := GP64() + + br0 := GP64() + br1 := GP64() + br2 := GP64() + br3 := GP64() + + Comment("Preload values") + { + ctx := Dereference(Param("ctx")) + Load(ctx.Field("peekBits"), peekBits) + Load(ctx.Field("out"), buffer) + MOVQ(buffer, bufferOrigin) + limit := Load(ctx.Field("limit"), GP64()) + MOVQ(limit, limitPtr) + Load(ctx.Field("dstEvery"), dstEvery) + Load(ctx.Field("tbl"), table) + Load(ctx.Field("pbr0"), br0) + Load(ctx.Field("pbr1"), br1) + Load(ctx.Field("pbr2"), br2) + Load(ctx.Field("pbr3"), br3) + } + + Comment("Main loop") + Label("main_loop") + + MOVQ(bufferOrigin, buffer) + // Check if we have space + CMPQ(buffer, limitPtr) + SETGE(exhausted.As8()) + d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted) + ADDQ(dstEvery, buffer) + d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted) + ADDQ(dstEvery, buffer) + d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted) + ADDQ(dstEvery, buffer) + d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted) + + ADDQ(U8(4), bufferOrigin) // off += 4 + + TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4? + JZ(LabelRef("main_loop")) + + { + ctx := Dereference(Param("ctx")) + tmp := Load(ctx.Field("out"), GP64()) + decoded := GP64() + MOVQ(bufferOrigin, decoded) + SUBQ(tmp, decoded) + SHLQ(U8(2), decoded) // decoded *= 4 + + Store(decoded, ctx.Field("decoded")) + } + RET() +} + +func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { + brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted) + + decompress := func(valID int, outByte reg.Register) { + CX := reg.CL + val := GP64() + Commentf("val%d := br%d.peekTopBits(peekBits)", valID, id) + MOVQ(brValue, val.As64()) + MOVQ(peekBits, CX.As64()) + SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask + + Commentf("v%d := table[val0&mask]", valID) + MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16()) + + Commentf("br%d.advance(uint8(v%d.entry)", id, valID) + MOVB(CX.As8H(), outByte) // outByte = uint8(v0.entry >> 8) + + SHLQ(CX, brValue) // value <<= n + ADDB(CX, brBitsRead.As8()) // bits_read += n + } + + out := reg.RAX // Fixed since we need 8H + decompress(0, out.As8L()) + decompress(1, out.As8H()) + BSWAPL(out.As32()) + decompress(2, out.As8H()) + decompress(3, out.As8L()) + BSWAPL(out.As32()) + + Comment("these four writes get coalesced") + Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)") + Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") + Comment("out[id * dstEvery + 3] = uint8(v2.entry >> 8)") + Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)") + MOVL(out.As32(), Mem{Base: buffer}) + + Comment("update the bitreader reader structure") + MOVQ(brValue, Mem{Base: br, Disp: bitReader_value}) + MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead}) +} + +func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) { + if atLeast > 32 { + panic(fmt.Sprintf("at least (%d) cannot be >32", atLeast)) + } + Commentf("br%d.fillFast32()", id) + brValue = GP64() + brBitsRead = GP64() + MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue) + MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead) + + // We must have at least 2 * max tablelog left + CMPQ(brBitsRead, U8(64-atLeast)) + JBE(LabelRef("skip_fill" + strconv.Itoa(id))) + brOffset := GP64() + MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset) + + SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32 + SUBQ(U8(4), brOffset) // b.off -= 4 + + // v := b.in[b.off-4 : b.off] + // v = v[:4] + // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + tmp := GP64() + MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp) + + Comment("b.value |= uint64(low) << (b.bitsRead & 63)") + addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1} + CX := reg.CL + MOVL(addr, tmp.As32()) // tmp = uint32(b.in[b.off:b.off+4]) + MOVQ(brBitsRead, CX.As64()) + SHLQ(CX, tmp.As64()) + + MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off}) + ORQ(tmp.As64(), brValue) + { + Commentf("exhausted = exhausted || (br%d.off < 4)", id) + CMPQ(brOffset, U8(4)) + tmp = GP64() + SETLT(tmp.As8()) + ORB(tmp.As8(), exhausted.As8()) + } + + Label("skip_fill" + strconv.Itoa(id)) + return +} diff --git a/huff0/_generate/go.mod b/huff0/_generate/go.mod new file mode 100644 index 0000000000..41c4458869 --- /dev/null +++ b/huff0/_generate/go.mod @@ -0,0 +1,10 @@ +module github.com/klauspost/compress/s2/_generate + +go 1.15 + +require ( + github.com/klauspost/compress v1.15.1 + github.com/mmcloughlin/avo v0.4.0 +) + +replace github.com/klauspost/compress => ../.. diff --git a/huff0/_generate/go.sum b/huff0/_generate/go.sum new file mode 100644 index 0000000000..b4b59140f0 --- /dev/null +++ b/huff0/_generate/go.sum @@ -0,0 +1,32 @@ +github.com/mmcloughlin/avo v0.4.0 h1:jeHDRktVD+578ULxWpQHkilor6pkdLF7u7EiTzDbfcU= +github.com/mmcloughlin/avo v0.4.0/go.mod h1:RW9BfYA3TgO9uCdNrKU2h6J8cPD8ZLznvfgHAeszb1s= +github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= +golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo= +golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021 h1:giLT+HuUP/gXYrG2Plg9WTjj4qhfgaW424ZIFog3rlk= +golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.7 h1:6j8CgantCy3yc8JGBqkDLMKWqZ0RDU2g1HVgacojGWQ= +golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/huff0/autogen.go b/huff0/autogen.go deleted file mode 100644 index ff2c69d60c..0000000000 --- a/huff0/autogen.go +++ /dev/null @@ -1,5 +0,0 @@ -package huff0 - -//go:generate go run generate.go -//go:generate asmfmt -w decompress_amd64.s -//go:generate asmfmt -w decompress_8b_amd64.s diff --git a/huff0/decompress_8b_amd64.s b/huff0/decompress_8b_amd64.s deleted file mode 100644 index 0d6cb1a962..0000000000 --- a/huff0/decompress_8b_amd64.s +++ /dev/null @@ -1,488 +0,0 @@ -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" -#include "funcdata.h" -#include "go_asm.h" - -#define bufoff 256 // see decompress.go, we're using [4][256]byte table - -// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, -// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool) -TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8 -#define off R8 -#define buffer DI -#define table SI - -#define br_bits_read R9 -#define br_value R10 -#define br_offset R11 -#define peek_bits R12 -#define exhausted DX - -#define br0 R13 -#define br1 R14 -#define br2 R15 -#define br3 BP - - MOVQ BP, 0(SP) - - XORQ exhausted, exhausted // exhausted = false - XORQ off, off // off = 0 - - MOVBQZX peekBits+32(FP), peek_bits - MOVQ buf+40(FP), buffer - MOVQ tbl+48(FP), table - - MOVQ pbr0+0(FP), br0 - MOVQ pbr1+8(FP), br1 - MOVQ pbr2+16(FP), br2 - MOVQ pbr3+24(FP), br3 - -main_loop: - - // const stream = 0 - // br0.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read - MOVQ bitReaderShifted_value(br0), br_value - MOVQ bitReaderShifted_off(br0), br_offset - - // if b.bitsRead >= 32 { - CMPQ br_bits_read, $32 - JB skip_fill0 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br0), AX - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - - // b.value |= uint64(low) << (b.bitsRead & 63) - MOVQ br_bits_read, CX - SHLQ CL, AX - ORQ AX, br_value - - // exhausted = exhausted || (br0.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH - - // } -skip_fill0: - - // val0 := br0.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br0.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val1 := br0.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br0.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 0(buffer)(off*1) - - // SECOND PART: - // val2 := br0.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v2 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br0.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val3 := br0.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v3 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br0.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // these two writes get coalesced - // buf[stream][off+2] = uint8(v2.entry >> 8) - // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVW BX, 0+2(buffer)(off*1) - - // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br0) - MOVQ br_value, bitReaderShifted_value(br0) - MOVQ br_offset, bitReaderShifted_off(br0) - - // const stream = 1 - // br1.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read - MOVQ bitReaderShifted_value(br1), br_value - MOVQ bitReaderShifted_off(br1), br_offset - - // if b.bitsRead >= 32 { - CMPQ br_bits_read, $32 - JB skip_fill1 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br1), AX - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - - // b.value |= uint64(low) << (b.bitsRead & 63) - MOVQ br_bits_read, CX - SHLQ CL, AX - ORQ AX, br_value - - // exhausted = exhausted || (br1.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH - - // } -skip_fill1: - - // val0 := br1.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br1.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val1 := br1.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br1.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 256(buffer)(off*1) - - // SECOND PART: - // val2 := br1.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v2 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br1.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val3 := br1.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v3 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br1.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // these two writes get coalesced - // buf[stream][off+2] = uint8(v2.entry >> 8) - // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVW BX, 256+2(buffer)(off*1) - - // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br1) - MOVQ br_value, bitReaderShifted_value(br1) - MOVQ br_offset, bitReaderShifted_off(br1) - - // const stream = 2 - // br2.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read - MOVQ bitReaderShifted_value(br2), br_value - MOVQ bitReaderShifted_off(br2), br_offset - - // if b.bitsRead >= 32 { - CMPQ br_bits_read, $32 - JB skip_fill2 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br2), AX - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - - // b.value |= uint64(low) << (b.bitsRead & 63) - MOVQ br_bits_read, CX - SHLQ CL, AX - ORQ AX, br_value - - // exhausted = exhausted || (br2.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH - - // } -skip_fill2: - - // val0 := br2.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br2.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val1 := br2.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br2.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 512(buffer)(off*1) - - // SECOND PART: - // val2 := br2.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v2 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br2.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val3 := br2.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v3 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br2.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // these two writes get coalesced - // buf[stream][off+2] = uint8(v2.entry >> 8) - // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVW BX, 512+2(buffer)(off*1) - - // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br2) - MOVQ br_value, bitReaderShifted_value(br2) - MOVQ br_offset, bitReaderShifted_off(br2) - - // const stream = 3 - // br3.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read - MOVQ bitReaderShifted_value(br3), br_value - MOVQ bitReaderShifted_off(br3), br_offset - - // if b.bitsRead >= 32 { - CMPQ br_bits_read, $32 - JB skip_fill3 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br3), AX - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - - // b.value |= uint64(low) << (b.bitsRead & 63) - MOVQ br_bits_read, CX - SHLQ CL, AX - ORQ AX, br_value - - // exhausted = exhausted || (br3.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH - - // } -skip_fill3: - - // val0 := br3.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br3.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val1 := br3.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br3.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 768(buffer)(off*1) - - // SECOND PART: - // val2 := br3.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v2 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br3.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val3 := br3.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v3 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br3.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // these two writes get coalesced - // buf[stream][off+2] = uint8(v2.entry >> 8) - // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVW BX, 768+2(buffer)(off*1) - - // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br3) - MOVQ br_value, bitReaderShifted_value(br3) - MOVQ br_offset, bitReaderShifted_off(br3) - - ADDQ $4, off // off += 2 - - TESTB DH, DH // any br[i].ofs < 4? - JNZ end - - CMPQ off, $bufoff - JL main_loop - -end: - MOVQ 0(SP), BP - - MOVB off, ret+56(FP) - RET - -#undef off -#undef buffer -#undef table - -#undef br_bits_read -#undef br_value -#undef br_offset -#undef peek_bits -#undef exhausted - -#undef br0 -#undef br1 -#undef br2 -#undef br3 diff --git a/huff0/decompress_8b_amd64.s.in b/huff0/decompress_8b_amd64.s.in deleted file mode 100644 index 6d477a2c11..0000000000 --- a/huff0/decompress_8b_amd64.s.in +++ /dev/null @@ -1,197 +0,0 @@ -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" -#include "funcdata.h" -#include "go_asm.h" - - -#define bufoff 256 // see decompress.go, we're using [4][256]byte table - -//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, -// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool) -TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8 -#define off R8 -#define buffer DI -#define table SI - -#define br_bits_read R9 -#define br_value R10 -#define br_offset R11 -#define peek_bits R12 -#define exhausted DX - -#define br0 R13 -#define br1 R14 -#define br2 R15 -#define br3 BP - - MOVQ BP, 0(SP) - - XORQ exhausted, exhausted // exhausted = false - XORQ off, off // off = 0 - - MOVBQZX peekBits+32(FP), peek_bits - MOVQ buf+40(FP), buffer - MOVQ tbl+48(FP), table - - MOVQ pbr0+0(FP), br0 - MOVQ pbr1+8(FP), br1 - MOVQ pbr2+16(FP), br2 - MOVQ pbr3+24(FP), br3 - -main_loop: -{{ define "decode_2_values_x86" }} - // const stream = {{ var "id" }} - // br{{ var "id"}}.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read - MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value - MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset - - // if b.bitsRead >= 32 { - CMPQ br_bits_read, $32 - JB skip_fill{{ var "id" }} - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br{{ var "id" }}), AX - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - - // b.value |= uint64(low) << (b.bitsRead & 63) - MOVQ br_bits_read, CX - SHLQ CL, AX - ORQ AX, br_value - - // exhausted = exhausted || (br{{ var "id"}}.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH - // } -skip_fill{{ var "id" }}: - - // val0 := br{{ var "id"}}.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br{{ var "id"}}.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val1 := br{{ var "id"}}.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br{{ var "id"}}.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - - // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, {{ var "bufofs" }}(buffer)(off*1) - - // SECOND PART: - // val2 := br{{ var "id"}}.peekTopBits(peekBits) - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v2 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br{{ var "id"}}.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - // val3 := br{{ var "id"}}.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - - // v3 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br{{ var "id"}}.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - MOVBQZX AL, CX - SHLQ CX, br_value // value <<= n - ADDQ CX, br_bits_read // bits_read += n - - - // these two writes get coalesced - // buf[stream][off+2] = uint8(v2.entry >> 8) - // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVW BX, {{ var "bufofs" }}+2(buffer)(off*1) - - // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }}) - MOVQ br_value, bitReaderShifted_value(br{{ var "id" }}) - MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }}) -{{ end }} - - {{ set "id" "0" }} - {{ set "ofs" "0" }} - {{ set "bufofs" "0" }} {{/* id * bufoff */}} - {{ template "decode_2_values_x86" . }} - - {{ set "id" "1" }} - {{ set "ofs" "8" }} - {{ set "bufofs" "256" }} - {{ template "decode_2_values_x86" . }} - - {{ set "id" "2" }} - {{ set "ofs" "16" }} - {{ set "bufofs" "512" }} - {{ template "decode_2_values_x86" . }} - - {{ set "id" "3" }} - {{ set "ofs" "24" }} - {{ set "bufofs" "768" }} - {{ template "decode_2_values_x86" . }} - - ADDQ $4, off // off += 2 - - TESTB DH, DH // any br[i].ofs < 4? - JNZ end - - CMPQ off, $bufoff - JL main_loop -end: - MOVQ 0(SP), BP - - MOVB off, ret+56(FP) - RET -#undef off -#undef buffer -#undef table - -#undef br_bits_read -#undef br_value -#undef br_offset -#undef peek_bits -#undef exhausted - -#undef br0 -#undef br1 -#undef br2 -#undef br3 diff --git a/huff0/decompress_amd64.go b/huff0/decompress_amd64.go index ce8e93bcd0..3415e5da22 100644 --- a/huff0/decompress_amd64.go +++ b/huff0/decompress_amd64.go @@ -13,19 +13,30 @@ import ( // decompress4x_main_loop_x86 is an x86 assembler implementation // of Decompress4X when tablelog > 8. //go:noescape -func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, - peekBits uint8, buf *byte, tbl *dEntrySingle) uint8 +func decompress4x_main_loop_amd64(ctx *decompress4xContext) // decompress4x_8b_loop_x86 is an x86 assembler implementation // of Decompress4X when tablelog <= 8 which decodes 4 entries // per loop. //go:noescape -func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, - peekBits uint8, buf *byte, tbl *dEntrySingle) uint8 +func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) // fallback8BitSize is the size where using Go version is faster. const fallback8BitSize = 800 +type decompress4xContext struct { + pbr0 *bitReaderShifted + pbr1 *bitReaderShifted + pbr2 *bitReaderShifted + pbr3 *bitReaderShifted + peekBits uint8 + out *byte + dstEvery int + tbl *dEntrySingle + decoded int + limit *byte +} + // Decompress4X will decompress a 4X encoded stream. // The length of the supplied input must match the end of a block exactly. // The *capacity* of the dst slice must match the destination size of @@ -42,6 +53,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { if cap(dst) < fallback8BitSize && use8BitTables { return d.decompress4X8bit(dst, src) } + var br [4]bitReaderShifted // Decode "jump table" start := 6 @@ -71,70 +83,28 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { const tlMask = tlSize - 1 single := d.dt.single[:tlSize] - // Use temp table to avoid bound checks/append penalty. - buf := d.buffer() - var off uint8 var decoded int - const debug = false - - // see: bitReaderShifted.peekBitsFast() - peekBits := uint8((64 - d.actualTableLog) & 63) - - // Decode 2 values from each decoder/loop. - const bufoff = 256 - for { - if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { - break + if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) { + ctx := decompress4xContext{ + pbr0: &br[0], + pbr1: &br[1], + pbr2: &br[2], + pbr3: &br[3], + peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() + out: &out[0], + dstEvery: dstEvery, + tbl: &single[0], + limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last. } - if use8BitTables { - off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0]) + decompress4x_8b_main_loop_amd64(&ctx) } else { - off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0]) - } - if debug { - fmt.Print("DEBUG: ") - fmt.Printf("off=%d,", off) - for i := 0; i < 4; i++ { - fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}", - i, br[i].bitsRead, br[i].value, br[i].off) - } - fmt.Println("") - } - - if off != 0 { - break + decompress4x_main_loop_amd64(&ctx) } - if bufoff > dstEvery { - d.bufs.Put(buf) - return nil, errors.New("corruption detected: stream overrun 1") - } - copy(out, buf[0][:]) - copy(out[dstEvery:], buf[1][:]) - copy(out[dstEvery*2:], buf[2][:]) - copy(out[dstEvery*3:], buf[3][:]) - out = out[bufoff:] - decoded += bufoff * 4 - // There must at least be 3 buffers left. - if len(out) < dstEvery*3 { - d.bufs.Put(buf) - return nil, errors.New("corruption detected: stream overrun 2") - } - } - if off > 0 { - ioff := int(off) - if len(out) < dstEvery*3+ioff { - d.bufs.Put(buf) - return nil, errors.New("corruption detected: stream overrun 3") - } - copy(out, buf[0][:off]) - copy(out[dstEvery:], buf[1][:off]) - copy(out[dstEvery*2:], buf[2][:off]) - copy(out[dstEvery*3:], buf[3][:off]) - decoded += int(off) * 4 - out = out[off:] + decoded = ctx.decoded + out = out[decoded/4:] } // Decode remaining. @@ -150,7 +120,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { for bitsLeft > 0 { br.fill() if offset >= endsAt { - d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 4") } @@ -164,7 +133,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { offset++ } if offset != endsAt { - d.bufs.Put(buf) return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) } decoded += offset - dstEvery*i @@ -173,7 +141,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { return nil, err } } - d.bufs.Put(buf) if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } diff --git a/huff0/decompress_amd64.s b/huff0/decompress_amd64.s index 2edad3ea5a..06287f5685 100644 --- a/huff0/decompress_amd64.s +++ b/huff0/decompress_amd64.s @@ -1,506 +1,662 @@ -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" -#include "funcdata.h" -#include "go_asm.h" - -#ifdef GOAMD64_v4 -#ifndef GOAMD64_v3 -#define GOAMD64_v3 -#endif -#endif - -#define bufoff 256 // see decompress.go, we're using [4][256]byte table - -// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, -// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool) -TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8 -#define off R8 -#define buffer DI -#define table SI - -#define br_bits_read R9 -#define br_value R10 -#define br_offset R11 -#define peek_bits R12 -#define exhausted DX - -#define br0 R13 -#define br1 R14 -#define br2 R15 -#define br3 BP - - MOVQ BP, 0(SP) - - XORQ exhausted, exhausted // exhausted = false - XORQ off, off // off = 0 - - MOVBQZX peekBits+32(FP), peek_bits - MOVQ buf+40(FP), buffer - MOVQ tbl+48(FP), table - - MOVQ pbr0+0(FP), br0 - MOVQ pbr1+8(FP), br1 - MOVQ pbr2+16(FP), br2 - MOVQ pbr3+24(FP), br3 - +// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT. + +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +// func decompress4x_main_loop_amd64(ctx *decompress4xContext) +TEXT ·decompress4x_main_loop_amd64(SB), $8-8 + XORQ DX, DX + + // Preload values + MOVQ ctx+0(FP), AX + MOVBQZX 32(AX), SI + MOVQ 40(AX), DI + MOVQ DI, BX + MOVQ 72(AX), CX + MOVQ CX, (SP) + MOVQ 48(AX), R8 + MOVQ 56(AX), R9 + MOVQ (AX), R10 + MOVQ 8(AX), R11 + MOVQ 16(AX), R12 + MOVQ 24(AX), R13 + + // Main loop main_loop: - - // const stream = 0 - // br0.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read - MOVQ bitReaderShifted_value(br0), br_value - MOVQ bitReaderShifted_off(br0), br_offset - - // We must have at least 2 * max tablelog left - CMPQ br_bits_read, $64-22 - JBE skip_fill0 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br0), AX + MOVQ BX, DI + CMPQ DI, (SP) + SETGE DL + + // br0.fillFast32() + MOVQ 32(R10), R14 + MOVBQZX 40(R10), R15 + CMPQ R15, $0x20 + JBE skip_fill0 + MOVQ 24(R10), AX + SUBQ $0x20, R15 + SUBQ $0x04, AX + MOVQ (R10), BP // b.value |= uint64(low) << (b.bitsRead & 63) -#ifdef GOAMD64_v3 - SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) - -#else - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - MOVQ br_bits_read, CX - SHLQ CL, AX - -#endif - - ORQ AX, br_value + MOVL (AX)(BP*1), BP + MOVQ R15, CX + SHLQ CL, BP + MOVQ AX, 24(R10) + ORQ BP, R14 // exhausted = exhausted || (br0.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL - // } skip_fill0: - // val0 := br0.peekTopBits(peekBits) -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ R14, BP + MOVQ SI, CX + SHRQ CL, BP // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br0.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n + MOVW (R9)(BP*2), CX -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n + // br0.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R14 + ADDB CL, R15 -#endif - - ADDQ CX, br_bits_read // bits_read += n - -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else // val1 := br0.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ SI, CX + MOVQ R14, BP + SHRQ CL, BP // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 + MOVW (R9)(BP*2), CX // br0.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - -#endif - - ADDQ CX, br_bits_read // bits_read += n + MOVB CH, AH + SHLQ CL, R14 + ADDB CL, R15 // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 0(buffer)(off*1) + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (DI) // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br0) - MOVQ br_value, bitReaderShifted_value(br0) - MOVQ br_offset, bitReaderShifted_off(br0) - - // const stream = 1 - // br1.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read - MOVQ bitReaderShifted_value(br1), br_value - MOVQ bitReaderShifted_off(br1), br_offset - - // We must have at least 2 * max tablelog left - CMPQ br_bits_read, $64-22 - JBE skip_fill1 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br1), AX + MOVQ R14, 32(R10) + MOVB R15, 40(R10) + ADDQ R8, DI + + // br1.fillFast32() + MOVQ 32(R11), R14 + MOVBQZX 40(R11), R15 + CMPQ R15, $0x20 + JBE skip_fill1 + MOVQ 24(R11), AX + SUBQ $0x20, R15 + SUBQ $0x04, AX + MOVQ (R11), BP // b.value |= uint64(low) << (b.bitsRead & 63) -#ifdef GOAMD64_v3 - SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) - -#else - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - MOVQ br_bits_read, CX - SHLQ CL, AX - -#endif - - ORQ AX, br_value + MOVL (AX)(BP*1), BP + MOVQ R15, CX + SHLQ CL, BP + MOVQ AX, 24(R11) + ORQ BP, R14 // exhausted = exhausted || (br1.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL - // } skip_fill1: - // val0 := br1.peekTopBits(peekBits) -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ R14, BP + MOVQ SI, CX + SHRQ CL, BP // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br1.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n + MOVW (R9)(BP*2), CX -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n + // br1.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R14 + ADDB CL, R15 -#endif - - ADDQ CX, br_bits_read // bits_read += n - -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else // val1 := br1.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ SI, CX + MOVQ R14, BP + SHRQ CL, BP // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 + MOVW (R9)(BP*2), CX // br1.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - -#endif - - ADDQ CX, br_bits_read // bits_read += n + MOVB CH, AH + SHLQ CL, R14 + ADDB CL, R15 // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 256(buffer)(off*1) + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (DI) // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br1) - MOVQ br_value, bitReaderShifted_value(br1) - MOVQ br_offset, bitReaderShifted_off(br1) - - // const stream = 2 - // br2.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read - MOVQ bitReaderShifted_value(br2), br_value - MOVQ bitReaderShifted_off(br2), br_offset - - // We must have at least 2 * max tablelog left - CMPQ br_bits_read, $64-22 - JBE skip_fill2 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br2), AX + MOVQ R14, 32(R11) + MOVB R15, 40(R11) + ADDQ R8, DI + + // br2.fillFast32() + MOVQ 32(R12), R14 + MOVBQZX 40(R12), R15 + CMPQ R15, $0x20 + JBE skip_fill2 + MOVQ 24(R12), AX + SUBQ $0x20, R15 + SUBQ $0x04, AX + MOVQ (R12), BP // b.value |= uint64(low) << (b.bitsRead & 63) -#ifdef GOAMD64_v3 - SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) - -#else - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - MOVQ br_bits_read, CX - SHLQ CL, AX - -#endif - - ORQ AX, br_value + MOVL (AX)(BP*1), BP + MOVQ R15, CX + SHLQ CL, BP + MOVQ AX, 24(R12) + ORQ BP, R14 // exhausted = exhausted || (br2.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL - // } skip_fill2: - // val0 := br2.peekTopBits(peekBits) -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ R14, BP + MOVQ SI, CX + SHRQ CL, BP // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 + MOVW (R9)(BP*2), CX - // br2.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) + // br2.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R14 + ADDB CL, R15 -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - -#endif - - ADDQ CX, br_bits_read // bits_read += n - -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else // val1 := br2.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ SI, CX + MOVQ R14, BP + SHRQ CL, BP // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 + MOVW (R9)(BP*2), CX // br2.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n - -#endif - - ADDQ CX, br_bits_read // bits_read += n + MOVB CH, AH + SHLQ CL, R14 + ADDB CL, R15 // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 512(buffer)(off*1) + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (DI) // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br2) - MOVQ br_value, bitReaderShifted_value(br2) - MOVQ br_offset, bitReaderShifted_off(br2) - - // const stream = 3 - // br3.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read - MOVQ bitReaderShifted_value(br3), br_value - MOVQ bitReaderShifted_off(br3), br_offset - - // We must have at least 2 * max tablelog left - CMPQ br_bits_read, $64-22 - JBE skip_fill3 - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br3), AX + MOVQ R14, 32(R12) + MOVB R15, 40(R12) + ADDQ R8, DI + + // br3.fillFast32() + MOVQ 32(R13), R14 + MOVBQZX 40(R13), R15 + CMPQ R15, $0x20 + JBE skip_fill3 + MOVQ 24(R13), AX + SUBQ $0x20, R15 + SUBQ $0x04, AX + MOVQ (R13), BP // b.value |= uint64(low) << (b.bitsRead & 63) -#ifdef GOAMD64_v3 - SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) - -#else - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - MOVQ br_bits_read, CX - SHLQ CL, AX - -#endif - - ORQ AX, br_value + MOVL (AX)(BP*1), BP + MOVQ R15, CX + SHLQ CL, BP + MOVQ AX, 24(R13) + ORQ BP, R14 // exhausted = exhausted || (br3.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL - // } skip_fill3: - // val0 := br3.peekTopBits(peekBits) -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask - -#else - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask - -#endif + MOVQ R14, BP + MOVQ SI, CX + SHRQ CL, BP // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 + MOVW (R9)(BP*2), CX - // br3.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) + // br3.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R14 + ADDB CL, R15 -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n - -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n + // val1 := br3.peekTopBits(peekBits) + MOVQ SI, CX + MOVQ R14, BP + SHRQ CL, BP -#endif + // v1 := table[val1&mask] + MOVW (R9)(BP*2), CX - ADDQ CX, br_bits_read // bits_read += n + // br3.advance(uint8(v1.entry)) + MOVB CH, AH + SHLQ CL, R14 + ADDB CL, R15 -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask + // these two writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (DI) -#else - // val1 := br3.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask + // update the bitrader reader structure + MOVQ R14, 32(R13) + MOVB R15, 40(R13) + ADDQ $0x02, BX + TESTB DL, DL + JZ main_loop + MOVQ ctx+0(FP), AX + MOVQ 40(AX), CX + MOVQ BX, DX + SUBQ CX, DX + SHLQ $0x02, DX + MOVQ DX, 64(AX) + RET -#endif +// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) +TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8 + XORQ DX, DX + + // Preload values + MOVQ ctx+0(FP), CX + MOVBQZX 32(CX), BX + MOVQ 40(CX), SI + MOVQ SI, (SP) + MOVQ 72(CX), DX + MOVQ DX, 8(SP) + MOVQ 48(CX), DI + MOVQ 56(CX), R8 + MOVQ (CX), R9 + MOVQ 8(CX), R10 + MOVQ 16(CX), R11 + MOVQ 24(CX), R12 + + // Main loop +main_loop: + MOVQ (SP), SI + CMPQ SI, 8(SP) + SETGE DL + + // br1000.fillFast32() + MOVQ 32(R9), R13 + MOVBQZX 40(R9), R14 + CMPQ R14, $0x20 + JBE skip_fill1000 + MOVQ 24(R9), R15 + SUBQ $0x20, R14 + SUBQ $0x04, R15 + MOVQ (R9), BP - // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R15)(BP*1), BP + MOVQ R14, CX + SHLQ CL, BP + MOVQ R15, 24(R9) + ORQ BP, R13 + + // exhausted = exhausted || (br1000.off < 4) + CMPQ R15, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1000: + // val0 := br0.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 - // br3.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) + // v0 := table[val0&mask] + MOVW (R8)(R15*2), CX -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n + // br0.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n + // val1 := br0.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v1 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br0.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // val2 := br0.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v2 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br0.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + + // val3 := br0.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v3 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br0.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (SI) + + // update the bitreader reader structure + MOVQ R13, 32(R9) + MOVB R14, 40(R9) + ADDQ DI, SI + + // br1001.fillFast32() + MOVQ 32(R10), R13 + MOVBQZX 40(R10), R14 + CMPQ R14, $0x20 + JBE skip_fill1001 + MOVQ 24(R10), R15 + SUBQ $0x20, R14 + SUBQ $0x04, R15 + MOVQ (R10), BP -#endif + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R15)(BP*1), BP + MOVQ R14, CX + SHLQ CL, BP + MOVQ R15, 24(R10) + ORQ BP, R13 + + // exhausted = exhausted || (br1001.off < 4) + CMPQ R15, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1001: + // val0 := br1.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 - ADDQ CX, br_bits_read // bits_read += n + // v0 := table[val0&mask] + MOVW (R8)(R15*2), CX - // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, 768(buffer)(off*1) + // br1.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 - // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br3) - MOVQ br_value, bitReaderShifted_value(br3) - MOVQ br_offset, bitReaderShifted_off(br3) + // val1 := br1.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v1 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br1.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // val2 := br1.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v2 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br1.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + + // val3 := br1.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v3 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br1.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (SI) + + // update the bitreader reader structure + MOVQ R13, 32(R10) + MOVB R14, 40(R10) + ADDQ DI, SI + + // br1002.fillFast32() + MOVQ 32(R11), R13 + MOVBQZX 40(R11), R14 + CMPQ R14, $0x20 + JBE skip_fill1002 + MOVQ 24(R11), R15 + SUBQ $0x20, R14 + SUBQ $0x04, R15 + MOVQ (R11), BP - ADDQ $2, off // off += 2 + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R15)(BP*1), BP + MOVQ R14, CX + SHLQ CL, BP + MOVQ R15, 24(R11) + ORQ BP, R13 + + // exhausted = exhausted || (br1002.off < 4) + CMPQ R15, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1002: + // val0 := br2.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 - TESTB DH, DH // any br[i].ofs < 4? - JNZ end + // v0 := table[val0&mask] + MOVW (R8)(R15*2), CX - CMPQ off, $bufoff - JL main_loop + // br2.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 -end: - MOVQ 0(SP), BP + // val1 := br2.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v1 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br2.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // val2 := br2.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v2 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br2.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + + // val3 := br2.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v3 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br2.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (SI) + + // update the bitreader reader structure + MOVQ R13, 32(R11) + MOVB R14, 40(R11) + ADDQ DI, SI + + // br1003.fillFast32() + MOVQ 32(R12), R13 + MOVBQZX 40(R12), R14 + CMPQ R14, $0x20 + JBE skip_fill1003 + MOVQ 24(R12), R15 + SUBQ $0x20, R14 + SUBQ $0x04, R15 + MOVQ (R12), BP - MOVB off, ret+56(FP) - RET + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R15)(BP*1), BP + MOVQ R14, CX + SHLQ CL, BP + MOVQ R15, 24(R12) + ORQ BP, R13 + + // exhausted = exhausted || (br1003.off < 4) + CMPQ R15, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1003: + // val0 := br3.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 -#undef off -#undef buffer -#undef table + // v0 := table[val0&mask] + MOVW (R8)(R15*2), CX -#undef br_bits_read -#undef br_value -#undef br_offset -#undef peek_bits -#undef exhausted + // br3.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 -#undef br0 -#undef br1 -#undef br2 -#undef br3 + // val1 := br3.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v1 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br3.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // val2 := br3.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v2 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br3.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R13 + ADDB CL, R14 + + // val3 := br3.peekTopBits(peekBits) + MOVQ R13, R15 + MOVQ BX, CX + SHRQ CL, R15 + + // v3 := table[val0&mask] + MOVW (R8)(R15*2), CX + + // br3.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R13 + ADDB CL, R14 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (SI) + + // update the bitreader reader structure + MOVQ R13, 32(R12) + MOVB R14, 40(R12) + ADDQ $0x04, (SP) + TESTB DL, DL + JZ main_loop + MOVQ ctx+0(FP), AX + MOVQ 40(AX), CX + MOVQ (SP), DX + SUBQ CX, DX + SHLQ $0x02, DX + MOVQ DX, 64(AX) + RET diff --git a/huff0/decompress_amd64.s.in b/huff0/decompress_amd64.s.in deleted file mode 100644 index 330d86ae15..0000000000 --- a/huff0/decompress_amd64.s.in +++ /dev/null @@ -1,195 +0,0 @@ -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" -#include "funcdata.h" -#include "go_asm.h" - -#ifdef GOAMD64_v4 -#ifndef GOAMD64_v3 -#define GOAMD64_v3 -#endif -#endif - -#define bufoff 256 // see decompress.go, we're using [4][256]byte table - -//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, -// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool) -TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8 -#define off R8 -#define buffer DI -#define table SI - -#define br_bits_read R9 -#define br_value R10 -#define br_offset R11 -#define peek_bits R12 -#define exhausted DX - -#define br0 R13 -#define br1 R14 -#define br2 R15 -#define br3 BP - - MOVQ BP, 0(SP) - - XORQ exhausted, exhausted // exhausted = false - XORQ off, off // off = 0 - - MOVBQZX peekBits+32(FP), peek_bits - MOVQ buf+40(FP), buffer - MOVQ tbl+48(FP), table - - MOVQ pbr0+0(FP), br0 - MOVQ pbr1+8(FP), br1 - MOVQ pbr2+16(FP), br2 - MOVQ pbr3+24(FP), br3 - -main_loop: -{{ define "decode_2_values_x86" }} - // const stream = {{ var "id" }} - // br{{ var "id"}}.fillFast() - MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read - MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value - MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset - - // We must have at least 2 * max tablelog left - CMPQ br_bits_read, $64-22 - JBE skip_fill{{ var "id" }} - - SUBQ $32, br_bits_read // b.bitsRead -= 32 - SUBQ $4, br_offset // b.off -= 4 - - // v := b.in[b.off-4 : b.off] - // v = v[:4] - // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - MOVQ bitReaderShifted_in(br{{ var "id" }}), AX - - // b.value |= uint64(low) << (b.bitsRead & 63) -#ifdef GOAMD64_v3 - SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) -#else - MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) - MOVQ br_bits_read, CX - SHLQ CL, AX -#endif - - ORQ AX, br_value - - // exhausted = exhausted || (br{{ var "id"}}.off < 4) - CMPQ br_offset, $4 - SETLT DL - ORB DL, DH - // } -skip_fill{{ var "id" }}: - - // val0 := br{{ var "id"}}.peekTopBits(peekBits) -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask -#else - MOVQ br_value, AX - MOVQ peek_bits, CX - SHRQ CL, AX // AX = (value >> peek_bits) & mask -#endif - - // v0 := table[val0&mask] - MOVW 0(table)(AX*2), AX // AX - v0 - - // br{{ var "id"}}.advance(uint8(v0.entry)) - MOVB AH, BL // BL = uint8(v0.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n -#endif - - ADDQ CX, br_bits_read // bits_read += n - - -#ifdef GOAMD64_v3 - SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask -#else - // val1 := br{{ var "id"}}.peekTopBits(peekBits) - MOVQ peek_bits, CX - MOVQ br_value, AX - SHRQ CL, AX // AX = (value >> peek_bits) & mask -#endif - - // v1 := table[val1&mask] - MOVW 0(table)(AX*2), AX // AX - v1 - - // br{{ var "id"}}.advance(uint8(v1.entry)) - MOVB AH, BH // BH = uint8(v1.entry >> 8) - -#ifdef GOAMD64_v3 - MOVBQZX AL, CX - SHLXQ AX, br_value, br_value // value <<= n -#else - MOVBQZX AL, CX - SHLQ CL, br_value // value <<= n -#endif - - ADDQ CX, br_bits_read // bits_read += n - - - // these two writes get coalesced - // buf[stream][off] = uint8(v0.entry >> 8) - // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVW BX, {{ var "bufofs" }}(buffer)(off*1) - - // update the bitrader reader structure - MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }}) - MOVQ br_value, bitReaderShifted_value(br{{ var "id" }}) - MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }}) -{{ end }} - - {{ set "id" "0" }} - {{ set "ofs" "0" }} - {{ set "bufofs" "0" }} {{/* id * bufoff */}} - {{ template "decode_2_values_x86" . }} - - {{ set "id" "1" }} - {{ set "ofs" "8" }} - {{ set "bufofs" "256" }} - {{ template "decode_2_values_x86" . }} - - {{ set "id" "2" }} - {{ set "ofs" "16" }} - {{ set "bufofs" "512" }} - {{ template "decode_2_values_x86" . }} - - {{ set "id" "3" }} - {{ set "ofs" "24" }} - {{ set "bufofs" "768" }} - {{ template "decode_2_values_x86" . }} - - ADDQ $2, off // off += 2 - - TESTB DH, DH // any br[i].ofs < 4? - JNZ end - - CMPQ off, $bufoff - JL main_loop -end: - MOVQ 0(SP), BP - - MOVB off, ret+56(FP) - RET -#undef off -#undef buffer -#undef table - -#undef br_bits_read -#undef br_value -#undef br_offset -#undef peek_bits -#undef exhausted - -#undef br0 -#undef br1 -#undef br2 -#undef br3 diff --git a/huff0/generate.go b/huff0/generate.go deleted file mode 100644 index 95b082fd34..0000000000 --- a/huff0/generate.go +++ /dev/null @@ -1,78 +0,0 @@ -//go:build ignore -// +build ignore - -package main - -import ( - "log" - "os" - "path" - "text/template" -) - -func main() { - mapping := []struct { - template string - output string - }{{ - template: "decompress_amd64.s.in", - output: "decompress_amd64.s", - }, - { - template: "decompress_8b_amd64.s.in", - output: "decompress_8b_amd64.s", - }, - } - - for i := range mapping { - - state := make(map[string]string) - - funcMap := template.FuncMap{ - "var": func(name string) string { return state[name] }, - "set": func(name, value string) string { - state[name] = value - return "" - }, - } - - input := mapping[i].template - output := mapping[i].output - if !shouldRegenerate(input, output) { - log.Printf("%q is up to date", output) - continue - } - - tmpl, err := template.New(path.Base(input)).Funcs(funcMap).ParseFiles(input) - die(err) - - f, err := os.Create(output) - die(err) - defer f.Close() - - log.Printf("Generating %q from %q", output, input) - err = tmpl.Execute(f, nil) - die(err) - } -} - -func die(err error) { - if err != nil { - log.Fatal(err) - os.Exit(1) - } -} - -func shouldRegenerate(srcpath, dstpath string) bool { - src, err1 := os.Stat(srcpath) - if err1 != nil { - return true // I/O errors will be rediscovered later - } - - dst, err2 := os.Stat(dstpath) - if err2 != nil { - return true - } - - return src.ModTime().After(dst.ModTime()) -}