diff --git a/huff0/_generate/gen.go b/huff0/_generate/gen.go index 485eed6472..6eab53ff4f 100644 --- a/huff0/_generate/gen.go +++ b/huff0/_generate/gen.go @@ -49,7 +49,7 @@ func (d decompress4x) generateProcedure(name string) { exhausted := GP64() XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false - limitPtr := AllocLocal(8) + limit := GP64() bufferOrigin := GP64() peekBits := GP64() @@ -57,25 +57,17 @@ func (d decompress4x) generateProcedure(name string) { dstEvery := GP64() table := GP64() - br0 := GP64() - br1 := GP64() - br2 := GP64() - br3 := GP64() + br := GP64() Comment("Preload values") { ctx := Dereference(Param("ctx")) Load(ctx.Field("peekBits"), peekBits) - Load(ctx.Field("out"), buffer) - MOVQ(buffer, bufferOrigin) - limit := Load(ctx.Field("limit"), GP64()) - MOVQ(limit, limitPtr) + Load(ctx.Field("out"), bufferOrigin) + Load(ctx.Field("limit"), limit) Load(ctx.Field("dstEvery"), dstEvery) Load(ctx.Field("tbl"), table) - Load(ctx.Field("pbr0"), br0) - Load(ctx.Field("pbr1"), br1) - Load(ctx.Field("pbr2"), br2) - Load(ctx.Field("pbr3"), br3) + Load(ctx.Field("pbr"), br) } Comment("Main loop") @@ -83,15 +75,15 @@ func (d decompress4x) generateProcedure(name string) { MOVQ(bufferOrigin, buffer) // Check if we have space - CMPQ(buffer, limitPtr) + CMPQ(buffer, limit) SETGE(exhausted.As8()) - d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted) + d.decodeTwoValues(0, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted) + d.decodeTwoValues(1, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted) + d.decodeTwoValues(2, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted) + d.decodeTwoValues(3, br, peekBits, table, buffer, exhausted) ADDQ(U8(2), bufferOrigin) // off += 2 @@ -100,10 +92,9 @@ func (d decompress4x) generateProcedure(name string) { { ctx := Dereference(Param("ctx")) - tmp := Load(ctx.Field("out"), GP64()) - decoded := GP64() - MOVQ(bufferOrigin, decoded) - SUBQ(tmp, decoded) + ctxout, _ := ctx.Field("out").Resolve() + decoded := bufferOrigin + SUBQ(ctxout.Addr, decoded) SHLQ(U8(2), decoded) // decoded *= 4 Store(decoded, ctx.Field("decoded")) @@ -118,6 +109,7 @@ const bitReader_in = 0 const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap} const bitReader_value = bitReader_off + 8 const bitReader_bitsRead = bitReader_value + 8 +const bitReader__size = bitReader_bitsRead + 8 func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) @@ -157,9 +149,10 @@ func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhau Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") MOVW(out.As16(), Mem{Base: buffer}) - Comment("update the bitrader reader structure") - MOVQ(brValue, Mem{Base: br, Disp: bitReader_value}) - MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead}) + Comment("update the bitreader structure") + offset := id * bitReader__size + MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value}) + MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead}) } func (d decompress4x) generateProcedure4x8bit(name string) { @@ -171,33 +164,25 @@ func (d decompress4x) generateProcedure4x8bit(name string) { exhausted := GP64() // Fixed since we need 8H XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false - bufferOrigin := AllocLocal(8) - limitPtr := AllocLocal(8) + bufferOrigin := GP64() + limit := GP64() peekBits := GP64() buffer := GP64() dstEvery := GP64() table := GP64() - br0 := GP64() - br1 := GP64() - br2 := GP64() - br3 := GP64() + br := GP64() Comment("Preload values") { ctx := Dereference(Param("ctx")) Load(ctx.Field("peekBits"), peekBits) - Load(ctx.Field("out"), buffer) - MOVQ(buffer, bufferOrigin) - limit := Load(ctx.Field("limit"), GP64()) - MOVQ(limit, limitPtr) + Load(ctx.Field("out"), bufferOrigin) + Load(ctx.Field("limit"), limit) Load(ctx.Field("dstEvery"), dstEvery) Load(ctx.Field("tbl"), table) - Load(ctx.Field("pbr0"), br0) - Load(ctx.Field("pbr1"), br1) - Load(ctx.Field("pbr2"), br2) - Load(ctx.Field("pbr3"), br3) + Load(ctx.Field("pbr"), br) } Comment("Main loop") @@ -205,15 +190,15 @@ func (d decompress4x) generateProcedure4x8bit(name string) { MOVQ(bufferOrigin, buffer) // Check if we have space - CMPQ(buffer, limitPtr) + CMPQ(buffer, limit) SETGE(exhausted.As8()) - d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted) + d.decodeFourValues(0, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted) + d.decodeFourValues(1, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted) + d.decodeFourValues(2, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted) + d.decodeFourValues(3, br, peekBits, table, buffer, exhausted) ADDQ(U8(4), bufferOrigin) // off += 4 @@ -222,10 +207,9 @@ func (d decompress4x) generateProcedure4x8bit(name string) { { ctx := Dereference(Param("ctx")) - tmp := Load(ctx.Field("out"), GP64()) - decoded := GP64() - MOVQ(bufferOrigin, decoded) - SUBQ(tmp, decoded) + ctxout, _ := ctx.Field("out").Resolve() + decoded := bufferOrigin + SUBQ(ctxout.Addr, decoded) SHLQ(U8(2), decoded) // decoded *= 4 Store(decoded, ctx.Field("decoded")) @@ -234,7 +218,7 @@ func (d decompress4x) generateProcedure4x8bit(name string) { } func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { - brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted) + brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) decompress := func(valID int, outByte reg.Register) { CX := reg.CL @@ -269,9 +253,10 @@ func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exha Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)") MOVL(out.As32(), Mem{Base: buffer}) - Comment("update the bitreader reader structure") - MOVQ(brValue, Mem{Base: br, Disp: bitReader_value}) - MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead}) + Comment("update the bitreader structure") + offset := id * bitReader__size + MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value}) + MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead}) } func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) { @@ -281,14 +266,15 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) ( Commentf("br%d.fillFast32()", id) brValue = GP64() brBitsRead = GP64() - MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue) - MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead) + offset := bitReader__size * id + MOVQ(Mem{Base: br, Disp: offset + bitReader_value}, brValue) + MOVBQZX(Mem{Base: br, Disp: offset + bitReader_bitsRead}, brBitsRead) // We must have at least 2 * max tablelog left CMPQ(brBitsRead, U8(64-atLeast)) JBE(LabelRef("skip_fill" + strconv.Itoa(id))) brOffset := GP64() - MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset) + MOVQ(Mem{Base: br, Disp: offset + bitReader_off}, brOffset) SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32 SUBQ(U8(4), brOffset) // b.off -= 4 @@ -297,7 +283,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) ( // v = v[:4] // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) tmp := GP64() - MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp) + MOVQ(Mem{Base: br, Disp: offset + bitReader_in}, tmp) Comment("b.value |= uint64(low) << (b.bitsRead & 63)") addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1} @@ -306,7 +292,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) ( MOVQ(brBitsRead, CX.As64()) SHLQ(CX, tmp.As64()) - MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off}) + MOVQ(brOffset, Mem{Base: br, Disp: offset + bitReader_off}) ORQ(tmp.As64(), brValue) { Commentf("exhausted = exhausted || (br%d.off < 4)", id) @@ -474,11 +460,9 @@ func (d decompress1x) generateProcedure(name string) { { // calculate decoded as current `out` - initial `out` ctx := Dereference(Param("ctx")) - decoded := GP64() - tmp := GP64() - MOVQ(buffer, decoded) - Load(ctx.Field("out"), tmp) - SUBQ(tmp, decoded) + ctxout, _ := ctx.Field("out").Resolve() + decoded := buffer + SUBQ(ctxout.Addr, decoded) Store(decoded, ctx.Field("decoded")) pbr := Dereference(ctx.Field("pbr")) diff --git a/huff0/decompress_amd64.go b/huff0/decompress_amd64.go index 671e630a84..9f3e9f79e2 100644 --- a/huff0/decompress_amd64.go +++ b/huff0/decompress_amd64.go @@ -27,10 +27,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) const fallback8BitSize = 800 type decompress4xContext struct { - pbr0 *bitReaderShifted - pbr1 *bitReaderShifted - pbr2 *bitReaderShifted - pbr3 *bitReaderShifted + pbr *[4]bitReaderShifted peekBits uint8 out *byte dstEvery int @@ -89,10 +86,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) { ctx := decompress4xContext{ - pbr0: &br[0], - pbr1: &br[1], - pbr2: &br[2], - pbr3: &br[3], + pbr: &br, peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() out: &out[0], dstEvery: dstEvery, diff --git a/huff0/decompress_amd64.s b/huff0/decompress_amd64.s index 6c65c6e2b2..dd1a5aecd6 100644 --- a/huff0/decompress_amd64.s +++ b/huff0/decompress_amd64.s @@ -4,45 +4,40 @@ // +build amd64,!appengine,!noasm,gc // func decompress4x_main_loop_amd64(ctx *decompress4xContext) -TEXT ·decompress4x_main_loop_amd64(SB), $8-8 +TEXT ·decompress4x_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), AX - MOVBQZX 32(AX), SI - MOVQ 40(AX), DI - MOVQ DI, BX - MOVQ 72(AX), CX - MOVQ CX, (SP) - MOVQ 48(AX), R8 - MOVQ 56(AX), R9 - MOVQ (AX), R10 - MOVQ 8(AX), R11 - MOVQ 16(AX), R12 - MOVQ 24(AX), R13 + MOVBQZX 8(AX), DI + MOVQ 16(AX), SI + MOVQ 48(AX), BX + MOVQ 24(AX), R9 + MOVQ 32(AX), R10 + MOVQ (AX), R11 // Main loop main_loop: - MOVQ BX, DI - CMPQ DI, (SP) + MOVQ SI, R8 + CMPQ R8, BX SETGE DL // br0.fillFast32() - MOVQ 32(R10), R14 - MOVBQZX 40(R10), R15 - CMPQ R15, $0x20 + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 JBE skip_fill0 - MOVQ 24(R10), AX - SUBQ $0x20, R15 + MOVQ 24(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R10), BP + MOVQ (R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R10) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 24(R11) + ORQ R14, R12 // exhausted = exhausted || (br0.off < 4) CMPQ AX, $0x04 @@ -51,57 +46,57 @@ main_loop: skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R10) - MOVB R15, 40(R10) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 // br1.fillFast32() - MOVQ 32(R11), R14 - MOVBQZX 40(R11), R15 - CMPQ R15, $0x20 + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 JBE skip_fill1 - MOVQ 24(R11), AX - SUBQ $0x20, R15 + MOVQ 72(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R11), BP + MOVQ 48(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R11) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 72(R11) + ORQ R14, R12 // exhausted = exhausted || (br1.off < 4) CMPQ AX, $0x04 @@ -110,57 +105,57 @@ skip_fill0: skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R11) - MOVB R15, 40(R11) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 // br2.fillFast32() - MOVQ 32(R12), R14 - MOVBQZX 40(R12), R15 - CMPQ R15, $0x20 + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 JBE skip_fill2 - MOVQ 24(R12), AX - SUBQ $0x20, R15 + MOVQ 120(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R12), BP + MOVQ 96(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R12) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 120(R11) + ORQ R14, R12 // exhausted = exhausted || (br2.off < 4) CMPQ AX, $0x04 @@ -169,57 +164,57 @@ skip_fill1: skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R12) - MOVB R15, 40(R12) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 // br3.fillFast32() - MOVQ 32(R13), R14 - MOVBQZX 40(R13), R15 - CMPQ R15, $0x20 + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 JBE skip_fill3 - MOVQ 24(R13), AX - SUBQ $0x20, R15 + MOVQ 168(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R13), BP + MOVQ 144(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R13) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 168(R11) + ORQ R14, R12 // exhausted = exhausted || (br3.off < 4) CMPQ AX, $0x04 @@ -228,149 +223,142 @@ skip_fill2: skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R13) - MOVB R15, 40(R13) - ADDQ $0x02, BX + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x02, SI TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - MOVQ 40(AX), CX - MOVQ BX, DX - SUBQ CX, DX - SHLQ $0x02, DX - MOVQ DX, 64(AX) + SUBQ 16(AX), SI + SHLQ $0x02, SI + MOVQ SI, 40(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) -TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8 +TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), CX - MOVBQZX 32(CX), BX - MOVQ 40(CX), SI - MOVQ SI, (SP) - MOVQ 72(CX), DX - MOVQ DX, 8(SP) - MOVQ 48(CX), DI - MOVQ 56(CX), R8 - MOVQ (CX), R9 - MOVQ 8(CX), R10 - MOVQ 16(CX), R11 - MOVQ 24(CX), R12 + MOVBQZX 8(CX), DI + MOVQ 16(CX), BX + MOVQ 48(CX), SI + MOVQ 24(CX), R9 + MOVQ 32(CX), R10 + MOVQ (CX), R11 // Main loop main_loop: - MOVQ (SP), SI - CMPQ SI, 8(SP) + MOVQ BX, R8 + CMPQ R8, SI SETGE DL - // br1000.fillFast32() - MOVQ 32(R9), R13 - MOVBQZX 40(R9), R14 - CMPQ R14, $0x20 - JBE skip_fill1000 - MOVQ 24(R9), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R9), BP + // br0.fillFast32() + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill0 + MOVQ 24(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ (R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R9) - ORQ BP, R13 - - // exhausted = exhausted || (br1000.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 24(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br0.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1000: +skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -378,88 +366,88 @@ skip_fill1000: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R9) - MOVB R14, 40(R9) - ADDQ DI, SI - - // br1001.fillFast32() - MOVQ 32(R10), R13 - MOVBQZX 40(R10), R14 - CMPQ R14, $0x20 - JBE skip_fill1001 - MOVQ 24(R10), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R10), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 + + // br1.fillFast32() + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill1 + MOVQ 72(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 48(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R10) - ORQ BP, R13 - - // exhausted = exhausted || (br1001.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 72(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br1.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1001: +skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -467,88 +455,88 @@ skip_fill1001: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R10) - MOVB R14, 40(R10) - ADDQ DI, SI - - // br1002.fillFast32() - MOVQ 32(R11), R13 - MOVBQZX 40(R11), R14 - CMPQ R14, $0x20 - JBE skip_fill1002 - MOVQ 24(R11), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R11), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 + + // br2.fillFast32() + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill2 + MOVQ 120(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 96(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R11) - ORQ BP, R13 - - // exhausted = exhausted || (br1002.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 120(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br2.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1002: +skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -556,88 +544,88 @@ skip_fill1002: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R11) - MOVB R14, 40(R11) - ADDQ DI, SI - - // br1003.fillFast32() - MOVQ 32(R12), R13 - MOVBQZX 40(R12), R14 - CMPQ R14, $0x20 - JBE skip_fill1003 - MOVQ 24(R12), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R12), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 + + // br3.fillFast32() + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill3 + MOVQ 168(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 144(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R12) - ORQ BP, R13 - - // exhausted = exhausted || (br1003.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 168(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br3.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1003: +skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -645,20 +633,18 @@ skip_fill1003: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) + MOVL AX, (R8) - // update the bitreader reader structure - MOVQ R13, 32(R12) - MOVB R14, 40(R12) - ADDQ $0x04, (SP) + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x04, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - MOVQ 40(AX), CX - MOVQ (SP), DX - SUBQ CX, DX - SHLQ $0x02, DX - MOVQ DX, 64(AX) + SUBQ 16(AX), BX + SHLQ $0x02, BX + MOVQ BX, 40(AX) RET // func decompress1x_main_loop_amd64(ctx *decompress1xContext) @@ -750,10 +736,8 @@ loop_condition: // Update ctx structure MOVQ ctx+0(FP), AX - MOVQ DX, CX - MOVQ 16(AX), DX - SUBQ DX, CX - MOVQ CX, 40(AX) + SUBQ 16(AX), DX + MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) @@ -847,10 +831,8 @@ loop_condition: // Update ctx structure MOVQ ctx+0(FP), AX - MOVQ DX, CX - MOVQ 16(AX), DX - SUBQ DX, CX - MOVQ CX, 40(AX) + SUBQ 16(AX), DX + MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX)