From 1b8091b70e226a33954f90aff96dfc15d1ef363e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Mu=C5=82a?= Date: Thu, 31 Mar 2022 08:31:40 +0200 Subject: [PATCH] Keep `off` counter in a register --- huff0/_generate/gen.go | 70 ++- huff0/decompress_amd64.s | 1000 +++++++++++++++++++------------------- 2 files changed, 524 insertions(+), 546 deletions(-) diff --git a/huff0/_generate/gen.go b/huff0/_generate/gen.go index 3f236ac04..da711be2a 100644 --- a/huff0/_generate/gen.go +++ b/huff0/_generate/gen.go @@ -69,16 +69,10 @@ func (d decompress4x) generateProcedure(name string) { Pragma("noescape") out := reg.RAX // Fixed since we need 8H - offsetComp, err := ReturnIndex(0).Resolve() - if err != nil { - panic(err) - } - offP := offsetComp.Addr - { - off := GP8() - XORB(off, off) // off = 0 - MOVB(off, offP) - } + + off := GP64() + XORQ(off, off) + exhausted := reg.RBX // Fixed since we need 8H XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false @@ -97,27 +91,32 @@ func (d decompress4x) generateProcedure(name string) { Label("main_loop") br0 := Dereference(Param("pbr0")) - d.decodeTwoValues(0, br0, peekBits, table, buffer, out, exhausted, offP) + d.decodeTwoValues(0, br0, peekBits, table, buffer, off, out, exhausted) br1 := Dereference(Param("pbr1")) - d.decodeTwoValues(1, br1, peekBits, table, buffer, out, exhausted, offP) + d.decodeTwoValues(1, br1, peekBits, table, buffer, off, out, exhausted) br2 := Dereference(Param("pbr2")) - d.decodeTwoValues(2, br2, peekBits, table, buffer, out, exhausted, offP) + d.decodeTwoValues(2, br2, peekBits, table, buffer, off, out, exhausted) br3 := Dereference(Param("pbr3")) - d.decodeTwoValues(3, br3, peekBits, table, buffer, out, exhausted, offP) + d.decodeTwoValues(3, br3, peekBits, table, buffer, off, out, exhausted) - ADDB(U8(2), offP) // off += 2 + ADDB(U8(2), off.As8()) // off += 2 TESTB(exhausted.As8H(), exhausted.As8H()) // any br[i].ofs < 4? JNZ(LabelRef("done")) - CMPB(offP, U8(0)) + CMPB(off.As8(), U8(0)) JNZ(LabelRef("main_loop")) Label("done") + offsetComp, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVB(off.As8(), offsetComp.Addr) RET() } -func (d decompress4x) decodeTwoValues(id int, br gotypes.Component, peekBits, table, buffer reg.GPVirtual, out, exhausted reg.GPPhysical, offP Mem) { +func (d decompress4x) decodeTwoValues(id int, br gotypes.Component, peekBits, table, buffer, off reg.GPVirtual, out, exhausted reg.GPPhysical) { Commentf("br%d.fillFast()", id) brOffset := GP64() brBitsRead := GP64() @@ -211,8 +210,6 @@ func (d decompress4x) decodeTwoValues(id int, br gotypes.Component, peekBits, ta Comment("these two writes get coalesced") Comment("buf[stream][off] = uint8(v0.entry >> 8)") Comment("buf[stream][off+1] = uint8(v1.entry >> 8)") - off := GP64() - MOVBQZX(offP, off) MOVW(out.As16(), Mem{Base: buffer, Index: off, Scale: 1, Disp: id * buffoff}) Comment("update the bitrader reader structure") @@ -232,16 +229,10 @@ func (d decompress4x8bit) generateProcedure(name string) { Pragma("noescape") out := reg.RAX // Fixed since we need 8H - offsetComp, err := ReturnIndex(0).Resolve() - if err != nil { - panic(err) - } - offP := offsetComp.Addr - { - off := GP8() - XORB(off, off) // off = 0 - MOVB(off, offP) - } + + off := GP64() + XORQ(off, off) + exhausted := reg.RBX // Fixed since we need 8H XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false @@ -260,27 +251,32 @@ func (d decompress4x8bit) generateProcedure(name string) { Label("main_loop") br0 := Dereference(Param("pbr0")) - d.decodeFourValues(0, br0, peekBits, table, buffer, out, exhausted, offP) + d.decodeFourValues(0, br0, peekBits, table, buffer, off, out, exhausted) br1 := Dereference(Param("pbr1")) - d.decodeFourValues(1, br1, peekBits, table, buffer, out, exhausted, offP) + d.decodeFourValues(1, br1, peekBits, table, buffer, off, out, exhausted) br2 := Dereference(Param("pbr2")) - d.decodeFourValues(2, br2, peekBits, table, buffer, out, exhausted, offP) + d.decodeFourValues(2, br2, peekBits, table, buffer, off, out, exhausted) br3 := Dereference(Param("pbr3")) - d.decodeFourValues(3, br3, peekBits, table, buffer, out, exhausted, offP) + d.decodeFourValues(3, br3, peekBits, table, buffer, off, out, exhausted) - ADDB(U8(4), offP) // off += 4 + ADDB(U8(4), off.As8()) // off += 4 TESTB(exhausted.As8H(), exhausted.As8H()) // any br[i].ofs < 4? JNZ(LabelRef("done")) - CMPB(offP, U8(0)) + CMPB(off.As8(), U8(0)) JNZ(LabelRef("main_loop")) Label("done") + offsetComp, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVB(off.As8(), offsetComp.Addr) RET() } -func (d decompress4x8bit) decodeFourValues(id int, br gotypes.Component, peekBits, table, buffer reg.GPVirtual, out, exhausted reg.GPPhysical, offP Mem) { +func (d decompress4x8bit) decodeFourValues(id int, br gotypes.Component, peekBits, table, buffer, off reg.GPVirtual, out, exhausted reg.GPPhysical) { Commentf("br%d.fillFast()", id) brOffset := GP64() brBitsRead := GP64() @@ -361,8 +357,6 @@ func (d decompress4x8bit) decodeFourValues(id int, br gotypes.Component, peekBit Comment("buf[stream][off+1] = uint8(v1.entry >> 8)") Comment("buf[stream][off+2] = uint8(v2.entry >> 8)") Comment("buf[stream][off+3] = uint8(v3.entry >> 8)") - off := GP64() - MOVBQZX(offP, off) MOVL(out.As32(), Mem{Base: buffer, Index: off, Scale: 1, Disp: id * buffoff}) Comment("update the bitrader reader structure") diff --git a/huff0/decompress_amd64.s b/huff0/decompress_amd64.s index 63581da7e..767f7380f 100644 --- a/huff0/decompress_amd64.s +++ b/huff0/decompress_amd64.s @@ -5,614 +5,605 @@ // func decompress4x_main_loop_amd64(pbr0 *bitReaderShifted, pbr1 *bitReaderShifted, pbr2 *bitReaderShifted, pbr3 *bitReaderShifted, peekBits uint8, buf *byte, tbl *dEntrySingle) uint8 TEXT ·decompress4x_main_loop_amd64(SB), $0-57 - XORB AL, AL - MOVB AL, ret+56(FP) + XORQ SI, SI XORQ BX, BX // Preload values - MOVBQZX peekBits+32(FP), SI - MOVQ buf+40(FP), DI - MOVQ tbl+48(FP), R8 + MOVBQZX peekBits+32(FP), DI + MOVQ buf+40(FP), R8 + MOVQ tbl+48(FP), R9 // Main loop main_loop: - MOVQ pbr0+0(FP), R9 + MOVQ pbr0+0(FP), R10 // br0.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x2a + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x2a JBE skip_fill0 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), AX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), AX // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R10)(AX*1), AX - MOVQ R11, CX + MOVL (R11)(AX*1), AX + MOVQ R12, CX SHLQ CL, AX - ORQ AX, R12 + ORQ AX, R13 // exhausted = exhausted || (br0.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v0 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br0.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val1 := br0.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R12, DX + MOVQ DI, CX + MOVQ R13, DX SHRQ CL, DX // v1 := table[val1&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br0.advance(uint8(v1.entry)) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // these two writes get coalesced // buf[stream][off] = uint8(v0.entry >> 8) // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVW AX, (DI)(CX*1) + MOVW AX, (R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr1+8(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr1+8(FP), R10 // br1.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x2a + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x2a JBE skip_fill1 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), AX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), AX // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R10)(AX*1), AX - MOVQ R11, CX + MOVL (R11)(AX*1), AX + MOVQ R12, CX SHLQ CL, AX - ORQ AX, R12 + ORQ AX, R13 // exhausted = exhausted || (br1.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v0 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br1.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val1 := br1.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R12, DX + MOVQ DI, CX + MOVQ R13, DX SHRQ CL, DX // v1 := table[val1&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br1.advance(uint8(v1.entry)) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // these two writes get coalesced // buf[stream][off] = uint8(v0.entry >> 8) // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVW AX, 256(DI)(CX*1) + MOVW AX, 256(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr2+16(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr2+16(FP), R10 // br2.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x2a + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x2a JBE skip_fill2 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), AX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), AX // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R10)(AX*1), AX - MOVQ R11, CX + MOVL (R11)(AX*1), AX + MOVQ R12, CX SHLQ CL, AX - ORQ AX, R12 + ORQ AX, R13 // exhausted = exhausted || (br2.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v0 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br2.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val1 := br2.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R12, DX + MOVQ DI, CX + MOVQ R13, DX SHRQ CL, DX // v1 := table[val1&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br2.advance(uint8(v1.entry)) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // these two writes get coalesced // buf[stream][off] = uint8(v0.entry >> 8) // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVW AX, 512(DI)(CX*1) + MOVW AX, 512(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr3+24(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr3+24(FP), R10 // br3.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x2a + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x2a JBE skip_fill3 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), AX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), AX // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R10)(AX*1), AX - MOVQ R11, CX + MOVL (R11)(AX*1), AX + MOVQ R12, CX SHLQ CL, AX - ORQ AX, R12 + ORQ AX, R13 // exhausted = exhausted || (br3.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v0 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br3.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val1 := br3.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R12, DX + MOVQ DI, CX + MOVQ R13, DX SHRQ CL, DX // v1 := table[val1&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br3.advance(uint8(v1.entry)) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // these two writes get coalesced // buf[stream][off] = uint8(v0.entry >> 8) // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVW AX, 768(DI)(CX*1) + MOVW AX, 768(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - ADDB $0x02, ret+56(FP) + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + ADDB $0x02, SI TESTB BH, BH JNZ done - CMPB ret+56(FP), $0x00 + CMPB SI, $0x00 JNZ main_loop done: + MOVB SI, ret+56(FP) RET // func decompress4x_main_loop_bmi2(pbr0 *bitReaderShifted, pbr1 *bitReaderShifted, pbr2 *bitReaderShifted, pbr3 *bitReaderShifted, peekBits uint8, buf *byte, tbl *dEntrySingle) uint8 // Requires: BMI2 TEXT ·decompress4x_main_loop_bmi2(SB), $0-57 - XORB AL, AL - MOVB AL, ret+56(FP) + XORQ SI, SI XORQ BX, BX // Preload values - MOVBQZX peekBits+32(FP), SI - MOVQ buf+40(FP), DI - MOVQ tbl+48(FP), R8 + MOVBQZX peekBits+32(FP), DI + MOVQ buf+40(FP), R8 + MOVQ tbl+48(FP), R9 // Main loop main_loop: - MOVQ pbr0+0(FP), R9 + MOVQ pbr0+0(FP), R10 // br0.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x2a + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x2a JBE skip_fill0 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), AX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), AX // b.value |= uint64(low) << (b.bitsRead & 63) - SHLXQ R11, (R10)(AX*1), AX - ORQ AX, R12 + SHLXQ R12, (R11)(AX*1), AX + ORQ AX, R13 // exhausted = exhausted || (br0.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill0: // val0 := br0.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v0 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br0.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val1 := br0.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v1 := table[val1&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br0.advance(uint8(v1.entry)) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // these two writes get coalesced // buf[stream][off] = uint8(v0.entry >> 8) // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVW AX, (DI)(CX*1) + MOVW AX, (R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr1+8(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr1+8(FP), R10 // br1.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x2a + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x2a JBE skip_fill1 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), AX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), AX // b.value |= uint64(low) << (b.bitsRead & 63) - SHLXQ R11, (R10)(AX*1), AX - ORQ AX, R12 + SHLXQ R12, (R11)(AX*1), AX + ORQ AX, R13 // exhausted = exhausted || (br1.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill1: // val0 := br1.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v0 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br1.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val1 := br1.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v1 := table[val1&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br1.advance(uint8(v1.entry)) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // these two writes get coalesced // buf[stream][off] = uint8(v0.entry >> 8) // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVW AX, 256(DI)(CX*1) + MOVW AX, 256(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr2+16(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr2+16(FP), R10 // br2.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x2a + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x2a JBE skip_fill2 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), AX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), AX // b.value |= uint64(low) << (b.bitsRead & 63) - SHLXQ R11, (R10)(AX*1), AX - ORQ AX, R12 + SHLXQ R12, (R11)(AX*1), AX + ORQ AX, R13 // exhausted = exhausted || (br2.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill2: // val0 := br2.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v0 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br2.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val1 := br2.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v1 := table[val1&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br2.advance(uint8(v1.entry)) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // these two writes get coalesced // buf[stream][off] = uint8(v0.entry >> 8) // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVW AX, 512(DI)(CX*1) + MOVW AX, 512(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr3+24(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr3+24(FP), R10 // br3.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x2a + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x2a JBE skip_fill3 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), AX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), AX // b.value |= uint64(low) << (b.bitsRead & 63) - SHLXQ R11, (R10)(AX*1), AX - ORQ AX, R12 + SHLXQ R12, (R11)(AX*1), AX + ORQ AX, R13 // exhausted = exhausted || (br3.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill3: // val0 := br3.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v0 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br3.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val1 := br3.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v1 := table[val1&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br3.advance(uint8(v1.entry)) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // these two writes get coalesced // buf[stream][off] = uint8(v0.entry >> 8) // buf[stream][off+1] = uint8(v1.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVW AX, 768(DI)(CX*1) + MOVW AX, 768(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - ADDB $0x02, ret+56(FP) + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + ADDB $0x02, SI TESTB BH, BH JNZ done - CMPB ret+56(FP), $0x00 + CMPB SI, $0x00 JNZ main_loop done: + MOVB SI, ret+56(FP) RET // func decompress4x_8b_main_loop_amd64(pbr0 *bitReaderShifted, pbr1 *bitReaderShifted, pbr2 *bitReaderShifted, pbr3 *bitReaderShifted, peekBits uint8, buf *byte, tbl *dEntrySingle) uint8 TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-57 - XORB AL, AL - MOVB AL, ret+56(FP) + XORQ SI, SI XORQ BX, BX // Preload values - MOVBQZX peekBits+32(FP), SI - MOVQ buf+40(FP), DI - MOVQ tbl+48(FP), R8 + MOVBQZX peekBits+32(FP), DI + MOVQ buf+40(FP), R8 + MOVQ tbl+48(FP), R9 // Main loop main_loop: - MOVQ pbr0+0(FP), R9 + MOVQ pbr0+0(FP), R10 // br0.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x20 + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x20 JBE skip_fill0 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), DX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), DX // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R10)(DX*1), DX - MOVQ R11, CX + MOVL (R11)(DX*1), DX + MOVQ R12, CX SHLQ CL, DX - ORQ DX, R12 + ORQ DX, R13 // exhausted = exhausted || (br0.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v0 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br0.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val1 := br0.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v1 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br0.advance(uint8(v1.entry) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 BSWAPL AX // val2 := br0.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v2 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br0.advance(uint8(v2.entry) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val3 := br0.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v3 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br0.advance(uint8(v3.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 BSWAPL AX // these four writes get coalesced @@ -620,93 +611,92 @@ skip_fill0: // buf[stream][off+1] = uint8(v1.entry >> 8) // buf[stream][off+2] = uint8(v2.entry >> 8) // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVL AX, (DI)(CX*1) + MOVL AX, (R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr1+8(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr1+8(FP), R10 // br1.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x20 + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x20 JBE skip_fill1 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), DX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), DX // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R10)(DX*1), DX - MOVQ R11, CX + MOVL (R11)(DX*1), DX + MOVQ R12, CX SHLQ CL, DX - ORQ DX, R12 + ORQ DX, R13 // exhausted = exhausted || (br1.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v0 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br1.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val1 := br1.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v1 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br1.advance(uint8(v1.entry) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 BSWAPL AX // val2 := br1.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v2 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br1.advance(uint8(v2.entry) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val3 := br1.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v3 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br1.advance(uint8(v3.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 BSWAPL AX // these four writes get coalesced @@ -714,93 +704,92 @@ skip_fill1: // buf[stream][off+1] = uint8(v1.entry >> 8) // buf[stream][off+2] = uint8(v2.entry >> 8) // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVL AX, 256(DI)(CX*1) + MOVL AX, 256(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr2+16(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr2+16(FP), R10 // br2.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x20 + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x20 JBE skip_fill2 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), DX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), DX // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R10)(DX*1), DX - MOVQ R11, CX + MOVL (R11)(DX*1), DX + MOVQ R12, CX SHLQ CL, DX - ORQ DX, R12 + ORQ DX, R13 // exhausted = exhausted || (br2.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v0 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br2.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val1 := br2.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v1 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br2.advance(uint8(v1.entry) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 BSWAPL AX // val2 := br2.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v2 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br2.advance(uint8(v2.entry) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val3 := br2.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v3 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br2.advance(uint8(v3.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 BSWAPL AX // these four writes get coalesced @@ -808,93 +797,92 @@ skip_fill2: // buf[stream][off+1] = uint8(v1.entry >> 8) // buf[stream][off+2] = uint8(v2.entry >> 8) // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVL AX, 512(DI)(CX*1) + MOVL AX, 512(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr3+24(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr3+24(FP), R10 // br3.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x20 + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x20 JBE skip_fill3 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), DX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), DX // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R10)(DX*1), DX - MOVQ R11, CX + MOVL (R11)(DX*1), DX + MOVQ R12, CX SHLQ CL, DX - ORQ DX, R12 + ORQ DX, R13 // exhausted = exhausted || (br3.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v0 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br3.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val1 := br3.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v1 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br3.advance(uint8(v1.entry) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 BSWAPL AX // val2 := br3.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v2 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br3.advance(uint8(v2.entry) MOVB DH, AH MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 // val3 := br3.peekTopBits(peekBits) - MOVQ R12, DX - MOVQ SI, CX + MOVQ R13, DX + MOVQ DI, CX SHRQ CL, DX // v3 := table[val0&mask] - MOVW (R8)(DX*2), DX + MOVW (R9)(DX*2), DX // br3.advance(uint8(v3.entry) MOVB DH, AL MOVBQZX DL, CX - SHLQ CL, R12 - ADDQ CX, R11 + SHLQ CL, R13 + ADDQ CX, R12 BSWAPL AX // these four writes get coalesced @@ -902,106 +890,105 @@ skip_fill3: // buf[stream][off+1] = uint8(v1.entry >> 8) // buf[stream][off+2] = uint8(v2.entry >> 8) // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVL AX, 768(DI)(CX*1) + MOVL AX, 768(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - ADDB $0x04, ret+56(FP) + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + ADDB $0x04, SI TESTB BH, BH JNZ done - CMPB ret+56(FP), $0x00 + CMPB SI, $0x00 JNZ main_loop done: + MOVB SI, ret+56(FP) RET // func decompress4x_8b_main_loop_bmi2(pbr0 *bitReaderShifted, pbr1 *bitReaderShifted, pbr2 *bitReaderShifted, pbr3 *bitReaderShifted, peekBits uint8, buf *byte, tbl *dEntrySingle) uint8 // Requires: BMI2 TEXT ·decompress4x_8b_main_loop_bmi2(SB), $0-57 - XORB AL, AL - MOVB AL, ret+56(FP) + XORQ SI, SI XORQ BX, BX // Preload values - MOVBQZX peekBits+32(FP), SI - MOVQ buf+40(FP), DI - MOVQ tbl+48(FP), R8 + MOVBQZX peekBits+32(FP), DI + MOVQ buf+40(FP), R8 + MOVQ tbl+48(FP), R9 // Main loop main_loop: - MOVQ pbr0+0(FP), R9 + MOVQ pbr0+0(FP), R10 // br0.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x20 + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x20 JBE skip_fill0 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), CX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), CX // b.value |= uint64(low) << (b.bitsRead & 63) - SHLXQ R11, (R10)(CX*1), CX - ORQ CX, R12 + SHLXQ R12, (R11)(CX*1), CX + ORQ CX, R13 // exhausted = exhausted || (br0.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill0: // val0 := br0.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v0 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br0.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val1 := br0.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v1 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br0.advance(uint8(v1.entry) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 BSWAPL AX // val2 := br0.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v2 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br0.advance(uint8(v2.entry) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val3 := br0.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v3 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br0.advance(uint8(v3.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 BSWAPL AX // these four writes get coalesced @@ -1009,83 +996,82 @@ skip_fill0: // buf[stream][off+1] = uint8(v1.entry >> 8) // buf[stream][off+2] = uint8(v2.entry >> 8) // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVL AX, (DI)(CX*1) + MOVL AX, (R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr1+8(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr1+8(FP), R10 // br1.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x20 + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x20 JBE skip_fill1 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), CX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), CX // b.value |= uint64(low) << (b.bitsRead & 63) - SHLXQ R11, (R10)(CX*1), CX - ORQ CX, R12 + SHLXQ R12, (R11)(CX*1), CX + ORQ CX, R13 // exhausted = exhausted || (br1.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill1: // val0 := br1.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v0 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br1.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val1 := br1.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v1 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br1.advance(uint8(v1.entry) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 BSWAPL AX // val2 := br1.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v2 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br1.advance(uint8(v2.entry) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val3 := br1.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v3 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br1.advance(uint8(v3.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 BSWAPL AX // these four writes get coalesced @@ -1093,83 +1079,82 @@ skip_fill1: // buf[stream][off+1] = uint8(v1.entry >> 8) // buf[stream][off+2] = uint8(v2.entry >> 8) // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVL AX, 256(DI)(CX*1) + MOVL AX, 256(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr2+16(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr2+16(FP), R10 // br2.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x20 + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x20 JBE skip_fill2 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), CX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), CX // b.value |= uint64(low) << (b.bitsRead & 63) - SHLXQ R11, (R10)(CX*1), CX - ORQ CX, R12 + SHLXQ R12, (R11)(CX*1), CX + ORQ CX, R13 // exhausted = exhausted || (br2.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill2: // val0 := br2.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v0 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br2.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val1 := br2.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v1 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br2.advance(uint8(v1.entry) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 BSWAPL AX // val2 := br2.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v2 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br2.advance(uint8(v2.entry) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val3 := br2.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v3 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br2.advance(uint8(v3.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 BSWAPL AX // these four writes get coalesced @@ -1177,83 +1162,82 @@ skip_fill2: // buf[stream][off+1] = uint8(v1.entry >> 8) // buf[stream][off+2] = uint8(v2.entry >> 8) // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVL AX, 512(DI)(CX*1) + MOVL AX, 512(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - MOVQ pbr3+24(FP), R9 + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + MOVQ pbr3+24(FP), R10 // br3.fillFast() - MOVBQZX 40(R9), R11 - MOVQ 24(R9), R10 - MOVQ 32(R9), R12 - CMPQ R11, $0x20 + MOVBQZX 40(R10), R12 + MOVQ 24(R10), R11 + MOVQ 32(R10), R13 + CMPQ R12, $0x20 JBE skip_fill3 - SUBQ $0x20, R11 - SUBQ $0x04, R10 - MOVQ (R9), CX + SUBQ $0x20, R12 + SUBQ $0x04, R11 + MOVQ (R10), CX // b.value |= uint64(low) << (b.bitsRead & 63) - SHLXQ R11, (R10)(CX*1), CX - ORQ CX, R12 + SHLXQ R12, (R11)(CX*1), CX + ORQ CX, R13 // exhausted = exhausted || (br3.off < 4) - CMPQ R10, $0x04 + CMPQ R11, $0x04 SETLT BL ORB BL, BH skip_fill3: // val0 := br3.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v0 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br3.advance(uint8(v0.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val1 := br3.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v1 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br3.advance(uint8(v1.entry) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 BSWAPL AX // val2 := br3.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v2 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br3.advance(uint8(v2.entry) MOVB DH, AH MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 // val3 := br3.peekTopBits(peekBits) - SHRXQ SI, R12, CX + SHRXQ DI, R13, CX // v3 := table[val0&mask] - MOVW (R8)(CX*2), DX + MOVW (R9)(CX*2), DX // br3.advance(uint8(v3.entry) MOVB DH, AL MOVBQZX DL, CX - SHLXQ DX, R12, R12 - ADDQ CX, R11 + SHLXQ DX, R13, R13 + ADDQ CX, R12 BSWAPL AX // these four writes get coalesced @@ -1261,18 +1245,18 @@ skip_fill3: // buf[stream][off+1] = uint8(v1.entry >> 8) // buf[stream][off+2] = uint8(v2.entry >> 8) // buf[stream][off+3] = uint8(v3.entry >> 8) - MOVBQZX ret+56(FP), CX - MOVL AX, 768(DI)(CX*1) + MOVL AX, 768(R8)(SI*1) // update the bitrader reader structure - MOVB R11, 40(R9) - MOVQ R12, 32(R9) - MOVQ R10, 24(R9) - ADDB $0x04, ret+56(FP) + MOVB R12, 40(R10) + MOVQ R13, 32(R10) + MOVQ R11, 24(R10) + ADDB $0x04, SI TESTB BH, BH JNZ done - CMPB ret+56(FP), $0x00 + CMPB SI, $0x00 JNZ main_loop done: + MOVB SI, ret+56(FP) RET