diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index ff0a1a9b72..23a26b21d4 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -302,28 +302,25 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute // Update states Comment("Update Offset State") { - nBits := GP64() + nBits := ofState // Note: SHRXQ uses lower 6 bits of shift amount and BZHIQ lower 8 bits of count lowBits := GP64() - MOVBQZX(ofState.As8(), nBits) BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1)) - SHRXQ(nBits, bits, bits) // bits >= nBits + SHRXQ(nBits, bits, bits) // bits >>= nBits o.nextState(name+"_ofState", ofState, lowBits, "ofTable") } Comment("Update Match Length State") { - nBits := GP64() + nBits := mlState lowBits := GP64() - MOVBQZX(mlState.As8(), nBits) - BZHIQ(nBits, bits, lowBits) // lowBits = lowBits & ((1 << nBits) - 1)) - SHRXQ(nBits, bits, bits) // lowBits >= nBits + BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1)) + SHRXQ(nBits, bits, bits) // lowBits >>= nBits o.nextState(name+"_mlState", mlState, lowBits, "mlTable") } Comment("Update Literal Length State") { - nBits := GP64() + nBits := llState lowBits := GP64() - MOVBQZX(llState.As8(), nBits) - BZHIQ(nBits, bits, lowBits) // lowBits = lowBits & ((1 << nBits) - 1)) + BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1)) o.nextState(name+"_llState", llState, lowBits, "llTable") } } else { diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index e7ad2a0ca2..212c6cac30 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -721,37 +721,34 @@ sequenceDecs_decode_bmi2_fill_2_end: BZHIQ R14, R15, R15 // Update Offset State - MOVBQZX R8, CX - BZHIQ CX, R15, R14 - SHRXQ CX, R15, R15 - MOVQ $0x00001010, CX - BEXTRQ CX, R8, CX - ADDQ R14, CX + BZHIQ R8, R15, CX + SHRXQ R8, R15, R15 + MOVQ $0x00001010, R14 + BEXTRQ R14, R8, R8 + ADDQ CX, R8 // Load ctx.ofTable - MOVQ ctx+16(FP), R8 - MOVQ 48(R8), R8 - MOVQ (R8)(CX*8), R8 + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 // Update Match Length State - MOVBQZX DI, CX - BZHIQ CX, R15, R14 - SHRXQ CX, R15, R15 - MOVQ $0x00001010, CX - BEXTRQ CX, DI, CX - ADDQ R14, CX + BZHIQ DI, R15, CX + SHRXQ DI, R15, R15 + MOVQ $0x00001010, R14 + BEXTRQ R14, DI, DI + ADDQ CX, DI // Load ctx.mlTable - MOVQ ctx+16(FP), DI - MOVQ 24(DI), DI - MOVQ (DI)(CX*8), DI + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI // Update Literal Length State - MOVBQZX SI, CX - BZHIQ CX, R15, CX - MOVQ $0x00001010, R14 - BEXTRQ R14, SI, SI - ADDQ CX, SI + BZHIQ SI, R15, CX + MOVQ $0x00001010, R14 + BEXTRQ R14, SI, SI + ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX @@ -979,37 +976,34 @@ sequenceDecs_decode_56_bmi2_fill_end: BZHIQ R14, R15, R15 // Update Offset State - MOVBQZX R8, CX - BZHIQ CX, R15, R14 - SHRXQ CX, R15, R15 - MOVQ $0x00001010, CX - BEXTRQ CX, R8, CX - ADDQ R14, CX + BZHIQ R8, R15, CX + SHRXQ R8, R15, R15 + MOVQ $0x00001010, R14 + BEXTRQ R14, R8, R8 + ADDQ CX, R8 // Load ctx.ofTable - MOVQ ctx+16(FP), R8 - MOVQ 48(R8), R8 - MOVQ (R8)(CX*8), R8 + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 // Update Match Length State - MOVBQZX DI, CX - BZHIQ CX, R15, R14 - SHRXQ CX, R15, R15 - MOVQ $0x00001010, CX - BEXTRQ CX, DI, CX - ADDQ R14, CX + BZHIQ DI, R15, CX + SHRXQ DI, R15, R15 + MOVQ $0x00001010, R14 + BEXTRQ R14, DI, DI + ADDQ CX, DI // Load ctx.mlTable - MOVQ ctx+16(FP), DI - MOVQ 24(DI), DI - MOVQ (DI)(CX*8), DI + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI // Update Literal Length State - MOVBQZX SI, CX - BZHIQ CX, R15, CX - MOVQ $0x00001010, R14 - BEXTRQ R14, SI, SI - ADDQ CX, SI + BZHIQ SI, R15, CX + MOVQ $0x00001010, R14 + BEXTRQ R14, SI, SI + ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX @@ -2277,37 +2271,34 @@ sequenceDecs_decodeSync_bmi2_fill_2_end: BZHIQ R13, R14, R14 // Update Offset State - MOVBQZX R8, CX - BZHIQ CX, R14, R13 - SHRXQ CX, R14, R14 - MOVQ $0x00001010, CX - BEXTRQ CX, R8, CX - ADDQ R13, CX + BZHIQ R8, R14, CX + SHRXQ R8, R14, R14 + MOVQ $0x00001010, R13 + BEXTRQ R13, R8, R8 + ADDQ CX, R8 // Load ctx.ofTable - MOVQ ctx+16(FP), R8 - MOVQ 48(R8), R8 - MOVQ (R8)(CX*8), R8 + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 // Update Match Length State - MOVBQZX DI, CX - BZHIQ CX, R14, R13 - SHRXQ CX, R14, R14 - MOVQ $0x00001010, CX - BEXTRQ CX, DI, CX - ADDQ R13, CX + BZHIQ DI, R14, CX + SHRXQ DI, R14, R14 + MOVQ $0x00001010, R13 + BEXTRQ R13, DI, DI + ADDQ CX, DI // Load ctx.mlTable - MOVQ ctx+16(FP), DI - MOVQ 24(DI), DI - MOVQ (DI)(CX*8), DI + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI // Update Literal Length State - MOVBQZX SI, CX - BZHIQ CX, R14, CX - MOVQ $0x00001010, R13 - BEXTRQ R13, SI, SI - ADDQ CX, SI + BZHIQ SI, R14, CX + MOVQ $0x00001010, R13 + BEXTRQ R13, SI, SI + ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX @@ -3310,37 +3301,34 @@ sequenceDecs_decodeSync_safe_bmi2_fill_2_end: BZHIQ R13, R14, R14 // Update Offset State - MOVBQZX R8, CX - BZHIQ CX, R14, R13 - SHRXQ CX, R14, R14 - MOVQ $0x00001010, CX - BEXTRQ CX, R8, CX - ADDQ R13, CX + BZHIQ R8, R14, CX + SHRXQ R8, R14, R14 + MOVQ $0x00001010, R13 + BEXTRQ R13, R8, R8 + ADDQ CX, R8 // Load ctx.ofTable - MOVQ ctx+16(FP), R8 - MOVQ 48(R8), R8 - MOVQ (R8)(CX*8), R8 + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 // Update Match Length State - MOVBQZX DI, CX - BZHIQ CX, R14, R13 - SHRXQ CX, R14, R14 - MOVQ $0x00001010, CX - BEXTRQ CX, DI, CX - ADDQ R13, CX + BZHIQ DI, R14, CX + SHRXQ DI, R14, R14 + MOVQ $0x00001010, R13 + BEXTRQ R13, DI, DI + ADDQ CX, DI // Load ctx.mlTable - MOVQ ctx+16(FP), DI - MOVQ 24(DI), DI - MOVQ (DI)(CX*8), DI + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI // Update Literal Length State - MOVBQZX SI, CX - BZHIQ CX, R14, CX - MOVQ $0x00001010, R13 - BEXTRQ R13, SI, SI - ADDQ CX, SI + BZHIQ SI, R14, CX + MOVQ $0x00001010, R13 + BEXTRQ R13, SI, SI + ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX