diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index 7ba1c8a03..c68419e0c 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -25,34 +25,48 @@ TEXT ·sequenceDecs_decode_amd64(SB), NOSPLIT, $8 /* This procedure implements the following sequence: - // s.next() - br.fill() - mo, moB := ofState.final() - mo += br.getBits(moB) - - br.fill() - ml, mlB := mlState.final() - ml += br.getBits(mlB) - - ll, llB := llState.final() - ll += br.getBits(llB) - - br.fill() - if i != 0 { - nBits := ctx.llState.nbBits() + ctx.mlState.nbBits() + ctx.ofState.nbBits() - bits := br.get32BitsFast(nBits) - lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31)) - llState = llTable[(llState.newState()+lowBits)&maxTableMask] - - lowBits = uint16(bits >> (ofState.nbBits() & 31)) - lowBits &= bitMask[mlState.nbBits()&15] - mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask] - - lowBits = uint16(bits) & bitMask[ofState.nbBits()&15] - ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask] + for ctx.iteration >= 0 { + // s.next() + br.fill() + mo, moB := ofState.final() + mo += br.getBits(moB) + + br.fill() + ml, mlB := mlState.final() + ml += br.getBits(mlB) + + ll, llB := llState.final() + ll += br.getBits(llB) + + br.fill() + if ctx.iteration != 0 { + nBits := ctx.llState.nbBits() + ctx.mlState.nbBits() + ctx.ofState.nbBits() + bits := br.get32BitsFast(nBits) + lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31)) + llState = llTable[(llState.newState()+lowBits)&maxTableMask] + + lowBits = uint16(bits >> (ofState.nbBits() & 31)) + lowBits &= bitMask[mlState.nbBits()&15] + mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask] + + lowBits = uint16(bits) & bitMask[ofState.nbBits()&15] + ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask] + } + + mo = s.adjustOffset(mo, ll, moB) + + if ml > maxMatchLen { + return errorMatchLenTooBig + } + if mo == 0 && ml > 0 { + return errorMatchLenOfsMismatch + } + + ctx.iteration -= 1 } - mo = s.adjustOffset(mo, ll, moB) + return 0 + */ #define br_value R8 // br.value #define br_bits_read R9 // br.bitsRead @@ -288,7 +302,7 @@ br_fill_byte_by_byte_3: br_fill_end_3: // bitreader_fill end - // if i != 0 { + // if ctx.iteration != 0 { // nBits := ctx.llState.nbBits() + ctx.mlState.nbBits() + ctx.ofState.nbBits() // bits := br.get32BitsFast(nBits) // lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31)) @@ -545,15 +559,16 @@ check_triple: return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml) } */ + XORQ AX, AX TESTQ BX, BX SETEQ DL CMPQ CX, $0 SETHI AL ANDQ DX, AX - TESTB AL, AL + TESTQ AX, AX JNZ error_match_len_ofs_mismatch - ADDQ $24, seqs + ADDQ $24, seqs // sizof(seqVals) == 3*8 DECQ decodeAsmContext_iteration(DI) JNS main_loop diff --git a/zstd/seqdec_amd64.s.in b/zstd/seqdec_amd64.s.in index 0c2d5fd9e..f2ff24e70 100644 --- a/zstd/seqdec_amd64.s.in +++ b/zstd/seqdec_amd64.s.in @@ -185,6 +185,16 @@ br_fill_end{{.}}: #endif {{end}} +{{/* +Input: + AX - number of bits + +Output: + BX - value + +Clobbers: + AX, BX, CX +*/}} {{define "get_bits"}} #ifdef GOAMD64_v3 LEAQ (br_bits_read)(AX*1), CX @@ -218,34 +228,48 @@ TEXT ·sequenceDecs_decode_amd64(SB), NOSPLIT, $8 /* This procedure implements the following sequence: - // s.next() - br.fill() - mo, moB := ofState.final() - mo += br.getBits(moB) + for ctx.iteration >= 0 { + // s.next() + br.fill() + mo, moB := ofState.final() + mo += br.getBits(moB) + + br.fill() + ml, mlB := mlState.final() + ml += br.getBits(mlB) - br.fill() - ml, mlB := mlState.final() - ml += br.getBits(mlB) + ll, llB := llState.final() + ll += br.getBits(llB) - ll, llB := llState.final() - ll += br.getBits(llB) + br.fill() + if ctx.iteration != 0 { + nBits := ctx.llState.nbBits() + ctx.mlState.nbBits() + ctx.ofState.nbBits() + bits := br.get32BitsFast(nBits) + lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31)) + llState = llTable[(llState.newState()+lowBits)&maxTableMask] - br.fill() - if i != 0 { - nBits := ctx.llState.nbBits() + ctx.mlState.nbBits() + ctx.ofState.nbBits() - bits := br.get32BitsFast(nBits) - lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31)) - llState = llTable[(llState.newState()+lowBits)&maxTableMask] + lowBits = uint16(bits >> (ofState.nbBits() & 31)) + lowBits &= bitMask[mlState.nbBits()&15] + mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask] - lowBits = uint16(bits >> (ofState.nbBits() & 31)) - lowBits &= bitMask[mlState.nbBits()&15] - mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask] + lowBits = uint16(bits) & bitMask[ofState.nbBits()&15] + ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask] + } - lowBits = uint16(bits) & bitMask[ofState.nbBits()&15] - ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask] + mo = s.adjustOffset(mo, ll, moB) + + if ml > maxMatchLen { + return errorMatchLenTooBig + } + if mo == 0 && ml > 0 { + return errorMatchLenOfsMismatch + } + + ctx.iteration -= 1 } - mo = s.adjustOffset(mo, ll, moB) + return 0 + */ #define br_value R8 // br.value #define br_bits_read R9 // br.bitsRead @@ -316,7 +340,7 @@ main_loop: {{template "bitreader_fill" .}} {{end}} - // if i != 0 { + // if ctx.iteration != 0 { // nBits := ctx.llState.nbBits() + ctx.mlState.nbBits() + ctx.ofState.nbBits() // bits := br.get32BitsFast(nBits) // lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31)) @@ -420,20 +444,22 @@ check_triple: return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml) } */ + XORQ AX, AX TESTQ BX, BX SETEQ DL CMPQ CX, $0 SETHI AL ANDQ DX, AX - TESTB AL, AL + TESTQ AX, AX JNZ error_match_len_ofs_mismatch - ADDQ $24, seqs + ADDQ $24, seqs // sizof(seqVals) == 3*8 DECQ decodeAsmContext_iteration(DI) JNS main_loop XORQ AX, AX + end: MOVQ 0(SP), BP MOVQ AX, ret+24(FP)