diff --git a/internal/lz4block/decode_amd64.s b/internal/lz4block/decode_amd64.s index d86e981f..5f0332cb 100644 --- a/internal/lz4block/decode_amd64.s +++ b/internal/lz4block/decode_amd64.s @@ -157,24 +157,28 @@ copy_literal: CMPQ BX, R8 JA err_short_buf - // whats a good cut off to call memmove? - CMPQ CX, $16 + // Copy matches of < 48 bytes through the XMM registers. + CMPQ CX, $48 JGT memmove_lit - // if len(dst[di:]) < 16 + // if len(dst[di:]) < 48 MOVQ R8, AX SUBQ DI, AX - CMPQ AX, $16 + CMPQ AX, $48 JLT memmove_lit - // if len(src[si:]) < 16 - MOVQ R9, AX - SUBQ SI, AX - CMPQ AX, $16 + // if len(src[si:]) < 48 + MOVQ R9, BX + SUBQ SI, BX + CMPQ BX, $48 JLT memmove_lit MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) ADDQ CX, SI ADDQ CX, DI