internal/lz4block: Copy literals of <=48 bytes through XMM

name old speed new speed delta UncompressPg1661-8 1.15GB/s ± 1% 1.19GB/s ± 1% +3.39% (p=0.000 n=10+10) UncompressDigits-8 1.89GB/s ± 0% 2.33GB/s ± 1% +23.46% (p=0.000 n=9+10) UncompressTwain-8 1.19GB/s ± 1% 1.23GB/s ± 0% +3.43% (p=0.000 n=10+10) UncompressRand-8 3.93GB/s ± 2% 3.96GB/s ± 1% ~ (p=0.105 n=10+10) The effect is most pronounced on Digits because 37.4% of its literals have lengths 17-48. In Twain and Pg1661, this is <4.1%. This is faster than copying 32 bytes. At 64 bytes, digits gets faster still whlie Twain and Pg1661 get slightly slower.
pierrec · Jan 30, 2022 · 04f2583 · 04f2583
1 parent 677f6a5
commit 04f2583
Showing 1 changed file with 12 additions and 8 deletions.
diff --git a/internal/lz4block/decode_amd64.s b/internal/lz4block/decode_amd64.s
@@ -157,24 +157,28 @@ copy_literal:
 	CMPQ BX, R8
 	JA err_short_buf
 
-	// whats a good cut off to call memmove?
-	CMPQ CX, $16
+	// Copy matches of < 48 bytes through the XMM registers.
+	CMPQ CX, $48
 	JGT memmove_lit
 
-	// if len(dst[di:]) < 16
+	// if len(dst[di:]) < 48
 	MOVQ R8, AX
 	SUBQ DI, AX
-	CMPQ AX, $16
+	CMPQ AX, $48
 	JLT memmove_lit
 
-	// if len(src[si:]) < 16
-	MOVQ R9, AX
-	SUBQ SI, AX
-	CMPQ AX, $16
+	// if len(src[si:]) < 48
+	MOVQ R9, BX
+	SUBQ SI, BX
+	CMPQ BX, $48
 	JLT memmove_lit
 
 	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU 32(SI), X2
 	MOVOU X0, (DI)
+	MOVOU X1, 16(DI)
+	MOVOU X2, 32(DI)
 
 	ADDQ CX, SI
 	ADDQ CX, DI