From 257c664d416bd8deace75610eedccf6504312315 Mon Sep 17 00:00:00 2001
From: greatroar <61184462+greatroar@users.noreply.github.com>
Date: Sun, 30 Jan 2022 18:45:52 +0100
Subject: [PATCH] internal/lz4block: Copy literals of <=48 bytes through XMM
 regs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

name                old speed      new speed      delta
UncompressPg1661-8  1.15GB/s ± 1%  1.19GB/s ± 1%   +3.39%  (p=0.000 n=10+10)
UncompressDigits-8  1.89GB/s ± 0%  2.33GB/s ± 1%  +23.46%  (p=0.000 n=9+10)
UncompressTwain-8   1.19GB/s ± 1%  1.23GB/s ± 0%   +3.43%  (p=0.000 n=10+10)
UncompressRand-8    3.93GB/s ± 2%  3.96GB/s ± 1%     ~     (p=0.105 n=10+10)

The effect is most pronounced on Digits because 37.4% of its literals
have lengths 17-48. In Twain and Pg1661, this is <4.1%.

This is faster than copying 32 bytes. At 64 bytes, digits gets faster
still whlie Twain and Pg1661 get slightly slower.
---
 internal/lz4block/decode_amd64.s | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/internal/lz4block/decode_amd64.s b/internal/lz4block/decode_amd64.s
index d86e981f..484b4b8d 100644
--- a/internal/lz4block/decode_amd64.s
+++ b/internal/lz4block/decode_amd64.s
@@ -157,24 +157,28 @@ copy_literal:
 	CMPQ BX, R8
 	JA err_short_buf
 
-	// whats a good cut off to call memmove?
-	CMPQ CX, $16
+	// Copy matches of <=48 bytes through the XMM registers.
+	CMPQ CX, $48
 	JGT memmove_lit
 
-	// if len(dst[di:]) < 16
+	// if len(dst[di:]) < 48
 	MOVQ R8, AX
 	SUBQ DI, AX
-	CMPQ AX, $16
+	CMPQ AX, $48
 	JLT memmove_lit
 
-	// if len(src[si:]) < 16
-	MOVQ R9, AX
-	SUBQ SI, AX
-	CMPQ AX, $16
+	// if len(src[si:]) < 48
+	MOVQ R9, BX
+	SUBQ SI, BX
+	CMPQ BX, $48
 	JLT memmove_lit
 
 	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU 32(SI), X2
 	MOVOU X0, (DI)
+	MOVOU X1, 16(DI)
+	MOVOU X2, 32(DI)
 
 	ADDQ CX, SI
 	ADDQ CX, DI