From cc3f1104bbea757cad6cd1f2b1cd7e9bd62199c7 Mon Sep 17 00:00:00 2001
From: greatroar <61184462+greatroar@users.noreply.github.com>
Date: Sat, 2 Jul 2022 10:13:01 +0200
Subject: [PATCH] zstd: Optimize seqdeq amd64 asm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

copyMemoryPrecise now generates a loop over 16-byte blocks with a single
branchless 16-byte fixup after it.

This is a tiny bit faster on the whole and quite a bit faster for some
inputs. Benchmark results on Intel Core i7-3770K:

	name                                                         old speed      new speed      delta
	Decoder_DecoderSmall/kppkn.gtb.zst-8                          369MB/s ± 0%   374MB/s ± 1%  +1.56%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/geo.protodata.zst-8                      977MB/s ± 0%  1056MB/s ± 1%  +8.17%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/plrabn12.txt.zst-8                       291MB/s ± 0%   289MB/s ± 0%  -0.74%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/lcet10.txt.zst-8                         329MB/s ± 1%   333MB/s ± 0%  +1.23%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/asyoulik.txt.zst-8                       310MB/s ± 0%   310MB/s ± 1%    ~     (p=1.000 n=5+5)
	Decoder_DecoderSmall/alice29.txt.zst-8                        291MB/s ± 0%   291MB/s ± 1%    ~     (p=0.421 n=5+5)
	Decoder_DecoderSmall/html_x_4.zst-8                          2.07GB/s ± 0%  2.15GB/s ± 2%  +4.05%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/paper-100k.pdf.zst-8                    3.58GB/s ± 3%  3.74GB/s ± 1%  +4.31%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/fireworks.jpeg.zst-8                    8.57GB/s ± 0%  8.60GB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecoderSmall/urls.10K.zst-8                           474MB/s ± 1%   507MB/s ± 1%  +6.80%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/html.zst-8                               745MB/s ± 0%   803MB/s ± 0%  +7.68%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/comp-data.bin.zst-8                      399MB/s ± 1%   400MB/s ± 0%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAll/kppkn.gtb.zst-8                             521MB/s ± 0%   521MB/s ± 0%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAll/geo.protodata.zst-8                        1.27GB/s ± 1%  1.29GB/s ± 0%  +1.19%  (p=0.008 n=5+5)
	Decoder_DecodeAll/plrabn12.txt.zst-8                          429MB/s ± 0%   427MB/s ± 0%  -0.51%  (p=0.032 n=5+5)
	Decoder_DecodeAll/lcet10.txt.zst-8                            435MB/s ± 0%   439MB/s ± 0%  +0.94%  (p=0.008 n=5+5)
	Decoder_DecodeAll/asyoulik.txt.zst-8                          438MB/s ± 0%   436MB/s ± 0%  -0.39%  (p=0.008 n=5+5)
	Decoder_DecodeAll/alice29.txt.zst-8                           423MB/s ± 0%   420MB/s ± 1%  -0.72%  (p=0.008 n=5+5)
	Decoder_DecodeAll/html_x_4.zst-8                             1.59GB/s ± 0%  1.59GB/s ± 1%  +0.54%  (p=0.032 n=5+5)
	Decoder_DecodeAll/paper-100k.pdf.zst-8                       4.53GB/s ± 1%  4.54GB/s ± 1%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAll/fireworks.jpeg.zst-8                       9.64GB/s ± 1%  9.57GB/s ± 0%    ~     (p=0.151 n=5+5)
	Decoder_DecodeAll/urls.10K.zst-8                              683MB/s ± 0%   681MB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAll/html.zst-8                                 1.04GB/s ± 1%  1.06GB/s ± 0%  +1.77%  (p=0.008 n=5+5)
	Decoder_DecodeAll/comp-data.bin.zst-8                         398MB/s ± 1%   399MB/s ± 1%    ~     (p=1.000 n=5+5)
	Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/fastest-8    439MB/s ± 0%   437MB/s ± 0%  -0.39%  (p=0.016 n=5+5)
	Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/default-8    448MB/s ± 0%   448MB/s ± 0%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/better-8     478MB/s ± 0%   477MB/s ± 0%    ~     (p=0.151 n=5+5)
	Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/best-8       463MB/s ± 0%   460MB/s ± 0%  -0.57%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/e.txt/fastest-8                       9.62GB/s ± 3%  9.66GB/s ± 1%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllFiles/e.txt/default-8                        394MB/s ± 0%   395MB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFiles/e.txt/better-8                         438MB/s ± 0%   442MB/s ± 0%  +0.82%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/e.txt/best-8                           501MB/s ± 0%   506MB/s ± 0%  +1.07%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/fse-artifact3.bin/fastest-8           1.04GB/s ± 0%  1.05GB/s ± 1%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFiles/fse-artifact3.bin/default-8           1.20GB/s ± 1%  1.20GB/s ± 1%    ~     (p=0.095 n=5+5)
	Decoder_DecodeAllFiles/fse-artifact3.bin/better-8            1.01GB/s ± 0%  1.00GB/s ± 1%  -0.82%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/fse-artifact3.bin/best-8               386MB/s ± 0%   383MB/s ± 0%  -0.57%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/gettysburg.txt/fastest-8               271MB/s ± 1%   275MB/s ± 1%  +1.59%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/gettysburg.txt/default-8               224MB/s ± 1%   223MB/s ± 1%    ~     (p=0.222 n=5+5)
	Decoder_DecodeAllFiles/gettysburg.txt/better-8                228MB/s ± 0%   226MB/s ± 0%  -0.89%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/gettysburg.txt/best-8                  223MB/s ± 1%   221MB/s ± 1%  -1.03%  (p=0.016 n=5+5)
	Decoder_DecodeAllFiles/html.txt/fastest-8                     592MB/s ± 1%   611MB/s ± 0%  +3.20%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/html.txt/default-8                     597MB/s ± 0%   607MB/s ± 0%  +1.71%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/html.txt/better-8                      623MB/s ± 0%   633MB/s ± 0%  +1.57%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/html.txt/best-8                        603MB/s ± 0%   610MB/s ± 0%  +1.25%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pi.txt/fastest-8                      9.59GB/s ± 1%  9.70GB/s ± 1%  +1.16%  (p=0.032 n=5+5)
	Decoder_DecodeAllFiles/pi.txt/default-8                       391MB/s ± 0%   393MB/s ± 0%  +0.62%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pi.txt/better-8                        437MB/s ± 1%   441MB/s ± 2%    ~     (p=0.087 n=5+5)
	Decoder_DecodeAllFiles/pi.txt/best-8                          501MB/s ± 0%   507MB/s ± 0%  +1.22%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pngdata.bin/fastest-8                 1.66GB/s ± 1%  1.70GB/s ± 0%  +2.49%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pngdata.bin/default-8                 1.49GB/s ± 0%  1.51GB/s ± 0%  +1.18%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pngdata.bin/better-8                  1.87GB/s ± 0%  1.90GB/s ± 1%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFiles/pngdata.bin/best-8                    1.44GB/s ± 1%  1.46GB/s ± 0%  +1.75%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/sharnd.out/fastest-8                  9.64GB/s ± 1%  9.66GB/s ± 1%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllFiles/sharnd.out/default-8                  9.70GB/s ± 1%  9.70GB/s ± 2%    ~     (p=1.000 n=5+5)
	Decoder_DecodeAllFiles/sharnd.out/better-8                   9.71GB/s ± 1%  9.79GB/s ± 1%    ~     (p=0.151 n=5+5)
	Decoder_DecodeAllFiles/sharnd.out/best-8                     9.76GB/s ± 0%  9.80GB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/fastest-8  1.85GB/s ± 0%  1.85GB/s ± 0%  -0.31%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/default-8  1.86GB/s ± 0%  1.85GB/s ± 0%  -0.47%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/better-8   2.00GB/s ± 0%  2.00GB/s ± 0%  -0.32%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/best-8     1.93GB/s ± 0%  1.93GB/s ± 0%  -0.22%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/e.txt/fastest-8                      37.7GB/s ± 0%  37.5GB/s ± 0%  -0.38%  (p=0.016 n=5+5)
	Decoder_DecodeAllFilesP/e.txt/default-8                      1.68GB/s ± 0%  1.69GB/s ± 0%  +0.55%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/e.txt/better-8                       1.91GB/s ± 0%  1.92GB/s ± 0%  +0.96%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/e.txt/best-8                         2.22GB/s ± 0%  2.25GB/s ± 0%  +1.50%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/fse-artifact3.bin/fastest-8          5.18GB/s ± 0%  5.05GB/s ± 2%  -2.50%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/fse-artifact3.bin/default-8          5.50GB/s ± 1%  5.34GB/s ± 1%  -2.86%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/fse-artifact3.bin/better-8           5.11GB/s ± 0%  5.14GB/s ± 0%  +0.57%  (p=0.016 n=5+5)
	Decoder_DecodeAllFilesP/fse-artifact3.bin/best-8             2.36GB/s ± 0%  2.37GB/s ± 0%  +0.20%  (p=0.032 n=5+5)
	Decoder_DecodeAllFilesP/gettysburg.txt/fastest-8             1.16GB/s ± 0%  1.16GB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFilesP/gettysburg.txt/default-8             1.09GB/s ± 0%  1.08GB/s ± 0%  -1.19%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/gettysburg.txt/better-8              1.09GB/s ± 0%  1.08GB/s ± 1%  -0.96%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/gettysburg.txt/best-8                1.03GB/s ± 3%  1.02GB/s ± 0%    ~     (p=0.151 n=5+5)
	Decoder_DecodeAllFilesP/html.txt/fastest-8                   2.50GB/s ± 1%  2.56GB/s ± 0%  +2.39%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/html.txt/default-8                   2.51GB/s ± 0%  2.55GB/s ± 0%  +1.69%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/html.txt/better-8                    2.61GB/s ± 0%  2.66GB/s ± 0%  +1.93%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/html.txt/best-8                      2.53GB/s ± 0%  2.56GB/s ± 0%  +1.13%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/pi.txt/fastest-8                     37.8GB/s ± 0%  37.6GB/s ± 0%  -0.44%  (p=0.016 n=5+5)
	Decoder_DecodeAllFilesP/pi.txt/default-8                     1.67GB/s ± 0%  1.68GB/s ± 0%  +0.61%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/pi.txt/better-8                      1.91GB/s ± 0%  1.93GB/s ± 0%  +0.82%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/pi.txt/best-8                        2.23GB/s ± 0%  2.26GB/s ± 0%  +1.35%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/pngdata.bin/fastest-8                6.99GB/s ± 0%  7.00GB/s ± 0%    ~     (p=0.690 n=5+5)
	Decoder_DecodeAllFilesP/pngdata.bin/default-8                6.88GB/s ± 0%  6.87GB/s ± 0%    ~     (p=0.222 n=5+5)
	Decoder_DecodeAllFilesP/pngdata.bin/better-8                 8.49GB/s ± 0%  8.44GB/s ± 1%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAllFilesP/pngdata.bin/best-8                   6.59GB/s ± 1%  6.53GB/s ± 1%  -0.96%  (p=0.032 n=5+5)
	Decoder_DecodeAllFilesP/sharnd.out/fastest-8                 37.8GB/s ± 0%  37.5GB/s ± 0%  -0.86%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/sharnd.out/default-8                 37.9GB/s ± 1%  38.0GB/s ± 1%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAllFilesP/sharnd.out/better-8                  37.9GB/s ± 0%  37.8GB/s ± 2%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllFilesP/sharnd.out/best-8                    37.8GB/s ± 0%  38.0GB/s ± 1%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAllParallel/kppkn.gtb.zst-8                    2.20GB/s ± 0%  2.20GB/s ± 0%    ~     (p=1.000 n=5+5)
	Decoder_DecodeAllParallel/geo.protodata.zst-8                5.37GB/s ± 0%  5.39GB/s ± 0%  +0.35%  (p=0.008 n=5+5)
	Decoder_DecodeAllParallel/plrabn12.txt.zst-8                 1.77GB/s ± 0%  1.76GB/s ± 0%  -0.19%  (p=0.008 n=5+5)
	Decoder_DecodeAllParallel/lcet10.txt.zst-8                   1.90GB/s ± 0%  1.92GB/s ± 0%  +0.80%  (p=0.008 n=5+5)
	Decoder_DecodeAllParallel/asyoulik.txt.zst-8                 1.83GB/s ± 0%  1.83GB/s ± 0%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllParallel/alice29.txt.zst-8                  1.74GB/s ± 0%  1.74GB/s ± 0%    ~     (p=0.548 n=5+5)
	Decoder_DecodeAllParallel/html_x_4.zst-8                     6.55GB/s ± 0%  6.49GB/s ± 0%  -0.97%  (p=0.008 n=5+5)
	Decoder_DecodeAllParallel/paper-100k.pdf.zst-8               18.3GB/s ± 0%  18.3GB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllParallel/fireworks.jpeg.zst-8               37.4GB/s ± 0%  37.2GB/s ± 1%  -0.57%  (p=0.016 n=4+5)
	Decoder_DecodeAllParallel/urls.10K.zst-8                     2.97GB/s ± 0%  2.96GB/s ± 0%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAllParallel/html.zst-8                         4.42GB/s ± 1%  4.43GB/s ± 0%    ~     (p=0.556 n=5+4)
	Decoder_DecodeAllParallel/comp-data.bin.zst-8                1.69GB/s ± 1%  1.70GB/s ± 0%  +0.84%  (p=0.008 n=5+5)
	[Geo mean]                                                   1.77GB/s       1.78GB/s       +0.57%
---
 zstd/_generate/gen.go |  62 ++--
 zstd/seqdec_amd64.s   | 693 ++++++++++++++++++++++++++++--------------
 2 files changed, 506 insertions(+), 249 deletions(-)

diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go
index c7fe02b305..96671d414d 100644
--- a/zstd/_generate/gen.go
+++ b/zstd/_generate/gen.go
@@ -1135,9 +1135,9 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
 			e.copyMemoryPrecise("1", c.literals, c.outBase, ll)
 		} else {
 			e.copyMemoryND("1", c.literals, c.outBase, ll)
+			ADDQ(ll, c.literals)
+			ADDQ(ll, c.outBase)
 		}
-		ADDQ(ll, c.literals)
-		ADDQ(ll, c.outBase)
 		ADDQ(ll, c.outPosition)
 	}
 
@@ -1203,7 +1203,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
 		*/
 		e.copyMemoryPrecise("4", ptr, c.outBase, ml)
 		ADDQ(ml, c.outPosition)
-		ADDQ(ml, c.outBase)
 		// Note: for the current go tests this branch is taken in 99.53% cases,
 		//       this is why we repeat a little code here.
 		handleLoop()
@@ -1219,7 +1218,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
 		    }
 		*/
 		e.copyMemoryPrecise("5", ptr, c.outBase, v)
-		ADDQ(v, c.outBase)
 		ADDQ(v, c.outPosition)
 		SUBQ(v, ml)
 		// fallback to the next block
@@ -1254,7 +1252,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
 			ADDQ(ml, c.outPosition)
 			if e.safeMem {
 				e.copyMemoryPrecise("2", src, c.outBase, ml)
-				ADDQ(ml, c.outBase)
 			} else {
 				dst := GP64()
 				MOVQ(c.outBase, dst)
@@ -1312,9 +1309,43 @@ func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtua
 }
 
 // copyMemoryPrecise will copy memory in blocks of 16 bytes,
-// without overwriting nor overreading.
+// without overreading. It adds length to src and dst,
+// preserving length.
 func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) {
-	label := "copy_" + suffix
+	n := GP64()
+	MOVQ(length, n)
+	SUBQ(U8(16), n)
+	JB(LabelRef("copy_" + suffix + "_small"))
+
+	// If length >= 16, copy blocks of 16 bytes and handle any remainder
+	// by a block copy that overlaps with the last full block.
+	{
+		t := XMM()
+
+		loop := "copy_" + suffix + "_loop"
+		Label(loop)
+		{
+			MOVUPS(Mem{Base: src}, t)
+			MOVUPS(t, Mem{Base: dst})
+			ADDQ(U8(16), src)
+			ADDQ(U8(16), dst)
+			SUBQ(U8(16), n)
+			JAE(LabelRef(loop))
+		}
+
+		// n is now the range [-16,-1].
+		// -16 means we copy the entire last block again.
+		// That should happen about 1/16th of the time,
+		// so we don't bother to check for it.
+		LEAQ(Mem{Base: src, Index: n, Disp: 16, Scale: 1}, src)
+		LEAQ(Mem{Base: dst, Index: n, Disp: 16, Scale: 1}, dst)
+		MOVUPS(Mem{Base: src, Disp: -16}, t)
+		MOVUPS(t, Mem{Base: dst, Disp: -16})
+
+		JMP(LabelRef("copy_" + suffix + "_end"))
+	}
+
+	Label("copy_" + suffix + "_small")
 	ofs := GP64()
 	s := Mem{Base: src, Index: ofs, Scale: 1}
 	d := Mem{Base: dst, Index: ofs, Scale: 1}
@@ -1351,23 +1382,18 @@ func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPV
 
 	Label("copy_" + suffix + "_qword")
 	TESTQ(U32(0x8), length)
-	JZ(LabelRef("copy_" + suffix + "_test"))
+	JZ(LabelRef("copy_" + suffix + "_add"))
 
 	// copy eight bytes if length & 0x08 != 0
 	MOVQ(s, tmp)
 	MOVQ(tmp, d)
 	ADDQ(U8(8), ofs)
-	JMP(LabelRef("copy_" + suffix + "_test"))
 
-	// copy in 16-byte chunks
-	Label(label)
-	t := XMM()
-	MOVUPS(s, t)
-	MOVUPS(t, d)
-	ADDQ(U8(16), ofs)
-	Label("copy_" + suffix + "_test")
-	CMPQ(ofs, length)
-	JB(LabelRef(label))
+	Label("copy_" + suffix + "_add")
+	ADDQ(length, dst)
+	ADDQ(length, src)
+
+	Label("copy_" + suffix + "_end")
 }
 
 // copyOverlappedMemory will copy one byte at the time from src to dst.
diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s
index 212c6cac30..9d76f0580f 100644
--- a/zstd/seqdec_amd64.s
+++ b/zstd/seqdec_amd64.s
@@ -1181,13 +1181,31 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  R12, R11
-	SUBQ  DI, R11
-	JLS   copy_match
-	MOVQ  R9, R14
-	SUBQ  R11, R14
-	CMPQ  R13, R11
-	JGE   copy_all_from_history
+	MOVQ R12, R11
+	SUBQ DI, R11
+	JLS  copy_match
+	MOVQ R9, R14
+	SUBQ R11, R14
+	CMPQ R13, R11
+	JGE  copy_all_from_history
+	MOVQ R13, R11
+	SUBQ $0x10, R11
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R11
+	JAE    copy_4_loop
+	LEAQ   16(R14)(R11*1), R14
+	LEAQ   16(BX)(R11*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_4_end
+
+copy_4_small:
 	XORQ  R11, R11
 	TESTQ $0x00000001, R13
 	JZ    copy_4_word
@@ -1211,22 +1229,17 @@ copy_4_dword:
 
 copy_4_qword:
 	TESTQ $0x00000008, R13
-	JZ    copy_4_test
+	JZ    copy_4_add
 	MOVQ  (R14)(R11*1), R12
 	MOVQ  R12, (BX)(R11*1)
 	ADDQ  $0x08, R11
-	JMP   copy_4_test
 
-copy_4:
-	MOVUPS (R14)(R11*1), X0
-	MOVUPS X0, (BX)(R11*1)
-	ADDQ   $0x10, R11
+copy_4_add:
+	ADDQ R13, BX
+	ADDQ R13, R14
 
-copy_4_test:
-	CMPQ R11, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, DI
-	ADDQ R13, BX
 	ADDQ $0x18, AX
 	INCQ DX
 	CMPQ DX, CX
@@ -1234,6 +1247,24 @@ copy_4_test:
 	JMP  loop_finished
 
 copy_all_from_history:
+	MOVQ R11, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(BX)(R15*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_5_end
+
+copy_5_small:
 	XORQ  R15, R15
 	TESTQ $0x00000001, R11
 	JZ    copy_5_word
@@ -1257,21 +1288,16 @@ copy_5_dword:
 
 copy_5_qword:
 	TESTQ $0x00000008, R11
-	JZ    copy_5_test
+	JZ    copy_5_add
 	MOVQ  (R14)(R15*1), BP
 	MOVQ  BP, (BX)(R15*1)
 	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (BX)(R15*1)
-	ADDQ   $0x10, R15
 
-copy_5_test:
-	CMPQ R15, R11
-	JB   copy_5
+copy_5_add:
 	ADDQ R11, BX
+	ADDQ R11, R14
+
+copy_5_end:
 	ADDQ R11, DI
 	SUBQ R11, R13
 
@@ -1382,6 +1408,24 @@ main_loop:
 	// Copy literals
 	TESTQ R11, R11
 	JZ    check_offset
+	MOVQ  R11, R14
+	SUBQ  $0x10, R14
+	JB    copy_1_small
+
+copy_1_loop:
+	MOVUPS (SI), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, SI
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R14
+	JAE    copy_1_loop
+	LEAQ   16(SI)(R14*1), SI
+	LEAQ   16(BX)(R14*1), BX
+	MOVUPS -16(SI), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_1_end
+
+copy_1_small:
 	XORQ  R14, R14
 	TESTQ $0x00000001, R11
 	JZ    copy_1_word
@@ -1405,22 +1449,16 @@ copy_1_dword:
 
 copy_1_qword:
 	TESTQ $0x00000008, R11
-	JZ    copy_1_test
+	JZ    copy_1_add
 	MOVQ  (SI)(R14*1), R15
 	MOVQ  R15, (BX)(R14*1)
 	ADDQ  $0x08, R14
-	JMP   copy_1_test
 
-copy_1:
-	MOVUPS (SI)(R14*1), X0
-	MOVUPS X0, (BX)(R14*1)
-	ADDQ   $0x10, R14
-
-copy_1_test:
-	CMPQ R14, R11
-	JB   copy_1
-	ADDQ R11, SI
+copy_1_add:
 	ADDQ R11, BX
+	ADDQ R11, SI
+
+copy_1_end:
 	ADDQ R11, DI
 
 	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -1432,13 +1470,31 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  R12, R11
-	SUBQ  DI, R11
-	JLS   copy_match
-	MOVQ  R9, R14
-	SUBQ  R11, R14
-	CMPQ  R13, R11
-	JGE   copy_all_from_history
+	MOVQ R12, R11
+	SUBQ DI, R11
+	JLS  copy_match
+	MOVQ R9, R14
+	SUBQ R11, R14
+	CMPQ R13, R11
+	JGE  copy_all_from_history
+	MOVQ R13, R11
+	SUBQ $0x10, R11
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R11
+	JAE    copy_4_loop
+	LEAQ   16(R14)(R11*1), R14
+	LEAQ   16(BX)(R11*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_4_end
+
+copy_4_small:
 	XORQ  R11, R11
 	TESTQ $0x00000001, R13
 	JZ    copy_4_word
@@ -1462,22 +1518,17 @@ copy_4_dword:
 
 copy_4_qword:
 	TESTQ $0x00000008, R13
-	JZ    copy_4_test
+	JZ    copy_4_add
 	MOVQ  (R14)(R11*1), R12
 	MOVQ  R12, (BX)(R11*1)
 	ADDQ  $0x08, R11
-	JMP   copy_4_test
 
-copy_4:
-	MOVUPS (R14)(R11*1), X0
-	MOVUPS X0, (BX)(R11*1)
-	ADDQ   $0x10, R11
+copy_4_add:
+	ADDQ R13, BX
+	ADDQ R13, R14
 
-copy_4_test:
-	CMPQ R11, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, DI
-	ADDQ R13, BX
 	ADDQ $0x18, AX
 	INCQ DX
 	CMPQ DX, CX
@@ -1485,6 +1536,24 @@ copy_4_test:
 	JMP  loop_finished
 
 copy_all_from_history:
+	MOVQ R11, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(BX)(R15*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_5_end
+
+copy_5_small:
 	XORQ  R15, R15
 	TESTQ $0x00000001, R11
 	JZ    copy_5_word
@@ -1508,21 +1577,16 @@ copy_5_dword:
 
 copy_5_qword:
 	TESTQ $0x00000008, R11
-	JZ    copy_5_test
+	JZ    copy_5_add
 	MOVQ  (R14)(R15*1), BP
 	MOVQ  BP, (BX)(R15*1)
 	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (BX)(R15*1)
-	ADDQ   $0x10, R15
 
-copy_5_test:
-	CMPQ R15, R11
-	JB   copy_5
+copy_5_add:
 	ADDQ R11, BX
+	ADDQ R11, R14
+
+copy_5_end:
 	ADDQ R11, DI
 	SUBQ R11, R13
 
@@ -1538,7 +1602,25 @@ copy_match:
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
-	ADDQ  R13, DI
+	ADDQ R13, DI
+	MOVQ R13, R12
+	SUBQ $0x10, R12
+	JB   copy_2_small
+
+copy_2_loop:
+	MOVUPS (R11), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R11
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R12
+	JAE    copy_2_loop
+	LEAQ   16(R11)(R12*1), R11
+	LEAQ   16(BX)(R12*1), BX
+	MOVUPS -16(R11), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_2_end
+
+copy_2_small:
 	XORQ  R12, R12
 	TESTQ $0x00000001, R13
 	JZ    copy_2_word
@@ -1562,22 +1644,17 @@ copy_2_dword:
 
 copy_2_qword:
 	TESTQ $0x00000008, R13
-	JZ    copy_2_test
+	JZ    copy_2_add
 	MOVQ  (R11)(R12*1), R14
 	MOVQ  R14, (BX)(R12*1)
 	ADDQ  $0x08, R12
-	JMP   copy_2_test
 
-copy_2:
-	MOVUPS (R11)(R12*1), X0
-	MOVUPS X0, (BX)(R12*1)
-	ADDQ   $0x10, R12
-
-copy_2_test:
-	CMPQ R12, R13
-	JB   copy_2
+copy_2_add:
 	ADDQ R13, BX
-	JMP  handle_loop
+	ADDQ R13, R11
+
+copy_2_end:
+	JMP handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
@@ -1934,13 +2011,31 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  CX, AX
-	SUBQ  R12, AX
-	JLS   copy_match
-	MOVQ  48(SP), R14
-	SUBQ  AX, R14
-	CMPQ  R13, AX
-	JGE   copy_all_from_history
+	MOVQ CX, AX
+	SUBQ R12, AX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ AX, R14
+	CMPQ R13, AX
+	JGE  copy_all_from_history
+	MOVQ R13, AX
+	SUBQ $0x10, AX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, AX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(AX*1), R14
+	LEAQ   16(R10)(AX*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_4_end
+
+copy_4_small:
 	XORQ  AX, AX
 	TESTQ $0x00000001, R13
 	JZ    copy_4_word
@@ -1964,26 +2059,39 @@ copy_4_dword:
 
 copy_4_qword:
 	TESTQ $0x00000008, R13
-	JZ    copy_4_test
+	JZ    copy_4_add
 	MOVQ  (R14)(AX*1), CX
 	MOVQ  CX, (R10)(AX*1)
 	ADDQ  $0x08, AX
-	JMP   copy_4_test
 
-copy_4:
-	MOVUPS (R14)(AX*1), X0
-	MOVUPS X0, (R10)(AX*1)
-	ADDQ   $0x10, AX
+copy_4_add:
+	ADDQ R13, R10
+	ADDQ R13, R14
 
-copy_4_test:
-	CMPQ AX, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, R12
-	ADDQ R13, R10
 	JMP  handle_loop
 	JMP loop_finished
 
 copy_all_from_history:
+	MOVQ AX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R10)(R15*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_5_end
+
+copy_5_small:
 	XORQ  R15, R15
 	TESTQ $0x00000001, AX
 	JZ    copy_5_word
@@ -2007,21 +2115,16 @@ copy_5_dword:
 
 copy_5_qword:
 	TESTQ $0x00000008, AX
-	JZ    copy_5_test
+	JZ    copy_5_add
 	MOVQ  (R14)(R15*1), BP
 	MOVQ  BP, (R10)(R15*1)
 	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (R10)(R15*1)
-	ADDQ   $0x10, R15
 
-copy_5_test:
-	CMPQ R15, AX
-	JB   copy_5
+copy_5_add:
 	ADDQ AX, R10
+	ADDQ AX, R14
+
+copy_5_end:
 	ADDQ AX, R12
 	SUBQ AX, R13
 
@@ -2407,13 +2510,31 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  R12, CX
-	SUBQ  R11, CX
-	JLS   copy_match
-	MOVQ  48(SP), R14
-	SUBQ  CX, R14
-	CMPQ  R13, CX
-	JGE   copy_all_from_history
+	MOVQ R12, CX
+	SUBQ R11, CX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ CX, R14
+	CMPQ R13, CX
+	JGE  copy_all_from_history
+	MOVQ R13, CX
+	SUBQ $0x10, CX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, CX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(CX*1), R14
+	LEAQ   16(R9)(CX*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_4_end
+
+copy_4_small:
 	XORQ  CX, CX
 	TESTQ $0x00000001, R13
 	JZ    copy_4_word
@@ -2437,26 +2558,39 @@ copy_4_dword:
 
 copy_4_qword:
 	TESTQ $0x00000008, R13
-	JZ    copy_4_test
+	JZ    copy_4_add
 	MOVQ  (R14)(CX*1), R12
 	MOVQ  R12, (R9)(CX*1)
 	ADDQ  $0x08, CX
-	JMP   copy_4_test
 
-copy_4:
-	MOVUPS (R14)(CX*1), X0
-	MOVUPS X0, (R9)(CX*1)
-	ADDQ   $0x10, CX
+copy_4_add:
+	ADDQ R13, R9
+	ADDQ R13, R14
 
-copy_4_test:
-	CMPQ CX, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, R11
-	ADDQ R13, R9
 	JMP  handle_loop
 	JMP loop_finished
 
 copy_all_from_history:
+	MOVQ CX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R9)(R15*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_5_end
+
+copy_5_small:
 	XORQ  R15, R15
 	TESTQ $0x00000001, CX
 	JZ    copy_5_word
@@ -2480,21 +2614,16 @@ copy_5_dword:
 
 copy_5_qword:
 	TESTQ $0x00000008, CX
-	JZ    copy_5_test
+	JZ    copy_5_add
 	MOVQ  (R14)(R15*1), BP
 	MOVQ  BP, (R9)(R15*1)
 	ADDQ  $0x08, R15
-	JMP   copy_5_test
-
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (R9)(R15*1)
-	ADDQ   $0x10, R15
 
-copy_5_test:
-	CMPQ R15, CX
-	JB   copy_5
+copy_5_add:
 	ADDQ CX, R9
+	ADDQ CX, R14
+
+copy_5_end:
 	ADDQ CX, R11
 	SUBQ CX, R13
 
@@ -2885,6 +3014,24 @@ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
 	// Copy literals
 	TESTQ AX, AX
 	JZ    check_offset
+	MOVQ  AX, R14
+	SUBQ  $0x10, R14
+	JB    copy_1_small
+
+copy_1_loop:
+	MOVUPS (R11), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R11
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, R14
+	JAE    copy_1_loop
+	LEAQ   16(R11)(R14*1), R11
+	LEAQ   16(R10)(R14*1), R10
+	MOVUPS -16(R11), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_1_end
+
+copy_1_small:
 	XORQ  R14, R14
 	TESTQ $0x00000001, AX
 	JZ    copy_1_word
@@ -2908,22 +3055,16 @@ copy_1_dword:
 
 copy_1_qword:
 	TESTQ $0x00000008, AX
-	JZ    copy_1_test
+	JZ    copy_1_add
 	MOVQ  (R11)(R14*1), R15
 	MOVQ  R15, (R10)(R14*1)
 	ADDQ  $0x08, R14
-	JMP   copy_1_test
 
-copy_1:
-	MOVUPS (R11)(R14*1), X0
-	MOVUPS X0, (R10)(R14*1)
-	ADDQ   $0x10, R14
-
-copy_1_test:
-	CMPQ R14, AX
-	JB   copy_1
-	ADDQ AX, R11
+copy_1_add:
 	ADDQ AX, R10
+	ADDQ AX, R11
+
+copy_1_end:
 	ADDQ AX, R12
 
 	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -2936,13 +3077,31 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  CX, AX
-	SUBQ  R12, AX
-	JLS   copy_match
-	MOVQ  48(SP), R14
-	SUBQ  AX, R14
-	CMPQ  R13, AX
-	JGE   copy_all_from_history
+	MOVQ CX, AX
+	SUBQ R12, AX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ AX, R14
+	CMPQ R13, AX
+	JGE  copy_all_from_history
+	MOVQ R13, AX
+	SUBQ $0x10, AX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, AX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(AX*1), R14
+	LEAQ   16(R10)(AX*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_4_end
+
+copy_4_small:
 	XORQ  AX, AX
 	TESTQ $0x00000001, R13
 	JZ    copy_4_word
@@ -2966,26 +3125,39 @@ copy_4_dword:
 
 copy_4_qword:
 	TESTQ $0x00000008, R13
-	JZ    copy_4_test
+	JZ    copy_4_add
 	MOVQ  (R14)(AX*1), CX
 	MOVQ  CX, (R10)(AX*1)
 	ADDQ  $0x08, AX
-	JMP   copy_4_test
 
-copy_4:
-	MOVUPS (R14)(AX*1), X0
-	MOVUPS X0, (R10)(AX*1)
-	ADDQ   $0x10, AX
+copy_4_add:
+	ADDQ R13, R10
+	ADDQ R13, R14
 
-copy_4_test:
-	CMPQ AX, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, R12
-	ADDQ R13, R10
 	JMP  handle_loop
 	JMP loop_finished
 
 copy_all_from_history:
+	MOVQ AX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R10)(R15*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_5_end
+
+copy_5_small:
 	XORQ  R15, R15
 	TESTQ $0x00000001, AX
 	JZ    copy_5_word
@@ -3009,21 +3181,16 @@ copy_5_dword:
 
 copy_5_qword:
 	TESTQ $0x00000008, AX
-	JZ    copy_5_test
+	JZ    copy_5_add
 	MOVQ  (R14)(R15*1), BP
 	MOVQ  BP, (R10)(R15*1)
 	ADDQ  $0x08, R15
-	JMP   copy_5_test
 
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (R10)(R15*1)
-	ADDQ   $0x10, R15
-
-copy_5_test:
-	CMPQ R15, AX
-	JB   copy_5
+copy_5_add:
 	ADDQ AX, R10
+	ADDQ AX, R14
+
+copy_5_end:
 	ADDQ AX, R12
 	SUBQ AX, R13
 
@@ -3039,7 +3206,25 @@ copy_match:
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
-	ADDQ  R13, R12
+	ADDQ R13, R12
+	MOVQ R13, CX
+	SUBQ $0x10, CX
+	JB   copy_2_small
+
+copy_2_loop:
+	MOVUPS (AX), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, AX
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, CX
+	JAE    copy_2_loop
+	LEAQ   16(AX)(CX*1), AX
+	LEAQ   16(R10)(CX*1), R10
+	MOVUPS -16(AX), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_2_end
+
+copy_2_small:
 	XORQ  CX, CX
 	TESTQ $0x00000001, R13
 	JZ    copy_2_word
@@ -3063,22 +3248,17 @@ copy_2_dword:
 
 copy_2_qword:
 	TESTQ $0x00000008, R13
-	JZ    copy_2_test
+	JZ    copy_2_add
 	MOVQ  (AX)(CX*1), R14
 	MOVQ  R14, (R10)(CX*1)
 	ADDQ  $0x08, CX
-	JMP   copy_2_test
 
-copy_2:
-	MOVUPS (AX)(CX*1), X0
-	MOVUPS X0, (R10)(CX*1)
-	ADDQ   $0x10, CX
-
-copy_2_test:
-	CMPQ CX, R13
-	JB   copy_2
+copy_2_add:
 	ADDQ R13, R10
-	JMP  handle_loop
+	ADDQ R13, AX
+
+copy_2_end:
+	JMP handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match:
@@ -3415,6 +3595,24 @@ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
 	// Copy literals
 	TESTQ CX, CX
 	JZ    check_offset
+	MOVQ  CX, R14
+	SUBQ  $0x10, R14
+	JB    copy_1_small
+
+copy_1_loop:
+	MOVUPS (R10), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R10
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R14
+	JAE    copy_1_loop
+	LEAQ   16(R10)(R14*1), R10
+	LEAQ   16(R9)(R14*1), R9
+	MOVUPS -16(R10), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_1_end
+
+copy_1_small:
 	XORQ  R14, R14
 	TESTQ $0x00000001, CX
 	JZ    copy_1_word
@@ -3438,22 +3636,16 @@ copy_1_dword:
 
 copy_1_qword:
 	TESTQ $0x00000008, CX
-	JZ    copy_1_test
+	JZ    copy_1_add
 	MOVQ  (R10)(R14*1), R15
 	MOVQ  R15, (R9)(R14*1)
 	ADDQ  $0x08, R14
-	JMP   copy_1_test
 
-copy_1:
-	MOVUPS (R10)(R14*1), X0
-	MOVUPS X0, (R9)(R14*1)
-	ADDQ   $0x10, R14
-
-copy_1_test:
-	CMPQ R14, CX
-	JB   copy_1
-	ADDQ CX, R10
+copy_1_add:
 	ADDQ CX, R9
+	ADDQ CX, R10
+
+copy_1_end:
 	ADDQ CX, R11
 
 	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
@@ -3466,13 +3658,31 @@ check_offset:
 	JG   error_match_off_too_big
 
 	// Copy match from history
-	MOVQ  R12, CX
-	SUBQ  R11, CX
-	JLS   copy_match
-	MOVQ  48(SP), R14
-	SUBQ  CX, R14
-	CMPQ  R13, CX
-	JGE   copy_all_from_history
+	MOVQ R12, CX
+	SUBQ R11, CX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ CX, R14
+	CMPQ R13, CX
+	JGE  copy_all_from_history
+	MOVQ R13, CX
+	SUBQ $0x10, CX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, CX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(CX*1), R14
+	LEAQ   16(R9)(CX*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_4_end
+
+copy_4_small:
 	XORQ  CX, CX
 	TESTQ $0x00000001, R13
 	JZ    copy_4_word
@@ -3496,26 +3706,39 @@ copy_4_dword:
 
 copy_4_qword:
 	TESTQ $0x00000008, R13
-	JZ    copy_4_test
+	JZ    copy_4_add
 	MOVQ  (R14)(CX*1), R12
 	MOVQ  R12, (R9)(CX*1)
 	ADDQ  $0x08, CX
-	JMP   copy_4_test
 
-copy_4:
-	MOVUPS (R14)(CX*1), X0
-	MOVUPS X0, (R9)(CX*1)
-	ADDQ   $0x10, CX
+copy_4_add:
+	ADDQ R13, R9
+	ADDQ R13, R14
 
-copy_4_test:
-	CMPQ CX, R13
-	JB   copy_4
+copy_4_end:
 	ADDQ R13, R11
-	ADDQ R13, R9
 	JMP  handle_loop
 	JMP loop_finished
 
 copy_all_from_history:
+	MOVQ CX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R9)(R15*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_5_end
+
+copy_5_small:
 	XORQ  R15, R15
 	TESTQ $0x00000001, CX
 	JZ    copy_5_word
@@ -3539,21 +3762,16 @@ copy_5_dword:
 
 copy_5_qword:
 	TESTQ $0x00000008, CX
-	JZ    copy_5_test
+	JZ    copy_5_add
 	MOVQ  (R14)(R15*1), BP
 	MOVQ  BP, (R9)(R15*1)
 	ADDQ  $0x08, R15
-	JMP   copy_5_test
 
-copy_5:
-	MOVUPS (R14)(R15*1), X0
-	MOVUPS X0, (R9)(R15*1)
-	ADDQ   $0x10, R15
-
-copy_5_test:
-	CMPQ R15, CX
-	JB   copy_5
+copy_5_add:
 	ADDQ CX, R9
+	ADDQ CX, R14
+
+copy_5_end:
 	ADDQ CX, R11
 	SUBQ CX, R13
 
@@ -3569,7 +3787,25 @@ copy_match:
 	JA   copy_overlapping_match
 
 	// Copy non-overlapping match
-	ADDQ  R13, R11
+	ADDQ R13, R11
+	MOVQ R13, R12
+	SUBQ $0x10, R12
+	JB   copy_2_small
+
+copy_2_loop:
+	MOVUPS (CX), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, CX
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R12
+	JAE    copy_2_loop
+	LEAQ   16(CX)(R12*1), CX
+	LEAQ   16(R9)(R12*1), R9
+	MOVUPS -16(CX), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_2_end
+
+copy_2_small:
 	XORQ  R12, R12
 	TESTQ $0x00000001, R13
 	JZ    copy_2_word
@@ -3593,22 +3829,17 @@ copy_2_dword:
 
 copy_2_qword:
 	TESTQ $0x00000008, R13
-	JZ    copy_2_test
+	JZ    copy_2_add
 	MOVQ  (CX)(R12*1), R14
 	MOVQ  R14, (R9)(R12*1)
 	ADDQ  $0x08, R12
-	JMP   copy_2_test
-
-copy_2:
-	MOVUPS (CX)(R12*1), X0
-	MOVUPS X0, (R9)(R12*1)
-	ADDQ   $0x10, R12
 
-copy_2_test:
-	CMPQ R12, R13
-	JB   copy_2
+copy_2_add:
 	ADDQ R13, R9
-	JMP  handle_loop
+	ADDQ R13, CX
+
+copy_2_end:
+	JMP handle_loop
 
 	// Copy overlapping match
 copy_overlapping_match: