From cc3f1104bbea757cad6cd1f2b1cd7e9bd62199c7 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sat, 2 Jul 2022 10:13:01 +0200 Subject: [PATCH] zstd: Optimize seqdeq amd64 asm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit copyMemoryPrecise now generates a loop over 16-byte blocks with a single branchless 16-byte fixup after it. This is a tiny bit faster on the whole and quite a bit faster for some inputs. Benchmark results on Intel Core i7-3770K: name old speed new speed delta Decoder_DecoderSmall/kppkn.gtb.zst-8 369MB/s ± 0% 374MB/s ± 1% +1.56% (p=0.008 n=5+5) Decoder_DecoderSmall/geo.protodata.zst-8 977MB/s ± 0% 1056MB/s ± 1% +8.17% (p=0.008 n=5+5) Decoder_DecoderSmall/plrabn12.txt.zst-8 291MB/s ± 0% 289MB/s ± 0% -0.74% (p=0.008 n=5+5) Decoder_DecoderSmall/lcet10.txt.zst-8 329MB/s ± 1% 333MB/s ± 0% +1.23% (p=0.008 n=5+5) Decoder_DecoderSmall/asyoulik.txt.zst-8 310MB/s ± 0% 310MB/s ± 1% ~ (p=1.000 n=5+5) Decoder_DecoderSmall/alice29.txt.zst-8 291MB/s ± 0% 291MB/s ± 1% ~ (p=0.421 n=5+5) Decoder_DecoderSmall/html_x_4.zst-8 2.07GB/s ± 0% 2.15GB/s ± 2% +4.05% (p=0.008 n=5+5) Decoder_DecoderSmall/paper-100k.pdf.zst-8 3.58GB/s ± 3% 3.74GB/s ± 1% +4.31% (p=0.008 n=5+5) Decoder_DecoderSmall/fireworks.jpeg.zst-8 8.57GB/s ± 0% 8.60GB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecoderSmall/urls.10K.zst-8 474MB/s ± 1% 507MB/s ± 1% +6.80% (p=0.008 n=5+5) Decoder_DecoderSmall/html.zst-8 745MB/s ± 0% 803MB/s ± 0% +7.68% (p=0.008 n=5+5) Decoder_DecoderSmall/comp-data.bin.zst-8 399MB/s ± 1% 400MB/s ± 0% ~ (p=0.841 n=5+5) Decoder_DecodeAll/kppkn.gtb.zst-8 521MB/s ± 0% 521MB/s ± 0% ~ (p=0.841 n=5+5) Decoder_DecodeAll/geo.protodata.zst-8 1.27GB/s ± 1% 1.29GB/s ± 0% +1.19% (p=0.008 n=5+5) Decoder_DecodeAll/plrabn12.txt.zst-8 429MB/s ± 0% 427MB/s ± 0% -0.51% (p=0.032 n=5+5) Decoder_DecodeAll/lcet10.txt.zst-8 435MB/s ± 0% 439MB/s ± 0% +0.94% (p=0.008 n=5+5) Decoder_DecodeAll/asyoulik.txt.zst-8 438MB/s ± 0% 436MB/s ± 0% -0.39% (p=0.008 n=5+5) Decoder_DecodeAll/alice29.txt.zst-8 423MB/s ± 0% 420MB/s ± 1% -0.72% (p=0.008 n=5+5) Decoder_DecodeAll/html_x_4.zst-8 1.59GB/s ± 0% 1.59GB/s ± 1% +0.54% (p=0.032 n=5+5) Decoder_DecodeAll/paper-100k.pdf.zst-8 4.53GB/s ± 1% 4.54GB/s ± 1% ~ (p=0.310 n=5+5) Decoder_DecodeAll/fireworks.jpeg.zst-8 9.64GB/s ± 1% 9.57GB/s ± 0% ~ (p=0.151 n=5+5) Decoder_DecodeAll/urls.10K.zst-8 683MB/s ± 0% 681MB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAll/html.zst-8 1.04GB/s ± 1% 1.06GB/s ± 0% +1.77% (p=0.008 n=5+5) Decoder_DecodeAll/comp-data.bin.zst-8 398MB/s ± 1% 399MB/s ± 1% ~ (p=1.000 n=5+5) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/fastest-8 439MB/s ± 0% 437MB/s ± 0% -0.39% (p=0.016 n=5+5) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/default-8 448MB/s ± 0% 448MB/s ± 0% ~ (p=0.841 n=5+5) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/better-8 478MB/s ± 0% 477MB/s ± 0% ~ (p=0.151 n=5+5) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/best-8 463MB/s ± 0% 460MB/s ± 0% -0.57% (p=0.008 n=5+5) Decoder_DecodeAllFiles/e.txt/fastest-8 9.62GB/s ± 3% 9.66GB/s ± 1% ~ (p=0.841 n=5+5) Decoder_DecodeAllFiles/e.txt/default-8 394MB/s ± 0% 395MB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAllFiles/e.txt/better-8 438MB/s ± 0% 442MB/s ± 0% +0.82% (p=0.008 n=5+5) Decoder_DecodeAllFiles/e.txt/best-8 501MB/s ± 0% 506MB/s ± 0% +1.07% (p=0.008 n=5+5) Decoder_DecodeAllFiles/fse-artifact3.bin/fastest-8 1.04GB/s ± 0% 1.05GB/s ± 1% ~ (p=0.056 n=5+5) Decoder_DecodeAllFiles/fse-artifact3.bin/default-8 1.20GB/s ± 1% 1.20GB/s ± 1% ~ (p=0.095 n=5+5) Decoder_DecodeAllFiles/fse-artifact3.bin/better-8 1.01GB/s ± 0% 1.00GB/s ± 1% -0.82% (p=0.008 n=5+5) Decoder_DecodeAllFiles/fse-artifact3.bin/best-8 386MB/s ± 0% 383MB/s ± 0% -0.57% (p=0.008 n=5+5) Decoder_DecodeAllFiles/gettysburg.txt/fastest-8 271MB/s ± 1% 275MB/s ± 1% +1.59% (p=0.008 n=5+5) Decoder_DecodeAllFiles/gettysburg.txt/default-8 224MB/s ± 1% 223MB/s ± 1% ~ (p=0.222 n=5+5) Decoder_DecodeAllFiles/gettysburg.txt/better-8 228MB/s ± 0% 226MB/s ± 0% -0.89% (p=0.008 n=5+5) Decoder_DecodeAllFiles/gettysburg.txt/best-8 223MB/s ± 1% 221MB/s ± 1% -1.03% (p=0.016 n=5+5) Decoder_DecodeAllFiles/html.txt/fastest-8 592MB/s ± 1% 611MB/s ± 0% +3.20% (p=0.008 n=5+5) Decoder_DecodeAllFiles/html.txt/default-8 597MB/s ± 0% 607MB/s ± 0% +1.71% (p=0.008 n=5+5) Decoder_DecodeAllFiles/html.txt/better-8 623MB/s ± 0% 633MB/s ± 0% +1.57% (p=0.008 n=5+5) Decoder_DecodeAllFiles/html.txt/best-8 603MB/s ± 0% 610MB/s ± 0% +1.25% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pi.txt/fastest-8 9.59GB/s ± 1% 9.70GB/s ± 1% +1.16% (p=0.032 n=5+5) Decoder_DecodeAllFiles/pi.txt/default-8 391MB/s ± 0% 393MB/s ± 0% +0.62% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pi.txt/better-8 437MB/s ± 1% 441MB/s ± 2% ~ (p=0.087 n=5+5) Decoder_DecodeAllFiles/pi.txt/best-8 501MB/s ± 0% 507MB/s ± 0% +1.22% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pngdata.bin/fastest-8 1.66GB/s ± 1% 1.70GB/s ± 0% +2.49% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pngdata.bin/default-8 1.49GB/s ± 0% 1.51GB/s ± 0% +1.18% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pngdata.bin/better-8 1.87GB/s ± 0% 1.90GB/s ± 1% ~ (p=0.056 n=5+5) Decoder_DecodeAllFiles/pngdata.bin/best-8 1.44GB/s ± 1% 1.46GB/s ± 0% +1.75% (p=0.008 n=5+5) Decoder_DecodeAllFiles/sharnd.out/fastest-8 9.64GB/s ± 1% 9.66GB/s ± 1% ~ (p=0.841 n=5+5) Decoder_DecodeAllFiles/sharnd.out/default-8 9.70GB/s ± 1% 9.70GB/s ± 2% ~ (p=1.000 n=5+5) Decoder_DecodeAllFiles/sharnd.out/better-8 9.71GB/s ± 1% 9.79GB/s ± 1% ~ (p=0.151 n=5+5) Decoder_DecodeAllFiles/sharnd.out/best-8 9.76GB/s ± 0% 9.80GB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/fastest-8 1.85GB/s ± 0% 1.85GB/s ± 0% -0.31% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/default-8 1.86GB/s ± 0% 1.85GB/s ± 0% -0.47% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/better-8 2.00GB/s ± 0% 2.00GB/s ± 0% -0.32% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/best-8 1.93GB/s ± 0% 1.93GB/s ± 0% -0.22% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/e.txt/fastest-8 37.7GB/s ± 0% 37.5GB/s ± 0% -0.38% (p=0.016 n=5+5) Decoder_DecodeAllFilesP/e.txt/default-8 1.68GB/s ± 0% 1.69GB/s ± 0% +0.55% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/e.txt/better-8 1.91GB/s ± 0% 1.92GB/s ± 0% +0.96% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/e.txt/best-8 2.22GB/s ± 0% 2.25GB/s ± 0% +1.50% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/fse-artifact3.bin/fastest-8 5.18GB/s ± 0% 5.05GB/s ± 2% -2.50% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/fse-artifact3.bin/default-8 5.50GB/s ± 1% 5.34GB/s ± 1% -2.86% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/fse-artifact3.bin/better-8 5.11GB/s ± 0% 5.14GB/s ± 0% +0.57% (p=0.016 n=5+5) Decoder_DecodeAllFilesP/fse-artifact3.bin/best-8 2.36GB/s ± 0% 2.37GB/s ± 0% +0.20% (p=0.032 n=5+5) Decoder_DecodeAllFilesP/gettysburg.txt/fastest-8 1.16GB/s ± 0% 1.16GB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAllFilesP/gettysburg.txt/default-8 1.09GB/s ± 0% 1.08GB/s ± 0% -1.19% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/gettysburg.txt/better-8 1.09GB/s ± 0% 1.08GB/s ± 1% -0.96% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/gettysburg.txt/best-8 1.03GB/s ± 3% 1.02GB/s ± 0% ~ (p=0.151 n=5+5) Decoder_DecodeAllFilesP/html.txt/fastest-8 2.50GB/s ± 1% 2.56GB/s ± 0% +2.39% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/html.txt/default-8 2.51GB/s ± 0% 2.55GB/s ± 0% +1.69% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/html.txt/better-8 2.61GB/s ± 0% 2.66GB/s ± 0% +1.93% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/html.txt/best-8 2.53GB/s ± 0% 2.56GB/s ± 0% +1.13% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/pi.txt/fastest-8 37.8GB/s ± 0% 37.6GB/s ± 0% -0.44% (p=0.016 n=5+5) Decoder_DecodeAllFilesP/pi.txt/default-8 1.67GB/s ± 0% 1.68GB/s ± 0% +0.61% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/pi.txt/better-8 1.91GB/s ± 0% 1.93GB/s ± 0% +0.82% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/pi.txt/best-8 2.23GB/s ± 0% 2.26GB/s ± 0% +1.35% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/pngdata.bin/fastest-8 6.99GB/s ± 0% 7.00GB/s ± 0% ~ (p=0.690 n=5+5) Decoder_DecodeAllFilesP/pngdata.bin/default-8 6.88GB/s ± 0% 6.87GB/s ± 0% ~ (p=0.222 n=5+5) Decoder_DecodeAllFilesP/pngdata.bin/better-8 8.49GB/s ± 0% 8.44GB/s ± 1% ~ (p=0.310 n=5+5) Decoder_DecodeAllFilesP/pngdata.bin/best-8 6.59GB/s ± 1% 6.53GB/s ± 1% -0.96% (p=0.032 n=5+5) Decoder_DecodeAllFilesP/sharnd.out/fastest-8 37.8GB/s ± 0% 37.5GB/s ± 0% -0.86% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/sharnd.out/default-8 37.9GB/s ± 1% 38.0GB/s ± 1% ~ (p=0.310 n=5+5) Decoder_DecodeAllFilesP/sharnd.out/better-8 37.9GB/s ± 0% 37.8GB/s ± 2% ~ (p=0.841 n=5+5) Decoder_DecodeAllFilesP/sharnd.out/best-8 37.8GB/s ± 0% 38.0GB/s ± 1% ~ (p=0.310 n=5+5) Decoder_DecodeAllParallel/kppkn.gtb.zst-8 2.20GB/s ± 0% 2.20GB/s ± 0% ~ (p=1.000 n=5+5) Decoder_DecodeAllParallel/geo.protodata.zst-8 5.37GB/s ± 0% 5.39GB/s ± 0% +0.35% (p=0.008 n=5+5) Decoder_DecodeAllParallel/plrabn12.txt.zst-8 1.77GB/s ± 0% 1.76GB/s ± 0% -0.19% (p=0.008 n=5+5) Decoder_DecodeAllParallel/lcet10.txt.zst-8 1.90GB/s ± 0% 1.92GB/s ± 0% +0.80% (p=0.008 n=5+5) Decoder_DecodeAllParallel/asyoulik.txt.zst-8 1.83GB/s ± 0% 1.83GB/s ± 0% ~ (p=0.841 n=5+5) Decoder_DecodeAllParallel/alice29.txt.zst-8 1.74GB/s ± 0% 1.74GB/s ± 0% ~ (p=0.548 n=5+5) Decoder_DecodeAllParallel/html_x_4.zst-8 6.55GB/s ± 0% 6.49GB/s ± 0% -0.97% (p=0.008 n=5+5) Decoder_DecodeAllParallel/paper-100k.pdf.zst-8 18.3GB/s ± 0% 18.3GB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAllParallel/fireworks.jpeg.zst-8 37.4GB/s ± 0% 37.2GB/s ± 1% -0.57% (p=0.016 n=4+5) Decoder_DecodeAllParallel/urls.10K.zst-8 2.97GB/s ± 0% 2.96GB/s ± 0% ~ (p=0.310 n=5+5) Decoder_DecodeAllParallel/html.zst-8 4.42GB/s ± 1% 4.43GB/s ± 0% ~ (p=0.556 n=5+4) Decoder_DecodeAllParallel/comp-data.bin.zst-8 1.69GB/s ± 1% 1.70GB/s ± 0% +0.84% (p=0.008 n=5+5) [Geo mean] 1.77GB/s 1.78GB/s +0.57% --- zstd/_generate/gen.go | 62 ++-- zstd/seqdec_amd64.s | 693 ++++++++++++++++++++++++++++-------------- 2 files changed, 506 insertions(+), 249 deletions(-) diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index c7fe02b305..96671d414d 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -1135,9 +1135,9 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle e.copyMemoryPrecise("1", c.literals, c.outBase, ll) } else { e.copyMemoryND("1", c.literals, c.outBase, ll) + ADDQ(ll, c.literals) + ADDQ(ll, c.outBase) } - ADDQ(ll, c.literals) - ADDQ(ll, c.outBase) ADDQ(ll, c.outPosition) } @@ -1203,7 +1203,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle */ e.copyMemoryPrecise("4", ptr, c.outBase, ml) ADDQ(ml, c.outPosition) - ADDQ(ml, c.outBase) // Note: for the current go tests this branch is taken in 99.53% cases, // this is why we repeat a little code here. handleLoop() @@ -1219,7 +1218,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle } */ e.copyMemoryPrecise("5", ptr, c.outBase, v) - ADDQ(v, c.outBase) ADDQ(v, c.outPosition) SUBQ(v, ml) // fallback to the next block @@ -1254,7 +1252,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle ADDQ(ml, c.outPosition) if e.safeMem { e.copyMemoryPrecise("2", src, c.outBase, ml) - ADDQ(ml, c.outBase) } else { dst := GP64() MOVQ(c.outBase, dst) @@ -1312,9 +1309,43 @@ func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtua } // copyMemoryPrecise will copy memory in blocks of 16 bytes, -// without overwriting nor overreading. +// without overreading. It adds length to src and dst, +// preserving length. func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) { - label := "copy_" + suffix + n := GP64() + MOVQ(length, n) + SUBQ(U8(16), n) + JB(LabelRef("copy_" + suffix + "_small")) + + // If length >= 16, copy blocks of 16 bytes and handle any remainder + // by a block copy that overlaps with the last full block. + { + t := XMM() + + loop := "copy_" + suffix + "_loop" + Label(loop) + { + MOVUPS(Mem{Base: src}, t) + MOVUPS(t, Mem{Base: dst}) + ADDQ(U8(16), src) + ADDQ(U8(16), dst) + SUBQ(U8(16), n) + JAE(LabelRef(loop)) + } + + // n is now the range [-16,-1]. + // -16 means we copy the entire last block again. + // That should happen about 1/16th of the time, + // so we don't bother to check for it. + LEAQ(Mem{Base: src, Index: n, Disp: 16, Scale: 1}, src) + LEAQ(Mem{Base: dst, Index: n, Disp: 16, Scale: 1}, dst) + MOVUPS(Mem{Base: src, Disp: -16}, t) + MOVUPS(t, Mem{Base: dst, Disp: -16}) + + JMP(LabelRef("copy_" + suffix + "_end")) + } + + Label("copy_" + suffix + "_small") ofs := GP64() s := Mem{Base: src, Index: ofs, Scale: 1} d := Mem{Base: dst, Index: ofs, Scale: 1} @@ -1351,23 +1382,18 @@ func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPV Label("copy_" + suffix + "_qword") TESTQ(U32(0x8), length) - JZ(LabelRef("copy_" + suffix + "_test")) + JZ(LabelRef("copy_" + suffix + "_add")) // copy eight bytes if length & 0x08 != 0 MOVQ(s, tmp) MOVQ(tmp, d) ADDQ(U8(8), ofs) - JMP(LabelRef("copy_" + suffix + "_test")) - // copy in 16-byte chunks - Label(label) - t := XMM() - MOVUPS(s, t) - MOVUPS(t, d) - ADDQ(U8(16), ofs) - Label("copy_" + suffix + "_test") - CMPQ(ofs, length) - JB(LabelRef(label)) + Label("copy_" + suffix + "_add") + ADDQ(length, dst) + ADDQ(length, src) + + Label("copy_" + suffix + "_end") } // copyOverlappedMemory will copy one byte at the time from src to dst. diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index 212c6cac30..9d76f0580f 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -1181,13 +1181,31 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JGE copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: XORQ R11, R11 TESTQ $0x00000001, R13 JZ copy_4_word @@ -1211,22 +1229,17 @@ copy_4_dword: copy_4_qword: TESTQ $0x00000008, R13 - JZ copy_4_test + JZ copy_4_add MOVQ (R14)(R11*1), R12 MOVQ R12, (BX)(R11*1) ADDQ $0x08, R11 - JMP copy_4_test -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 +copy_4_add: + ADDQ R13, BX + ADDQ R13, R14 -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1234,6 +1247,24 @@ copy_4_test: JMP loop_finished copy_all_from_history: + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: XORQ R15, R15 TESTQ $0x00000001, R11 JZ copy_5_word @@ -1257,21 +1288,16 @@ copy_5_dword: copy_5_qword: TESTQ $0x00000008, R11 - JZ copy_5_test + JZ copy_5_add MOVQ (R14)(R15*1), BP MOVQ BP, (BX)(R15*1) ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 -copy_5_test: - CMPQ R15, R11 - JB copy_5 +copy_5_add: ADDQ R11, BX + ADDQ R11, R14 + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 @@ -1382,6 +1408,24 @@ main_loop: // Copy literals TESTQ R11, R11 JZ check_offset + MOVQ R11, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (SI), X0 + MOVUPS X0, (BX) + ADDQ $0x10, SI + ADDQ $0x10, BX + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(SI)(R14*1), SI + LEAQ 16(BX)(R14*1), BX + MOVUPS -16(SI), X0 + MOVUPS X0, -16(BX) + JMP copy_1_end + +copy_1_small: XORQ R14, R14 TESTQ $0x00000001, R11 JZ copy_1_word @@ -1405,22 +1449,16 @@ copy_1_dword: copy_1_qword: TESTQ $0x00000008, R11 - JZ copy_1_test + JZ copy_1_add MOVQ (SI)(R14*1), R15 MOVQ R15, (BX)(R14*1) ADDQ $0x08, R14 - JMP copy_1_test -copy_1: - MOVUPS (SI)(R14*1), X0 - MOVUPS X0, (BX)(R14*1) - ADDQ $0x10, R14 - -copy_1_test: - CMPQ R14, R11 - JB copy_1 - ADDQ R11, SI +copy_1_add: ADDQ R11, BX + ADDQ R11, SI + +copy_1_end: ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -1432,13 +1470,31 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JGE copy_all_from_history + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JGE copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: XORQ R11, R11 TESTQ $0x00000001, R13 JZ copy_4_word @@ -1462,22 +1518,17 @@ copy_4_dword: copy_4_qword: TESTQ $0x00000008, R13 - JZ copy_4_test + JZ copy_4_add MOVQ (R14)(R11*1), R12 MOVQ R12, (BX)(R11*1) ADDQ $0x08, R11 - JMP copy_4_test -copy_4: - MOVUPS (R14)(R11*1), X0 - MOVUPS X0, (BX)(R11*1) - ADDQ $0x10, R11 +copy_4_add: + ADDQ R13, BX + ADDQ R13, R14 -copy_4_test: - CMPQ R11, R13 - JB copy_4 +copy_4_end: ADDQ R13, DI - ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX @@ -1485,6 +1536,24 @@ copy_4_test: JMP loop_finished copy_all_from_history: + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: XORQ R15, R15 TESTQ $0x00000001, R11 JZ copy_5_word @@ -1508,21 +1577,16 @@ copy_5_dword: copy_5_qword: TESTQ $0x00000008, R11 - JZ copy_5_test + JZ copy_5_add MOVQ (R14)(R15*1), BP MOVQ BP, (BX)(R15*1) ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (BX)(R15*1) - ADDQ $0x10, R15 -copy_5_test: - CMPQ R15, R11 - JB copy_5 +copy_5_add: ADDQ R11, BX + ADDQ R11, R14 + +copy_5_end: ADDQ R11, DI SUBQ R11, R13 @@ -1538,7 +1602,25 @@ copy_match: JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, DI + ADDQ R13, DI + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small + +copy_2_loop: + MOVUPS (R11), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R11 + ADDQ $0x10, BX + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(R11)(R12*1), R11 + LEAQ 16(BX)(R12*1), BX + MOVUPS -16(R11), X0 + MOVUPS X0, -16(BX) + JMP copy_2_end + +copy_2_small: XORQ R12, R12 TESTQ $0x00000001, R13 JZ copy_2_word @@ -1562,22 +1644,17 @@ copy_2_dword: copy_2_qword: TESTQ $0x00000008, R13 - JZ copy_2_test + JZ copy_2_add MOVQ (R11)(R12*1), R14 MOVQ R14, (BX)(R12*1) ADDQ $0x08, R12 - JMP copy_2_test -copy_2: - MOVUPS (R11)(R12*1), X0 - MOVUPS X0, (BX)(R12*1) - ADDQ $0x10, R12 - -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_add: ADDQ R13, BX - JMP handle_loop + ADDQ R13, R11 + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -1934,13 +2011,31 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JGE copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: XORQ AX, AX TESTQ $0x00000001, R13 JZ copy_4_word @@ -1964,26 +2059,39 @@ copy_4_dword: copy_4_qword: TESTQ $0x00000008, R13 - JZ copy_4_test + JZ copy_4_add MOVQ (R14)(AX*1), CX MOVQ CX, (R10)(AX*1) ADDQ $0x08, AX - JMP copy_4_test -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX +copy_4_add: + ADDQ R13, R10 + ADDQ R13, R14 -copy_4_test: - CMPQ AX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R12 - ADDQ R13, R10 JMP handle_loop JMP loop_finished copy_all_from_history: + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: XORQ R15, R15 TESTQ $0x00000001, AX JZ copy_5_word @@ -2007,21 +2115,16 @@ copy_5_dword: copy_5_qword: TESTQ $0x00000008, AX - JZ copy_5_test + JZ copy_5_add MOVQ (R14)(R15*1), BP MOVQ BP, (R10)(R15*1) ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 -copy_5_test: - CMPQ R15, AX - JB copy_5 +copy_5_add: ADDQ AX, R10 + ADDQ AX, R14 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 @@ -2407,13 +2510,31 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JGE copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: XORQ CX, CX TESTQ $0x00000001, R13 JZ copy_4_word @@ -2437,26 +2558,39 @@ copy_4_dword: copy_4_qword: TESTQ $0x00000008, R13 - JZ copy_4_test + JZ copy_4_add MOVQ (R14)(CX*1), R12 MOVQ R12, (R9)(CX*1) ADDQ $0x08, CX - JMP copy_4_test -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX +copy_4_add: + ADDQ R13, R9 + ADDQ R13, R14 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: XORQ R15, R15 TESTQ $0x00000001, CX JZ copy_5_word @@ -2480,21 +2614,16 @@ copy_5_dword: copy_5_qword: TESTQ $0x00000008, CX - JZ copy_5_test + JZ copy_5_add MOVQ (R14)(R15*1), BP MOVQ BP, (R9)(R15*1) ADDQ $0x08, R15 - JMP copy_5_test - -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 -copy_5_test: - CMPQ R15, CX - JB copy_5 +copy_5_add: ADDQ CX, R9 + ADDQ CX, R14 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 @@ -2885,6 +3014,24 @@ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: // Copy literals TESTQ AX, AX JZ check_offset + MOVQ AX, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (R11), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R11 + ADDQ $0x10, R10 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R11)(R14*1), R11 + LEAQ 16(R10)(R14*1), R10 + MOVUPS -16(R11), X0 + MOVUPS X0, -16(R10) + JMP copy_1_end + +copy_1_small: XORQ R14, R14 TESTQ $0x00000001, AX JZ copy_1_word @@ -2908,22 +3055,16 @@ copy_1_dword: copy_1_qword: TESTQ $0x00000008, AX - JZ copy_1_test + JZ copy_1_add MOVQ (R11)(R14*1), R15 MOVQ R15, (R10)(R14*1) ADDQ $0x08, R14 - JMP copy_1_test -copy_1: - MOVUPS (R11)(R14*1), X0 - MOVUPS X0, (R10)(R14*1) - ADDQ $0x10, R14 - -copy_1_test: - CMPQ R14, AX - JB copy_1 - ADDQ AX, R11 +copy_1_add: ADDQ AX, R10 + ADDQ AX, R11 + +copy_1_end: ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -2936,13 +3077,31 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JGE copy_all_from_history + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JGE copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: XORQ AX, AX TESTQ $0x00000001, R13 JZ copy_4_word @@ -2966,26 +3125,39 @@ copy_4_dword: copy_4_qword: TESTQ $0x00000008, R13 - JZ copy_4_test + JZ copy_4_add MOVQ (R14)(AX*1), CX MOVQ CX, (R10)(AX*1) ADDQ $0x08, AX - JMP copy_4_test -copy_4: - MOVUPS (R14)(AX*1), X0 - MOVUPS X0, (R10)(AX*1) - ADDQ $0x10, AX +copy_4_add: + ADDQ R13, R10 + ADDQ R13, R14 -copy_4_test: - CMPQ AX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R12 - ADDQ R13, R10 JMP handle_loop JMP loop_finished copy_all_from_history: + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: XORQ R15, R15 TESTQ $0x00000001, AX JZ copy_5_word @@ -3009,21 +3181,16 @@ copy_5_dword: copy_5_qword: TESTQ $0x00000008, AX - JZ copy_5_test + JZ copy_5_add MOVQ (R14)(R15*1), BP MOVQ BP, (R10)(R15*1) ADDQ $0x08, R15 - JMP copy_5_test -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R10)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, AX - JB copy_5 +copy_5_add: ADDQ AX, R10 + ADDQ AX, R14 + +copy_5_end: ADDQ AX, R12 SUBQ AX, R13 @@ -3039,7 +3206,25 @@ copy_match: JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R12 + ADDQ R13, R12 + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_2_small + +copy_2_loop: + MOVUPS (AX), X0 + MOVUPS X0, (R10) + ADDQ $0x10, AX + ADDQ $0x10, R10 + SUBQ $0x10, CX + JAE copy_2_loop + LEAQ 16(AX)(CX*1), AX + LEAQ 16(R10)(CX*1), R10 + MOVUPS -16(AX), X0 + MOVUPS X0, -16(R10) + JMP copy_2_end + +copy_2_small: XORQ CX, CX TESTQ $0x00000001, R13 JZ copy_2_word @@ -3063,22 +3248,17 @@ copy_2_dword: copy_2_qword: TESTQ $0x00000008, R13 - JZ copy_2_test + JZ copy_2_add MOVQ (AX)(CX*1), R14 MOVQ R14, (R10)(CX*1) ADDQ $0x08, CX - JMP copy_2_test -copy_2: - MOVUPS (AX)(CX*1), X0 - MOVUPS X0, (R10)(CX*1) - ADDQ $0x10, CX - -copy_2_test: - CMPQ CX, R13 - JB copy_2 +copy_2_add: ADDQ R13, R10 - JMP handle_loop + ADDQ R13, AX + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: @@ -3415,6 +3595,24 @@ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: // Copy literals TESTQ CX, CX JZ check_offset + MOVQ CX, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (R10), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R10 + ADDQ $0x10, R9 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R10)(R14*1), R10 + LEAQ 16(R9)(R14*1), R9 + MOVUPS -16(R10), X0 + MOVUPS X0, -16(R9) + JMP copy_1_end + +copy_1_small: XORQ R14, R14 TESTQ $0x00000001, CX JZ copy_1_word @@ -3438,22 +3636,16 @@ copy_1_dword: copy_1_qword: TESTQ $0x00000008, CX - JZ copy_1_test + JZ copy_1_add MOVQ (R10)(R14*1), R15 MOVQ R15, (R9)(R14*1) ADDQ $0x08, R14 - JMP copy_1_test -copy_1: - MOVUPS (R10)(R14*1), X0 - MOVUPS X0, (R9)(R14*1) - ADDQ $0x10, R14 - -copy_1_test: - CMPQ R14, CX - JB copy_1 - ADDQ CX, R10 +copy_1_add: ADDQ CX, R9 + ADDQ CX, R10 + +copy_1_end: ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) @@ -3466,13 +3658,31 @@ check_offset: JG error_match_off_too_big // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JGE copy_all_from_history + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JGE copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: XORQ CX, CX TESTQ $0x00000001, R13 JZ copy_4_word @@ -3496,26 +3706,39 @@ copy_4_dword: copy_4_qword: TESTQ $0x00000008, R13 - JZ copy_4_test + JZ copy_4_add MOVQ (R14)(CX*1), R12 MOVQ R12, (R9)(CX*1) ADDQ $0x08, CX - JMP copy_4_test -copy_4: - MOVUPS (R14)(CX*1), X0 - MOVUPS X0, (R9)(CX*1) - ADDQ $0x10, CX +copy_4_add: + ADDQ R13, R9 + ADDQ R13, R14 -copy_4_test: - CMPQ CX, R13 - JB copy_4 +copy_4_end: ADDQ R13, R11 - ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: XORQ R15, R15 TESTQ $0x00000001, CX JZ copy_5_word @@ -3539,21 +3762,16 @@ copy_5_dword: copy_5_qword: TESTQ $0x00000008, CX - JZ copy_5_test + JZ copy_5_add MOVQ (R14)(R15*1), BP MOVQ BP, (R9)(R15*1) ADDQ $0x08, R15 - JMP copy_5_test -copy_5: - MOVUPS (R14)(R15*1), X0 - MOVUPS X0, (R9)(R15*1) - ADDQ $0x10, R15 - -copy_5_test: - CMPQ R15, CX - JB copy_5 +copy_5_add: ADDQ CX, R9 + ADDQ CX, R14 + +copy_5_end: ADDQ CX, R11 SUBQ CX, R13 @@ -3569,7 +3787,25 @@ copy_match: JA copy_overlapping_match // Copy non-overlapping match - ADDQ R13, R11 + ADDQ R13, R11 + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small + +copy_2_loop: + MOVUPS (CX), X0 + MOVUPS X0, (R9) + ADDQ $0x10, CX + ADDQ $0x10, R9 + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(CX)(R12*1), CX + LEAQ 16(R9)(R12*1), R9 + MOVUPS -16(CX), X0 + MOVUPS X0, -16(R9) + JMP copy_2_end + +copy_2_small: XORQ R12, R12 TESTQ $0x00000001, R13 JZ copy_2_word @@ -3593,22 +3829,17 @@ copy_2_dword: copy_2_qword: TESTQ $0x00000008, R13 - JZ copy_2_test + JZ copy_2_add MOVQ (CX)(R12*1), R14 MOVQ R14, (R9)(R12*1) ADDQ $0x08, R12 - JMP copy_2_test - -copy_2: - MOVUPS (CX)(R12*1), X0 - MOVUPS X0, (R9)(R12*1) - ADDQ $0x10, R12 -copy_2_test: - CMPQ R12, R13 - JB copy_2 +copy_2_add: ADDQ R13, R9 - JMP handle_loop + ADDQ R13, CX + +copy_2_end: + JMP handle_loop // Copy overlapping match copy_overlapping_match: