diff --git a/s2/_generate/cleanup.go b/s2/_generate/cleanup.go new file mode 100644 index 0000000000..bcd7964e69 --- /dev/null +++ b/s2/_generate/cleanup.go @@ -0,0 +1,34 @@ +//go:build custom +// +build custom + +package main + +import ( + "bytes" + "flag" + "io/ioutil" + "log" + "os" + + "github.com/klauspost/asmfmt" +) + +func main() { + flag.Parse() + args := flag.Args() + for _, file := range args { + data, err := ioutil.ReadFile(file) + if err != nil { + log.Fatalln(err) + } + data = bytes.Replace(data, []byte("\t// #"), []byte("#"), -1) + data, err = asmfmt.Format(bytes.NewBuffer(data)) + if err != nil { + log.Fatalln(err) + } + err = ioutil.WriteFile(file, data, os.ModePerm) + if err != nil { + log.Fatalln(err) + } + } +} diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index 1202ba1d3b..47d58ea1e6 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -2,9 +2,7 @@ package main //go:generate go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2 //go:generate gofmt -w ../encodeblock_amd64.go - -//go:generate go run gen.go -x64v3 -out ../encodeblock_v3_amd64.s -stubs ../encodeblock_v3_amd64.go -pkg=s2 -//go:generate gofmt -w ../encodeblock_v3_amd64.go +//go:generate go run cleanup.go ../encodeblock_amd64.s import ( "flag" @@ -31,23 +29,16 @@ const ( limit8B = 512 - 1 ) -var x64v3 = flag.Bool("x64v3", false, "Generate for amd64-v3") - func main() { flag.Parse() Constraint(buildtags.Not("appengine").ToConstraint()) Constraint(buildtags.Not("noasm").ToConstraint()) Constraint(buildtags.Term("gc").ToConstraint()) - if *x64v3 { - Constraint(buildtags.Term("goamd64_v3").ToConstraint()) - } else { - Constraint(buildtags.Not("goamd64_v3").ToConstraint()) - } Constraint(buildtags.Not("noasm").ToConstraint()) o := options{ - bmi1: *x64v3, - bmi2: *x64v3, // Currently unused.... + bmi1: false, + bmi2: false, snappy: false, outputMargin: 9, } @@ -2511,13 +2502,17 @@ func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) re TESTQ(tmp, tmp) JZ(LabelRef("matchlen_loop_" + name)) // Not all match. - if o.bmi1 { - // 2016 BMI :TZCNT r64, r64 L: 0.57ns= 2.0c T: 0.29ns= 1.00c - // 315 AMD64 :BSF r64, r64 L: 0.88ns= 3.1c T: 0.86ns= 3.00c - TZCNTQ(tmp, tmp) - } else { - BSFQ(tmp, tmp) - } + + Comment("#ifdef GOAMD64_v3") + // 2016 BMI :TZCNT r64, r64 L: 0.57ns= 2.0c T: 0.29ns= 1.00c + // 315 AMD64 :BSF r64, r64 L: 0.88ns= 3.1c T: 0.86ns= 3.00c + TZCNTQ(tmp, tmp) + Comment("#elifdef GOAMD64_v4") + TZCNTQ(tmp, tmp) + Comment("#else") + BSFQ(tmp, tmp) + Comment("#endif") + SARQ(U8(3), tmp) LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched) JMP(end) diff --git a/s2/_generate/go.mod b/s2/_generate/go.mod index 572b422fd9..5a6f4b1d58 100644 --- a/s2/_generate/go.mod +++ b/s2/_generate/go.mod @@ -1,5 +1,8 @@ module github.com/klauspost/compress/s2/_generate -go 1.13 +go 1.15 -require github.com/mmcloughlin/avo v0.2.0 +require ( + github.com/klauspost/asmfmt v1.3.1 + github.com/mmcloughlin/avo v0.4.0 +) diff --git a/s2/_generate/go.sum b/s2/_generate/go.sum index dae47774ac..111e55305e 100644 --- a/s2/_generate/go.sum +++ b/s2/_generate/go.sum @@ -1,29 +1,32 @@ -github.com/mmcloughlin/avo v0.2.0 h1:6vhoSaKtxb6f4RiH+LK2qL6GSMpFzhEwJYTTSZNy09w= -github.com/mmcloughlin/avo v0.2.0/go.mod h1:5tidO2Z9Z7N6X7UMcGg+1KTj51O8OxYDCMHxCZTVpEA= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -golang.org/x/arch v0.0.0-20210405154355-08b684f594a5/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4= +github.com/klauspost/asmfmt v1.3.1 h1:7xZi1N7s9gTLbqiM8KUv8TLyysavbTRGBT5/ly0bRtw= +github.com/klauspost/asmfmt v1.3.1/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= +github.com/mmcloughlin/avo v0.4.0 h1:jeHDRktVD+578ULxWpQHkilor6pkdLF7u7EiTzDbfcU= +github.com/mmcloughlin/avo v0.4.0/go.mod h1:RW9BfYA3TgO9uCdNrKU2h6J8cPD8ZLznvfgHAeszb1s= +github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= +golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo= +golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57 h1:F5Gozwx4I1xtr/sr/8CFbb57iKi3297KFs0QDbGN60A= -golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021 h1:giLT+HuUP/gXYrG2Plg9WTjj4qhfgaW424ZIFog3rlk= +golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.0 h1:po9/4sTYwZU9lPhi1tOrb4hCv3qrhiQ77LZfGa2OjwY= -golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= +golang.org/x/tools v0.1.7 h1:6j8CgantCy3yc8JGBqkDLMKWqZ0RDU2g1HVgacojGWQ= +golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index 68d40904e6..d9312e5b92 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -1,7 +1,7 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. -//go:build !appengine && !noasm && gc && !goamd64_v3 && !noasm -// +build !appengine,!noasm,gc,!goamd64_v3,!noasm +//go:build !appengine && !noasm && gc && !noasm +// +build !appengine,!noasm,gc,!noasm package s2 diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index bab4159861..cc35dce373 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -1,15 +1,12 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. -// +build !appengine -// +build !noasm -// +build gc -// +build !goamd64_v3 -// +build !noasm +//go:build !appengine && !noasm && gc && !noasm +// +build !appengine,!noasm,gc,!noasm #include "textflag.h" // func encodeBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -252,10 +249,20 @@ matchlen_loopback_repeat_extend_encodeBlockAsm: XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm matchlen_loop_repeat_extend_encodeBlockAsm: LEAL -8(R9), R9 @@ -772,10 +779,20 @@ matchlen_loopback_match_nolit_encodeBlockAsm: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm matchlen_loop_match_nolit_encodeBlockAsm: LEAL -8(DI), DI @@ -1194,7 +1211,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm: RET // func encodeBlockAsm4MB(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm4MB(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -1429,10 +1446,20 @@ matchlen_loopback_repeat_extend_encodeBlockAsm4MB: XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm4MB matchlen_loop_repeat_extend_encodeBlockAsm4MB: LEAL -8(R9), R9 @@ -1908,10 +1935,20 @@ matchlen_loopback_match_nolit_encodeBlockAsm4MB: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm4MB - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm4MB matchlen_loop_match_nolit_encodeBlockAsm4MB: LEAL -8(DI), DI @@ -2300,7 +2337,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm4MB: RET // func encodeBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000080, CX @@ -2524,10 +2561,20 @@ matchlen_loopback_repeat_extend_encodeBlockAsm12B: XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm12B matchlen_loop_repeat_extend_encodeBlockAsm12B: LEAL -8(R9), R9 @@ -2888,10 +2935,20 @@ matchlen_loopback_match_nolit_encodeBlockAsm12B: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm12B matchlen_loop_match_nolit_encodeBlockAsm12B: LEAL -8(DI), DI @@ -3177,7 +3234,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm12B: RET // func encodeBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000020, CX @@ -3401,10 +3458,20 @@ matchlen_loopback_repeat_extend_encodeBlockAsm10B: XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm10B matchlen_loop_repeat_extend_encodeBlockAsm10B: LEAL -8(R9), R9 @@ -3765,10 +3832,20 @@ matchlen_loopback_match_nolit_encodeBlockAsm10B: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm10B matchlen_loop_match_nolit_encodeBlockAsm10B: LEAL -8(DI), DI @@ -4054,7 +4131,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm10B: RET // func encodeBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm8B(SB), $1048-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000008, CX @@ -4278,10 +4355,20 @@ matchlen_loopback_repeat_extend_encodeBlockAsm8B: XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm8B matchlen_loop_repeat_extend_encodeBlockAsm8B: LEAL -8(R9), R9 @@ -4632,10 +4719,20 @@ matchlen_loopback_match_nolit_encodeBlockAsm8B: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm8B matchlen_loop_match_nolit_encodeBlockAsm8B: LEAL -8(DI), DI @@ -4915,7 +5012,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B: RET // func encodeBetterBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX @@ -5044,10 +5141,20 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm matchlen_loop_match_nolit_encodeBetterBlockAsm: LEAL -8(R8), R8 @@ -5886,7 +5993,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm: RET // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX @@ -6015,10 +6122,20 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm4MB matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: LEAL -8(R8), R8 @@ -6800,7 +6917,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: RET // func encodeBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000280, CX @@ -6921,10 +7038,20 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm12B matchlen_loop_match_nolit_encodeBetterBlockAsm12B: LEAL -8(R8), R8 @@ -7560,7 +7687,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: RET // func encodeBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 MOVQ dst_base+0(FP), AX MOVQ $0x000000a0, CX @@ -7681,10 +7808,20 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm10B matchlen_loop_match_nolit_encodeBetterBlockAsm10B: LEAL -8(R8), R8 @@ -8320,7 +8457,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: RET // func encodeBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000028, CX @@ -8441,10 +8578,20 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm8B matchlen_loop_match_nolit_encodeBetterBlockAsm8B: LEAL -8(R8), R8 @@ -9070,7 +9217,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: RET // func encodeSnappyBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -9313,10 +9460,20 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#elifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_loop_repeat_extend_encodeSnappyBlockAsm: LEAL -8(R8), R8 @@ -9629,10 +9786,20 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm matchlen_loop_match_nolit_encodeSnappyBlockAsm: LEAL -8(DI), DI @@ -9917,7 +10084,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm: RET // func encodeSnappyBlockAsm64K(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -10141,10 +10308,20 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#elifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: LEAL -8(R8), R8 @@ -10414,10 +10591,20 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm64K + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm64K matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: LEAL -8(DI), DI @@ -10659,7 +10846,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: RET // func encodeSnappyBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000080, CX @@ -10883,10 +11070,20 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#elifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: LEAL -8(R8), R8 @@ -11156,10 +11353,20 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: LEAL -8(DI), DI @@ -11401,7 +11608,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: RET // func encodeSnappyBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000020, CX @@ -11625,10 +11832,20 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#elifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: LEAL -8(R8), R8 @@ -11898,10 +12115,20 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: LEAL -8(DI), DI @@ -12143,7 +12370,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: RET // func encodeSnappyBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000008, CX @@ -12367,10 +12594,20 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#elifdef GOAMD64_v4 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: LEAL -8(R8), R8 @@ -12638,10 +12875,20 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#elifdef GOAMD64_v4 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: LEAL -8(DI), DI @@ -12881,7 +13128,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: RET // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX @@ -13010,10 +13257,20 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: LEAL -8(R8), R8 @@ -13478,7 +13735,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: RET // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX @@ -13599,10 +13856,20 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm64K + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm64K matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: LEAL -8(R8), R8 @@ -13996,7 +14263,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: RET // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000280, CX @@ -14117,10 +14384,20 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: LEAL -8(R8), R8 @@ -14514,7 +14791,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: RET // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 MOVQ dst_base+0(FP), AX MOVQ $0x000000a0, CX @@ -14635,10 +14912,20 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: LEAL -8(R8), R8 @@ -15032,7 +15319,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: RET // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000028, CX @@ -15153,10 +15440,20 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#elifdef GOAMD64_v4 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: LEAL -8(R8), R8 @@ -16087,6 +16384,7 @@ gen_emit_copy_end_snappy: RET // func matchLen(a []byte, b []byte) int +// Requires: BMI TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX MOVQ b_base+24(FP), CX @@ -16102,10 +16400,20 @@ matchlen_loopback_standalone: XORQ (CX)(SI*1), BX TESTQ BX, BX JZ matchlen_loop_standalone - BSFQ BX, BX - SARQ $0x03, BX - LEAL (SI)(BX*1), SI - JMP gen_match_len_end + +#ifdef GOAMD64_v3 + TZCNTQ BX, BX + +#elifdef GOAMD64_v4 + TZCNTQ BX, BX + +#else + BSFQ BX, BX + +#endif + SARQ $0x03, BX + LEAL (SI)(BX*1), SI + JMP gen_match_len_end matchlen_loop_standalone: LEAL -8(DX), DX diff --git a/s2/encodeblock_v3_amd64.go b/s2/encodeblock_v3_amd64.go deleted file mode 100644 index f947b972fc..0000000000 --- a/s2/encodeblock_v3_amd64.go +++ /dev/null @@ -1,189 +0,0 @@ -// Code generated by command: go run gen.go -x64v3 -out ../encodeblock_v3_amd64.s -stubs ../encodeblock_v3_amd64.go -pkg=s2. DO NOT EDIT. - -//go:build !appengine && !noasm && gc && goamd64_v3 && !noasm -// +build !appengine,!noasm,gc,goamd64_v3,!noasm - -package s2 - -// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4294967295 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm(dst []byte, src []byte) int - -// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4194304 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm4MB(dst []byte, src []byte) int - -// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 16383 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm12B(dst []byte, src []byte) int - -// encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4095 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm10B(dst []byte, src []byte) int - -// encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 511 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBlockAsm8B(dst []byte, src []byte) int - -// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4294967295 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm(dst []byte, src []byte) int - -// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4194304 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm4MB(dst []byte, src []byte) int - -// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 16383 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm12B(dst []byte, src []byte) int - -// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4095 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm10B(dst []byte, src []byte) int - -// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 511 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeBetterBlockAsm8B(dst []byte, src []byte) int - -// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4294967295 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm(dst []byte, src []byte) int - -// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 65535 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm64K(dst []byte, src []byte) int - -// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 16383 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm12B(dst []byte, src []byte) int - -// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4095 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm10B(dst []byte, src []byte) int - -// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 511 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBlockAsm8B(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4294967295 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 65535 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 16383 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 4095 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int - -// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. -// Maximum input 511 bytes. -// It assumes that the varint-encoded length of the decompressed bytes has already been written. -// -//go:noescape -func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int - -// emitLiteral writes a literal chunk and returns the number of bytes written. -// -// It assumes that: -// dst is long enough to hold the encoded bytes with margin of 0 bytes -// 0 <= len(lit) && len(lit) <= math.MaxUint32 -// -//go:noescape -func emitLiteral(dst []byte, lit []byte) int - -// emitRepeat writes a repeat chunk and returns the number of bytes written. -// Length must be at least 4 and < 1<<32 -// -//go:noescape -func emitRepeat(dst []byte, offset int, length int) int - -// emitCopy writes a copy chunk and returns the number of bytes written. -// -// It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 -// -//go:noescape -func emitCopy(dst []byte, offset int, length int) int - -// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. -// -// It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 -// -//go:noescape -func emitCopyNoRepeat(dst []byte, offset int, length int) int - -// matchLen returns how many bytes match in a and b -// -// It assumes that: -// len(a) <= len(b) -// -//go:noescape -func matchLen(a []byte, b []byte) int diff --git a/s2/encodeblock_v3_amd64.s b/s2/encodeblock_v3_amd64.s deleted file mode 100644 index 4ce36bfdae..0000000000 --- a/s2/encodeblock_v3_amd64.s +++ /dev/null @@ -1,16146 +0,0 @@ -// Code generated by command: go run gen.go -x64v3 -out ../encodeblock_v3_amd64.s -stubs ../encodeblock_v3_amd64.go -pkg=s2. DO NOT EDIT. - -// +build !appengine -// +build !noasm -// +build gc -// +build goamd64_v3 -// +build !noasm - -#include "textflag.h" - -// func encodeBlockAsm(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm - -repeat_extend_back_loop_encodeBlockAsm: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm - -repeat_extend_back_end_encodeBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_repeat_emit_encodeBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_repeat_emit_encodeBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeBlockAsm - -four_bytes_repeat_emit_encodeBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBlockAsm - -three_bytes_repeat_emit_encodeBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm - -two_bytes_repeat_emit_encodeBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm - JMP memmove_long_repeat_emit_encodeBlockAsm - -one_byte_repeat_emit_encodeBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm - -memmove_long_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_match4_repeat_extend_encodeBlockAsm - -matchlen_loopback_repeat_extend_encodeBlockAsm: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm - -matchlen_loop_repeat_extend_encodeBlockAsm: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm - JZ repeat_extend_forward_end_encodeBlockAsm - -matchlen_match4_repeat_extend_encodeBlockAsm: - CMPL R9, $0x04 - JL matchlen_match2_repeat_extend_encodeBlockAsm - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm - SUBL $0x04, R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm: - CMPL R9, $0x02 - JL matchlen_match1_repeat_extend_encodeBlockAsm - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm - SUBL $0x02, R9 - LEAL 2(R12), R12 - -matchlen_match1_repeat_extend_encodeBlockAsm: - CMPL R9, $0x01 - JL repeat_extend_forward_end_encodeBlockAsm - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm - - // emitRepeat -emit_repeat_again_match_repeat_encodeBlockAsm: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm - -cant_repeat_two_offset_match_repeat_encodeBlockAsm: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm - CMPL SI, $0x00010100 - JLT repeat_four_match_repeat_encodeBlockAsm - CMPL SI, $0x0100ffff - JLT repeat_five_match_repeat_encodeBlockAsm - LEAL -16842747(SI), SI - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_repeat_encodeBlockAsm - -repeat_five_match_repeat_encodeBlockAsm: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_match_repeat_encodeBlockAsm: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_match_repeat_encodeBlockAsm: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_match_repeat_encodeBlockAsm: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_match_repeat_encodeBlockAsm: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_as_copy_encodeBlockAsm: - // emitCopy - CMPL DI, $0x00010000 - JL two_byte_offset_repeat_as_copy_encodeBlockAsm - -four_bytes_loop_back_repeat_as_copy_encodeBlockAsm: - CMPL SI, $0x40 - JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm - MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(SI), SI - ADDQ $0x05, AX - CMPL SI, $0x04 - JL four_bytes_remain_repeat_as_copy_encodeBlockAsm - - // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL SI, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL SI, $0x0100ffff - JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy - LEAL -16842747(SI), SI - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm - -four_bytes_remain_repeat_as_copy_encodeBlockAsm: - TESTL SI, SI - JZ repeat_end_emit_encodeBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -two_byte_offset_repeat_as_copy_encodeBlockAsm: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL SI, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL SI, $0x0100ffff - JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short - LEAL -16842747(SI), SI - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -emit_copy_three_repeat_as_copy_encodeBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm - -no_repeat_found_encodeBlockAsm: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm - -candidate3_match_encodeBlockAsm: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm - -candidate2_match_encodeBlockAsm: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm - -match_extend_back_loop_encodeBlockAsm: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm - JMP match_extend_back_loop_encodeBlockAsm - -match_extend_back_end_encodeBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 5(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm - CMPL R8, $0x00010000 - JLT three_bytes_match_emit_encodeBlockAsm - CMPL R8, $0x01000000 - JLT four_bytes_match_emit_encodeBlockAsm - MOVB $0xfc, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeBlockAsm - -four_bytes_match_emit_encodeBlockAsm: - MOVL R8, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW R8, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBlockAsm - -three_bytes_match_emit_encodeBlockAsm: - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm - -two_bytes_match_emit_encodeBlockAsm: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm - JMP memmove_long_match_emit_encodeBlockAsm - -one_byte_match_emit_encodeBlockAsm: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm - -memmove_long_match_emit_encodeBlockAsm: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm: -match_nolit_loop_encodeBlockAsm: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeBlockAsm - -matchlen_loopback_match_nolit_encodeBlockAsm: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm - -matchlen_loop_match_nolit_encodeBlockAsm: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm - JZ match_nolit_end_encodeBlockAsm - -matchlen_match4_match_nolit_encodeBlockAsm: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeBlockAsm - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeBlockAsm - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeBlockAsm: - CMPL DI, $0x01 - JL match_nolit_end_encodeBlockAsm - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JL two_byte_offset_match_nolit_encodeBlockAsm - -four_bytes_loop_back_match_nolit_encodeBlockAsm: - CMPL R10, $0x40 - JLE four_bytes_remain_match_nolit_encodeBlockAsm - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(R10), R10 - ADDQ $0x05, AX - CMPL R10, $0x04 - JL four_bytes_remain_match_nolit_encodeBlockAsm - - // emitRepeat -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy - CMPL R10, $0x00010100 - JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy - CMPL R10, $0x0100ffff - JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy - LEAL -16842747(R10), R10 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy - -repeat_five_match_nolit_encodeBlockAsm_emit_copy: - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm - -four_bytes_remain_match_nolit_encodeBlockAsm: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -two_byte_offset_match_nolit_encodeBlockAsm: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R10, $0x00010100 - JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R10, $0x0100ffff - JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short - LEAL -16842747(R10), R10 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short - -repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - JMP two_byte_offset_match_nolit_encodeBlockAsm - -two_byte_offset_short_match_nolit_encodeBlockAsm: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -emit_copy_three_match_nolit_encodeBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm - INCL CX - JMP search_loop_encodeBlockAsm - -emit_remainder_encodeBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBlockAsm - CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeBlockAsm - -four_bytes_emit_remainder_encodeBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBlockAsm - -three_bytes_emit_remainder_encodeBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm - -two_bytes_emit_remainder_encodeBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm - JMP memmove_long_emit_remainder_encodeBlockAsm - -one_byte_emit_remainder_encodeBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm - -memmove_long_emit_remainder_encodeBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm4MB(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm4MB(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm4MB: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm4MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm4MB: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm4MB - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm4MB - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm4MB - -repeat_extend_back_loop_encodeBlockAsm4MB: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm4MB - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm4MB - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm4MB - -repeat_extend_back_end_encodeBlockAsm4MB: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm4MB - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm4MB - CMPL SI, $0x00010000 - JLT three_bytes_repeat_emit_encodeBlockAsm4MB - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -three_bytes_repeat_emit_encodeBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -two_bytes_repeat_emit_encodeBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm4MB - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -one_byte_repeat_emit_encodeBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm4MB: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB - -memmove_long_repeat_emit_encodeBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm4MB: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_match4_repeat_extend_encodeBlockAsm4MB - -matchlen_loopback_repeat_extend_encodeBlockAsm4MB: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_loop_repeat_extend_encodeBlockAsm4MB: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB - JZ repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_match4_repeat_extend_encodeBlockAsm4MB: - CMPL R9, $0x04 - JL matchlen_match2_repeat_extend_encodeBlockAsm4MB - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB - SUBL $0x04, R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm4MB: - CMPL R9, $0x02 - JL matchlen_match1_repeat_extend_encodeBlockAsm4MB - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB - SUBL $0x02, R9 - LEAL 2(R12), R12 - -matchlen_match1_repeat_extend_encodeBlockAsm4MB: - CMPL R9, $0x01 - JL repeat_extend_forward_end_encodeBlockAsm4MB - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm4MB - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm4MB: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm4MB - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm4MB - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB - -cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm4MB - CMPL SI, $0x00010100 - JLT repeat_four_match_repeat_encodeBlockAsm4MB - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_match_repeat_encodeBlockAsm4MB: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_match_repeat_encodeBlockAsm4MB: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_match_repeat_encodeBlockAsm4MB: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_match_repeat_encodeBlockAsm4MB: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_as_copy_encodeBlockAsm4MB: - // emitCopy - CMPL DI, $0x00010000 - JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB - -four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB: - CMPL SI, $0x40 - JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB - MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(SI), SI - ADDQ $0x05, AX - CMPL SI, $0x04 - JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB - -four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: - TESTL SI, SI - JZ repeat_end_emit_encodeBlockAsm4MB - MOVB $0x03, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm4MB: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm4MB - -no_repeat_found_encodeBlockAsm4MB: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm4MB - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm4MB - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm4MB - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm4MB - -candidate3_match_encodeBlockAsm4MB: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm4MB - -candidate2_match_encodeBlockAsm4MB: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm4MB: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm4MB - -match_extend_back_loop_encodeBlockAsm4MB: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm4MB - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm4MB - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm4MB - JMP match_extend_back_loop_encodeBlockAsm4MB - -match_extend_back_end_encodeBlockAsm4MB: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm4MB: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm4MB - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm4MB - CMPL R8, $0x00010000 - JLT three_bytes_match_emit_encodeBlockAsm4MB - MOVL R8, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW R8, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBlockAsm4MB - -three_bytes_match_emit_encodeBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm4MB - -two_bytes_match_emit_encodeBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm4MB - JMP memmove_long_match_emit_encodeBlockAsm4MB - -one_byte_match_emit_encodeBlockAsm4MB: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm4MB: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm4MB: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm4MB - -memmove_long_match_emit_encodeBlockAsm4MB: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm4MB: -match_nolit_loop_encodeBlockAsm4MB: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeBlockAsm4MB - -matchlen_loopback_match_nolit_encodeBlockAsm4MB: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm4MB - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm4MB - -matchlen_loop_match_nolit_encodeBlockAsm4MB: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB - JZ match_nolit_end_encodeBlockAsm4MB - -matchlen_match4_match_nolit_encodeBlockAsm4MB: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeBlockAsm4MB - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm4MB - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm4MB: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeBlockAsm4MB - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm4MB - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeBlockAsm4MB: - CMPL DI, $0x01 - JL match_nolit_end_encodeBlockAsm4MB - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm4MB - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm4MB: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JL two_byte_offset_match_nolit_encodeBlockAsm4MB - -four_bytes_loop_back_match_nolit_encodeBlockAsm4MB: - CMPL R10, $0x40 - JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(R10), R10 - ADDQ $0x05, AX - CMPL R10, $0x04 - JL four_bytes_remain_match_nolit_encodeBlockAsm4MB - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL R10, $0x00010100 - JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB - -four_bytes_remain_match_nolit_encodeBlockAsm4MB: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeBlockAsm4MB - MOVB $0x03, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -two_byte_offset_match_nolit_encodeBlockAsm4MB: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL R10, $0x00010100 - JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - JMP two_byte_offset_match_nolit_encodeBlockAsm4MB - -two_byte_offset_short_match_nolit_encodeBlockAsm4MB: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm4MB - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm4MB - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -emit_copy_three_match_nolit_encodeBlockAsm4MB: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm4MB: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm4MB - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm4MB: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm4MB - INCL CX - JMP search_loop_encodeBlockAsm4MB - -emit_remainder_encodeBlockAsm4MB: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm4MB: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm4MB - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm4MB - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBlockAsm4MB - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -three_bytes_emit_remainder_encodeBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -two_bytes_emit_remainder_encodeBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm4MB - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -one_byte_emit_remainder_encodeBlockAsm4MB: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm4MB: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB - -memmove_long_emit_remainder_encodeBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm4MB: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm12B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm12B(SB), $16408-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000080, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm12B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x18, R11 - IMULQ R9, R11 - SHRQ $0x34, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm12B - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm12B - -repeat_extend_back_loop_encodeBlockAsm12B: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm12B - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm12B - -repeat_extend_back_end_encodeBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm12B - -two_bytes_repeat_emit_encodeBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm12B - JMP memmove_long_repeat_emit_encodeBlockAsm12B - -one_byte_repeat_emit_encodeBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm12B - -memmove_long_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm12B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_match4_repeat_extend_encodeBlockAsm12B - -matchlen_loopback_repeat_extend_encodeBlockAsm12B: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm12B - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_loop_repeat_extend_encodeBlockAsm12B: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B - JZ repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_match4_repeat_extend_encodeBlockAsm12B: - CMPL R9, $0x04 - JL matchlen_match2_repeat_extend_encodeBlockAsm12B - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm12B - SUBL $0x04, R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm12B: - CMPL R9, $0x02 - JL matchlen_match1_repeat_extend_encodeBlockAsm12B - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm12B - SUBL $0x02, R9 - LEAL 2(R12), R12 - -matchlen_match1_repeat_extend_encodeBlockAsm12B: - CMPL R9, $0x01 - JL repeat_extend_forward_end_encodeBlockAsm12B - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm12B - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm12B: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm12B - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm12B - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm12B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm12B - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_match_repeat_encodeBlockAsm12B: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_match_repeat_encodeBlockAsm12B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_match_repeat_encodeBlockAsm12B: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_as_copy_encodeBlockAsm12B: - // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm12B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -emit_copy_three_repeat_as_copy_encodeBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm12B: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm12B - -no_repeat_found_encodeBlockAsm12B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm12B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm12B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm12B - -candidate3_match_encodeBlockAsm12B: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm12B - -candidate2_match_encodeBlockAsm12B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm12B - -match_extend_back_loop_encodeBlockAsm12B: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm12B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm12B - JMP match_extend_back_loop_encodeBlockAsm12B - -match_extend_back_end_encodeBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm12B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm12B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm12B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm12B - -two_bytes_match_emit_encodeBlockAsm12B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm12B - JMP memmove_long_match_emit_encodeBlockAsm12B - -one_byte_match_emit_encodeBlockAsm12B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm12B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm12B - -memmove_long_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm12B: -match_nolit_loop_encodeBlockAsm12B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeBlockAsm12B - -matchlen_loopback_match_nolit_encodeBlockAsm12B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm12B - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm12B - -matchlen_loop_match_nolit_encodeBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm12B - JZ match_nolit_end_encodeBlockAsm12B - -matchlen_match4_match_nolit_encodeBlockAsm12B: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeBlockAsm12B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm12B - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm12B: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeBlockAsm12B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm12B - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeBlockAsm12B: - CMPL DI, $0x01 - JL match_nolit_end_encodeBlockAsm12B - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm12B - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm12B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm12B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - JMP two_byte_offset_match_nolit_encodeBlockAsm12B - -two_byte_offset_short_match_nolit_encodeBlockAsm12B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm12B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -emit_copy_three_match_nolit_encodeBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm12B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm12B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x18, R8 - IMULQ R9, R8 - SHRQ $0x34, R8 - SHLQ $0x18, SI - IMULQ R9, SI - SHRQ $0x34, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm12B - INCL CX - JMP search_loop_encodeBlockAsm12B - -emit_remainder_encodeBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm12B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm12B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm12B - -two_bytes_emit_remainder_encodeBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm12B - JMP memmove_long_emit_remainder_encodeBlockAsm12B - -one_byte_emit_remainder_encodeBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm12B - -memmove_long_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm10B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm10B(SB), $4120-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000020, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm10B - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm10B - -repeat_extend_back_loop_encodeBlockAsm10B: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm10B - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm10B - -repeat_extend_back_end_encodeBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm10B - -two_bytes_repeat_emit_encodeBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm10B - JMP memmove_long_repeat_emit_encodeBlockAsm10B - -one_byte_repeat_emit_encodeBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm10B - -memmove_long_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm10B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_match4_repeat_extend_encodeBlockAsm10B - -matchlen_loopback_repeat_extend_encodeBlockAsm10B: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm10B - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_loop_repeat_extend_encodeBlockAsm10B: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B - JZ repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_match4_repeat_extend_encodeBlockAsm10B: - CMPL R9, $0x04 - JL matchlen_match2_repeat_extend_encodeBlockAsm10B - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm10B - SUBL $0x04, R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm10B: - CMPL R9, $0x02 - JL matchlen_match1_repeat_extend_encodeBlockAsm10B - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm10B - SUBL $0x02, R9 - LEAL 2(R12), R12 - -matchlen_match1_repeat_extend_encodeBlockAsm10B: - CMPL R9, $0x01 - JL repeat_extend_forward_end_encodeBlockAsm10B - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm10B - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm10B: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm10B - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm10B - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm10B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm10B - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_match_repeat_encodeBlockAsm10B: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_match_repeat_encodeBlockAsm10B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_match_repeat_encodeBlockAsm10B: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_as_copy_encodeBlockAsm10B: - // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm10B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL R8, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL DI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -emit_copy_three_repeat_as_copy_encodeBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm10B: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm10B - -no_repeat_found_encodeBlockAsm10B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm10B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm10B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm10B - -candidate3_match_encodeBlockAsm10B: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm10B - -candidate2_match_encodeBlockAsm10B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm10B - -match_extend_back_loop_encodeBlockAsm10B: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm10B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm10B - JMP match_extend_back_loop_encodeBlockAsm10B - -match_extend_back_end_encodeBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm10B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm10B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm10B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm10B - -two_bytes_match_emit_encodeBlockAsm10B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm10B - JMP memmove_long_match_emit_encodeBlockAsm10B - -one_byte_match_emit_encodeBlockAsm10B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm10B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm10B - -memmove_long_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm10B: -match_nolit_loop_encodeBlockAsm10B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeBlockAsm10B - -matchlen_loopback_match_nolit_encodeBlockAsm10B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm10B - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm10B - -matchlen_loop_match_nolit_encodeBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm10B - JZ match_nolit_end_encodeBlockAsm10B - -matchlen_match4_match_nolit_encodeBlockAsm10B: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeBlockAsm10B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm10B - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm10B: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeBlockAsm10B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm10B - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeBlockAsm10B: - CMPL DI, $0x01 - JL match_nolit_end_encodeBlockAsm10B - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm10B - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm10B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm10B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - JMP two_byte_offset_match_nolit_encodeBlockAsm10B - -two_byte_offset_short_match_nolit_encodeBlockAsm10B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm10B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -emit_copy_three_match_nolit_encodeBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm10B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm10B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm10B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x36, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x36, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm10B - INCL CX - JMP search_loop_encodeBlockAsm10B - -emit_remainder_encodeBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm10B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm10B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm10B - -two_bytes_emit_remainder_encodeBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm10B - JMP memmove_long_emit_remainder_encodeBlockAsm10B - -one_byte_emit_remainder_encodeBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm10B - -memmove_long_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm8B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm8B(SB), $1048-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000008, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBlockAsm8B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x38, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm8B - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm8B - -repeat_extend_back_loop_encodeBlockAsm8B: - CMPL DI, R8 - JLE repeat_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBlockAsm8B - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm8B - -repeat_extend_back_end_encodeBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm8B - -two_bytes_repeat_emit_encodeBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBlockAsm8B - JMP memmove_long_repeat_emit_encodeBlockAsm8B - -one_byte_repeat_emit_encodeBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm8B - -memmove_long_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBlockAsm8B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R12, R12 - CMPL R9, $0x08 - JL matchlen_match4_repeat_extend_encodeBlockAsm8B - -matchlen_loopback_repeat_extend_encodeBlockAsm8B: - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_repeat_extend_encodeBlockAsm8B - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_loop_repeat_extend_encodeBlockAsm8B: - LEAL -8(R9), R9 - LEAL 8(R12), R12 - CMPL R9, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B - JZ repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_match4_repeat_extend_encodeBlockAsm8B: - CMPL R9, $0x04 - JL matchlen_match2_repeat_extend_encodeBlockAsm8B - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm8B - SUBL $0x04, R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm8B: - CMPL R9, $0x02 - JL matchlen_match1_repeat_extend_encodeBlockAsm8B - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm8B - SUBL $0x02, R9 - LEAL 2(R12), R12 - -matchlen_match1_repeat_extend_encodeBlockAsm8B: - CMPL R9, $0x01 - JL repeat_extend_forward_end_encodeBlockAsm8B - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm8B - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm8B: - ADDL R12, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm8B - - // emitRepeat - MOVL SI, DI - LEAL -4(SI), SI - CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm8B - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm8B - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_match_repeat_encodeBlockAsm8B: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_match_repeat_encodeBlockAsm8B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_as_copy_encodeBlockAsm8B: - // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm8B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - - // emitRepeat - MOVL SI, DI - LEAL -4(SI), SI - CMPL DI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL DI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - CMPL SI, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - -emit_copy_three_repeat_as_copy_encodeBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm8B: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm8B - -no_repeat_found_encodeBlockAsm8B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm8B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm8B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm8B - -candidate3_match_encodeBlockAsm8B: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm8B - -candidate2_match_encodeBlockAsm8B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm8B - -match_extend_back_loop_encodeBlockAsm8B: - CMPL CX, DI - JLE match_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBlockAsm8B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBlockAsm8B - JMP match_extend_back_loop_encodeBlockAsm8B - -match_extend_back_end_encodeBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm8B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeBlockAsm8B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm8B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm8B - -two_bytes_match_emit_encodeBlockAsm8B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeBlockAsm8B - JMP memmove_long_match_emit_encodeBlockAsm8B - -one_byte_match_emit_encodeBlockAsm8B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm8B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeBlockAsm8B - -memmove_long_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeBlockAsm8B: -match_nolit_loop_encodeBlockAsm8B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeBlockAsm8B - -matchlen_loopback_match_nolit_encodeBlockAsm8B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBlockAsm8B - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm8B - -matchlen_loop_match_nolit_encodeBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm8B - JZ match_nolit_end_encodeBlockAsm8B - -matchlen_match4_match_nolit_encodeBlockAsm8B: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeBlockAsm8B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm8B - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm8B: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeBlockAsm8B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm8B - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeBlockAsm8B: - CMPL DI, $0x01 - JL match_nolit_end_encodeBlockAsm8B - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm8B - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm8B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm8B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat - MOVL R10, SI - LEAL -4(R10), R10 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: - CMPL R10, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBlockAsm8B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -emit_copy_three_match_nolit_encodeBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm8B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm8B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm8B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x38, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x38, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm8B - INCL CX - JMP search_loop_encodeBlockAsm8B - -emit_remainder_encodeBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm8B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm8B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm8B - -two_bytes_emit_remainder_encodeBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm8B - JMP memmove_long_emit_remainder_encodeBlockAsm8B - -one_byte_emit_remainder_encodeBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm8B - -memmove_long_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm(SB), $327704-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - CMPL SI, $0x63 - JLE check_maxskip_ok_encodeBetterBlockAsm - LEAL 100(CX), SI - JMP check_maxskip_cont_encodeBetterBlockAsm - -check_maxskip_ok_encodeBetterBlockAsm: - LEAL 1(CX)(SI*1), SI - -check_maxskip_cont_encodeBetterBlockAsm: - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm - -candidateS_match_encodeBetterBlockAsm: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm - -match_extend_back_loop_encodeBetterBlockAsm: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm - JMP match_extend_back_loop_encodeBetterBlockAsm - -match_extend_back_end_encodeBetterBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 5(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeBetterBlockAsm - -matchlen_loopback_match_nolit_encodeBetterBlockAsm: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm - -matchlen_loop_match_nolit_encodeBetterBlockAsm: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm - JZ match_nolit_end_encodeBetterBlockAsm - -matchlen_match4_match_nolit_encodeBetterBlockAsm: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeBetterBlockAsm - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeBetterBlockAsm - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeBetterBlockAsm: - CMPL R8, $0x01 - JL match_nolit_end_encodeBetterBlockAsm - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm - CMPL R12, $0x01 - JG match_length_ok_encodeBetterBlockAsm - CMPL R8, $0x0000ffff - JLE match_length_ok_encodeBetterBlockAsm - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeBetterBlockAsm - -match_length_ok_encodeBetterBlockAsm: - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_encodeBetterBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_match_emit_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -four_bytes_match_emit_encodeBetterBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -three_bytes_match_emit_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -two_bytes_match_emit_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm - JMP memmove_long_match_emit_encodeBetterBlockAsm - -one_byte_match_emit_encodeBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm - -memmove_long_match_emit_encodeBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy - CMPL R8, $0x00010000 - JL two_byte_offset_match_nolit_encodeBetterBlockAsm - -four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: - CMPL R12, $0x40 - JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm - MOVB $0xff, (AX) - MOVL R8, 1(AX) - LEAL -64(R12), R12 - ADDQ $0x05, AX - CMPL R12, $0x04 - JL four_bytes_remain_match_nolit_encodeBetterBlockAsm - - // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R12, $0x0100ffff - JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy - LEAL -16842747(R12), R12 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm - -four_bytes_remain_match_nolit_encodeBetterBlockAsm: - TESTL R12, R12 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -two_byte_offset_match_nolit_encodeBetterBlockAsm: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R12, $0x0100ffff - JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short - LEAL -16842747(R12), R12 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -emit_copy_three_match_nolit_encodeBetterBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -match_is_repeat_encodeBetterBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -four_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -three_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -two_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -one_byte_match_emit_repeat_encodeBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - -memmove_long_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat -emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm - CMPL R12, $0x0100ffff - JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm - LEAL -16842747(R12), R12 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm - -repeat_five_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm - -emit_remainder_encodeBetterBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -four_bytes_emit_remainder_encodeBetterBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -three_bytes_emit_remainder_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -two_bytes_emit_remainder_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -one_byte_emit_remainder_encodeBetterBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm - -memmove_long_emit_remainder_encodeBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm4MB: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm4MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm4MB: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - CMPL SI, $0x63 - JLE check_maxskip_ok_encodeBetterBlockAsm4MB - LEAL 100(CX), SI - JMP check_maxskip_cont_encodeBetterBlockAsm4MB - -check_maxskip_ok_encodeBetterBlockAsm4MB: - LEAL 1(CX)(SI*1), SI - -check_maxskip_cont_encodeBetterBlockAsm4MB: - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm4MB - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm4MB - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm4MB - -candidateS_match_encodeBetterBlockAsm4MB: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm4MB - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm4MB: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm4MB - -match_extend_back_loop_encodeBetterBlockAsm4MB: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm4MB - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm4MB - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm4MB - JMP match_extend_back_loop_encodeBetterBlockAsm4MB - -match_extend_back_end_encodeBetterBlockAsm4MB: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm4MB: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeBetterBlockAsm4MB - -matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB - JZ match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeBetterBlockAsm4MB - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm4MB: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeBetterBlockAsm4MB - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeBetterBlockAsm4MB: - CMPL R8, $0x01 - JL match_nolit_end_encodeBetterBlockAsm4MB - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm4MB - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm4MB: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm4MB - CMPL R12, $0x01 - JG match_length_ok_encodeBetterBlockAsm4MB - CMPL R8, $0x0000ffff - JLE match_length_ok_encodeBetterBlockAsm4MB - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeBetterBlockAsm4MB - -match_length_ok_encodeBetterBlockAsm4MB: - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm4MB - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm4MB - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_encodeBetterBlockAsm4MB - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -three_bytes_match_emit_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -two_bytes_match_emit_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm4MB - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -one_byte_match_emit_encodeBetterBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB - -memmove_long_match_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm4MB: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy - CMPL R8, $0x00010000 - JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB - -four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB: - CMPL R12, $0x40 - JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB - MOVB $0xff, (AX) - MOVL R8, 1(AX) - LEAL -64(R12), R12 - ADDQ $0x05, AX - CMPL R12, $0x04 - JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB - -four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: - TESTL R12, R12 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - MOVB $0x03, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -match_is_repeat_encodeBetterBlockAsm4MB: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB - -memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL R12, $0x00010100 - JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (AX) - MOVW R12, 2(AX) - SARL $0x10, R8 - MOVB R8, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm4MB - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm4MB: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm4MB - -emit_remainder_encodeBetterBlockAsm4MB: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm4MB: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -three_bytes_emit_remainder_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -two_bytes_emit_remainder_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm4MB - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -one_byte_emit_remainder_encodeBetterBlockAsm4MB: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB - -memmove_long_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000280, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm12B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x34, R11 - MOVL 24(SP)(R10*4), SI - MOVL 65560(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm12B - -candidateS_match_encodeBetterBlockAsm12B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm12B - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm12B - -match_extend_back_loop_encodeBetterBlockAsm12B: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm12B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm12B - JMP match_extend_back_loop_encodeBetterBlockAsm12B - -match_extend_back_end_encodeBetterBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm12B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeBetterBlockAsm12B - -matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm12B - -matchlen_loop_match_nolit_encodeBetterBlockAsm12B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B - JZ match_nolit_end_encodeBetterBlockAsm12B - -matchlen_match4_match_nolit_encodeBetterBlockAsm12B: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeBetterBlockAsm12B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm12B: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeBetterBlockAsm12B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeBetterBlockAsm12B: - CMPL R8, $0x01 - JL match_nolit_end_encodeBetterBlockAsm12B - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm12B - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm12B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm12B - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm12B - -two_bytes_match_emit_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm12B - JMP memmove_long_match_emit_encodeBetterBlockAsm12B - -one_byte_match_emit_encodeBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B - -memmove_long_match_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm12B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm12B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -emit_copy_three_match_nolit_encodeBetterBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -match_is_repeat_encodeBetterBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm12B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B - -one_byte_match_emit_repeat_encodeBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm12B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm12B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm12B - -emit_remainder_encodeBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm12B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B - -two_bytes_emit_remainder_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm12B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B - -one_byte_emit_remainder_encodeBetterBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B - -memmove_long_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x000000a0, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 16408(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm10B - -candidateS_match_encodeBetterBlockAsm10B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm10B - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm10B - -match_extend_back_loop_encodeBetterBlockAsm10B: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm10B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm10B - JMP match_extend_back_loop_encodeBetterBlockAsm10B - -match_extend_back_end_encodeBetterBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm10B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeBetterBlockAsm10B - -matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm10B - -matchlen_loop_match_nolit_encodeBetterBlockAsm10B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B - JZ match_nolit_end_encodeBetterBlockAsm10B - -matchlen_match4_match_nolit_encodeBetterBlockAsm10B: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeBetterBlockAsm10B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm10B: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeBetterBlockAsm10B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeBetterBlockAsm10B: - CMPL R8, $0x01 - JL match_nolit_end_encodeBetterBlockAsm10B - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm10B - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm10B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm10B - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm10B - -two_bytes_match_emit_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm10B - JMP memmove_long_match_emit_encodeBetterBlockAsm10B - -one_byte_match_emit_encodeBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B - -memmove_long_match_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm10B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm10B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -emit_copy_three_match_nolit_encodeBetterBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -match_is_repeat_encodeBetterBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm10B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B - -one_byte_match_emit_repeat_encodeBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL R8, $0x00000800 - JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm10B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm10B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm10B - -emit_remainder_encodeBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm10B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B - -two_bytes_emit_remainder_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm10B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B - -one_byte_emit_remainder_encodeBetterBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B - -memmove_long_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000028, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm8B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x38, R11 - MOVL 24(SP)(R10*4), SI - MOVL 4120(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm8B - -candidateS_match_encodeBetterBlockAsm8B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm8B - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm8B - -match_extend_back_loop_encodeBetterBlockAsm8B: - CMPL CX, DI - JLE match_extend_back_end_encodeBetterBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeBetterBlockAsm8B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm8B - JMP match_extend_back_loop_encodeBetterBlockAsm8B - -match_extend_back_end_encodeBetterBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm8B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeBetterBlockAsm8B - -matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm8B - -matchlen_loop_match_nolit_encodeBetterBlockAsm8B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B - JZ match_nolit_end_encodeBetterBlockAsm8B - -matchlen_match4_match_nolit_encodeBetterBlockAsm8B: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeBetterBlockAsm8B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm8B: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeBetterBlockAsm8B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeBetterBlockAsm8B: - CMPL R8, $0x01 - JL match_nolit_end_encodeBetterBlockAsm8B - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm8B - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm8B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm8B - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm8B - -two_bytes_match_emit_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_encodeBetterBlockAsm8B - -one_byte_match_emit_encodeBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (AX) - MOVL R10, -4(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B - -memmove_long_match_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm8B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm8B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -emit_copy_three_match_nolit_encodeBetterBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -match_is_repeat_encodeBetterBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -one_byte_match_emit_repeat_encodeBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x04 - JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R11 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: - CMPL R12, $0x00000104 - JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B - LEAL -256(R12), R12 - MOVW $0x0019, (AX) - MOVW R12, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: - LEAL -4(R12), R12 - MOVW $0x0015, (AX) - MOVB R12, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(AX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm8B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm8B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm8B - -emit_remainder_encodeBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm8B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B - -two_bytes_emit_remainder_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm8B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B - -one_byte_emit_remainder_encodeBetterBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B - -memmove_long_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm - -repeat_extend_back_loop_encodeSnappyBlockAsm: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm - -repeat_extend_back_end_encodeSnappyBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_repeat_emit_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -four_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVL SI, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -three_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -two_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -one_byte_repeat_emit_encodeSnappyBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm - -memmove_long_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm - JZ repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeSnappyBlockAsm - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy - CMPL DI, $0x00010000 - JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm - -four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: - CMPL SI, $0x40 - JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(SI), SI - ADDQ $0x05, AX - CMPL SI, $0x04 - JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm - -four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: - TESTL SI, SI - JZ repeat_end_emit_encodeSnappyBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeSnappyBlockAsm - -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm - -no_repeat_found_encodeSnappyBlockAsm: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm - -candidate3_match_encodeSnappyBlockAsm: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm - -candidate2_match_encodeSnappyBlockAsm: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm - -match_extend_back_loop_encodeSnappyBlockAsm: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm - JMP match_extend_back_loop_encodeSnappyBlockAsm - -match_extend_back_end_encodeSnappyBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 5(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm - CMPL R8, $0x00010000 - JLT three_bytes_match_emit_encodeSnappyBlockAsm - CMPL R8, $0x01000000 - JLT four_bytes_match_emit_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -four_bytes_match_emit_encodeSnappyBlockAsm: - MOVL R8, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW R8, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -three_bytes_match_emit_encodeSnappyBlockAsm: - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -two_bytes_match_emit_encodeSnappyBlockAsm: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -one_byte_match_emit_encodeSnappyBlockAsm: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm - -memmove_long_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm: -match_nolit_loop_encodeSnappyBlockAsm: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBlockAsm - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm - -matchlen_loop_match_nolit_encodeSnappyBlockAsm: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm - JZ match_nolit_end_encodeSnappyBlockAsm - -matchlen_match4_match_nolit_encodeSnappyBlockAsm: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBlockAsm - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBlockAsm - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeSnappyBlockAsm: - CMPL DI, $0x01 - JL match_nolit_end_encodeSnappyBlockAsm - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JL two_byte_offset_match_nolit_encodeSnappyBlockAsm - -four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: - CMPL R10, $0x40 - JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(R10), R10 - ADDQ $0x05, AX - CMPL R10, $0x04 - JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm - -four_bytes_remain_match_nolit_encodeSnappyBlockAsm: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm - -two_byte_offset_match_nolit_encodeSnappyBlockAsm: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm - -emit_copy_three_match_nolit_encodeSnappyBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm - INCL CX - JMP search_loop_encodeSnappyBlockAsm - -emit_remainder_encodeSnappyBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -four_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -three_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -two_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -one_byte_emit_remainder_encodeSnappyBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm - -memmove_long_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm64K: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm64K: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm64K - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm64K - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm64K - -repeat_extend_back_loop_encodeSnappyBlockAsm64K: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm64K - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm64K - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K - -repeat_extend_back_end_encodeSnappyBlockAsm64K: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K - -two_bytes_repeat_emit_encodeSnappyBlockAsm64K: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm64K - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K - -one_byte_repeat_emit_encodeSnappyBlockAsm64K: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K - -memmove_long_repeat_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K - JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeSnappyBlockAsm64K - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm64K: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm64K - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm64K: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm64K - -no_repeat_found_encodeSnappyBlockAsm64K: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm64K - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm64K - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm64K - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm64K - -candidate3_match_encodeSnappyBlockAsm64K: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm64K - -candidate2_match_encodeSnappyBlockAsm64K: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm64K: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm64K - -match_extend_back_loop_encodeSnappyBlockAsm64K: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm64K - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm64K - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm64K - JMP match_extend_back_loop_encodeSnappyBlockAsm64K - -match_extend_back_end_encodeSnappyBlockAsm64K: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm64K: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm64K - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm64K - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm64K - -two_bytes_match_emit_encodeSnappyBlockAsm64K: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm64K - JMP memmove_long_match_emit_encodeSnappyBlockAsm64K - -one_byte_match_emit_encodeSnappyBlockAsm64K: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K - -memmove_long_match_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm64K: -match_nolit_loop_encodeSnappyBlockAsm64K: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBlockAsm64K - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K - JZ match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBlockAsm64K - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm64K: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBlockAsm64K - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeSnappyBlockAsm64K: - CMPL DI, $0x01 - JL match_nolit_end_encodeSnappyBlockAsm64K - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm64K - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm64K: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K - -emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm64K - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm64K: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm64K - INCL CX - JMP search_loop_encodeSnappyBlockAsm64K - -emit_remainder_encodeSnappyBlockAsm64K: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm64K: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K - -two_bytes_emit_remainder_encodeSnappyBlockAsm64K: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm64K - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K - -one_byte_emit_remainder_encodeSnappyBlockAsm64K: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K - -memmove_long_emit_remainder_encodeSnappyBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000080, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm12B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x18, R11 - IMULQ R9, R11 - SHRQ $0x34, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm12B - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm12B - -repeat_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B - -repeat_extend_back_end_encodeSnappyBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B - -two_bytes_repeat_emit_encodeSnappyBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm12B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B - -one_byte_repeat_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - -memmove_long_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B - JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeSnappyBlockAsm12B - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm12B: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm12B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm12B: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm12B - -no_repeat_found_encodeSnappyBlockAsm12B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm12B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm12B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm12B - -candidate3_match_encodeSnappyBlockAsm12B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm12B - -candidate2_match_encodeSnappyBlockAsm12B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm12B - -match_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm12B - JMP match_extend_back_loop_encodeSnappyBlockAsm12B - -match_extend_back_end_encodeSnappyBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm12B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm12B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm12B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B - -two_bytes_match_emit_encodeSnappyBlockAsm12B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm12B - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B - -one_byte_match_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B - -memmove_long_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm12B: -match_nolit_loop_encodeSnappyBlockAsm12B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBlockAsm12B - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B - JZ match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBlockAsm12B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm12B: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBlockAsm12B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeSnappyBlockAsm12B: - CMPL DI, $0x01 - JL match_nolit_end_encodeSnappyBlockAsm12B - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm12B - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm12B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm12B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x18, R8 - IMULQ R9, R8 - SHRQ $0x34, R8 - SHLQ $0x18, SI - IMULQ R9, SI - SHRQ $0x34, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm12B - INCL CX - JMP search_loop_encodeSnappyBlockAsm12B - -emit_remainder_encodeSnappyBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B - -two_bytes_emit_remainder_encodeSnappyBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm12B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B - -one_byte_emit_remainder_encodeSnappyBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B - -memmove_long_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000020, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm10B - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm10B - -repeat_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B - -repeat_extend_back_end_encodeSnappyBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B - -two_bytes_repeat_emit_encodeSnappyBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm10B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B - -one_byte_repeat_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - -memmove_long_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B - JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeSnappyBlockAsm10B - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm10B: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - CMPL DI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm10B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm10B: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm10B - -no_repeat_found_encodeSnappyBlockAsm10B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm10B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm10B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm10B - -candidate3_match_encodeSnappyBlockAsm10B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm10B - -candidate2_match_encodeSnappyBlockAsm10B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm10B - -match_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm10B - JMP match_extend_back_loop_encodeSnappyBlockAsm10B - -match_extend_back_end_encodeSnappyBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm10B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm10B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm10B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B - -two_bytes_match_emit_encodeSnappyBlockAsm10B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm10B - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B - -one_byte_match_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B - -memmove_long_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm10B: -match_nolit_loop_encodeSnappyBlockAsm10B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBlockAsm10B - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B - JZ match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBlockAsm10B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm10B: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBlockAsm10B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeSnappyBlockAsm10B: - CMPL DI, $0x01 - JL match_nolit_end_encodeSnappyBlockAsm10B - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm10B - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm10B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm10B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm10B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x36, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x36, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm10B - INCL CX - JMP search_loop_encodeSnappyBlockAsm10B - -emit_remainder_encodeSnappyBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B - -two_bytes_emit_remainder_encodeSnappyBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm10B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B - -one_byte_emit_remainder_encodeSnappyBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B - -memmove_long_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000008, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 4(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm8B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x38, R11 - MOVL 24(SP)(R10*4), SI - MOVL 24(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - LEAL 1(CX), R10 - MOVL R10, 24(SP)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm8B - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm8B - -repeat_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL DI, SI - JLE repeat_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B - -repeat_extend_back_end_encodeSnappyBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B - -two_bytes_repeat_emit_encodeSnappyBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm8B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B - -one_byte_repeat_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - -memmove_long_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: - ADDL $0x05, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B - JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeSnappyBlockAsm8B - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm8B: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL SI, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(SI), SI - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL SI, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(SI*4), SI - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm8B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(SI*4), SI - MOVB SI, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm8B: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm8B - -no_repeat_found_encodeSnappyBlockAsm8B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm8B - SHRQ $0x08, DI - MOVL 24(SP)(R10*4), SI - LEAL 2(CX), R9 - CMPL (DX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm8B - MOVL R9, 24(SP)(R10*4) - SHRQ $0x08, DI - CMPL (DX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm8B - -candidate3_match_encodeSnappyBlockAsm8B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm8B - -candidate2_match_encodeSnappyBlockAsm8B: - MOVL R9, 24(SP)(R10*4) - INCL CX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm8B - -match_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm8B - JMP match_extend_back_loop_encodeSnappyBlockAsm8B - -match_extend_back_end_encodeSnappyBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm8B: - MOVL CX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm8B - CMPL R8, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm8B - MOVB $0xf4, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B - -two_bytes_match_emit_encodeSnappyBlockAsm8B: - MOVB $0xf0, (AX) - MOVB R8, 1(AX) - ADDQ $0x02, AX - CMPL R8, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm8B - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B - -one_byte_match_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, R8 - MOVB R8, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (AX) - MOVQ DI, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: - MOVQ R8, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B - -memmove_long_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R8, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm8B: -match_nolit_loop_encodeSnappyBlockAsm8B: - MOVL CX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBlockAsm8B - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B - TZCNTQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B - JZ match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: - CMPL DI, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBlockAsm8B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B - SUBL $0x04, DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm8B: - CMPL DI, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBlockAsm8B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B - SUBL $0x02, DI - LEAL 2(R10), R10 - -matchlen_match1_match_nolit_encodeSnappyBlockAsm8B: - CMPL DI, $0x01 - JL match_nolit_end_encodeSnappyBlockAsm8B - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm8B - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm8B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm8B - MOVQ -2(DX)(CX*1), DI - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm8B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x38, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x38, SI - LEAL -2(CX), R9 - LEAQ 24(SP)(SI*4), R10 - MOVL (R10), SI - MOVL R9, 24(SP)(R8*4) - MOVL CX, (R10) - CMPL (DX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm8B - INCL CX - JMP search_loop_encodeSnappyBlockAsm8B - -emit_remainder_encodeSnappyBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B - -two_bytes_emit_remainder_encodeSnappyBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm8B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B - -one_byte_emit_remainder_encodeSnappyBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B - -memmove_long_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - CMPL SI, $0x63 - JLE check_maxskip_ok_encodeSnappyBetterBlockAsm - LEAL 100(CX), SI - JMP check_maxskip_cont_encodeSnappyBetterBlockAsm - -check_maxskip_ok_encodeSnappyBetterBlockAsm: - LEAL 1(CX)(SI*1), SI - -check_maxskip_cont_encodeSnappyBetterBlockAsm: - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm - -candidateS_match_encodeSnappyBetterBlockAsm: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm - -match_extend_back_loop_encodeSnappyBetterBlockAsm: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm - -match_extend_back_end_encodeSnappyBetterBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 5(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm - JZ match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R8, $0x01 - JL match_nolit_end_encodeSnappyBetterBlockAsm - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL R12, $0x01 - JG match_length_ok_encodeSnappyBetterBlockAsm - CMPL R8, $0x0000ffff - JLE match_length_ok_encodeSnappyBetterBlockAsm - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeSnappyBetterBlockAsm - -match_length_ok_encodeSnappyBetterBlockAsm: - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -four_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -three_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -two_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -one_byte_match_emit_encodeSnappyBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm - -memmove_long_match_emit_encodeSnappyBetterBlockAsm: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy - CMPL R8, $0x00010000 - JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm - -four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R12, $0x40 - JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0xff, (AX) - MOVL R8, 1(AX) - LEAL -64(R12), R12 - ADDQ $0x05, AX - CMPL R12, $0x04 - JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm - -four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: - TESTL R12, R12 - JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVL R8, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm - -emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm64K: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm64K: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm64K - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm64K - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm64K - -candidateS_match_encodeSnappyBetterBlockAsm64K: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm64K: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K - -match_extend_back_loop_encodeSnappyBetterBlockAsm64K: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K - -match_extend_back_end_encodeSnappyBetterBlockAsm64K: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm64K: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K - JZ match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R8, $0x01 - JL match_nolit_end_encodeSnappyBetterBlockAsm64K - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm64K - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm64K: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K - -two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm64K - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K - -one_byte_match_emit_encodeSnappyBetterBlockAsm64K: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K - -memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm64K - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm64K - -emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm64K: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000280, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm12B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x34, R11 - MOVL 24(SP)(R10*4), SI - MOVL 65560(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm12B - -candidateS_match_encodeSnappyBetterBlockAsm12B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B - -match_extend_back_loop_encodeSnappyBetterBlockAsm12B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B - -match_extend_back_end_encodeSnappyBetterBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm12B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B - JZ match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R8, $0x01 - JL match_nolit_end_encodeSnappyBetterBlockAsm12B - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm12B - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm12B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm12B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B - -one_byte_match_emit_encodeSnappyBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm12B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm12B - -emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x000000a0, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 16408(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm10B - -candidateS_match_encodeSnappyBetterBlockAsm10B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B - -match_extend_back_loop_encodeSnappyBetterBlockAsm10B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B - -match_extend_back_end_encodeSnappyBetterBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm10B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B - JZ match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R8, $0x01 - JL match_nolit_end_encodeSnappyBetterBlockAsm10B - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm10B - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm10B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm10B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B - -one_byte_match_emit_encodeSnappyBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B - CMPL R8, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm10B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm10B - -emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000028, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm8B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x38, R11 - MOVL 24(SP)(R10*4), SI - MOVL 4120(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm8B - -candidateS_match_encodeSnappyBetterBlockAsm8B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - DECL CX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B - -match_extend_back_loop_encodeSnappyBetterBlockAsm8B: - CMPL CX, DI - JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B - MOVB -1(DX)(SI*1), BL - MOVB -1(DX)(CX*1), R8 - CMPB BL, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B - LEAL -1(CX), CX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B - -match_extend_back_end_encodeSnappyBetterBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 3(AX)(DI*1), DI - CMPQ DI, (SP) - JL match_dst_size_check_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm8B: - MOVL CX, DI - ADDL $0x04, CX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), R10 - - // matchLen - XORL R12, R12 - CMPL R8, $0x08 - JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B - -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - TESTQ R11, R11 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: - LEAL -8(R8), R8 - LEAL 8(R12), R12 - CMPL R8, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B - JZ match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R8, $0x04 - JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B - SUBL $0x04, R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R8, $0x02 - JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B - SUBL $0x02, R8 - LEAL 2(R12), R12 - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R8, $0x01 - JL match_nolit_end_encodeSnappyBetterBlockAsm8B - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm8B - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm8B: - MOVL CX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_match_emit_encodeSnappyBetterBlockAsm8B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B - -one_byte_match_emit_encodeSnappyBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (AX) - MOVQ R10, -8(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ SI, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: - ADDL R12, CX - ADDL $0x04, R12 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R12, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R12), R12 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R12, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(R12*4), R12 - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(R12*4), R12 - MOVB R12, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: - CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBetterBlockAsm8B - CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 - MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 - MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm8B - -emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B - CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func emitLiteral(dst []byte, lit []byte) int -// Requires: SSE2 -TEXT ·emitLiteral(SB), NOSPLIT, $0-56 - MOVQ lit_len+32(FP), DX - MOVQ dst_base+0(FP), AX - MOVQ lit_base+24(FP), CX - TESTQ DX, DX - JZ emit_literal_end_standalone_skip - MOVL DX, BX - LEAL -1(DX), SI - CMPL SI, $0x3c - JLT one_byte_standalone - CMPL SI, $0x00000100 - JLT two_bytes_standalone - CMPL SI, $0x00010000 - JLT three_bytes_standalone - CMPL SI, $0x01000000 - JLT four_bytes_standalone - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP memmove_long_standalone - -four_bytes_standalone: - MOVL SI, DI - SHRL $0x10, DI - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB DI, 3(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP memmove_long_standalone - -three_bytes_standalone: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP memmove_long_standalone - -two_bytes_standalone: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_standalone - JMP memmove_long_standalone - -one_byte_standalone: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, BX - ADDQ $0x01, AX - -memmove_standalone: - // genMemMoveShort - CMPQ DX, $0x03 - JB emit_lit_memmove_standalone_memmove_move_1or2 - JE emit_lit_memmove_standalone_memmove_move_3 - CMPQ DX, $0x08 - JB emit_lit_memmove_standalone_memmove_move_4through7 - CMPQ DX, $0x10 - JBE emit_lit_memmove_standalone_memmove_move_8through16 - CMPQ DX, $0x20 - JBE emit_lit_memmove_standalone_memmove_move_17through32 - JMP emit_lit_memmove_standalone_memmove_move_33through64 - -emit_lit_memmove_standalone_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(DX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(DX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(DX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(DX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(DX*1), X2 - MOVOU -16(CX)(DX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DX*1) - MOVOU X3, -16(AX)(DX*1) - JMP emit_literal_end_standalone - JMP emit_literal_end_standalone - -memmove_long_standalone: - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(DX*1), X2 - MOVOU -16(CX)(DX*1), X3 - MOVQ DX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_standalonelarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_standalonelarge_big_loop_back - -emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ DX, R8 - JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DX*1) - MOVOU X3, -16(AX)(DX*1) - JMP emit_literal_end_standalone - JMP emit_literal_end_standalone - -emit_literal_end_standalone_skip: - XORQ BX, BX - -emit_literal_end_standalone: - MOVQ BX, ret+48(FP) - RET - -// func emitRepeat(dst []byte, offset int, length int) int -TEXT ·emitRepeat(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitRepeat -emit_repeat_again_standalone: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JLE repeat_two_standalone - CMPL SI, $0x0c - JGE cant_repeat_two_offset_standalone - CMPL CX, $0x00000800 - JLT repeat_two_offset_standalone - -cant_repeat_two_offset_standalone: - CMPL DX, $0x00000104 - JLT repeat_three_standalone - CMPL DX, $0x00010100 - JLT repeat_four_standalone - CMPL DX, $0x0100ffff - JLT repeat_five_standalone - LEAL -16842747(DX), DX - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone - -repeat_five_standalone: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_repeat_end - -repeat_four_standalone: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_repeat_end - -repeat_three_standalone: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_repeat_end - -repeat_two_standalone: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_repeat_end - -repeat_two_offset_standalone: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - -gen_emit_repeat_end: - MOVQ BX, ret+40(FP) - RET - -// func emitCopy(dst []byte, offset int, length int) int -TEXT ·emitCopy(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitCopy - CMPL CX, $0x00010000 - JL two_byte_offset_standalone - -four_bytes_loop_back_standalone: - CMPL DX, $0x40 - JLE four_bytes_remain_standalone - MOVB $0xff, (AX) - MOVL CX, 1(AX) - LEAL -64(DX), DX - ADDQ $0x05, BX - ADDQ $0x05, AX - CMPL DX, $0x04 - JL four_bytes_remain_standalone - - // emitRepeat -emit_repeat_again_standalone_emit_copy: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JLE repeat_two_standalone_emit_copy - CMPL SI, $0x0c - JGE cant_repeat_two_offset_standalone_emit_copy - CMPL CX, $0x00000800 - JLT repeat_two_offset_standalone_emit_copy - -cant_repeat_two_offset_standalone_emit_copy: - CMPL DX, $0x00000104 - JLT repeat_three_standalone_emit_copy - CMPL DX, $0x00010100 - JLT repeat_four_standalone_emit_copy - CMPL DX, $0x0100ffff - JLT repeat_five_standalone_emit_copy - LEAL -16842747(DX), DX - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy - -repeat_five_standalone_emit_copy: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - JMP four_bytes_loop_back_standalone - -four_bytes_remain_standalone: - TESTL DX, DX - JZ gen_emit_copy_end - MOVB $0x03, SI - LEAL -4(SI)(DX*4), DX - MOVB DL, (AX) - MOVL CX, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -two_byte_offset_standalone: - CMPL DX, $0x40 - JLE two_byte_offset_short_standalone - MOVB $0xee, (AX) - MOVW CX, 1(AX) - LEAL -60(DX), DX - ADDQ $0x03, AX - ADDQ $0x03, BX - - // emitRepeat -emit_repeat_again_standalone_emit_copy_short: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JLE repeat_two_standalone_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_standalone_emit_copy_short - CMPL CX, $0x00000800 - JLT repeat_two_offset_standalone_emit_copy_short - -cant_repeat_two_offset_standalone_emit_copy_short: - CMPL DX, $0x00000104 - JLT repeat_three_standalone_emit_copy_short - CMPL DX, $0x00010100 - JLT repeat_four_standalone_emit_copy_short - CMPL DX, $0x0100ffff - JLT repeat_five_standalone_emit_copy_short - LEAL -16842747(DX), DX - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy_short - -repeat_five_standalone_emit_copy_short: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy_short: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy_short: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy_short: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - JMP two_byte_offset_standalone - -two_byte_offset_short_standalone: - CMPL DX, $0x0c - JGE emit_copy_three_standalone - CMPL CX, $0x00000800 - JGE emit_copy_three_standalone - MOVB $0x01, SI - LEAL -16(SI)(DX*4), DX - MOVB CL, 1(AX) - SHRL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -emit_copy_three_standalone: - MOVB $0x02, SI - LEAL -4(SI)(DX*4), DX - MOVB DL, (AX) - MOVW CX, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - -gen_emit_copy_end: - MOVQ BX, ret+40(FP) - RET - -// func emitCopyNoRepeat(dst []byte, offset int, length int) int -TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitCopy - CMPL CX, $0x00010000 - JL two_byte_offset_standalone_snappy - -four_bytes_loop_back_standalone_snappy: - CMPL DX, $0x40 - JLE four_bytes_remain_standalone_snappy - MOVB $0xff, (AX) - MOVL CX, 1(AX) - LEAL -64(DX), DX - ADDQ $0x05, BX - ADDQ $0x05, AX - CMPL DX, $0x04 - JL four_bytes_remain_standalone_snappy - JMP four_bytes_loop_back_standalone_snappy - -four_bytes_remain_standalone_snappy: - TESTL DX, DX - JZ gen_emit_copy_end_snappy - MOVB $0x03, SI - LEAL -4(SI)(DX*4), DX - MOVB DL, (AX) - MOVL CX, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end_snappy - -two_byte_offset_standalone_snappy: - CMPL DX, $0x40 - JLE two_byte_offset_short_standalone_snappy - MOVB $0xee, (AX) - MOVW CX, 1(AX) - LEAL -60(DX), DX - ADDQ $0x03, AX - ADDQ $0x03, BX - JMP two_byte_offset_standalone_snappy - -two_byte_offset_short_standalone_snappy: - CMPL DX, $0x0c - JGE emit_copy_three_standalone_snappy - CMPL CX, $0x00000800 - JGE emit_copy_three_standalone_snappy - MOVB $0x01, SI - LEAL -16(SI)(DX*4), DX - MOVB CL, 1(AX) - SHRL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end_snappy - -emit_copy_three_standalone_snappy: - MOVB $0x02, SI - LEAL -4(SI)(DX*4), DX - MOVB DL, (AX) - MOVW CX, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - -gen_emit_copy_end_snappy: - MOVQ BX, ret+40(FP) - RET - -// func matchLen(a []byte, b []byte) int -// Requires: BMI -TEXT ·matchLen(SB), NOSPLIT, $0-56 - MOVQ a_base+0(FP), AX - MOVQ b_base+24(FP), CX - MOVQ a_len+8(FP), DX - - // matchLen - XORL SI, SI - CMPL DX, $0x08 - JL matchlen_match4_standalone - -matchlen_loopback_standalone: - MOVQ (AX)(SI*1), BX - XORQ (CX)(SI*1), BX - TESTQ BX, BX - JZ matchlen_loop_standalone - TZCNTQ BX, BX - SARQ $0x03, BX - LEAL (SI)(BX*1), SI - JMP gen_match_len_end - -matchlen_loop_standalone: - LEAL -8(DX), DX - LEAL 8(SI), SI - CMPL DX, $0x08 - JGE matchlen_loopback_standalone - JZ gen_match_len_end - -matchlen_match4_standalone: - CMPL DX, $0x04 - JL matchlen_match2_standalone - MOVL (AX)(SI*1), BX - CMPL (CX)(SI*1), BX - JNE matchlen_match2_standalone - SUBL $0x04, DX - LEAL 4(SI), SI - -matchlen_match2_standalone: - CMPL DX, $0x02 - JL matchlen_match1_standalone - MOVW (AX)(SI*1), BX - CMPW (CX)(SI*1), BX - JNE matchlen_match1_standalone - SUBL $0x02, DX - LEAL 2(SI), SI - -matchlen_match1_standalone: - CMPL DX, $0x01 - JL gen_match_len_end - MOVB (AX)(SI*1), BL - CMPB (CX)(SI*1), BL - JNE gen_match_len_end - LEAL 1(SI), SI - -gen_match_len_end: - MOVQ SI, ret+48(FP) - RET