Skip to content

Commit

Permalink
s2: Improve matching (#484)
Browse files Browse the repository at this point in the history
Improve end-of-buffer speed.

Add `goamd64_v3` version with small improvement for matching. For now set as tag to enable.

For now guarded by build tags to not duplicate all code.

```
benchmark                                                                 old ns/op      new ns/op      delta
BenchmarkTwainEncode1e1/default-32                                        8.32           8.28           -0.49%
BenchmarkTwainEncode1e1/better-32                                         8.36           8.32           -0.53%
BenchmarkTwainEncode1e1/snappy-default-32                                 8.34           8.32           -0.17%
BenchmarkTwainEncode1e1/snappy-better-32                                  8.31           8.31           +0.00%
BenchmarkTwainEncode1e1/snappy-ref-noasm-32                               7.61           7.62           +0.22%
BenchmarkTwainEncode1e2/default-32                                        94.4           93.8           -0.70%
BenchmarkTwainEncode1e2/better-32                                         273            269            -1.36%
BenchmarkTwainEncode1e2/snappy-default-32                                 94.7           93.6           -1.17%
BenchmarkTwainEncode1e2/snappy-better-32                                  273            268            -1.58%
BenchmarkTwainEncode1e2/snappy-ref-noasm-32                               471            469            -0.25%
BenchmarkTwainEncode1e3/default-32                                        872            867            -0.62%
BenchmarkTwainEncode1e3/better-32                                         2416           2403           -0.54%
BenchmarkTwainEncode1e3/snappy-default-32                                 869            862            -0.84%
BenchmarkTwainEncode1e3/snappy-better-32                                  2415           2402           -0.54%
BenchmarkTwainEncode1e3/snappy-ref-noasm-32                               2317           2328           +0.47%
BenchmarkTwainEncode1e4/default-32                                        10080          9862           -2.16%
BenchmarkTwainEncode1e4/better-32                                         24173          23778          -1.63%
BenchmarkTwainEncode1e4/snappy-default-32                                 10038          9900           -1.37%
BenchmarkTwainEncode1e4/snappy-better-32                                  24088          23655          -1.80%
BenchmarkTwainEncode1e4/snappy-ref-noasm-32                               25050          24941          -0.44%
BenchmarkTwainEncode1e5/default-32                                        208338         204080         -2.04%
BenchmarkTwainEncode1e5/better-32                                         400069         382699         -4.34%
BenchmarkTwainEncode1e5/snappy-default-32                                 207783         200382         -3.56%
BenchmarkTwainEncode1e5/snappy-better-32                                  388589         378026         -2.72%
BenchmarkTwainEncode1e5/snappy-ref-noasm-32                               487332         484808         -0.52%
BenchmarkTwainEncode1e6/default-32                                        2305542        2251826        -2.33%
BenchmarkTwainEncode1e6/better-32                                         4023332        3904791        -2.95%
BenchmarkTwainEncode1e6/snappy-default-32                                 2300992        2179567        -5.28%
BenchmarkTwainEncode1e6/snappy-better-32                                  3938222        3879487        -1.49%
BenchmarkTwainEncode1e6/snappy-ref-noasm-32                               4890432        4866709        -0.49%
BenchmarkTwainEncode1e7/default-32                                        23717990       22395276       -5.58%
BenchmarkTwainEncode1e7/better-32                                         42845300       42508469       -0.79%
BenchmarkTwainEncode1e7/snappy-default-32                                 23335686       22315622       -4.37%
BenchmarkTwainEncode1e7/snappy-better-32                                  42227550       41652074       -1.36%
BenchmarkTwainEncode1e7/snappy-ref-noasm-32                               51418814       51197981       -0.43%
```
  • Loading branch information
klauspost committed Feb 1, 2022
1 parent 60b19fa commit a1a9cfc
Show file tree
Hide file tree
Showing 6 changed files with 17,142 additions and 289 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/go.yml
Expand Up @@ -31,6 +31,11 @@ jobs:
- name: Test
run: go test ./...

- name: Test S2 GOAMD64 v3
env:
GOAMD64: v3
run: go test -tags=goamd64_v3 ./s2/...

- name: Test Noasm
run: go test -tags=noasm ./...

Expand Down
64 changes: 55 additions & 9 deletions s2/_generate/gen.go
@@ -1,8 +1,13 @@
package main

//go:generate go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2
//go:generate gofmt -w ../encodeblock_amd64.go

//go:generate go run gen.go -x64v3 -out ../encodeblock_v3_amd64.s -stubs ../encodeblock_v3_amd64.go -pkg=s2
//go:generate gofmt -w ../encodeblock_v3_amd64.go

import (
"flag"
"fmt"
"math"
"runtime"
Expand All @@ -26,12 +31,23 @@ const (
limit8B = 512 - 1
)

var x64v3 = flag.Bool("x64v3", false, "Generate for amd64-v3")

func main() {
flag.Parse()
Constraint(buildtags.Not("appengine").ToConstraint())
Constraint(buildtags.Not("noasm").ToConstraint())
Constraint(buildtags.Term("gc").ToConstraint())
if *x64v3 {
Constraint(buildtags.Term("goamd64_v3").ToConstraint())
} else {
Constraint(buildtags.Not("goamd64_v3").ToConstraint())
}
Constraint(buildtags.Not("noasm").ToConstraint())

o := options{
bmi1: *x64v3,
bmi2: *x64v3, // Currently unused....
snappy: false,
outputMargin: 9,
}
Expand Down Expand Up @@ -114,7 +130,8 @@ func assert(fn func(ok LabelRef)) {

type options struct {
snappy bool
vmbi2 bool
bmi1 bool
bmi2 bool
maxLen int
outputMargin int // Should be at least 5.
maxSkip int
Expand Down Expand Up @@ -1550,8 +1567,12 @@ func (h hashGen) hash(val reg.GPVirtual) {
if h.bytes < 8 {
SHLQ(U8(64-8*h.bytes), val)
}
// 329 AMD64 :IMUL r64, r64 L: 0.86ns= 3.0c T: 0.29ns= 1.00c
// 2020 BMI2 :MULX r64, r64, r64 L: 1.14ns= 4.0c T: 0.29ns= 1.00c
IMULQ(h.mulreg, val)
// Move value to bottom
// 2032 BMI2 :SHRX r64, r64, r64 L: 0.29ns= 1.0c T: 0.12ns= 0.42c
// 236 AMD64 :SHR r64, imm8 L: 0.29ns= 1.0c T: 0.13ns= 0.46c
SHRQ(U8(64-h.tablebits), val)
}

Expand Down Expand Up @@ -2482,15 +2503,21 @@ func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) re
XORL(matched, matched)

CMPL(len.As32(), U8(8))
JL(LabelRef("matchlen_single_" + name))
JL(LabelRef("matchlen_match4_" + name))

Label("matchlen_loopback_" + name)
MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp)
XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp)
TESTQ(tmp, tmp)
JZ(LabelRef("matchlen_loop_" + name))
// Not all match.
BSFQ(tmp, tmp)
if o.bmi1 {
// 2016 BMI :TZCNT r64, r64 L: 0.57ns= 2.0c T: 0.29ns= 1.00c
// 315 AMD64 :BSF r64, r64 L: 0.88ns= 3.1c T: 0.86ns= 3.00c
TZCNTQ(tmp, tmp)
} else {
BSFQ(tmp, tmp)
}
SARQ(U8(3), tmp)
LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
JMP(end)
Expand All @@ -2501,18 +2528,37 @@ func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) re
LEAL(Mem{Base: matched, Disp: 8}, matched)
CMPL(len.As32(), U8(8))
JGE(LabelRef("matchlen_loopback_" + name))
JZ(end)

// Less than 8 bytes left.
Label("matchlen_single_" + name)
TESTL(len.As32(), len.As32())
JZ(end)
Label("matchlen_single_loopback_" + name)
// Test 4 bytes...
Label("matchlen_match4_" + name)
CMPL(len.As32(), U8(4))
JL(LabelRef("matchlen_match2_" + name))
MOVL(Mem{Base: a, Index: matched, Scale: 1}, tmp.As32())
CMPL(Mem{Base: b, Index: matched, Scale: 1}, tmp.As32())
JNE(LabelRef("matchlen_match2_" + name))
SUBL(U8(4), len.As32())
LEAL(Mem{Base: matched, Disp: 4}, matched)

// Test 2 bytes...
Label("matchlen_match2_" + name)
CMPL(len.As32(), U8(2))
JL(LabelRef("matchlen_match1_" + name))
MOVW(Mem{Base: a, Index: matched, Scale: 1}, tmp.As16())
CMPW(Mem{Base: b, Index: matched, Scale: 1}, tmp.As16())
JNE(LabelRef("matchlen_match1_" + name))
SUBL(U8(2), len.As32())
LEAL(Mem{Base: matched, Disp: 2}, matched)

// Test 1 byte...
Label("matchlen_match1_" + name)
CMPL(len.As32(), U8(1))
JL(end)
MOVB(Mem{Base: a, Index: matched, Scale: 1}, tmp.As8())
CMPB(Mem{Base: b, Index: matched, Scale: 1}, tmp.As8())
JNE(end)
LEAL(Mem{Base: matched, Disp: 1}, matched)
DECL(len.As32())
JNZ(LabelRef("matchlen_single_loopback_" + name))
JMP(end)
return matched
}
Expand Down
4 changes: 2 additions & 2 deletions s2/encodeblock_amd64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a1a9cfc

Please sign in to comment.