Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zstd: Improve decoder memcopy #637

Merged
merged 2 commits into from Jul 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
124 changes: 68 additions & 56 deletions zstd/_generate/gen.go
Expand Up @@ -1132,7 +1132,7 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
JZ(LabelRef("check_offset"))
// TODO: Investigate if it is possible to consistently overallocate literals.
if e.safeMem {
e.copyMemoryPrecise("1", c.literals, c.outBase, ll)
e.copyMemoryPrecise("1", c.literals, c.outBase, ll, 1)
} else {
e.copyMemoryND("1", c.literals, c.outBase, ll)
ADDQ(ll, c.literals)
Expand Down Expand Up @@ -1194,14 +1194,15 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
}
SUBQ(v, ptr) // ptr := &hist[len(hist) - v]
CMPQ(ml, v)
JGE(LabelRef("copy_all_from_history"))
JG(LabelRef("copy_all_from_history"))
/* if ml <= v {
copy(out[outPosition:], hist[start:start+seq.ml])
t += seq.ml
continue
}
*/
e.copyMemoryPrecise("4", ptr, c.outBase, ml)
// We know ml will be at least 3, since we didn't copy anything yet.
e.copyMemoryPrecise("4", ptr, c.outBase, ml, 3)
ADDQ(ml, c.outPosition)
// Note: for the current go tests this branch is taken in 99.53% cases,
// this is why we repeat a little code here.
Expand All @@ -1217,18 +1218,16 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
seq.ml -= v
}
*/
e.copyMemoryPrecise("5", ptr, c.outBase, v)
e.copyMemoryPrecise("5", ptr, c.outBase, v, 1)
ADDQ(v, c.outPosition)
SUBQ(v, ml)
// fallback to the next block
// ml cannot be 0, since we only jump here is ml > v.
// Copy rest from current block.
}

Comment("Copy match from the current buffer")
Label("copy_match")
{
TESTQ(ml, ml)
JZ(LabelRef("handle_loop"))

src := GP64()
MOVQ(c.outBase, src)
SUBQ(mo, src) // src = &s.out[t - mo]
Expand All @@ -1251,7 +1250,7 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
{
ADDQ(ml, c.outPosition)
if e.safeMem {
e.copyMemoryPrecise("2", src, c.outBase, ml)
e.copyMemoryPrecise("2", src, c.outBase, ml, 1)
} else {
dst := GP64()
MOVQ(c.outBase, dst)
Expand Down Expand Up @@ -1311,7 +1310,16 @@ func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtua
// copyMemoryPrecise will copy memory in blocks of 16 bytes,
// without overreading. It adds length to src and dst,
// preserving length.
func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) {
func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual, minLength int) {
assert(func(ok LabelRef) {
// if length >= minLength, ok
CMPQ(length, U8(minLength))
JAE(ok)
})
if minLength == 0 {
TESTQ(length, length)
JZ(LabelRef("copy_" + suffix + "_end"))
}
n := GP64()
MOVQ(length, n)
SUBQ(U8(16), n)
Expand Down Expand Up @@ -1346,53 +1354,57 @@ func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPV
}

Label("copy_" + suffix + "_small")
ofs := GP64()
s := Mem{Base: src, Index: ofs, Scale: 1}
d := Mem{Base: dst, Index: ofs, Scale: 1}

tmp := GP64()
XORQ(ofs, ofs)

Label("copy_" + suffix + "_byte")
TESTQ(U32(0x1), length)
JZ(LabelRef("copy_" + suffix + "_word"))

// copy one byte if length & 0x01 != 0
MOVB(s, tmp.As8())
MOVB(tmp.As8(), d)
ADDQ(U8(1), ofs)

Label("copy_" + suffix + "_word")
TESTQ(U32(0x2), length)
JZ(LabelRef("copy_" + suffix + "_dword"))

// copy two bytes if length & 0x02 != 0
MOVW(s, tmp.As16())
MOVW(tmp.As16(), d)
ADDQ(U8(2), ofs)

Label("copy_" + suffix + "_dword")
TESTQ(U32(0x4), length)
JZ(LabelRef("copy_" + suffix + "_qword"))

// copy four bytes if length & 0x04 != 0
MOVL(s, tmp.As32())
MOVL(tmp.As32(), d)
ADDQ(U8(4), ofs)

Label("copy_" + suffix + "_qword")
TESTQ(U32(0x8), length)
JZ(LabelRef("copy_" + suffix + "_add"))

// copy eight bytes if length & 0x08 != 0
MOVQ(s, tmp)
MOVQ(tmp, d)
ADDQ(U8(8), ofs)

Label("copy_" + suffix + "_add")
ADDQ(length, dst)
ADDQ(length, src)
{
name := "copy_" + suffix + "_"
end := LabelRef("copy_" + suffix + "_end")
CMPQ(length, U8(3))
JE(LabelRef(name + "move_3"))
if minLength < 3 {
JB(LabelRef(name + "move_1or2"))
}
CMPQ(length, U8(8))
JB(LabelRef(name + "move_4through7"))
JMP(LabelRef(name + "move_8through16"))
AX, CX := GP64(), GP64()

if minLength < 3 {
Label(name + "move_1or2")
MOVB(Mem{Base: src}, AX.As8())
MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
MOVB(AX.As8(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
ADDQ(length, src)
ADDQ(length, dst)
JMP(end)
}

Label(name + "move_3")
MOVW(Mem{Base: src}, AX.As16())
MOVB(Mem{Base: src, Disp: 2}, CX.As8())
MOVW(AX.As16(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
ADDQ(length, src)
ADDQ(length, dst)
JMP(end)

Label(name + "move_4through7")
MOVL(Mem{Base: src}, AX.As32())
MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
MOVL(AX.As32(), Mem{Base: dst})
MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
ADDQ(length, src)
ADDQ(length, dst)
JMP(end)

Label(name + "move_8through16")
MOVQ(Mem{Base: src}, AX)
MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
MOVQ(AX, Mem{Base: dst})
MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
ADDQ(length, src)
ADDQ(length, dst)
JMP(end)
}
Label("copy_" + suffix + "_end")
}

Expand Down