Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[huff0] Add x86 specialisation of Decode4X #512

Merged
merged 1 commit into from Mar 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions huff0/autogen.go
@@ -0,0 +1,4 @@
package huff0

//go:generate go run generate.go
//go:generate asmfmt -w decompress_amd64.s
5 changes: 5 additions & 0 deletions huff0/bitreader.go
Expand Up @@ -165,6 +165,11 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
return uint16(b.value >> ((64 - n) & 63))
}

// peekTopBits(n) is equvialent to peekBitFast(64 - n)
func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
return uint16(b.value >> n)
}

func (b *bitReaderShifted) advance(n uint8) {
b.bitsRead += n
b.value <<= n & 63
Expand Down
183 changes: 0 additions & 183 deletions huff0/decompress.go
Expand Up @@ -725,189 +725,6 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
return dst, br.close()
}

// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
// the uncompressed data exactly.
func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if len(src) < 6+(4*1) {
return nil, errors.New("input too small")
}
if use8BitTables && d.actualTableLog <= 8 {
return d.decompress4X8bit(dst, src)
}

var br [4]bitReaderShifted
// Decode "jump table"
start := 6
for i := 0; i < 3; i++ {
length := int(src[i*2]) | (int(src[i*2+1]) << 8)
if start+length >= len(src) {
return nil, errors.New("truncated input (or invalid offset)")
}
err := br[i].init(src[start : start+length])
if err != nil {
return nil, err
}
start += length
}
err := br[3].init(src[start:])
if err != nil {
return nil, err
}

// destination, offset to match first output
dstSize := cap(dst)
dst = dst[:dstSize]
out := dst
dstEvery := (dstSize + 3) / 4

const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]

// Use temp table to avoid bound checks/append penalty.
buf := d.buffer()
var off uint8
var decoded int

// Decode 2 values from each decoder/loop.
const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
}

{
const stream = 0
const stream2 = 1
br[stream].fillFast()
br[stream2].fillFast()

val := br[stream].peekBitsFast(d.actualTableLog)
val2 := br[stream2].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
v2 := single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off] = uint8(v.entry >> 8)
buf[stream2][off] = uint8(v2.entry >> 8)

val = br[stream].peekBitsFast(d.actualTableLog)
val2 = br[stream2].peekBitsFast(d.actualTableLog)
v = single[val&tlMask]
v2 = single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off+1] = uint8(v.entry >> 8)
buf[stream2][off+1] = uint8(v2.entry >> 8)
}

{
const stream = 2
const stream2 = 3
br[stream].fillFast()
br[stream2].fillFast()

val := br[stream].peekBitsFast(d.actualTableLog)
val2 := br[stream2].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
v2 := single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off] = uint8(v.entry >> 8)
buf[stream2][off] = uint8(v2.entry >> 8)

val = br[stream].peekBitsFast(d.actualTableLog)
val2 = br[stream2].peekBitsFast(d.actualTableLog)
v = single[val&tlMask]
v2 = single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off+1] = uint8(v.entry >> 8)
buf[stream2][off+1] = uint8(v2.entry >> 8)
}

off += 2

if off == 0 {
if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
copy(out, buf[0][:off])
copy(out[dstEvery:], buf[1][:off])
copy(out[dstEvery*2:], buf[2][:off])
copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}

// Decode remaining.
remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
endsAt := offset + remainBytes
if endsAt > len(out) {
endsAt = len(out)
}
br := &br[i]
bitsLeft := br.remaining()
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}

// Read value and increment offset.
val := br.peekBitsFast(d.actualTableLog)
v := single[val&tlMask].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
if offset != endsAt {
d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}

// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
Expand Down
166 changes: 166 additions & 0 deletions huff0/decompress_amd64.go
@@ -0,0 +1,166 @@
//go:build amd64 && !appengine && !noasm && gc
// +build amd64,!appengine,!noasm,gc

// This file contains the specialisation of Decoder.Decompress4X
// that uses an asm implementation of its main loop.
package huff0

import (
"errors"
"fmt"
)

// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress4X that uses BMI1 instructions.
// go:noescape
func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
peekBits uint8, buf *byte, tbl *dEntrySingle) uint8

// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
// the uncompressed data exactly.
func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if len(src) < 6+(4*1) {
return nil, errors.New("input too small")
}
if use8BitTables && d.actualTableLog <= 8 {
return d.decompress4X8bit(dst, src)
}

var br [4]bitReaderShifted
// Decode "jump table"
start := 6
for i := 0; i < 3; i++ {
length := int(src[i*2]) | (int(src[i*2+1]) << 8)
if start+length >= len(src) {
return nil, errors.New("truncated input (or invalid offset)")
}
err := br[i].init(src[start : start+length])
if err != nil {
return nil, err
}
start += length
}
err := br[3].init(src[start:])
if err != nil {
return nil, err
}

// destination, offset to match first output
dstSize := cap(dst)
dst = dst[:dstSize]
out := dst
dstEvery := (dstSize + 3) / 4

const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]

// Use temp table to avoid bound checks/append penalty.
buf := d.buffer()
var off uint8
var decoded int

const debug = false

// see: bitReaderShifted.peekBitsFast()
peekBits := uint8((64 - d.actualTableLog) & 63)

// Decode 2 values from each decoder/loop.
const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
}

off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
if debug {
fmt.Print("DEBUG: ")
fmt.Printf("off=%d,", off)
for i := 0; i < 4; i++ {
fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
i, br[i].bitsRead, br[i].value, br[i].off)
}
fmt.Println("")
}

if off != 0 {
break
}

if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
copy(out, buf[0][:off])
copy(out[dstEvery:], buf[1][:off])
copy(out[dstEvery*2:], buf[2][:off])
copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}

// Decode remaining.
remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
endsAt := offset + remainBytes
if endsAt > len(out) {
endsAt = len(out)
}
br := &br[i]
bitsLeft := br.remaining()
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}

// Read value and increment offset.
val := br.peekBitsFast(d.actualTableLog)
v := single[val&tlMask].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
if offset != endsAt {
d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}