diff --git a/huff0/autogen.go b/huff0/autogen.go new file mode 100644 index 0000000000..110883f13f --- /dev/null +++ b/huff0/autogen.go @@ -0,0 +1,4 @@ +package huff0 + +//go:generate go run generate.go +//go:generate asmfmt -w decompress_amd64.s diff --git a/huff0/bitreader.go b/huff0/bitreader.go index 03562db16f..451160edda 100644 --- a/huff0/bitreader.go +++ b/huff0/bitreader.go @@ -165,6 +165,11 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 { return uint16(b.value >> ((64 - n) & 63)) } +// peekTopBits(n) is equvialent to peekBitFast(64 - n) +func (b *bitReaderShifted) peekTopBits(n uint8) uint16 { + return uint16(b.value >> n) +} + func (b *bitReaderShifted) advance(n uint8) { b.bitsRead += n b.value <<= n & 63 diff --git a/huff0/decompress.go b/huff0/decompress.go index 3ae7d46771..04f6529955 100644 --- a/huff0/decompress.go +++ b/huff0/decompress.go @@ -725,189 +725,6 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { return dst, br.close() } -// Decompress4X will decompress a 4X encoded stream. -// The length of the supplied input must match the end of a block exactly. -// The *capacity* of the dst slice must match the destination size of -// the uncompressed data exactly. -func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { - if len(d.dt.single) == 0 { - return nil, errors.New("no table loaded") - } - if len(src) < 6+(4*1) { - return nil, errors.New("input too small") - } - if use8BitTables && d.actualTableLog <= 8 { - return d.decompress4X8bit(dst, src) - } - - var br [4]bitReaderShifted - // Decode "jump table" - start := 6 - for i := 0; i < 3; i++ { - length := int(src[i*2]) | (int(src[i*2+1]) << 8) - if start+length >= len(src) { - return nil, errors.New("truncated input (or invalid offset)") - } - err := br[i].init(src[start : start+length]) - if err != nil { - return nil, err - } - start += length - } - err := br[3].init(src[start:]) - if err != nil { - return nil, err - } - - // destination, offset to match first output - dstSize := cap(dst) - dst = dst[:dstSize] - out := dst - dstEvery := (dstSize + 3) / 4 - - const tlSize = 1 << tableLogMax - const tlMask = tlSize - 1 - single := d.dt.single[:tlSize] - - // Use temp table to avoid bound checks/append penalty. - buf := d.buffer() - var off uint8 - var decoded int - - // Decode 2 values from each decoder/loop. - const bufoff = 256 - for { - if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { - break - } - - { - const stream = 0 - const stream2 = 1 - br[stream].fillFast() - br[stream2].fillFast() - - val := br[stream].peekBitsFast(d.actualTableLog) - val2 := br[stream2].peekBitsFast(d.actualTableLog) - v := single[val&tlMask] - v2 := single[val2&tlMask] - br[stream].advance(uint8(v.entry)) - br[stream2].advance(uint8(v2.entry)) - buf[stream][off] = uint8(v.entry >> 8) - buf[stream2][off] = uint8(v2.entry >> 8) - - val = br[stream].peekBitsFast(d.actualTableLog) - val2 = br[stream2].peekBitsFast(d.actualTableLog) - v = single[val&tlMask] - v2 = single[val2&tlMask] - br[stream].advance(uint8(v.entry)) - br[stream2].advance(uint8(v2.entry)) - buf[stream][off+1] = uint8(v.entry >> 8) - buf[stream2][off+1] = uint8(v2.entry >> 8) - } - - { - const stream = 2 - const stream2 = 3 - br[stream].fillFast() - br[stream2].fillFast() - - val := br[stream].peekBitsFast(d.actualTableLog) - val2 := br[stream2].peekBitsFast(d.actualTableLog) - v := single[val&tlMask] - v2 := single[val2&tlMask] - br[stream].advance(uint8(v.entry)) - br[stream2].advance(uint8(v2.entry)) - buf[stream][off] = uint8(v.entry >> 8) - buf[stream2][off] = uint8(v2.entry >> 8) - - val = br[stream].peekBitsFast(d.actualTableLog) - val2 = br[stream2].peekBitsFast(d.actualTableLog) - v = single[val&tlMask] - v2 = single[val2&tlMask] - br[stream].advance(uint8(v.entry)) - br[stream2].advance(uint8(v2.entry)) - buf[stream][off+1] = uint8(v.entry >> 8) - buf[stream2][off+1] = uint8(v2.entry >> 8) - } - - off += 2 - - if off == 0 { - if bufoff > dstEvery { - d.bufs.Put(buf) - return nil, errors.New("corruption detected: stream overrun 1") - } - copy(out, buf[0][:]) - copy(out[dstEvery:], buf[1][:]) - copy(out[dstEvery*2:], buf[2][:]) - copy(out[dstEvery*3:], buf[3][:]) - out = out[bufoff:] - decoded += bufoff * 4 - // There must at least be 3 buffers left. - if len(out) < dstEvery*3 { - d.bufs.Put(buf) - return nil, errors.New("corruption detected: stream overrun 2") - } - } - } - if off > 0 { - ioff := int(off) - if len(out) < dstEvery*3+ioff { - d.bufs.Put(buf) - return nil, errors.New("corruption detected: stream overrun 3") - } - copy(out, buf[0][:off]) - copy(out[dstEvery:], buf[1][:off]) - copy(out[dstEvery*2:], buf[2][:off]) - copy(out[dstEvery*3:], buf[3][:off]) - decoded += int(off) * 4 - out = out[off:] - } - - // Decode remaining. - remainBytes := dstEvery - (decoded / 4) - for i := range br { - offset := dstEvery * i - endsAt := offset + remainBytes - if endsAt > len(out) { - endsAt = len(out) - } - br := &br[i] - bitsLeft := br.remaining() - for bitsLeft > 0 { - br.fill() - if offset >= endsAt { - d.bufs.Put(buf) - return nil, errors.New("corruption detected: stream overrun 4") - } - - // Read value and increment offset. - val := br.peekBitsFast(d.actualTableLog) - v := single[val&tlMask].entry - nBits := uint8(v) - br.advance(nBits) - bitsLeft -= uint(nBits) - out[offset] = uint8(v >> 8) - offset++ - } - if offset != endsAt { - d.bufs.Put(buf) - return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) - } - decoded += offset - dstEvery*i - err = br.close() - if err != nil { - return nil, err - } - } - d.bufs.Put(buf) - if dstSize != decoded { - return nil, errors.New("corruption detected: short output block") - } - return dst, nil -} - // Decompress4X will decompress a 4X encoded stream. // The length of the supplied input must match the end of a block exactly. // The *capacity* of the dst slice must match the destination size of diff --git a/huff0/decompress_amd64.go b/huff0/decompress_amd64.go new file mode 100644 index 0000000000..fa83f78040 --- /dev/null +++ b/huff0/decompress_amd64.go @@ -0,0 +1,166 @@ +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +// This file contains the specialisation of Decoder.Decompress4X +// that uses an asm implementation of its main loop. +package huff0 + +import ( + "errors" + "fmt" +) + +// decompress4x_main_loop_x86 is an x86 assembler implementation +// of Decompress4X that uses BMI1 instructions. +// go:noescape +func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, + peekBits uint8, buf *byte, tbl *dEntrySingle) uint8 + +// Decompress4X will decompress a 4X encoded stream. +// The length of the supplied input must match the end of a block exactly. +// The *capacity* of the dst slice must match the destination size of +// the uncompressed data exactly. +func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { + if len(d.dt.single) == 0 { + return nil, errors.New("no table loaded") + } + if len(src) < 6+(4*1) { + return nil, errors.New("input too small") + } + if use8BitTables && d.actualTableLog <= 8 { + return d.decompress4X8bit(dst, src) + } + + var br [4]bitReaderShifted + // Decode "jump table" + start := 6 + for i := 0; i < 3; i++ { + length := int(src[i*2]) | (int(src[i*2+1]) << 8) + if start+length >= len(src) { + return nil, errors.New("truncated input (or invalid offset)") + } + err := br[i].init(src[start : start+length]) + if err != nil { + return nil, err + } + start += length + } + err := br[3].init(src[start:]) + if err != nil { + return nil, err + } + + // destination, offset to match first output + dstSize := cap(dst) + dst = dst[:dstSize] + out := dst + dstEvery := (dstSize + 3) / 4 + + const tlSize = 1 << tableLogMax + const tlMask = tlSize - 1 + single := d.dt.single[:tlSize] + + // Use temp table to avoid bound checks/append penalty. + buf := d.buffer() + var off uint8 + var decoded int + + const debug = false + + // see: bitReaderShifted.peekBitsFast() + peekBits := uint8((64 - d.actualTableLog) & 63) + + // Decode 2 values from each decoder/loop. + const bufoff = 256 + for { + if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { + break + } + + off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0]) + if debug { + fmt.Print("DEBUG: ") + fmt.Printf("off=%d,", off) + for i := 0; i < 4; i++ { + fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}", + i, br[i].bitsRead, br[i].value, br[i].off) + } + fmt.Println("") + } + + if off != 0 { + break + } + + if bufoff > dstEvery { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 1") + } + copy(out, buf[0][:]) + copy(out[dstEvery:], buf[1][:]) + copy(out[dstEvery*2:], buf[2][:]) + copy(out[dstEvery*3:], buf[3][:]) + out = out[bufoff:] + decoded += bufoff * 4 + // There must at least be 3 buffers left. + if len(out) < dstEvery*3 { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 2") + } + } + if off > 0 { + ioff := int(off) + if len(out) < dstEvery*3+ioff { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 3") + } + copy(out, buf[0][:off]) + copy(out[dstEvery:], buf[1][:off]) + copy(out[dstEvery*2:], buf[2][:off]) + copy(out[dstEvery*3:], buf[3][:off]) + decoded += int(off) * 4 + out = out[off:] + } + + // Decode remaining. + remainBytes := dstEvery - (decoded / 4) + for i := range br { + offset := dstEvery * i + endsAt := offset + remainBytes + if endsAt > len(out) { + endsAt = len(out) + } + br := &br[i] + bitsLeft := br.remaining() + for bitsLeft > 0 { + br.fill() + if offset >= endsAt { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 4") + } + + // Read value and increment offset. + val := br.peekBitsFast(d.actualTableLog) + v := single[val&tlMask].entry + nBits := uint8(v) + br.advance(nBits) + bitsLeft -= uint(nBits) + out[offset] = uint8(v >> 8) + offset++ + } + if offset != endsAt { + d.bufs.Put(buf) + return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) + } + decoded += offset - dstEvery*i + err = br.close() + if err != nil { + return nil, err + } + } + d.bufs.Put(buf) + if dstSize != decoded { + return nil, errors.New("corruption detected: short output block") + } + return dst, nil +} diff --git a/huff0/decompress_amd64.s b/huff0/decompress_amd64.s new file mode 100644 index 0000000000..6d883c60dc --- /dev/null +++ b/huff0/decompress_amd64.s @@ -0,0 +1,352 @@ +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" +#include "funcdata.h" +#include "go_asm.h" + +#define bufoff 256 // see decompress.go, we're using [4][256]byte table + +// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, +// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool) +TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8 +#define off R8 +#define buffer DI +#define table SI + +#define br_bits_read R9 +#define br_value R10 +#define br_offset R11 +#define peek_bits R12 +#define exhausted DX + +#define br0 R13 +#define br1 R14 +#define br2 R15 +#define br3 BP + + MOVQ BP, 0(SP) + + XORQ exhausted, exhausted // exhausted = false + XORQ off, off // off = 0 + + MOVBQZX peekBits+32(FP), peek_bits + MOVQ buf+40(FP), buffer + MOVQ tbl+48(FP), table + + MOVQ pbr0+0(FP), br0 + MOVQ pbr1+8(FP), br1 + MOVQ pbr2+16(FP), br2 + MOVQ pbr3+24(FP), br3 + +main_loop: + + // const stream = 0 + // br0.fillFast() + MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read + MOVQ bitReaderShifted_value(br0), br_value + MOVQ bitReaderShifted_off(br0), br_offset + + // if b.bitsRead >= 32 { + CMPQ br_bits_read, $32 + JB skip_fill0 + + SUBQ $32, br_bits_read // b.bitsRead -= 32 + SUBQ $4, br_offset // b.off -= 4 + + // v := b.in[b.off-4 : b.off] + // v = v[:4] + // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + MOVQ bitReaderShifted_in(br0), AX + MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVQ br_bits_read, CX + SHLQ CL, AX + ORQ AX, br_value + + // } +skip_fill0: + + // exhausted = exhausted || (br0.off < 4) + CMPQ br_offset, $4 + SETLT DL + ORB DL, DH + + // val0 := br0.peekTopBits(peekBits) + MOVQ br_value, AX + MOVQ peek_bits, CX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v0 := table[val0&mask] + MOVW 0(table)(AX*2), AX // AX - v0 + + // br0.advance(uint8(v0.entry)) + MOVB AH, BL // BL = uint8(v0.entry >> 8) + MOVBQZX AL, CX + SHLQ CL, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + // val1 := br0.peekTopBits(peekBits) + MOVQ peek_bits, CX + MOVQ br_value, AX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v1 := table[val1&mask] + MOVW 0(table)(AX*2), AX // AX - v1 + + // br0.advance(uint8(v1.entry)) + MOVB AH, BH // BH = uint8(v1.entry >> 8) + MOVBQZX AL, CX + SHLQ CX, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + // these two writes get coalesced + // buf[stream][off] = uint8(v0.entry >> 8) + // buf[stream][off+1] = uint8(v1.entry >> 8) + MOVW BX, 0(buffer)(off*1) + + // update the bitrader reader structure + MOVB br_bits_read, bitReaderShifted_bitsRead(br0) + MOVQ br_value, bitReaderShifted_value(br0) + MOVQ br_offset, bitReaderShifted_off(br0) + + // const stream = 1 + // br1.fillFast() + MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read + MOVQ bitReaderShifted_value(br1), br_value + MOVQ bitReaderShifted_off(br1), br_offset + + // if b.bitsRead >= 32 { + CMPQ br_bits_read, $32 + JB skip_fill1 + + SUBQ $32, br_bits_read // b.bitsRead -= 32 + SUBQ $4, br_offset // b.off -= 4 + + // v := b.in[b.off-4 : b.off] + // v = v[:4] + // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + MOVQ bitReaderShifted_in(br1), AX + MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVQ br_bits_read, CX + SHLQ CL, AX + ORQ AX, br_value + + // } +skip_fill1: + + // exhausted = exhausted || (br1.off < 4) + CMPQ br_offset, $4 + SETLT DL + ORB DL, DH + + // val0 := br1.peekTopBits(peekBits) + MOVQ br_value, AX + MOVQ peek_bits, CX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v0 := table[val0&mask] + MOVW 0(table)(AX*2), AX // AX - v0 + + // br1.advance(uint8(v0.entry)) + MOVB AH, BL // BL = uint8(v0.entry >> 8) + MOVBQZX AL, CX + SHLQ CL, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + // val1 := br1.peekTopBits(peekBits) + MOVQ peek_bits, CX + MOVQ br_value, AX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v1 := table[val1&mask] + MOVW 0(table)(AX*2), AX // AX - v1 + + // br1.advance(uint8(v1.entry)) + MOVB AH, BH // BH = uint8(v1.entry >> 8) + MOVBQZX AL, CX + SHLQ CX, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + // these two writes get coalesced + // buf[stream][off] = uint8(v0.entry >> 8) + // buf[stream][off+1] = uint8(v1.entry >> 8) + MOVW BX, 256(buffer)(off*1) + + // update the bitrader reader structure + MOVB br_bits_read, bitReaderShifted_bitsRead(br1) + MOVQ br_value, bitReaderShifted_value(br1) + MOVQ br_offset, bitReaderShifted_off(br1) + + // const stream = 2 + // br2.fillFast() + MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read + MOVQ bitReaderShifted_value(br2), br_value + MOVQ bitReaderShifted_off(br2), br_offset + + // if b.bitsRead >= 32 { + CMPQ br_bits_read, $32 + JB skip_fill2 + + SUBQ $32, br_bits_read // b.bitsRead -= 32 + SUBQ $4, br_offset // b.off -= 4 + + // v := b.in[b.off-4 : b.off] + // v = v[:4] + // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + MOVQ bitReaderShifted_in(br2), AX + MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVQ br_bits_read, CX + SHLQ CL, AX + ORQ AX, br_value + + // } +skip_fill2: + + // exhausted = exhausted || (br2.off < 4) + CMPQ br_offset, $4 + SETLT DL + ORB DL, DH + + // val0 := br2.peekTopBits(peekBits) + MOVQ br_value, AX + MOVQ peek_bits, CX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v0 := table[val0&mask] + MOVW 0(table)(AX*2), AX // AX - v0 + + // br2.advance(uint8(v0.entry)) + MOVB AH, BL // BL = uint8(v0.entry >> 8) + MOVBQZX AL, CX + SHLQ CL, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + // val1 := br2.peekTopBits(peekBits) + MOVQ peek_bits, CX + MOVQ br_value, AX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v1 := table[val1&mask] + MOVW 0(table)(AX*2), AX // AX - v1 + + // br2.advance(uint8(v1.entry)) + MOVB AH, BH // BH = uint8(v1.entry >> 8) + MOVBQZX AL, CX + SHLQ CX, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + // these two writes get coalesced + // buf[stream][off] = uint8(v0.entry >> 8) + // buf[stream][off+1] = uint8(v1.entry >> 8) + MOVW BX, 512(buffer)(off*1) + + // update the bitrader reader structure + MOVB br_bits_read, bitReaderShifted_bitsRead(br2) + MOVQ br_value, bitReaderShifted_value(br2) + MOVQ br_offset, bitReaderShifted_off(br2) + + // const stream = 3 + // br3.fillFast() + MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read + MOVQ bitReaderShifted_value(br3), br_value + MOVQ bitReaderShifted_off(br3), br_offset + + // if b.bitsRead >= 32 { + CMPQ br_bits_read, $32 + JB skip_fill3 + + SUBQ $32, br_bits_read // b.bitsRead -= 32 + SUBQ $4, br_offset // b.off -= 4 + + // v := b.in[b.off-4 : b.off] + // v = v[:4] + // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + MOVQ bitReaderShifted_in(br3), AX + MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVQ br_bits_read, CX + SHLQ CL, AX + ORQ AX, br_value + + // } +skip_fill3: + + // exhausted = exhausted || (br3.off < 4) + CMPQ br_offset, $4 + SETLT DL + ORB DL, DH + + // val0 := br3.peekTopBits(peekBits) + MOVQ br_value, AX + MOVQ peek_bits, CX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v0 := table[val0&mask] + MOVW 0(table)(AX*2), AX // AX - v0 + + // br3.advance(uint8(v0.entry)) + MOVB AH, BL // BL = uint8(v0.entry >> 8) + MOVBQZX AL, CX + SHLQ CL, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + // val1 := br3.peekTopBits(peekBits) + MOVQ peek_bits, CX + MOVQ br_value, AX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v1 := table[val1&mask] + MOVW 0(table)(AX*2), AX // AX - v1 + + // br3.advance(uint8(v1.entry)) + MOVB AH, BH // BH = uint8(v1.entry >> 8) + MOVBQZX AL, CX + SHLQ CX, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + // these two writes get coalesced + // buf[stream][off] = uint8(v0.entry >> 8) + // buf[stream][off+1] = uint8(v1.entry >> 8) + MOVW BX, 768(buffer)(off*1) + + // update the bitrader reader structure + MOVB br_bits_read, bitReaderShifted_bitsRead(br3) + MOVQ br_value, bitReaderShifted_value(br3) + MOVQ br_offset, bitReaderShifted_off(br3) + + ADDQ $2, off // off += 2 + + TESTB DH, DH // any br[i].ofs < 4? + JNZ end + + CMPQ off, $bufoff + JL main_loop + +end: + MOVQ 0(SP), BP + + MOVB off, ret+56(FP) + RET + +#undef off +#undef buffer +#undef table + +#undef br_bits_read +#undef br_value +#undef br_offset +#undef peek_bits +#undef exhausted + +#undef br0 +#undef br1 +#undef br2 +#undef br3 diff --git a/huff0/decompress_amd64.s.in b/huff0/decompress_amd64.s.in new file mode 100644 index 0000000000..34bb290b63 --- /dev/null +++ b/huff0/decompress_amd64.s.in @@ -0,0 +1,162 @@ +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" +#include "funcdata.h" +#include "go_asm.h" + + +#define bufoff 256 // see decompress.go, we're using [4][256]byte table + +//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, +// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool) +TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8 +#define off R8 +#define buffer DI +#define table SI + +#define br_bits_read R9 +#define br_value R10 +#define br_offset R11 +#define peek_bits R12 +#define exhausted DX + +#define br0 R13 +#define br1 R14 +#define br2 R15 +#define br3 BP + + MOVQ BP, 0(SP) + + XORQ exhausted, exhausted // exhausted = false + XORQ off, off // off = 0 + + MOVBQZX peekBits+32(FP), peek_bits + MOVQ buf+40(FP), buffer + MOVQ tbl+48(FP), table + + MOVQ pbr0+0(FP), br0 + MOVQ pbr1+8(FP), br1 + MOVQ pbr2+16(FP), br2 + MOVQ pbr3+24(FP), br3 + +main_loop: +{{ define "decode_2_values_x86" }} + // const stream = {{ var "id" }} + // br{{ var "id"}}.fillFast() + MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read + MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value + MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset + + // if b.bitsRead >= 32 { + CMPQ br_bits_read, $32 + JB skip_fill{{ var "id" }} + + SUBQ $32, br_bits_read // b.bitsRead -= 32 + SUBQ $4, br_offset // b.off -= 4 + + // v := b.in[b.off-4 : b.off] + // v = v[:4] + // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + MOVQ bitReaderShifted_in(br{{ var "id" }}), AX + MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVQ br_bits_read, CX + SHLQ CL, AX + ORQ AX, br_value + // } +skip_fill{{ var "id" }}: + + // exhausted = exhausted || (br{{ var "id"}}.off < 4) + CMPQ br_offset, $4 + SETLT DL + ORB DL, DH + + // val0 := br{{ var "id"}}.peekTopBits(peekBits) + MOVQ br_value, AX + MOVQ peek_bits, CX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v0 := table[val0&mask] + MOVW 0(table)(AX*2), AX // AX - v0 + + // br{{ var "id"}}.advance(uint8(v0.entry)) + MOVB AH, BL // BL = uint8(v0.entry >> 8) + MOVBQZX AL, CX + SHLQ CL, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + // val1 := br{{ var "id"}}.peekTopBits(peekBits) + MOVQ peek_bits, CX + MOVQ br_value, AX + SHRQ CL, AX // AX = (value >> peek_bits) & mask + + // v1 := table[val1&mask] + MOVW 0(table)(AX*2), AX // AX - v1 + + // br{{ var "id"}}.advance(uint8(v1.entry)) + MOVB AH, BH // BH = uint8(v1.entry >> 8) + MOVBQZX AL, CX + SHLQ CX, br_value // value <<= n + ADDQ CX, br_bits_read // bits_read += n + + + // these two writes get coalesced + // buf[stream][off] = uint8(v0.entry >> 8) + // buf[stream][off+1] = uint8(v1.entry >> 8) + MOVW BX, {{ var "bufofs" }}(buffer)(off*1) + + // update the bitrader reader structure + MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }}) + MOVQ br_value, bitReaderShifted_value(br{{ var "id" }}) + MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }}) +{{ end }} + + {{ set "id" "0" }} + {{ set "ofs" "0" }} + {{ set "bufofs" "0" }} {{/* id * bufoff */}} + {{ template "decode_2_values_x86" . }} + + {{ set "id" "1" }} + {{ set "ofs" "8" }} + {{ set "bufofs" "256" }} + {{ template "decode_2_values_x86" . }} + + {{ set "id" "2" }} + {{ set "ofs" "16" }} + {{ set "bufofs" "512" }} + {{ template "decode_2_values_x86" . }} + + {{ set "id" "3" }} + {{ set "ofs" "24" }} + {{ set "bufofs" "768" }} + {{ template "decode_2_values_x86" . }} + + ADDQ $2, off // off += 2 + + TESTB DH, DH // any br[i].ofs < 4? + JNZ end + + CMPQ off, $bufoff + JL main_loop +end: + MOVQ 0(SP), BP + + MOVB off, ret+56(FP) + RET +#undef off +#undef buffer +#undef table + +#undef br_bits_read +#undef br_value +#undef br_offset +#undef peek_bits +#undef exhausted + +#undef br0 +#undef br1 +#undef br2 +#undef br3 diff --git a/huff0/decompress_generic.go b/huff0/decompress_generic.go new file mode 100644 index 0000000000..126b4d68a9 --- /dev/null +++ b/huff0/decompress_generic.go @@ -0,0 +1,193 @@ +//go:build !amd64 || appengine || !gc || noasm +// +build !amd64 appengine !gc noasm + +// This file contains a generic implementation of Decoder.Decompress4X. +package huff0 + +import ( + "errors" + "fmt" +) + +// Decompress4X will decompress a 4X encoded stream. +// The length of the supplied input must match the end of a block exactly. +// The *capacity* of the dst slice must match the destination size of +// the uncompressed data exactly. +func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { + if len(d.dt.single) == 0 { + return nil, errors.New("no table loaded") + } + if len(src) < 6+(4*1) { + return nil, errors.New("input too small") + } + if use8BitTables && d.actualTableLog <= 8 { + return d.decompress4X8bit(dst, src) + } + + var br [4]bitReaderShifted + // Decode "jump table" + start := 6 + for i := 0; i < 3; i++ { + length := int(src[i*2]) | (int(src[i*2+1]) << 8) + if start+length >= len(src) { + return nil, errors.New("truncated input (or invalid offset)") + } + err := br[i].init(src[start : start+length]) + if err != nil { + return nil, err + } + start += length + } + err := br[3].init(src[start:]) + if err != nil { + return nil, err + } + + // destination, offset to match first output + dstSize := cap(dst) + dst = dst[:dstSize] + out := dst + dstEvery := (dstSize + 3) / 4 + + const tlSize = 1 << tableLogMax + const tlMask = tlSize - 1 + single := d.dt.single[:tlSize] + + // Use temp table to avoid bound checks/append penalty. + buf := d.buffer() + var off uint8 + var decoded int + + // Decode 2 values from each decoder/loop. + const bufoff = 256 + for { + if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { + break + } + + { + const stream = 0 + const stream2 = 1 + br[stream].fillFast() + br[stream2].fillFast() + + val := br[stream].peekBitsFast(d.actualTableLog) + val2 := br[stream2].peekBitsFast(d.actualTableLog) + v := single[val&tlMask] + v2 := single[val2&tlMask] + br[stream].advance(uint8(v.entry)) + br[stream2].advance(uint8(v2.entry)) + buf[stream][off] = uint8(v.entry >> 8) + buf[stream2][off] = uint8(v2.entry >> 8) + + val = br[stream].peekBitsFast(d.actualTableLog) + val2 = br[stream2].peekBitsFast(d.actualTableLog) + v = single[val&tlMask] + v2 = single[val2&tlMask] + br[stream].advance(uint8(v.entry)) + br[stream2].advance(uint8(v2.entry)) + buf[stream][off+1] = uint8(v.entry >> 8) + buf[stream2][off+1] = uint8(v2.entry >> 8) + } + + { + const stream = 2 + const stream2 = 3 + br[stream].fillFast() + br[stream2].fillFast() + + val := br[stream].peekBitsFast(d.actualTableLog) + val2 := br[stream2].peekBitsFast(d.actualTableLog) + v := single[val&tlMask] + v2 := single[val2&tlMask] + br[stream].advance(uint8(v.entry)) + br[stream2].advance(uint8(v2.entry)) + buf[stream][off] = uint8(v.entry >> 8) + buf[stream2][off] = uint8(v2.entry >> 8) + + val = br[stream].peekBitsFast(d.actualTableLog) + val2 = br[stream2].peekBitsFast(d.actualTableLog) + v = single[val&tlMask] + v2 = single[val2&tlMask] + br[stream].advance(uint8(v.entry)) + br[stream2].advance(uint8(v2.entry)) + buf[stream][off+1] = uint8(v.entry >> 8) + buf[stream2][off+1] = uint8(v2.entry >> 8) + } + + off += 2 + + if off == 0 { + if bufoff > dstEvery { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 1") + } + copy(out, buf[0][:]) + copy(out[dstEvery:], buf[1][:]) + copy(out[dstEvery*2:], buf[2][:]) + copy(out[dstEvery*3:], buf[3][:]) + out = out[bufoff:] + decoded += bufoff * 4 + // There must at least be 3 buffers left. + if len(out) < dstEvery*3 { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 2") + } + } + } + if off > 0 { + ioff := int(off) + if len(out) < dstEvery*3+ioff { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 3") + } + copy(out, buf[0][:off]) + copy(out[dstEvery:], buf[1][:off]) + copy(out[dstEvery*2:], buf[2][:off]) + copy(out[dstEvery*3:], buf[3][:off]) + decoded += int(off) * 4 + out = out[off:] + } + + // Decode remaining. + remainBytes := dstEvery - (decoded / 4) + for i := range br { + offset := dstEvery * i + endsAt := offset + remainBytes + if endsAt > len(out) { + endsAt = len(out) + } + br := &br[i] + bitsLeft := br.remaining() + for bitsLeft > 0 { + br.fill() + if offset >= endsAt { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 4") + } + + // Read value and increment offset. + val := br.peekBitsFast(d.actualTableLog) + v := single[val&tlMask].entry + nBits := uint8(v) + br.advance(nBits) + bitsLeft -= uint(nBits) + out[offset] = uint8(v >> 8) + offset++ + } + if offset != endsAt { + d.bufs.Put(buf) + return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) + } + decoded += offset - dstEvery*i + err = br.close() + if err != nil { + return nil, err + } + } + d.bufs.Put(buf) + if dstSize != decoded { + return nil, errors.New("corruption detected: short output block") + } + return dst, nil +} diff --git a/huff0/generate.go b/huff0/generate.go new file mode 100644 index 0000000000..9415cae393 --- /dev/null +++ b/huff0/generate.go @@ -0,0 +1,74 @@ +//go:build ignore +// +build ignore + +package main + +import ( + "log" + "os" + "path" + "text/template" +) + +func main() { + mapping := []struct { + template string + output string + }{{ + template: "decompress_amd64.s.in", + output: "decompress_amd64.s", + }, + } + + for i := range mapping { + + state := make(map[string]string) + + funcMap := template.FuncMap{ + "var": func(name string) string { return state[name] }, + "set": func(name, value string) string { + state[name] = value + return "" + }, + } + + input := mapping[i].template + output := mapping[i].output + if !shouldRegenerate(input, output) { + log.Printf("%q is up to date", output) + continue + } + + tmpl, err := template.New(path.Base(input)).Funcs(funcMap).ParseFiles(input) + die(err) + + f, err := os.Create(output) + die(err) + defer f.Close() + + log.Printf("Generating %q from %q", output, input) + err = tmpl.Execute(f, nil) + die(err) + } +} + +func die(err error) { + if err != nil { + log.Fatal(err) + os.Exit(1) + } +} + +func shouldRegenerate(srcpath, dstpath string) bool { + src, err1 := os.Stat(srcpath) + if err1 != nil { + return true // I/O errors will be rediscovered later + } + + dst, err2 := os.Stat(dstpath) + if err2 != nil { + return true + } + + return src.ModTime().After(dst.ModTime()) +}