diff --git a/huff0/autogen.go b/huff0/autogen.go
new file mode 100644
index 0000000000..110883f13f
--- /dev/null
+++ b/huff0/autogen.go
@@ -0,0 +1,4 @@
+package huff0
+
+//go:generate go run generate.go
+//go:generate asmfmt -w decompress_amd64.s
diff --git a/huff0/bitreader.go b/huff0/bitreader.go
index 03562db16f..451160edda 100644
--- a/huff0/bitreader.go
+++ b/huff0/bitreader.go
@@ -165,6 +165,11 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
 	return uint16(b.value >> ((64 - n) & 63))
 }
 
+// peekTopBits(n) is equvialent to peekBitFast(64 - n)
+func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
+	return uint16(b.value >> n)
+}
+
 func (b *bitReaderShifted) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
diff --git a/huff0/decompress.go b/huff0/decompress.go
index 3ae7d46771..04f6529955 100644
--- a/huff0/decompress.go
+++ b/huff0/decompress.go
@@ -725,189 +725,6 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	return dst, br.close()
 }
 
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	// Decode "jump table"
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
-	var decoded int
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			const stream = 0
-			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off] = uint8(v.entry >> 8)
-			buf[stream2][off] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off+1] = uint8(v.entry >> 8)
-			buf[stream2][off+1] = uint8(v2.entry >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off] = uint8(v.entry >> 8)
-			buf[stream2][off] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off+1] = uint8(v.entry >> 8)
-			buf[stream2][off+1] = uint8(v2.entry >> 8)
-		}
-
-		off += 2
-
-		if off == 0 {
-			if bufoff > dstEvery {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			copy(out, buf[0][:])
-			copy(out[dstEvery:], buf[1][:])
-			copy(out[dstEvery*2:], buf[2][:])
-			copy(out[dstEvery*3:], buf[3][:])
-			out = out[bufoff:]
-			decoded += bufoff * 4
-			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			br.fill()
-			if offset >= endsAt {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			d.bufs.Put(buf)
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	d.bufs.Put(buf)
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
diff --git a/huff0/decompress_amd64.go b/huff0/decompress_amd64.go
new file mode 100644
index 0000000000..fa83f78040
--- /dev/null
+++ b/huff0/decompress_amd64.go
@@ -0,0 +1,166 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// This file contains the specialisation of Decoder.Decompress4X
+// that uses an asm implementation of its main loop.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress4X that uses BMI1 instructions.
+// go:noescape
+func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	const debug = false
+
+	// see: bitReaderShifted.peekBitsFast()
+	peekBits := uint8((64 - d.actualTableLog) & 63)
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+		if debug {
+			fmt.Print("DEBUG: ")
+			fmt.Printf("off=%d,", off)
+			for i := 0; i < 4; i++ {
+				fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
+					i, br[i].bitsRead, br[i].value, br[i].off)
+			}
+			fmt.Println("")
+		}
+
+		if off != 0 {
+			break
+		}
+
+		if bufoff > dstEvery {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 1")
+		}
+		copy(out, buf[0][:])
+		copy(out[dstEvery:], buf[1][:])
+		copy(out[dstEvery*2:], buf[2][:])
+		copy(out[dstEvery*3:], buf[3][:])
+		out = out[bufoff:]
+		decoded += bufoff * 4
+		// There must at least be 3 buffers left.
+		if len(out) < dstEvery*3 {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 2")
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/huff0/decompress_amd64.s b/huff0/decompress_amd64.s
new file mode 100644
index 0000000000..6d883c60dc
--- /dev/null
+++ b/huff0/decompress_amd64.s
@@ -0,0 +1,352 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#define bufoff      256 // see decompress.go, we're using [4][256]byte table
+
+// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+	MOVQ BP, 0(SP)
+
+	XORQ exhausted, exhausted // exhausted = false
+	XORQ off, off             // off = 0
+
+	MOVBQZX peekBits+32(FP), peek_bits
+	MOVQ    buf+40(FP), buffer
+	MOVQ    tbl+48(FP), table
+
+	MOVQ pbr0+0(FP), br0
+	MOVQ pbr1+8(FP), br1
+	MOVQ pbr2+16(FP), br2
+	MOVQ pbr3+24(FP), br3
+
+main_loop:
+
+	// const stream = 0
+	// br0.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
+	MOVQ    bitReaderShifted_value(br0), br_value
+	MOVQ    bitReaderShifted_off(br0), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill0
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br0), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// }
+skip_fill0:
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br0.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br0.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 0(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
+	MOVQ br_value, bitReaderShifted_value(br0)
+	MOVQ br_offset, bitReaderShifted_off(br0)
+
+	// const stream = 1
+	// br1.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
+	MOVQ    bitReaderShifted_value(br1), br_value
+	MOVQ    bitReaderShifted_off(br1), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill1
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br1), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// }
+skip_fill1:
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br1.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br1.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 256(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
+	MOVQ br_value, bitReaderShifted_value(br1)
+	MOVQ br_offset, bitReaderShifted_off(br1)
+
+	// const stream = 2
+	// br2.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
+	MOVQ    bitReaderShifted_value(br2), br_value
+	MOVQ    bitReaderShifted_off(br2), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill2
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br2), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// }
+skip_fill2:
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br2.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br2.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 512(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
+	MOVQ br_value, bitReaderShifted_value(br2)
+	MOVQ br_offset, bitReaderShifted_off(br2)
+
+	// const stream = 3
+	// br3.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
+	MOVQ    bitReaderShifted_value(br3), br_value
+	MOVQ    bitReaderShifted_off(br3), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill3
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br3), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// }
+skip_fill3:
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br3.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br3.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 768(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
+	MOVQ br_value, bitReaderShifted_value(br3)
+	MOVQ br_offset, bitReaderShifted_off(br3)
+
+	ADDQ $2, off // off += 2
+
+	TESTB DH, DH // any br[i].ofs < 4?
+	JNZ   end
+
+	CMPQ off, $bufoff
+	JL   main_loop
+
+end:
+	MOVQ 0(SP), BP
+
+	MOVB off, ret+56(FP)
+	RET
+
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3
diff --git a/huff0/decompress_amd64.s.in b/huff0/decompress_amd64.s.in
new file mode 100644
index 0000000000..34bb290b63
--- /dev/null
+++ b/huff0/decompress_amd64.s.in
@@ -0,0 +1,162 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+
+#define bufoff      256     // see decompress.go, we're using [4][256]byte table
+
+//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+    MOVQ    BP, 0(SP)
+
+    XORQ    exhausted, exhausted    // exhausted = false
+    XORQ    off, off                // off = 0
+
+    MOVBQZX peekBits+32(FP), peek_bits
+    MOVQ    buf+40(FP), buffer
+    MOVQ    tbl+48(FP), table
+
+    MOVQ    pbr0+0(FP), br0
+    MOVQ    pbr1+8(FP), br1
+    MOVQ    pbr2+16(FP), br2
+    MOVQ    pbr3+24(FP), br3
+
+main_loop:
+{{ define "decode_2_values_x86" }}
+    // const stream = {{ var "id" }}
+    // br{{ var "id"}}.fillFast()
+    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
+    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
+    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
+
+	// if b.bitsRead >= 32 {
+    CMPQ    br_bits_read, $32
+    JB      skip_fill{{ var "id" }}
+
+    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
+    SUBQ    $4, br_offset           // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
+    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+    MOVQ    br_bits_read, CX
+    SHLQ    CL, AX
+    ORQ     AX, br_value
+    // }
+skip_fill{{ var "id" }}:
+
+    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
+    CMPQ    br_offset, $4
+    SETLT   DL
+    ORB     DL, DH
+
+    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    br_value, AX
+    MOVQ    peek_bits, CX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v0 := table[val0&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v0
+
+    // br{{ var "id"}}.advance(uint8(v0.entry))
+    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CL, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    peek_bits, CX
+    MOVQ    br_value, AX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v1 := table[val1&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v1
+
+    // br{{ var "id"}}.advance(uint8(v1.entry))
+    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CX, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+
+    // these two writes get coalesced
+    // buf[stream][off] = uint8(v0.entry >> 8)
+    // buf[stream][off+1] = uint8(v1.entry >> 8)
+    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
+
+    // update the bitrader reader structure
+    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
+    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
+    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
+{{ end }}
+
+    {{ set "id" "0" }}
+    {{ set "ofs" "0" }}
+    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "1" }}
+    {{ set "ofs" "8" }}
+    {{ set "bufofs" "256" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "2" }}
+    {{ set "ofs" "16" }}
+    {{ set "bufofs" "512" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "3" }}
+    {{ set "ofs" "24" }}
+    {{ set "bufofs" "768" }}
+    {{ template "decode_2_values_x86" . }}
+
+    ADDQ    $2, off     // off += 2
+
+    TESTB   DH, DH      // any br[i].ofs < 4?
+    JNZ     end
+
+    CMPQ    off, $bufoff
+    JL      main_loop
+end:
+    MOVQ    0(SP), BP
+
+    MOVB    off, ret+56(FP)
+    RET
+#undef  off
+#undef  buffer
+#undef  table
+
+#undef  br_bits_read
+#undef  br_value
+#undef  br_offset
+#undef  peek_bits
+#undef  exhausted
+
+#undef  br0
+#undef  br1
+#undef  br2
+#undef  br3
diff --git a/huff0/decompress_generic.go b/huff0/decompress_generic.go
new file mode 100644
index 0000000000..126b4d68a9
--- /dev/null
+++ b/huff0/decompress_generic.go
@@ -0,0 +1,193 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// This file contains a generic implementation of Decoder.Decompress4X.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		off += 2
+
+		if off == 0 {
+			if bufoff > dstEvery {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
+			out = out[bufoff:]
+			decoded += bufoff * 4
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/huff0/generate.go b/huff0/generate.go
new file mode 100644
index 0000000000..9415cae393
--- /dev/null
+++ b/huff0/generate.go
@@ -0,0 +1,74 @@
+//go:build ignore
+// +build ignore
+
+package main
+
+import (
+	"log"
+	"os"
+	"path"
+	"text/template"
+)
+
+func main() {
+	mapping := []struct {
+		template string
+		output   string
+	}{{
+		template: "decompress_amd64.s.in",
+		output:   "decompress_amd64.s",
+	},
+	}
+
+	for i := range mapping {
+
+		state := make(map[string]string)
+
+		funcMap := template.FuncMap{
+			"var": func(name string) string { return state[name] },
+			"set": func(name, value string) string {
+				state[name] = value
+				return ""
+			},
+		}
+
+		input := mapping[i].template
+		output := mapping[i].output
+		if !shouldRegenerate(input, output) {
+			log.Printf("%q is up to date", output)
+			continue
+		}
+
+		tmpl, err := template.New(path.Base(input)).Funcs(funcMap).ParseFiles(input)
+		die(err)
+
+		f, err := os.Create(output)
+		die(err)
+		defer f.Close()
+
+		log.Printf("Generating %q from %q", output, input)
+		err = tmpl.Execute(f, nil)
+		die(err)
+	}
+}
+
+func die(err error) {
+	if err != nil {
+		log.Fatal(err)
+		os.Exit(1)
+	}
+}
+
+func shouldRegenerate(srcpath, dstpath string) bool {
+	src, err1 := os.Stat(srcpath)
+	if err1 != nil {
+		return true // I/O errors will be rediscovered later
+	}
+
+	dst, err2 := os.Stat(dstpath)
+	if err2 != nil {
+		return true
+	}
+
+	return src.ModTime().After(dst.ModTime())
+}