diff --git a/huff0/_generate/gen.go b/huff0/_generate/gen.go
new file mode 100644
index 0000000000..04870d91dc
--- /dev/null
+++ b/huff0/_generate/gen.go
@@ -0,0 +1,310 @@
+package main
+
+//go:generate go run gen.go -out ../decompress_amd64.s -pkg=huff0
+
+import (
+	"flag"
+	"fmt"
+	"strconv"
+
+	_ "github.com/klauspost/compress"
+
+	. "github.com/mmcloughlin/avo/build"
+	. "github.com/mmcloughlin/avo/operand"
+	"github.com/mmcloughlin/avo/reg"
+)
+
+func main() {
+	flag.Parse()
+
+	ConstraintExpr("amd64,!appengine,!noasm,gc")
+
+	decompress := decompress4x{}
+	decompress.generateProcedure("decompress4x_main_loop_amd64")
+	decompress.generateProcedure4x8bit("decompress4x_8b_main_loop_amd64")
+
+	Generate()
+}
+
+type decompress4x struct {
+}
+
+func (d decompress4x) generateProcedure(name string) {
+	Package("github.com/klauspost/compress/huff0")
+	TEXT(name, 0, "func(ctx* decompress4xContext)")
+	Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "")
+	Pragma("noescape")
+
+	exhausted := GP64()
+	XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false
+
+	limitPtr := AllocLocal(8)
+
+	bufferOrigin := GP64()
+	peekBits := GP64()
+	buffer := GP64()
+	dstEvery := GP64()
+	table := GP64()
+
+	br0 := GP64()
+	br1 := GP64()
+	br2 := GP64()
+	br3 := GP64()
+
+	Comment("Preload values")
+	{
+		ctx := Dereference(Param("ctx"))
+		Load(ctx.Field("peekBits"), peekBits)
+		Load(ctx.Field("out"), buffer)
+		MOVQ(buffer, bufferOrigin)
+		limit := Load(ctx.Field("limit"), GP64())
+		MOVQ(limit, limitPtr)
+		Load(ctx.Field("dstEvery"), dstEvery)
+		Load(ctx.Field("tbl"), table)
+		Load(ctx.Field("pbr0"), br0)
+		Load(ctx.Field("pbr1"), br1)
+		Load(ctx.Field("pbr2"), br2)
+		Load(ctx.Field("pbr3"), br3)
+	}
+
+	Comment("Main loop")
+	Label("main_loop")
+
+	MOVQ(bufferOrigin, buffer)
+	// Check if we have space
+	CMPQ(buffer, limitPtr)
+	SETGE(exhausted.As8())
+	d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted)
+	ADDQ(dstEvery, buffer)
+	d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted)
+	ADDQ(dstEvery, buffer)
+	d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted)
+	ADDQ(dstEvery, buffer)
+	d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted)
+
+	ADDQ(U8(2), bufferOrigin) // off += 2
+
+	TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4?
+	JZ(LabelRef("main_loop"))
+
+	{
+		ctx := Dereference(Param("ctx"))
+		tmp := Load(ctx.Field("out"), GP64())
+		decoded := GP64()
+		MOVQ(bufferOrigin, decoded)
+		SUBQ(tmp, decoded)
+		SHLQ(U8(2), decoded) // decoded *= 4
+
+		Store(decoded, ctx.Field("decoded"))
+	}
+
+	RET()
+}
+
+// TODO [wmu]: I believe it's doable in avo, but can't figure out how to deal
+//             with arbitrary pointers to a given type
+const bitReader_in = 0
+const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap}
+const bitReader_value = bitReader_off + 8
+const bitReader_bitsRead = bitReader_value + 8
+
+func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
+	brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted)
+
+	val := GP64()
+	Commentf("val0 := br%d.peekTopBits(peekBits)", id)
+	CX := reg.CL
+	MOVQ(brValue, val.As64())
+	MOVQ(peekBits, CX.As64())
+	SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask
+
+	Comment("v0 := table[val0&mask]")
+	MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16())
+
+	Commentf("br%d.advance(uint8(v0.entry)", id)
+	out := reg.RAX             // Fixed since we need 8H
+	MOVB(CX.As8H(), out.As8()) // AL = uint8(v0.entry >> 8)
+
+	SHLQ(CX, brValue)                // value <<= n
+	ADDB(CX.As8(), brBitsRead.As8()) // bits_read += n
+
+	Commentf("val1 := br%d.peekTopBits(peekBits)", id)
+	MOVQ(peekBits, CX.As64())
+	MOVQ(brValue, val.As64())
+	SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask
+
+	Comment("v1 := table[val1&mask]")
+	MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16()) // tmp - v1
+
+	Commentf("br%d.advance(uint8(v1.entry))", id)
+	MOVB(CX.As8H(), out.As8H())      // AH = uint8(v0.entry >> 8)
+	SHLQ(CX, brValue)                // value <<= n
+	ADDB(CX.As8(), brBitsRead.As8()) // bits_read += n
+
+	Comment("these two writes get coalesced")
+	Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)")
+	Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)")
+	MOVW(out.As16(), Mem{Base: buffer})
+
+	Comment("update the bitrader reader structure")
+	MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
+	MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
+}
+
+func (d decompress4x) generateProcedure4x8bit(name string) {
+	Package("github.com/klauspost/compress/huff0")
+	TEXT(name, 0, "func(ctx* decompress4xContext)")
+	Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "")
+	Pragma("noescape")
+
+	exhausted := GP64()                      // Fixed since we need 8H
+	XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false
+
+	bufferOrigin := AllocLocal(8)
+	limitPtr := AllocLocal(8)
+
+	peekBits := GP64()
+	buffer := GP64()
+	dstEvery := GP64()
+	table := GP64()
+
+	br0 := GP64()
+	br1 := GP64()
+	br2 := GP64()
+	br3 := GP64()
+
+	Comment("Preload values")
+	{
+		ctx := Dereference(Param("ctx"))
+		Load(ctx.Field("peekBits"), peekBits)
+		Load(ctx.Field("out"), buffer)
+		MOVQ(buffer, bufferOrigin)
+		limit := Load(ctx.Field("limit"), GP64())
+		MOVQ(limit, limitPtr)
+		Load(ctx.Field("dstEvery"), dstEvery)
+		Load(ctx.Field("tbl"), table)
+		Load(ctx.Field("pbr0"), br0)
+		Load(ctx.Field("pbr1"), br1)
+		Load(ctx.Field("pbr2"), br2)
+		Load(ctx.Field("pbr3"), br3)
+	}
+
+	Comment("Main loop")
+	Label("main_loop")
+
+	MOVQ(bufferOrigin, buffer)
+	// Check if we have space
+	CMPQ(buffer, limitPtr)
+	SETGE(exhausted.As8())
+	d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted)
+	ADDQ(dstEvery, buffer)
+	d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted)
+	ADDQ(dstEvery, buffer)
+	d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted)
+	ADDQ(dstEvery, buffer)
+	d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted)
+
+	ADDQ(U8(4), bufferOrigin) // off += 4
+
+	TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4?
+	JZ(LabelRef("main_loop"))
+
+	{
+		ctx := Dereference(Param("ctx"))
+		tmp := Load(ctx.Field("out"), GP64())
+		decoded := GP64()
+		MOVQ(bufferOrigin, decoded)
+		SUBQ(tmp, decoded)
+		SHLQ(U8(2), decoded) // decoded *= 4
+
+		Store(decoded, ctx.Field("decoded"))
+	}
+	RET()
+}
+
+func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
+	brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted)
+
+	decompress := func(valID int, outByte reg.Register) {
+		CX := reg.CL
+		val := GP64()
+		Commentf("val%d := br%d.peekTopBits(peekBits)", valID, id)
+		MOVQ(brValue, val.As64())
+		MOVQ(peekBits, CX.As64())
+		SHRQ(CX, val.As64()) // val = (value >> peek_bits) & mask
+
+		Commentf("v%d := table[val0&mask]", valID)
+		MOVW(Mem{Base: table, Index: val.As64(), Scale: 2}, CX.As16())
+
+		Commentf("br%d.advance(uint8(v%d.entry)", id, valID)
+		MOVB(CX.As8H(), outByte) // outByte = uint8(v0.entry >> 8)
+
+		SHLQ(CX, brValue)          // value <<= n
+		ADDB(CX, brBitsRead.As8()) // bits_read += n
+	}
+
+	out := reg.RAX // Fixed since we need 8H
+	decompress(0, out.As8L())
+	decompress(1, out.As8H())
+	BSWAPL(out.As32())
+	decompress(2, out.As8H())
+	decompress(3, out.As8L())
+	BSWAPL(out.As32())
+
+	Comment("these four writes get coalesced")
+	Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)")
+	Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)")
+	Comment("out[id * dstEvery + 3] = uint8(v2.entry >> 8)")
+	Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)")
+	MOVL(out.As32(), Mem{Base: buffer})
+
+	Comment("update the bitreader reader structure")
+	MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
+	MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
+}
+
+func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) {
+	if atLeast > 32 {
+		panic(fmt.Sprintf("at least (%d) cannot be >32", atLeast))
+	}
+	Commentf("br%d.fillFast32()", id)
+	brValue = GP64()
+	brBitsRead = GP64()
+	MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue)
+	MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead)
+
+	// We must have at least 2 * max tablelog left
+	CMPQ(brBitsRead, U8(64-atLeast))
+	JBE(LabelRef("skip_fill" + strconv.Itoa(id)))
+	brOffset := GP64()
+	MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset)
+
+	SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32
+	SUBQ(U8(4), brOffset)    // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	tmp := GP64()
+	MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp)
+
+	Comment("b.value |= uint64(low) << (b.bitsRead & 63)")
+	addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1}
+	CX := reg.CL
+	MOVL(addr, tmp.As32()) // tmp = uint32(b.in[b.off:b.off+4])
+	MOVQ(brBitsRead, CX.As64())
+	SHLQ(CX, tmp.As64())
+
+	MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off})
+	ORQ(tmp.As64(), brValue)
+	{
+		Commentf("exhausted = exhausted || (br%d.off < 4)", id)
+		CMPQ(brOffset, U8(4))
+		tmp = GP64()
+		SETLT(tmp.As8())
+		ORB(tmp.As8(), exhausted.As8())
+	}
+
+	Label("skip_fill" + strconv.Itoa(id))
+	return
+}
diff --git a/huff0/_generate/go.mod b/huff0/_generate/go.mod
new file mode 100644
index 0000000000..41c4458869
--- /dev/null
+++ b/huff0/_generate/go.mod
@@ -0,0 +1,10 @@
+module github.com/klauspost/compress/s2/_generate
+
+go 1.15
+
+require (
+	github.com/klauspost/compress v1.15.1
+	github.com/mmcloughlin/avo v0.4.0
+)
+
+replace github.com/klauspost/compress => ../..
diff --git a/huff0/_generate/go.sum b/huff0/_generate/go.sum
new file mode 100644
index 0000000000..b4b59140f0
--- /dev/null
+++ b/huff0/_generate/go.sum
@@ -0,0 +1,32 @@
+github.com/mmcloughlin/avo v0.4.0 h1:jeHDRktVD+578ULxWpQHkilor6pkdLF7u7EiTzDbfcU=
+github.com/mmcloughlin/avo v0.4.0/go.mod h1:RW9BfYA3TgO9uCdNrKU2h6J8cPD8ZLznvfgHAeszb1s=
+github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
+golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo=
+golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021 h1:giLT+HuUP/gXYrG2Plg9WTjj4qhfgaW424ZIFog3rlk=
+golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.7 h1:6j8CgantCy3yc8JGBqkDLMKWqZ0RDU2g1HVgacojGWQ=
+golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
diff --git a/huff0/autogen.go b/huff0/autogen.go
deleted file mode 100644
index ff2c69d60c..0000000000
--- a/huff0/autogen.go
+++ /dev/null
@@ -1,5 +0,0 @@
-package huff0
-
-//go:generate go run generate.go
-//go:generate asmfmt -w decompress_amd64.s
-//go:generate asmfmt -w decompress_8b_amd64.s
diff --git a/huff0/decompress_8b_amd64.s b/huff0/decompress_8b_amd64.s
deleted file mode 100644
index 0d6cb1a962..0000000000
--- a/huff0/decompress_8b_amd64.s
+++ /dev/null
@@ -1,488 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#define bufoff      256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off             R8
-#define buffer          DI
-#define table           SI
-
-#define br_bits_read    R9
-#define br_value        R10
-#define br_offset       R11
-#define peek_bits       R12
-#define exhausted       DX
-
-#define br0             R13
-#define br1             R14
-#define br2             R15
-#define br3             BP
-
-	MOVQ BP, 0(SP)
-
-	XORQ exhausted, exhausted // exhausted = false
-	XORQ off, off             // off = 0
-
-	MOVBQZX peekBits+32(FP), peek_bits
-	MOVQ    buf+40(FP), buffer
-	MOVQ    tbl+48(FP), table
-
-	MOVQ pbr0+0(FP), br0
-	MOVQ pbr1+8(FP), br1
-	MOVQ pbr2+16(FP), br2
-	MOVQ pbr3+24(FP), br3
-
-main_loop:
-
-	// const stream = 0
-	// br0.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
-	MOVQ    bitReaderShifted_value(br0), br_value
-	MOVQ    bitReaderShifted_off(br0), br_offset
-
-	// if b.bitsRead >= 32 {
-	CMPQ br_bits_read, $32
-	JB   skip_fill0
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br0), AX
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-	ORQ  AX, br_value
-
-	// exhausted = exhausted || (br0.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
-
-	// }
-skip_fill0:
-
-	// val0 := br0.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br0.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val1 := br0.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br0.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 0(buffer)(off*1)
-
-	// SECOND PART:
-	// val2 := br0.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v2 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br0.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val3 := br0.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v3 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br0.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
-	MOVW BX, 0+2(buffer)(off*1)
-
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
-	MOVQ br_value, bitReaderShifted_value(br0)
-	MOVQ br_offset, bitReaderShifted_off(br0)
-
-	// const stream = 1
-	// br1.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
-	MOVQ    bitReaderShifted_value(br1), br_value
-	MOVQ    bitReaderShifted_off(br1), br_offset
-
-	// if b.bitsRead >= 32 {
-	CMPQ br_bits_read, $32
-	JB   skip_fill1
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br1), AX
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-	ORQ  AX, br_value
-
-	// exhausted = exhausted || (br1.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
-
-	// }
-skip_fill1:
-
-	// val0 := br1.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br1.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val1 := br1.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br1.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 256(buffer)(off*1)
-
-	// SECOND PART:
-	// val2 := br1.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v2 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br1.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val3 := br1.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v3 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br1.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
-	MOVW BX, 256+2(buffer)(off*1)
-
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
-	MOVQ br_value, bitReaderShifted_value(br1)
-	MOVQ br_offset, bitReaderShifted_off(br1)
-
-	// const stream = 2
-	// br2.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
-	MOVQ    bitReaderShifted_value(br2), br_value
-	MOVQ    bitReaderShifted_off(br2), br_offset
-
-	// if b.bitsRead >= 32 {
-	CMPQ br_bits_read, $32
-	JB   skip_fill2
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br2), AX
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-	ORQ  AX, br_value
-
-	// exhausted = exhausted || (br2.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
-
-	// }
-skip_fill2:
-
-	// val0 := br2.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br2.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val1 := br2.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br2.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 512(buffer)(off*1)
-
-	// SECOND PART:
-	// val2 := br2.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v2 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br2.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val3 := br2.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v3 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br2.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
-	MOVW BX, 512+2(buffer)(off*1)
-
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
-	MOVQ br_value, bitReaderShifted_value(br2)
-	MOVQ br_offset, bitReaderShifted_off(br2)
-
-	// const stream = 3
-	// br3.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
-	MOVQ    bitReaderShifted_value(br3), br_value
-	MOVQ    bitReaderShifted_off(br3), br_offset
-
-	// if b.bitsRead >= 32 {
-	CMPQ br_bits_read, $32
-	JB   skip_fill3
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br3), AX
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-	ORQ  AX, br_value
-
-	// exhausted = exhausted || (br3.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
-
-	// }
-skip_fill3:
-
-	// val0 := br3.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br3.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val1 := br3.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br3.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 768(buffer)(off*1)
-
-	// SECOND PART:
-	// val2 := br3.peekTopBits(peekBits)
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v2 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br3.advance(uint8(v0.entry))
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// val3 := br3.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-	// v3 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
-
-	// br3.advance(uint8(v1.entry))
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
-	MOVBQZX AL, CX
-	SHLQ    CX, br_value     // value <<= n
-	ADDQ    CX, br_bits_read // bits_read += n
-
-	// these two writes get coalesced
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
-	MOVW BX, 768+2(buffer)(off*1)
-
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
-	MOVQ br_value, bitReaderShifted_value(br3)
-	MOVQ br_offset, bitReaderShifted_off(br3)
-
-	ADDQ $4, off // off += 2
-
-	TESTB DH, DH // any br[i].ofs < 4?
-	JNZ   end
-
-	CMPQ off, $bufoff
-	JL   main_loop
-
-end:
-	MOVQ 0(SP), BP
-
-	MOVB off, ret+56(FP)
-	RET
-
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/huff0/decompress_8b_amd64.s.in b/huff0/decompress_8b_amd64.s.in
deleted file mode 100644
index 6d477a2c11..0000000000
--- a/huff0/decompress_8b_amd64.s.in
+++ /dev/null
@@ -1,197 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-
-#define bufoff      256     // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off             R8
-#define buffer          DI
-#define table           SI
-
-#define br_bits_read    R9
-#define br_value        R10
-#define br_offset       R11
-#define peek_bits       R12
-#define exhausted       DX
-
-#define br0             R13
-#define br1             R14
-#define br2             R15
-#define br3             BP
-
-    MOVQ    BP, 0(SP)
-
-    XORQ    exhausted, exhausted    // exhausted = false
-    XORQ    off, off                // off = 0
-
-    MOVBQZX peekBits+32(FP), peek_bits
-    MOVQ    buf+40(FP), buffer
-    MOVQ    tbl+48(FP), table
-
-    MOVQ    pbr0+0(FP), br0
-    MOVQ    pbr1+8(FP), br1
-    MOVQ    pbr2+16(FP), br2
-    MOVQ    pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
-    // const stream = {{ var "id" }}
-    // br{{ var "id"}}.fillFast()
-    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
-    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
-    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
-
-	// if b.bitsRead >= 32 {
-    CMPQ    br_bits_read, $32
-    JB      skip_fill{{ var "id" }}
-
-    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
-    SUBQ    $4, br_offset           // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
-    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-    MOVQ    br_bits_read, CX
-    SHLQ    CL, AX
-    ORQ     AX, br_value
-
-    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
-    CMPQ    br_offset, $4
-    SETLT   DL
-    ORB     DL, DH
-    // }
-skip_fill{{ var "id" }}:
-
-    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    br_value, AX
-    MOVQ    peek_bits, CX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-
-    // v0 := table[val0&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v0
-
-    // br{{ var "id"}}.advance(uint8(v0.entry))
-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
-    MOVBQZX AL, CX
-    SHLQ    CL, br_value            // value <<= n
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    peek_bits, CX
-    MOVQ    br_value, AX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-
-    // v1 := table[val1&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v1
-
-    // br{{ var "id"}}.advance(uint8(v1.entry))
-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
-    MOVBQZX AL, CX
-    SHLQ    CX, br_value            // value <<= n
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-
-    // these two writes get coalesced
-    // buf[stream][off] = uint8(v0.entry >> 8)
-    // buf[stream][off+1] = uint8(v1.entry >> 8)
-    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
-
-    // SECOND PART:
-    // val2 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    br_value, AX
-    MOVQ    peek_bits, CX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-
-    // v2 := table[val0&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v0
-
-    // br{{ var "id"}}.advance(uint8(v0.entry))
-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
-    MOVBQZX AL, CX
-    SHLQ    CL, br_value            // value <<= n
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-    // val3 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    peek_bits, CX
-    MOVQ    br_value, AX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-
-    // v3 := table[val1&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v1
-
-    // br{{ var "id"}}.advance(uint8(v1.entry))
-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
-    MOVBQZX AL, CX
-    SHLQ    CX, br_value            // value <<= n
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-
-    // these two writes get coalesced
-    // buf[stream][off+2] = uint8(v2.entry >> 8)
-    // buf[stream][off+3] = uint8(v3.entry >> 8)
-    MOVW    BX, {{ var "bufofs" }}+2(buffer)(off*1)
-
-    // update the bitrader reader structure
-    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
-    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
-    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
-    {{ set "id" "0" }}
-    {{ set "ofs" "0" }}
-    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "1" }}
-    {{ set "ofs" "8" }}
-    {{ set "bufofs" "256" }}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "2" }}
-    {{ set "ofs" "16" }}
-    {{ set "bufofs" "512" }}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "3" }}
-    {{ set "ofs" "24" }}
-    {{ set "bufofs" "768" }}
-    {{ template "decode_2_values_x86" . }}
-
-    ADDQ    $4, off     // off += 2
-
-    TESTB   DH, DH      // any br[i].ofs < 4?
-    JNZ     end
-
-    CMPQ    off, $bufoff
-    JL      main_loop
-end:
-    MOVQ    0(SP), BP
-
-    MOVB    off, ret+56(FP)
-    RET
-#undef  off
-#undef  buffer
-#undef  table
-
-#undef  br_bits_read
-#undef  br_value
-#undef  br_offset
-#undef  peek_bits
-#undef  exhausted
-
-#undef  br0
-#undef  br1
-#undef  br2
-#undef  br3
diff --git a/huff0/decompress_amd64.go b/huff0/decompress_amd64.go
index ce8e93bcd0..3415e5da22 100644
--- a/huff0/decompress_amd64.go
+++ b/huff0/decompress_amd64.go
@@ -13,19 +13,30 @@ import (
 // decompress4x_main_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog > 8.
 //go:noescape
-func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
 
 // decompress4x_8b_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog <= 8 which decodes 4 entries
 // per loop.
 //go:noescape
-func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
 
 // fallback8BitSize is the size where using Go version is faster.
 const fallback8BitSize = 800
 
+type decompress4xContext struct {
+	pbr0     *bitReaderShifted
+	pbr1     *bitReaderShifted
+	pbr2     *bitReaderShifted
+	pbr3     *bitReaderShifted
+	peekBits uint8
+	out      *byte
+	dstEvery int
+	tbl      *dEntrySingle
+	decoded  int
+	limit    *byte
+}
+
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
@@ -42,6 +53,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	if cap(dst) < fallback8BitSize && use8BitTables {
 		return d.decompress4X8bit(dst, src)
 	}
+
 	var br [4]bitReaderShifted
 	// Decode "jump table"
 	start := 6
@@ -71,70 +83,28 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
 	var decoded int
 
-	const debug = false
-
-	// see: bitReaderShifted.peekBitsFast()
-	peekBits := uint8((64 - d.actualTableLog) & 63)
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
+	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+		ctx := decompress4xContext{
+			pbr0:     &br[0],
+			pbr1:     &br[1],
+			pbr2:     &br[2],
+			pbr3:     &br[3],
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+			out:      &out[0],
+			dstEvery: dstEvery,
+			tbl:      &single[0],
+			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
 		}
-
 		if use8BitTables {
-			off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+			decompress4x_8b_main_loop_amd64(&ctx)
 		} else {
-			off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
-		}
-		if debug {
-			fmt.Print("DEBUG: ")
-			fmt.Printf("off=%d,", off)
-			for i := 0; i < 4; i++ {
-				fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
-					i, br[i].bitsRead, br[i].value, br[i].off)
-			}
-			fmt.Println("")
-		}
-
-		if off != 0 {
-			break
+			decompress4x_main_loop_amd64(&ctx)
 		}
 
-		if bufoff > dstEvery {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 1")
-		}
-		copy(out, buf[0][:])
-		copy(out[dstEvery:], buf[1][:])
-		copy(out[dstEvery*2:], buf[2][:])
-		copy(out[dstEvery*3:], buf[3][:])
-		out = out[bufoff:]
-		decoded += bufoff * 4
-		// There must at least be 3 buffers left.
-		if len(out) < dstEvery*3 {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 2")
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
+		decoded = ctx.decoded
+		out = out[decoded/4:]
 	}
 
 	// Decode remaining.
@@ -150,7 +120,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 		for bitsLeft > 0 {
 			br.fill()
 			if offset >= endsAt {
-				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
@@ -164,7 +133,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 			offset++
 		}
 		if offset != endsAt {
-			d.bufs.Put(buf)
 			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
 		}
 		decoded += offset - dstEvery*i
@@ -173,7 +141,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 			return nil, err
 		}
 	}
-	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
diff --git a/huff0/decompress_amd64.s b/huff0/decompress_amd64.s
index 2edad3ea5a..06287f5685 100644
--- a/huff0/decompress_amd64.s
+++ b/huff0/decompress_amd64.s
@@ -1,506 +1,662 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff      256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off             R8
-#define buffer          DI
-#define table           SI
-
-#define br_bits_read    R9
-#define br_value        R10
-#define br_offset       R11
-#define peek_bits       R12
-#define exhausted       DX
-
-#define br0             R13
-#define br1             R14
-#define br2             R15
-#define br3             BP
-
-	MOVQ BP, 0(SP)
-
-	XORQ exhausted, exhausted // exhausted = false
-	XORQ off, off             // off = 0
-
-	MOVBQZX peekBits+32(FP), peek_bits
-	MOVQ    buf+40(FP), buffer
-	MOVQ    tbl+48(FP), table
-
-	MOVQ pbr0+0(FP), br0
-	MOVQ pbr1+8(FP), br1
-	MOVQ pbr2+16(FP), br2
-	MOVQ pbr3+24(FP), br3
-
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $8-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), AX
+	MOVBQZX 32(AX), SI
+	MOVQ    40(AX), DI
+	MOVQ    DI, BX
+	MOVQ    72(AX), CX
+	MOVQ    CX, (SP)
+	MOVQ    48(AX), R8
+	MOVQ    56(AX), R9
+	MOVQ    (AX), R10
+	MOVQ    8(AX), R11
+	MOVQ    16(AX), R12
+	MOVQ    24(AX), R13
+
+	// Main loop
 main_loop:
-
-	// const stream = 0
-	// br0.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
-	MOVQ    bitReaderShifted_value(br0), br_value
-	MOVQ    bitReaderShifted_off(br0), br_offset
-
-	// We must have at least 2 * max tablelog left
-	CMPQ br_bits_read, $64-22
-	JBE  skip_fill0
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br0), AX
+	MOVQ  BX, DI
+	CMPQ  DI, (SP)
+	SETGE DL
+
+	// br0.fillFast32()
+	MOVQ    32(R10), R14
+	MOVBQZX 40(R10), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill0
+	MOVQ    24(R10), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R10), BP
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-
-#endif
-
-	ORQ AX, br_value
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R10)
+	ORQ  BP, R14
 
 	// exhausted = exhausted || (br0.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
 
-	// }
 skip_fill0:
-
 	// val0 := br0.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
 
 	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br0.advance(uint8(v0.entry))
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
+	MOVW (R9)(BP*2), CX
 
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
 	// val1 := br0.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
 
 	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
+	MOVW (R9)(BP*2), CX
 
 	// br0.advance(uint8(v1.entry))
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
-
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
 
 	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 0(buffer)(off*1)
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
 
 	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
-	MOVQ br_value, bitReaderShifted_value(br0)
-	MOVQ br_offset, bitReaderShifted_off(br0)
-
-	// const stream = 1
-	// br1.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
-	MOVQ    bitReaderShifted_value(br1), br_value
-	MOVQ    bitReaderShifted_off(br1), br_offset
-
-	// We must have at least 2 * max tablelog left
-	CMPQ br_bits_read, $64-22
-	JBE  skip_fill1
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br1), AX
+	MOVQ R14, 32(R10)
+	MOVB R15, 40(R10)
+	ADDQ R8, DI
+
+	// br1.fillFast32()
+	MOVQ    32(R11), R14
+	MOVBQZX 40(R11), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill1
+	MOVQ    24(R11), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R11), BP
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-
-#endif
-
-	ORQ AX, br_value
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R11)
+	ORQ  BP, R14
 
 	// exhausted = exhausted || (br1.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
 
-	// }
 skip_fill1:
-
 	// val0 := br1.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
 
 	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
-
-	// br1.advance(uint8(v0.entry))
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
+	MOVW (R9)(BP*2), CX
 
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
 	// val1 := br1.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
 
 	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
+	MOVW (R9)(BP*2), CX
 
 	// br1.advance(uint8(v1.entry))
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
-
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
 
 	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 256(buffer)(off*1)
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
 
 	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
-	MOVQ br_value, bitReaderShifted_value(br1)
-	MOVQ br_offset, bitReaderShifted_off(br1)
-
-	// const stream = 2
-	// br2.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
-	MOVQ    bitReaderShifted_value(br2), br_value
-	MOVQ    bitReaderShifted_off(br2), br_offset
-
-	// We must have at least 2 * max tablelog left
-	CMPQ br_bits_read, $64-22
-	JBE  skip_fill2
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br2), AX
+	MOVQ R14, 32(R11)
+	MOVB R15, 40(R11)
+	ADDQ R8, DI
+
+	// br2.fillFast32()
+	MOVQ    32(R12), R14
+	MOVBQZX 40(R12), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill2
+	MOVQ    24(R12), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R12), BP
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-
-#endif
-
-	ORQ AX, br_value
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R12)
+	ORQ  BP, R14
 
 	// exhausted = exhausted || (br2.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
 
-	// }
 skip_fill2:
-
 	// val0 := br2.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
 
 	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
+	MOVW (R9)(BP*2), CX
 
-	// br2.advance(uint8(v0.entry))
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
-
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
 	// val1 := br2.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
 
 	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
+	MOVW (R9)(BP*2), CX
 
 	// br2.advance(uint8(v1.entry))
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
-
-#endif
-
-	ADDQ CX, br_bits_read // bits_read += n
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
 
 	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 512(buffer)(off*1)
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
 
 	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
-	MOVQ br_value, bitReaderShifted_value(br2)
-	MOVQ br_offset, bitReaderShifted_off(br2)
-
-	// const stream = 3
-	// br3.fillFast()
-	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
-	MOVQ    bitReaderShifted_value(br3), br_value
-	MOVQ    bitReaderShifted_off(br3), br_offset
-
-	// We must have at least 2 * max tablelog left
-	CMPQ br_bits_read, $64-22
-	JBE  skip_fill3
-
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
-	SUBQ $4, br_offset     // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	MOVQ bitReaderShifted_in(br3), AX
+	MOVQ R14, 32(R12)
+	MOVB R15, 40(R12)
+	ADDQ R8, DI
+
+	// br3.fillFast32()
+	MOVQ    32(R13), R14
+	MOVBQZX 40(R13), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill3
+	MOVQ    24(R13), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R13), BP
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-	MOVQ br_bits_read, CX
-	SHLQ CL, AX
-
-#endif
-
-	ORQ AX, br_value
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R13)
+	ORQ  BP, R14
 
 	// exhausted = exhausted || (br3.off < 4)
-	CMPQ  br_offset, $4
-	SETLT DL
-	ORB   DL, DH
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
 
-	// }
 skip_fill3:
-
 	// val0 := br3.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
-	MOVQ br_value, AX
-	MOVQ peek_bits, CX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
-
-#endif
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
 
 	// v0 := table[val0&mask]
-	MOVW 0(table)(AX*2), AX // AX - v0
+	MOVW (R9)(BP*2), CX
 
-	// br3.advance(uint8(v0.entry))
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
-
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
 
-#endif
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
 
-	ADDQ CX, br_bits_read // bits_read += n
+	// br3.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
 
-#ifdef GOAMD64_v3
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
 
-#else
-	// val1 := br3.peekTopBits(peekBits)
-	MOVQ peek_bits, CX
-	MOVQ br_value, AX
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+	// update the bitrader reader structure
+	MOVQ  R14, 32(R13)
+	MOVB  R15, 40(R13)
+	ADDQ  $0x02, BX
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	MOVQ  40(AX), CX
+	MOVQ  BX, DX
+	SUBQ  CX, DX
+	SHLQ  $0x02, DX
+	MOVQ  DX, 64(AX)
+	RET
 
-#endif
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), CX
+	MOVBQZX 32(CX), BX
+	MOVQ    40(CX), SI
+	MOVQ    SI, (SP)
+	MOVQ    72(CX), DX
+	MOVQ    DX, 8(SP)
+	MOVQ    48(CX), DI
+	MOVQ    56(CX), R8
+	MOVQ    (CX), R9
+	MOVQ    8(CX), R10
+	MOVQ    16(CX), R11
+	MOVQ    24(CX), R12
+
+	// Main loop
+main_loop:
+	MOVQ  (SP), SI
+	CMPQ  SI, 8(SP)
+	SETGE DL
+
+	// br1000.fillFast32()
+	MOVQ    32(R9), R13
+	MOVBQZX 40(R9), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1000
+	MOVQ    24(R9), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R9), BP
 
-	// v1 := table[val1&mask]
-	MOVW 0(table)(AX*2), AX // AX - v1
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R9)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1000.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1000:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
 
-	// br3.advance(uint8(v1.entry))
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
 
-#ifdef GOAMD64_v3
-	MOVBQZX AL, CX
-	SHLXQ   AX, br_value, br_value // value <<= n
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
 
-#else
-	MOVBQZX AL, CX
-	SHLQ    CL, br_value // value <<= n
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R9)
+	MOVB R14, 40(R9)
+	ADDQ DI, SI
+
+	// br1001.fillFast32()
+	MOVQ    32(R10), R13
+	MOVBQZX 40(R10), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1001
+	MOVQ    24(R10), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R10), BP
 
-#endif
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R10)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1001.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1001:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
 
-	ADDQ CX, br_bits_read // bits_read += n
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
 
-	// these two writes get coalesced
-	// buf[stream][off] = uint8(v0.entry >> 8)
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
-	MOVW BX, 768(buffer)(off*1)
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
 
-	// update the bitrader reader structure
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
-	MOVQ br_value, bitReaderShifted_value(br3)
-	MOVQ br_offset, bitReaderShifted_off(br3)
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R10)
+	MOVB R14, 40(R10)
+	ADDQ DI, SI
+
+	// br1002.fillFast32()
+	MOVQ    32(R11), R13
+	MOVBQZX 40(R11), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1002
+	MOVQ    24(R11), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R11), BP
 
-	ADDQ $2, off // off += 2
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R11)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1002.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1002:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
 
-	TESTB DH, DH // any br[i].ofs < 4?
-	JNZ   end
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
 
-	CMPQ off, $bufoff
-	JL   main_loop
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
 
-end:
-	MOVQ 0(SP), BP
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R11)
+	MOVB R14, 40(R11)
+	ADDQ DI, SI
+
+	// br1003.fillFast32()
+	MOVQ    32(R12), R13
+	MOVBQZX 40(R12), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1003
+	MOVQ    24(R12), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R12), BP
 
-	MOVB off, ret+56(FP)
-	RET
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R12)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1003.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1003:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
 
-#undef off
-#undef buffer
-#undef table
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
 
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
 
-#undef br0
-#undef br1
-#undef br2
-#undef br3
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ  R13, 32(R12)
+	MOVB  R14, 40(R12)
+	ADDQ  $0x04, (SP)
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	MOVQ  40(AX), CX
+	MOVQ  (SP), DX
+	SUBQ  CX, DX
+	SHLQ  $0x02, DX
+	MOVQ  DX, 64(AX)
+	RET
diff --git a/huff0/decompress_amd64.s.in b/huff0/decompress_amd64.s.in
deleted file mode 100644
index 330d86ae15..0000000000
--- a/huff0/decompress_amd64.s.in
+++ /dev/null
@@ -1,195 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff      256     // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off             R8
-#define buffer          DI
-#define table           SI
-
-#define br_bits_read    R9
-#define br_value        R10
-#define br_offset       R11
-#define peek_bits       R12
-#define exhausted       DX
-
-#define br0             R13
-#define br1             R14
-#define br2             R15
-#define br3             BP
-
-    MOVQ    BP, 0(SP)
-
-    XORQ    exhausted, exhausted    // exhausted = false
-    XORQ    off, off                // off = 0
-
-    MOVBQZX peekBits+32(FP), peek_bits
-    MOVQ    buf+40(FP), buffer
-    MOVQ    tbl+48(FP), table
-
-    MOVQ    pbr0+0(FP), br0
-    MOVQ    pbr1+8(FP), br1
-    MOVQ    pbr2+16(FP), br2
-    MOVQ    pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
-    // const stream = {{ var "id" }}
-    // br{{ var "id"}}.fillFast()
-    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
-    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
-    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
-
-    // We must have at least 2 * max tablelog left
-    CMPQ    br_bits_read, $64-22
-    JBE     skip_fill{{ var "id" }}
-
-    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
-    SUBQ    $4, br_offset           // b.off -= 4
-
-	// v := b.in[b.off-4 : b.off]
-	// v = v[:4]
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
-    SHLXQ   br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-#else
-    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
-    MOVQ    br_bits_read, CX
-    SHLQ    CL, AX
-#endif
-
-    ORQ     AX, br_value
-
-    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
-    CMPQ    br_offset, $4
-    SETLT   DL
-    ORB     DL, DH
-    // }
-skip_fill{{ var "id" }}:
-
-    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
-    SHRXQ   peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-#else
-    MOVQ    br_value, AX
-    MOVQ    peek_bits, CX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-#endif
-
-    // v0 := table[val0&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v0
-
-    // br{{ var "id"}}.advance(uint8(v0.entry))
-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
-    MOVBQZX AL, CX
-    SHLXQ   AX, br_value, br_value // value <<= n
-#else
-    MOVBQZX AL, CX
-    SHLQ    CL, br_value            // value <<= n
-#endif
-
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-
-#ifdef GOAMD64_v3
-    SHRXQ    peek_bits, br_value, AX  // AX = (value >> peek_bits) & mask
-#else
-    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
-    MOVQ    peek_bits, CX
-    MOVQ    br_value, AX
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
-#endif
-
-    // v1 := table[val1&mask]
-    MOVW    0(table)(AX*2), AX      // AX - v1
-
-    // br{{ var "id"}}.advance(uint8(v1.entry))
-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
-    MOVBQZX AL, CX
-    SHLXQ   AX, br_value, br_value // value <<= n
-#else
-    MOVBQZX AL, CX
-    SHLQ    CL, br_value            // value <<= n
-#endif
-
-    ADDQ    CX, br_bits_read        // bits_read += n
-
-
-    // these two writes get coalesced
-    // buf[stream][off] = uint8(v0.entry >> 8)
-    // buf[stream][off+1] = uint8(v1.entry >> 8)
-    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
-
-    // update the bitrader reader structure
-    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
-    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
-    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
-    {{ set "id" "0" }}
-    {{ set "ofs" "0" }}
-    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "1" }}
-    {{ set "ofs" "8" }}
-    {{ set "bufofs" "256" }}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "2" }}
-    {{ set "ofs" "16" }}
-    {{ set "bufofs" "512" }}
-    {{ template "decode_2_values_x86" . }}
-
-    {{ set "id" "3" }}
-    {{ set "ofs" "24" }}
-    {{ set "bufofs" "768" }}
-    {{ template "decode_2_values_x86" . }}
-
-    ADDQ    $2, off     // off += 2
-
-    TESTB   DH, DH      // any br[i].ofs < 4?
-    JNZ     end
-
-    CMPQ    off, $bufoff
-    JL      main_loop
-end:
-    MOVQ    0(SP), BP
-
-    MOVB    off, ret+56(FP)
-    RET
-#undef  off
-#undef  buffer
-#undef  table
-
-#undef  br_bits_read
-#undef  br_value
-#undef  br_offset
-#undef  peek_bits
-#undef  exhausted
-
-#undef  br0
-#undef  br1
-#undef  br2
-#undef  br3
diff --git a/huff0/generate.go b/huff0/generate.go
deleted file mode 100644
index 95b082fd34..0000000000
--- a/huff0/generate.go
+++ /dev/null
@@ -1,78 +0,0 @@
-//go:build ignore
-// +build ignore
-
-package main
-
-import (
-	"log"
-	"os"
-	"path"
-	"text/template"
-)
-
-func main() {
-	mapping := []struct {
-		template string
-		output   string
-	}{{
-		template: "decompress_amd64.s.in",
-		output:   "decompress_amd64.s",
-	},
-		{
-			template: "decompress_8b_amd64.s.in",
-			output:   "decompress_8b_amd64.s",
-		},
-	}
-
-	for i := range mapping {
-
-		state := make(map[string]string)
-
-		funcMap := template.FuncMap{
-			"var": func(name string) string { return state[name] },
-			"set": func(name, value string) string {
-				state[name] = value
-				return ""
-			},
-		}
-
-		input := mapping[i].template
-		output := mapping[i].output
-		if !shouldRegenerate(input, output) {
-			log.Printf("%q is up to date", output)
-			continue
-		}
-
-		tmpl, err := template.New(path.Base(input)).Funcs(funcMap).ParseFiles(input)
-		die(err)
-
-		f, err := os.Create(output)
-		die(err)
-		defer f.Close()
-
-		log.Printf("Generating %q from %q", output, input)
-		err = tmpl.Execute(f, nil)
-		die(err)
-	}
-}
-
-func die(err error) {
-	if err != nil {
-		log.Fatal(err)
-		os.Exit(1)
-	}
-}
-
-func shouldRegenerate(srcpath, dstpath string) bool {
-	src, err1 := os.Stat(srcpath)
-	if err1 != nil {
-		return true // I/O errors will be rediscovered later
-	}
-
-	dst, err2 := os.Stat(dstpath)
-	if err2 != nil {
-		return true
-	}
-
-	return src.ModTime().After(dst.ModTime())
-}