diff --git a/btf/core.go b/btf/core.go
index 223de819c..f952b654e 100644
--- a/btf/core.go
+++ b/btf/core.go
@@ -156,16 +156,17 @@ func (k coreKind) String() string {
 	}
 }
 
-// CORERelocate calculates the difference in types between local and target.
+// CORERelocate calculates changes needed to adjust eBPF instructions for differences
+// in types.
 //
 // Returns a list of fixups which can be applied to instructions to make them
 // match the target type(s).
 //
 // Fixups are returned in the order of relos, e.g. fixup[i] is the solution
 // for relos[i].
-func CORERelocate(local, target *Spec, relos []*CORERelocation) ([]COREFixup, error) {
-	if local.byteOrder != target.byteOrder {
-		return nil, fmt.Errorf("can't relocate %s against %s", local.byteOrder, target.byteOrder)
+func CORERelocate(relos []*CORERelocation, target *Spec, bo binary.ByteOrder) ([]COREFixup, error) {
+	if bo != target.byteOrder {
+		return nil, fmt.Errorf("can't relocate %s against %s", bo, target.byteOrder)
 	}
 
 	type reloGroup struct {
@@ -185,15 +186,14 @@ func CORERelocate(local, target *Spec, relos []*CORERelocation) ([]COREFixup, er
 				return nil, fmt.Errorf("%s: unexpected accessor %v", relo.kind, relo.accessor)
 			}
 
-			id, err := local.TypeID(relo.typ)
-			if err != nil {
-				return nil, fmt.Errorf("%s: %w", relo.kind, err)
-			}
-
 			result[i] = COREFixup{
-				kind:   relo.kind,
-				local:  uint32(id),
-				target: uint32(id),
+				kind:  relo.kind,
+				local: uint32(relo.id),
+				// NB: Using relo.id as the target here is incorrect, since
+				// it doesn't match the BTF we generate on the fly. This isn't
+				// too bad for now since there are no uses of the local type ID
+				// in the kernel, yet.
+				target: uint32(relo.id),
 			}
 			continue
 		}
@@ -214,7 +214,7 @@ func CORERelocate(local, target *Spec, relos []*CORERelocation) ([]COREFixup, er
 		}
 
 		targets := target.namedTypes[newEssentialName(localTypeName)]
-		fixups, err := coreCalculateFixups(local, target, localType, targets, group.relos)
+		fixups, err := coreCalculateFixups(group.relos, target, targets, bo)
 		if err != nil {
 			return nil, fmt.Errorf("relocate %s: %w", localType, err)
 		}
@@ -230,18 +230,13 @@ func CORERelocate(local, target *Spec, relos []*CORERelocation) ([]COREFixup, er
 var errAmbiguousRelocation = errors.New("ambiguous relocation")
 var errImpossibleRelocation = errors.New("impossible relocation")
 
-// coreCalculateFixups calculates the fixups for the given relocations using
-// the "best" target.
+// coreCalculateFixups finds the target type that best matches all relocations.
+//
+// All relos must target the same type.
 //
 // The best target is determined by scoring: the less poisoning we have to do
 // the better the target is.
-func coreCalculateFixups(localSpec, targetSpec *Spec, local Type, targets []Type, relos []*CORERelocation) ([]COREFixup, error) {
-	localID, err := localSpec.TypeID(local)
-	if err != nil {
-		return nil, fmt.Errorf("local type ID: %w", err)
-	}
-	local = Copy(local, UnderlyingType)
-
+func coreCalculateFixups(relos []*CORERelocation, targetSpec *Spec, targets []Type, bo binary.ByteOrder) ([]COREFixup, error) {
 	bestScore := len(relos)
 	var bestFixups []COREFixup
 	for i := range targets {
@@ -254,7 +249,7 @@ func coreCalculateFixups(localSpec, targetSpec *Spec, local Type, targets []Type
 		score := 0 // lower is better
 		fixups := make([]COREFixup, 0, len(relos))
 		for _, relo := range relos {
-			fixup, err := coreCalculateFixup(localSpec.byteOrder, local, localID, target, targetID, relo)
+			fixup, err := coreCalculateFixup(relo, target, targetID, bo)
 			if err != nil {
 				return nil, fmt.Errorf("target %s: %w", target, err)
 			}
@@ -305,7 +300,7 @@ func coreCalculateFixups(localSpec, targetSpec *Spec, local Type, targets []Type
 
 // coreCalculateFixup calculates the fixup for a single local type, target type
 // and relocation.
-func coreCalculateFixup(byteOrder binary.ByteOrder, local Type, localID TypeID, target Type, targetID TypeID, relo *CORERelocation) (COREFixup, error) {
+func coreCalculateFixup(relo *CORERelocation, target Type, targetID TypeID, bo binary.ByteOrder) (COREFixup, error) {
 	fixup := func(local, target uint32) (COREFixup, error) {
 		return COREFixup{kind: relo.kind, local: local, target: target}, nil
 	}
@@ -320,6 +315,8 @@ func coreCalculateFixup(byteOrder binary.ByteOrder, local Type, localID TypeID,
 	}
 	zero := COREFixup{}
 
+	local := Copy(relo.typ, UnderlyingType)
+
 	switch relo.kind {
 	case reloTypeIDTarget, reloTypeSize, reloTypeExists:
 		if len(relo.accessor) > 1 || relo.accessor[0] != 0 {
@@ -339,7 +336,7 @@ func coreCalculateFixup(byteOrder binary.ByteOrder, local Type, localID TypeID,
 			return fixup(1, 1)
 
 		case reloTypeIDTarget:
-			return fixup(uint32(localID), uint32(targetID))
+			return fixup(uint32(relo.id), uint32(targetID))
 
 		case reloTypeSize:
 			localSize, err := Sizeof(local)
@@ -427,7 +424,7 @@ func coreCalculateFixup(byteOrder binary.ByteOrder, local Type, localID TypeID,
 
 		case reloFieldLShiftU64:
 			var target uint32
-			if byteOrder == binary.LittleEndian {
+			if bo == binary.LittleEndian {
 				targetSize, err := targetField.sizeBits()
 				if err != nil {
 					return zero, err
@@ -858,7 +855,7 @@ func coreAreTypesCompatible(localType Type, targetType Type) error {
 		depth             = 0
 	)
 
-	for ; l != nil && t != nil; l, t = localTs.shift(), targetTs.shift() {
+	for ; l != nil && t != nil; l, t = localTs.Shift(), targetTs.Shift() {
 		if depth >= maxTypeDepth {
 			return errors.New("types are nested too deep")
 		}
@@ -876,8 +873,8 @@ func coreAreTypesCompatible(localType Type, targetType Type) error {
 
 		case *Pointer, *Array:
 			depth++
-			localType.walk(&localTs)
-			targetType.walk(&targetTs)
+			walkType(localType, localTs.Push)
+			walkType(targetType, targetTs.Push)
 
 		case *FuncProto:
 			tv := targetType.(*FuncProto)
@@ -886,8 +883,8 @@ func coreAreTypesCompatible(localType Type, targetType Type) error {
 			}
 
 			depth++
-			localType.walk(&localTs)
-			targetType.walk(&targetTs)
+			walkType(localType, localTs.Push)
+			walkType(targetType, targetTs.Push)
 
 		default:
 			return fmt.Errorf("unsupported type %T", localType)
diff --git a/btf/core_test.go b/btf/core_test.go
index 97675787b..f98eb4fa3 100644
--- a/btf/core_test.go
+++ b/btf/core_test.go
@@ -546,7 +546,7 @@ func TestCORERelocation(t *testing.T) {
 					relos = append(relos, reloInfo.relo)
 				}
 
-				fixups, err := CORERelocate(spec, spec, relos)
+				fixups, err := CORERelocate(relos, spec, spec.byteOrder)
 				if want := errs[name]; want != nil {
 					if !errors.Is(err, want) {
 						t.Fatal("Expected", want, "got", err)
diff --git a/btf/ext_info.go b/btf/ext_info.go
index 2c0e1afe2..36e38abae 100644
--- a/btf/ext_info.go
+++ b/btf/ext_info.go
@@ -605,9 +605,12 @@ type bpfCORERelo struct {
 }
 
 type CORERelocation struct {
+	// The local type of the relocation, stripped of typedefs and qualifiers.
 	typ      Type
 	accessor coreAccessor
 	kind     coreKind
+	// The ID of the local type in the source BTF.
+	id TypeID
 }
 
 func CORERelocationMetadata(ins *asm.Instruction) *CORERelocation {
@@ -641,6 +644,7 @@ func newRelocationInfo(relo bpfCORERelo, ts types, strings *stringTable) (*coreR
 			typ,
 			accessor,
 			relo.Kind,
+			relo.TypeID,
 		},
 		asm.RawInstructionOffset(relo.InsnOff),
 	}, nil
diff --git a/btf/traversal.go b/btf/traversal.go
new file mode 100644
index 000000000..a9ff1f703
--- /dev/null
+++ b/btf/traversal.go
@@ -0,0 +1,56 @@
+package btf
+
+import "fmt"
+
+// walkType calls fn on each child of typ.
+func walkType(typ Type, fn func(*Type)) {
+	// Explicitly type switch on the most common types to allow the inliner to
+	// do its work. This avoids allocating intermediate slices from walk() on
+	// the heap.
+	switch v := typ.(type) {
+	case *Void, *Int, *Enum, *Fwd, *Float:
+		// No children to traverse.
+	case *Pointer:
+		fn(&v.Target)
+	case *Array:
+		fn(&v.Index)
+		fn(&v.Type)
+	case *Struct:
+		for i := range v.Members {
+			fn(&v.Members[i].Type)
+		}
+	case *Union:
+		for i := range v.Members {
+			fn(&v.Members[i].Type)
+		}
+	case *Typedef:
+		fn(&v.Type)
+	case *Volatile:
+		fn(&v.Type)
+	case *Const:
+		fn(&v.Type)
+	case *Restrict:
+		fn(&v.Type)
+	case *Func:
+		fn(&v.Type)
+	case *FuncProto:
+		fn(&v.Return)
+		for i := range v.Params {
+			fn(&v.Params[i].Type)
+		}
+	case *Var:
+		fn(&v.Type)
+	case *Datasec:
+		for i := range v.Vars {
+			fn(&v.Vars[i].Type)
+		}
+	case *declTag:
+		fn(&v.Type)
+	case *typeTag:
+		fn(&v.Type)
+	case *cycle:
+		// cycle has children, but we ignore them deliberately.
+	default:
+		panic(fmt.Sprintf("don't know how to walk Type %T", v))
+	}
+}
diff --git a/btf/types.go b/btf/types.go
index b34aaeefc..81980e8d8 100644
--- a/btf/types.go
+++ b/btf/types.go
@@ -8,6 +8,7 @@ import (
 	"strings"
 
 	"github.com/cilium/ebpf/asm"
+	"github.com/cilium/ebpf/internal"
 )
 
 const maxTypeDepth = 32
@@ -35,9 +36,7 @@ type Type interface {
 	// Make a copy of the type, without copying Type members.
 	copy() Type
 
-	// Enumerate all nested Types. Repeated calls must visit nested
-	// types in the same order.
-	walk(*typeDeque)
+	// New implementations must update walkType.
 }
 
 var (
@@ -51,6 +50,9 @@ var (
 	_ Type = (*Var)(nil)
 	_ Type = (*Datasec)(nil)
 	_ Type = (*Float)(nil)
+	_ Type = (*declTag)(nil)
+	_ Type = (*typeTag)(nil)
+	_ Type = (*cycle)(nil)
 )
 
 // types is a list of Type.
@@ -72,7 +74,6 @@ func (v *Void) Format(fs fmt.State, verb rune) { formatType(fs, verb, v) }
 func (v *Void) TypeName() string               { return "" }
 func (v *Void) size() uint32                   { return 0 }
 func (v *Void) copy() Type                     { return (*Void)(nil) }
-func (v *Void) walk(*typeDeque)                {}
 
 type IntEncoding byte
 
@@ -126,7 +127,6 @@ func (i *Int) Format(fs fmt.State, verb rune) {
 
 func (i *Int) TypeName() string { return i.Name }
 func (i *Int) size() uint32     { return i.Size }
-func (i *Int) walk(*typeDeque)  {}
 func (i *Int) copy() Type {
 	cpy := *i
 	return &cpy
@@ -141,9 +141,8 @@ func (p *Pointer) Format(fs fmt.State, verb rune) {
 	formatType(fs, verb, p, "target=", p.Target)
 }
 
-func (p *Pointer) TypeName() string    { return "" }
-func (p *Pointer) size() uint32        { return 8 }
-func (p *Pointer) walk(tdq *typeDeque) { tdq.push(&p.Target) }
+func (p *Pointer) TypeName() string { return "" }
+func (p *Pointer) size() uint32     { return 8 }
 func (p *Pointer) copy() Type {
 	cpy := *p
 	return &cpy
@@ -162,11 +161,6 @@ func (arr *Array) Format(fs fmt.State, verb rune) {
 
 func (arr *Array) TypeName() string { return "" }
 
-func (arr *Array) walk(tdq *typeDeque) {
-	tdq.push(&arr.Index)
-	tdq.push(&arr.Type)
-}
-
 func (arr *Array) copy() Type {
 	cpy := *arr
 	return &cpy
@@ -188,12 +182,6 @@ func (s *Struct) TypeName() string { return s.Name }
 
 func (s *Struct) size() uint32 { return s.Size }
 
-func (s *Struct) walk(tdq *typeDeque) {
-	for i := range s.Members {
-		tdq.push(&s.Members[i].Type)
-	}
-}
-
 func (s *Struct) copy() Type {
 	cpy := *s
 	cpy.Members = copyMembers(s.Members)
@@ -220,12 +208,6 @@ func (u *Union) TypeName() string { return u.Name }
 
 func (u *Union) size() uint32 { return u.Size }
 
-func (u *Union) walk(tdq *typeDeque) {
-	for i := range u.Members {
-		tdq.push(&u.Members[i].Type)
-	}
-}
-
 func (u *Union) copy() Type {
 	cpy := *u
 	cpy.Members = copyMembers(u.Members)
@@ -293,8 +275,7 @@ type EnumValue struct {
 	Value uint64
 }
 
-func (e *Enum) size() uint32    { return e.Size }
-func (e *Enum) walk(*typeDeque) {}
+func (e *Enum) size() uint32 { return e.Size }
 func (e *Enum) copy() Type {
 	cpy := *e
 	cpy.Values = make([]EnumValue, len(e.Values))
@@ -334,7 +315,6 @@ func (f *Fwd) Format(fs fmt.State, verb rune) {
 
 func (f *Fwd) TypeName() string { return f.Name }
 
-func (f *Fwd) walk(*typeDeque) {}
 func (f *Fwd) copy() Type {
 	cpy := *f
 	return &cpy
@@ -352,7 +332,6 @@ func (td *Typedef) Format(fs fmt.State, verb rune) {
 
 func (td *Typedef) TypeName() string { return td.Name }
 
-func (td *Typedef) walk(tdq *typeDeque) { tdq.push(&td.Type) }
 func (td *Typedef) copy() Type {
 	cpy := *td
 	return &cpy
@@ -369,8 +348,7 @@ func (v *Volatile) Format(fs fmt.State, verb rune) {
 
 func (v *Volatile) TypeName() string { return "" }
 
-func (v *Volatile) qualify() Type       { return v.Type }
-func (v *Volatile) walk(tdq *typeDeque) { tdq.push(&v.Type) }
+func (v *Volatile) qualify() Type { return v.Type }
 func (v *Volatile) copy() Type {
 	cpy := *v
 	return &cpy
@@ -387,8 +365,7 @@ func (c *Const) Format(fs fmt.State, verb rune) {
 
 func (c *Const) TypeName() string { return "" }
 
-func (c *Const) qualify() Type       { return c.Type }
-func (c *Const) walk(tdq *typeDeque) { tdq.push(&c.Type) }
+func (c *Const) qualify() Type { return c.Type }
 func (c *Const) copy() Type {
 	cpy := *c
 	return &cpy
@@ -405,8 +382,7 @@ func (r *Restrict) Format(fs fmt.State, verb rune) {
 
 func (r *Restrict) TypeName() string { return "" }
 
-func (r *Restrict) qualify() Type       { return r.Type }
-func (r *Restrict) walk(tdq *typeDeque) { tdq.push(&r.Type) }
+func (r *Restrict) qualify() Type { return r.Type }
 func (r *Restrict) copy() Type {
 	cpy := *r
 	return &cpy
@@ -430,7 +406,6 @@ func (f *Func) Format(fs fmt.State, verb rune) {
 
 func (f *Func) TypeName() string { return f.Name }
 
-func (f *Func) walk(tdq *typeDeque) { tdq.push(&f.Type) }
 func (f *Func) copy() Type {
 	cpy := *f
 	return &cpy
@@ -448,13 +423,6 @@ func (fp *FuncProto) Format(fs fmt.State, verb rune) {
 
 func (fp *FuncProto) TypeName() string { return "" }
 
-func (fp *FuncProto) walk(tdq *typeDeque) {
-	tdq.push(&fp.Return)
-	for i := range fp.Params {
-		tdq.push(&fp.Params[i].Type)
-	}
-}
-
 func (fp *FuncProto) copy() Type {
 	cpy := *fp
 	cpy.Params = make([]FuncParam, len(fp.Params))
@@ -480,7 +448,6 @@ func (v *Var) Format(fs fmt.State, verb rune) {
 
 func (v *Var) TypeName() string { return v.Name }
 
-func (v *Var) walk(tdq *typeDeque) { tdq.push(&v.Type) }
 func (v *Var) copy() Type {
 	cpy := *v
 	return &cpy
@@ -501,12 +468,6 @@ func (ds *Datasec) TypeName() string { return ds.Name }
 
 func (ds *Datasec) size() uint32 { return ds.Size }
 
-func (ds *Datasec) walk(tdq *typeDeque) {
-	for i := range ds.Vars {
-		tdq.push(&ds.Vars[i].Type)
-	}
-}
-
 func (ds *Datasec) copy() Type {
 	cpy := *ds
 	cpy.Vars = make([]VarSecinfo, len(ds.Vars))
@@ -537,7 +498,6 @@ func (f *Float) Format(fs fmt.State, verb rune) {
 
 func (f *Float) TypeName() string { return f.Name }
 func (f *Float) size() uint32     { return f.Size }
-func (f *Float) walk(*typeDeque)  {}
 func (f *Float) copy() Type {
 	cpy := *f
 	return &cpy
@@ -557,8 +517,7 @@ func (dt *declTag) Format(fs fmt.State, verb rune) {
 	formatType(fs, verb, dt, "type=", dt.Type, "value=", dt.Value, "index=", dt.Index)
 }
 
-func (dt *declTag) TypeName() string   { return "" }
-func (dt *declTag) walk(td *typeDeque) { td.push(&dt.Type) }
+func (dt *declTag) TypeName() string { return "" }
 func (dt *declTag) copy() Type {
 	cpy := *dt
 	return &cpy
@@ -574,9 +533,8 @@ func (tt *typeTag) Format(fs fmt.State, verb rune) {
 	formatType(fs, verb, tt, "type=", tt.Type, "value=", tt.Value)
 }
 
-func (tt *typeTag) TypeName() string   { return "" }
-func (tt *typeTag) qualify() Type      { return tt.Type }
-func (tt *typeTag) walk(td *typeDeque) { td.push(&tt.Type) }
+func (tt *typeTag) TypeName() string { return "" }
+func (tt *typeTag) qualify() Type    { return tt.Type }
 func (tt *typeTag) copy() Type {
 	cpy := *tt
 	return &cpy
@@ -590,7 +548,6 @@ type cycle struct {
 func (c *cycle) ID() TypeID                     { return math.MaxUint32 }
 func (c *cycle) Format(fs fmt.State, verb rune) { formatType(fs, verb, c, "root=", c.root) }
 func (c *cycle) TypeName() string               { return "" }
-func (c *cycle) walk(*typeDeque)                {}
 func (c *cycle) copy() Type {
 	cpy := *c
 	return &cpy
@@ -721,7 +678,7 @@ type copier map[Type]Type
 
 func (c copier) copy(typ *Type, transform Transformer) {
 	var work typeDeque
-	for t := typ; t != nil; t = work.pop() {
+	for t := typ; t != nil; t = work.Pop() {
 		// *t is the identity of the type.
 		if cpy := c[*t]; cpy != nil {
 			*t = cpy
@@ -739,83 +696,11 @@ func (c copier) copy(typ *Type, transform Transformer) {
 		*t = cpy
 
 		// Mark any nested types for copying.
-		cpy.walk(&work)
+		walkType(cpy, work.Push)
 	}
 }
 
-// typeDeque keeps track of pointers to types which still
-// need to be visited.
-type typeDeque struct {
-	types       []*Type
-	read, write uint64
-	mask        uint64
-}
-
-func (dq *typeDeque) empty() bool {
-	return dq.read == dq.write
-}
-
-// push adds a type to the stack.
-func (dq *typeDeque) push(t *Type) {
-	if dq.write-dq.read < uint64(len(dq.types)) {
-		dq.types[dq.write&dq.mask] = t
-		dq.write++
-		return
-	}
-
-	new := len(dq.types) * 2
-	if new == 0 {
-		new = 8
-	}
-
-	types := make([]*Type, new)
-	pivot := dq.read & dq.mask
-	n := copy(types, dq.types[pivot:])
-	n += copy(types[n:], dq.types[:pivot])
-	types[n] = t
-
-	dq.types = types
-	dq.mask = uint64(new) - 1
-	dq.read, dq.write = 0, uint64(n+1)
-}
-
-// shift returns the first element or null.
-func (dq *typeDeque) shift() *Type {
-	if dq.empty() {
-		return nil
-	}
-
-	index := dq.read & dq.mask
-	t := dq.types[index]
-	dq.types[index] = nil
-	dq.read++
-	return t
-}
-
-// pop returns the last element or null.
-func (dq *typeDeque) pop() *Type {
-	if dq.empty() {
-		return nil
-	}
-
-	dq.write--
-	index := dq.write & dq.mask
-	t := dq.types[index]
-	dq.types[index] = nil
-	return t
-}
-
-// all returns all elements.
-//
-// The deque is empty after calling this method.
-func (dq *typeDeque) all() []*Type {
-	length := dq.write - dq.read
-	types := make([]*Type, 0, length)
-	for t := dq.shift(); t != nil; t = dq.shift() {
-		types = append(types, t)
-	}
-	return types
-}
+type typeDeque = internal.Deque[*Type]
 
 // inflateRawTypes takes a list of raw btf types linked via type IDs, and turns
 // it into a graph of Types connected via pointers.
diff --git a/btf/types_test.go b/btf/types_test.go
index c1437533a..569d32bd5 100644
--- a/btf/types_test.go
+++ b/btf/types_test.go
@@ -97,6 +97,7 @@ func ExampleType_validTypes() {
 	var _ Type = &FuncProto{}
 	var _ Type = &Var{}
 	var _ Type = &Datasec{}
+	var _ Type = &Float{}
 }
 
 func TestType(t *testing.T) {
@@ -153,91 +154,43 @@ func TestType(t *testing.T) {
 				t.Error("Copy doesn't copy")
 			}
 
-			var first, second typeDeque
-			typ.walk(&first)
-			typ.walk(&second)
+			var a []*Type
+			walkType(typ, func(t *Type) { a = append(a, t) })
 
-			if diff := cmp.Diff(first.all(), second.all(), compareTypes); diff != "" {
+			if _, ok := typ.(*cycle); !ok {
+				if n := countChildren(t, reflect.TypeOf(typ)); len(a) < n {
+					t.Errorf("walkType visited %d children, expected at least %d", len(a), n)
+				}
+			}
+
+			var b []*Type
+			walkType(typ, func(t *Type) { b = append(b, t) })
+
+			if diff := cmp.Diff(a, b, compareTypes); diff != "" {
 				t.Errorf("Walk mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}
 }
 
-func TestTypeDeque(t *testing.T) {
-	a, b := new(Type), new(Type)
-
-	t.Run("pop", func(t *testing.T) {
-		var td typeDeque
-		td.push(a)
-		td.push(b)
-
-		if td.pop() != b {
-			t.Error("Didn't pop b first")
-		}
-
-		if td.pop() != a {
-			t.Error("Didn't pop a second")
-		}
-
-		if td.pop() != nil {
-			t.Error("Didn't pop nil")
-		}
-	})
-
-	t.Run("shift", func(t *testing.T) {
-		var td typeDeque
-		td.push(a)
-		td.push(b)
-
-		if td.shift() != a {
-			t.Error("Didn't shift a second")
-		}
-
-		if td.shift() != b {
-			t.Error("Didn't shift b first")
-		}
-
-		if td.shift() != nil {
-			t.Error("Didn't shift nil")
-		}
-	})
-
-	t.Run("push", func(t *testing.T) {
-		var td typeDeque
-		td.push(a)
-		td.push(b)
-		td.shift()
-
-		ts := make([]Type, 12)
-		for i := range ts {
-			td.push(&ts[i])
-		}
-
-		if td.shift() != b {
-			t.Error("Didn't shift b first")
-		}
-		for i := range ts {
-			if td.shift() != &ts[i] {
-				t.Fatal("Shifted wrong Type at pos", i)
-			}
-		}
-	})
+func countChildren(t *testing.T, typ reflect.Type) int {
+	if typ.Kind() != reflect.Pointer {
+		t.Fatal("Expected pointer, got", typ.Kind())
+	}
 
-	t.Run("all", func(t *testing.T) {
-		var td typeDeque
-		td.push(a)
-		td.push(b)
+	typ = typ.Elem()
+	if typ.Kind() != reflect.Struct {
+		t.Fatal("Expected struct, got", typ.Kind())
+	}
 
-		all := td.all()
-		if len(all) != 2 {
-			t.Fatal("Expected 2 elements, got", len(all))
+	var n int
+	for i := 0; i < typ.NumField(); i++ {
+		if typ.Field(i).Type == reflect.TypeOf((*Type)(nil)).Elem() {
+			n++
 		}
+	}
 
-		if all[0] != a || all[1] != b {
-			t.Fatal("Elements don't match")
-		}
-	})
+	return n
 }
 
 type testFormattableType struct {
@@ -415,6 +368,38 @@ func TestInflateLegacyBitfield(t *testing.T) {
 	}
 }
 
+func BenchmarkWalk(b *testing.B) {
+	types := []Type{
+		&Void{},
+		&Int{},
+		&Pointer{},
+		&Array{},
+		&Struct{Members: make([]Member, 2)},
+		&Union{Members: make([]Member, 2)},
+		&Enum{},
+		&Fwd{},
+		&Typedef{},
+		&Volatile{},
+		&Const{},
+		&Restrict{},
+		&Func{},
+		&FuncProto{Params: make([]FuncParam, 2)},
+		&Var{},
+		&Datasec{Vars: make([]VarSecinfo, 2)},
+	}
+
+	for _, typ := range types {
+		b.Run(fmt.Sprint(typ), func(b *testing.B) {
+			b.ReportAllocs()
+
+			for i := 0; i < b.N; i++ {
+				var dq typeDeque
+				walkType(typ, dq.Push)
+			}
+		})
+	}
+}
+
 func BenchmarkUnderlyingType(b *testing.B) {
 	b.Run("no unwrapping", func(b *testing.B) {
 		v := &Int{}
diff --git a/internal/deque.go b/internal/deque.go
new file mode 100644
index 000000000..1abc9a9ba
--- /dev/null
+++ b/internal/deque.go
@@ -0,0 +1,89 @@
+package internal
+
+import "math/bits"
+
+// Deque implements a double ended queue.
+type Deque[T any] struct {
+	elems       []T
+	read, write uint64
+	mask        uint64
+}
+
+func (dq *Deque[T]) Empty() bool {
+	return dq.read == dq.write
+}
+
+func (dq *Deque[T]) remainingCap() int {
+	return len(dq.elems) - int(dq.write-dq.read)
+}
+
+// Push adds an element to the end.
+func (dq *Deque[T]) Push(e T) {
+	if dq.remainingCap() >= 1 {
+		dq.elems[dq.write&dq.mask] = e
+		dq.write++
+		return
+	}
+
+	elems := dq.linearise(1)
+	elems = append(elems, e)
+
+	dq.elems = elems[:cap(elems)]
+	dq.mask = uint64(cap(elems)) - 1
+	dq.read, dq.write = 0, uint64(len(elems))
+}
+
+// Shift returns the first element or the zero value.
+func (dq *Deque[T]) Shift() T {
+	var zero T
+
+	if dq.Empty() {
+		return zero
+	}
+
+	index := dq.read & dq.mask
+	t := dq.elems[index]
+	dq.elems[index] = zero
+	dq.read++
+	return t
+}
+
+// Pop returns the last element or the zero value.
+func (dq *Deque[T]) Pop() T {
+	var zero T
+
+	if dq.Empty() {
+		return zero
+	}
+
+	dq.write--
+	index := dq.write & dq.mask
+	t := dq.elems[index]
+	dq.elems[index] = zero
+	return t
+}
+
+// linearise the contents of the deque.
+//
+// The returned slice has space for at least n more elements and has power
+// of two capacity.
+func (dq *Deque[T]) linearise(n int) []T {
+	length := dq.write - dq.read
+	need := length + uint64(n)
+	if need < length {
+		panic("overflow")
+	}
+
+	// Round up to the new power of two which is at least 8.
+	// See https://jameshfisher.com/2018/03/30/round-up-power-2/
+	capacity := 1 << (64 - bits.LeadingZeros64(need-1))
+	if capacity < 8 {
+		capacity = 8
+	}
+
+	types := make([]T, length, capacity)
+	pivot := dq.read & dq.mask
+	copied := copy(types, dq.elems[pivot:])
+	copy(types[copied:], dq.elems[:pivot])
+	return types
+}
diff --git a/internal/deque_test.go b/internal/deque_test.go
new file mode 100644
index 000000000..d611c0719
--- /dev/null
+++ b/internal/deque_test.go
@@ -0,0 +1,80 @@
+package internal
+
+import "testing"
+
+func TestDeque(t *testing.T) {
+	t.Run("pop", func(t *testing.T) {
+		var dq Deque[int]
+		dq.Push(1)
+		dq.Push(2)
+
+		if dq.Pop() != 2 {
+			t.Error("Didn't pop 2 first")
+		}
+
+		if dq.Pop() != 1 {
+			t.Error("Didn't pop 1 second")
+		}
+
+		if dq.Pop() != 0 {
+			t.Error("Didn't pop zero")
+		}
+	})
+
+	t.Run("shift", func(t *testing.T) {
+		var td Deque[int]
+		td.Push(1)
+		td.Push(2)
+
+		if td.Shift() != 1 {
+			t.Error("Didn't shift 1 first")
+		}
+
+		if td.Shift() != 2 {
+			t.Error("Didn't shift b second")
+		}
+
+		if td.Shift() != 0 {
+			t.Error("Didn't shift zero")
+		}
+	})
+
+	t.Run("push", func(t *testing.T) {
+		var td Deque[int]
+		td.Push(1)
+		td.Push(2)
+		td.Shift()
+
+		for i := 1; i <= 12; i++ {
+			td.Push(i)
+		}
+
+		if td.Shift() != 2 {
+			t.Error("Didn't shift 2 first")
+		}
+		for i := 1; i <= 12; i++ {
+			if v := td.Shift(); v != i {
+				t.Fatalf("Shifted %d at pos %d", v, i)
+			}
+		}
+	})
+
+	t.Run("linearise", func(t *testing.T) {
+		var td Deque[int]
+		td.Push(1)
+		td.Push(2)
+
+		all := td.linearise(0)
+		if len(all) != 2 {
+			t.Fatal("Expected 2 elements, got", len(all))
+		}
+
+		if cap(all)&(cap(all)-1) != 0 {
+			t.Fatalf("Capacity %d is not a power of two", cap(all))
+		}
+
+		if all[0] != 1 || all[1] != 2 {
+			t.Fatal("Elements don't match")
+		}
+	})
+}
diff --git a/linker.go b/linker.go
index e6276b182..2a2c1b639 100644
--- a/linker.go
+++ b/linker.go
@@ -1,12 +1,14 @@
 package ebpf
 
 import (
+	"encoding/binary"
 	"errors"
 	"fmt"
 	"sync"
 
 	"github.com/cilium/ebpf/asm"
 	"github.com/cilium/ebpf/btf"
+	"github.com/cilium/ebpf/internal"
 )
 
 // splitSymbols splits insns into subsections delimited by Symbol Instructions.
@@ -67,7 +69,7 @@ func hasFunctionReferences(insns asm.Instructions) bool {
 //
 // Passing a nil target will relocate against the running kernel. insns are
 // modified in place.
-func applyRelocations(insns asm.Instructions, local, target *btf.Spec) error {
+func applyRelocations(insns asm.Instructions, target *btf.Spec, bo binary.ByteOrder) error {
 	var relos []*btf.CORERelocation
 	var reloInsns []*asm.Instruction
 	iter := insns.Iterate()
@@ -82,12 +84,16 @@ func applyRelocations(insns asm.Instructions, local, target *btf.Spec) error {
 		return nil
 	}
 
+	if bo == nil {
+		bo = internal.NativeEndian
+	}
+
 	target, err := maybeLoadKernelBTF(target)
 	if err != nil {
 		return err
 	}
 
-	fixups, err := btf.CORERelocate(local, target, relos)
+	fixups, err := btf.CORERelocate(relos, target, bo)
 	if err != nil {
 		return err
 	}
diff --git a/prog.go b/prog.go
index bec7d2347..3bec8e094 100644
--- a/prog.go
+++ b/prog.go
@@ -243,10 +243,6 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, handles *hand
 
 	var btfDisabled bool
 	if spec.BTF != nil {
-		if err := applyRelocations(insns, spec.BTF, opts.KernelTypes); err != nil {
-			return nil, fmt.Errorf("apply CO-RE relocations: %w", err)
-		}
-
 		handle, err := handles.btfHandle(spec.BTF)
 		btfDisabled = errors.Is(err, btf.ErrNotSupported)
 		if err != nil && !btfDisabled {
@@ -271,6 +267,10 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, handles *hand
 		}
 	}
 
+	if err := applyRelocations(insns, opts.KernelTypes, spec.ByteOrder); err != nil {
+		return nil, fmt.Errorf("apply CO-RE relocations: %w", err)
+	}
+
 	if err := fixupAndValidate(insns); err != nil {
 		return nil, err
 	}