Skip to content

Commit

Permalink
Initial GPU Support (#512)
Browse files Browse the repository at this point in the history
  • Loading branch information
majst01 committed May 6, 2024
1 parent d81ecca commit 9aa6fc7
Show file tree
Hide file tree
Showing 16 changed files with 1,457 additions and 526 deletions.
8 changes: 7 additions & 1 deletion cmd/metal-api/internal/datastore/machine_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ func (_ *machineTestable) defaultBody(m *metal.Machine) *metal.Machine {
if m.Hardware.Disks == nil {
m.Hardware.Disks = []metal.BlockDevice{}
}
if m.Hardware.MetalCPUs == nil {
m.Hardware.MetalCPUs = []metal.MetalCPU{}
}
if m.Hardware.MetalGPUs == nil {
m.Hardware.MetalGPUs = []metal.MetalGPU{}
}
if m.Tags == nil {
m.Tags = []string{}
}
Expand Down Expand Up @@ -935,7 +941,7 @@ func TestRethinkStore_UpdateMachine(t *testing.T) {
},
want: &metal.Machine{
Base: metal.Base{ID: "1"},
Hardware: metal.MachineHardware{Nics: metal.Nics{}, Disks: []metal.BlockDevice{}},
Hardware: metal.MachineHardware{Nics: metal.Nics{}, Disks: []metal.BlockDevice{}, MetalCPUs: []metal.MetalCPU{}, MetalGPUs: []metal.MetalGPU{}},
Tags: []string{"a=b"},
},
},
Expand Down
28 changes: 24 additions & 4 deletions cmd/metal-api/internal/grpc/boot-service.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,31 @@ func (b *BootService) Register(ctx context.Context, req *v1.BootServiceRegisterR
})
}

cpus := []metal.MetalCPU{}
for _, cpu := range req.Hardware.Cpus {
cpus = append(cpus, metal.MetalCPU{
Vendor: cpu.Vendor,
Model: cpu.Model,
Cores: cpu.Cores,
Threads: cpu.Threads,
})
}

gpus := []metal.MetalGPU{}
for _, gpu := range req.Hardware.Gpus {
gpus = append(gpus, metal.MetalGPU{
Vendor: gpu.Vendor,
Model: gpu.Model,
})
}

machineHardware := metal.MachineHardware{
Memory: req.Hardware.Memory,
CPUCores: int(req.Hardware.CpuCores),
Disks: disks,
Nics: nics,
Memory: req.Hardware.Memory,
CPUCores: int(req.Hardware.CpuCores),
Disks: disks,
Nics: nics,
MetalCPUs: cpus,
MetalGPUs: gpus,
}

size, _, err := b.ds.FromHardware(machineHardware)
Expand Down
45 changes: 38 additions & 7 deletions cmd/metal-api/internal/metal/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -457,10 +457,24 @@ func (n NetworkType) String() string {

// MachineHardware stores the data which is collected by our system on the hardware when it registers itself.
type MachineHardware struct {
Memory uint64 `rethinkdb:"memory" json:"memory"`
CPUCores int `rethinkdb:"cpu_cores" json:"cpu_cores"`
Nics Nics `rethinkdb:"network_interfaces" json:"network_interfaces"`
Disks []BlockDevice `rethinkdb:"block_devices" json:"block_devices"`
Memory uint64 `rethinkdb:"memory" json:"memory"`
CPUCores int `rethinkdb:"cpu_cores" json:"cpu_cores"`
Nics Nics `rethinkdb:"network_interfaces" json:"network_interfaces"`
Disks []BlockDevice `rethinkdb:"block_devices" json:"block_devices"`
MetalCPUs []MetalCPU `rethinkdb:"cpus" json:"cpus"`
MetalGPUs []MetalGPU `rethinkdb:"gpus" json:"gpus"`
}

type MetalCPU struct {
Vendor string `rethinkdb:"vendor" json:"vendor"`
Model string `rethinkdb:"model" json:"model"`
Cores uint32 `rethinkdb:"cores" json:"cores"`
Threads uint32 `rethinkdb:"threads" json:"threads"`
}

type MetalGPU struct {
Vendor string `rethinkdb:"vendor" json:"vendor"`
Model string `rethinkdb:"model" json:"model"`
}

// MachineLiveliness indicates the liveliness of a machine
Expand All @@ -484,9 +498,22 @@ func (hw *MachineHardware) DiskCapacity() uint64 {
return c
}

func (hw *MachineHardware) GPUModels() map[string]uint64 {
models := make(map[string]uint64)
for _, gpu := range hw.MetalGPUs {
_, ok := models[gpu.Model]
if !ok {
models[gpu.Model] = 1
} else {
models[gpu.Model]++
}
}
return models
}

// ReadableSpec returns a human readable string for the hardware.
func (hw *MachineHardware) ReadableSpec() string {
return fmt.Sprintf("Cores: %d, Memory: %s, Storage: %s", hw.CPUCores, humanize.Bytes(hw.Memory), humanize.Bytes(hw.DiskCapacity()))
return fmt.Sprintf("Cores: %d, Memory: %s, Storage: %s GPUs:%s", hw.CPUCores, humanize.Bytes(hw.Memory), humanize.Bytes(hw.DiskCapacity()), hw.MetalGPUs)
}

// BlockDevice information.
Expand Down Expand Up @@ -621,10 +648,14 @@ func NewIPMISuperUser(log *slog.Logger, path string) MachineIPMISuperUser {
password := ""

if raw, err := os.ReadFile(path); err == nil {
log.Info("ipmi superuser password found, feature is enabled")
password = strings.TrimSpace(string(raw))
if password != "" {
log.Info("ipmi superuser password found, feature is enabled")
} else {
log.Warn("ipmi superuser password file found, but password is empty, feature is disabled")
}
} else {
log.Info("ipmi superuser password could not be read, feature is disabled", "error", err)
log.Warn("ipmi superuser password could not be read, feature is disabled", "error", err)
}

return MachineIPMISuperUser{
Expand Down
89 changes: 78 additions & 11 deletions cmd/metal-api/internal/metal/size.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@ package metal

import (
"fmt"
"path/filepath"
"slices"

mdmv1 "github.com/metal-stack/masterdata-api/api/v1"
"github.com/samber/lo"
)

// A Size represents a supported machine size.
Expand Down Expand Up @@ -33,14 +35,16 @@ const (
CoreConstraint ConstraintType = "cores"
MemoryConstraint ConstraintType = "memory"
StorageConstraint ConstraintType = "storage"
GPUConstraint ConstraintType = "gpu"
)

// A Constraint describes the hardware constraints for a given size. At the moment we only
// consider the cpu cores and the memory.
type Constraint struct {
Type ConstraintType `rethinkdb:"type" json:"type"`
Min uint64 `rethinkdb:"min" json:"min"`
Max uint64 `rethinkdb:"max" json:"max"`
Type ConstraintType `rethinkdb:"type" json:"type"`
Min uint64 `rethinkdb:"min" json:"min"`
Max uint64 `rethinkdb:"max" json:"max"`
Identifier string `rethinkdb:"identifier" json:"identifier" description:"glob of the identifier of this type"`
}

// Sizes is a list of sizes.
Expand Down Expand Up @@ -84,6 +88,20 @@ func (c *Constraint) Matches(hw MachineHardware) (ConstraintMatchingLog, bool) {
case StorageConstraint:
res = hw.DiskCapacity() >= c.Min && hw.DiskCapacity() <= c.Max
cml.Log = fmt.Sprintf(logentryFmt, hw.DiskCapacity(), hw.DiskCapacity())
case GPUConstraint:
for model, count := range hw.GPUModels() {
idMatches, err := filepath.Match(c.Identifier, model)
if err != nil {
cml.Log = fmt.Sprintf("cannot match gpu model:%v", err)
return cml, false
}
res = count >= c.Min && count <= c.Max && idMatches
if res {
break
}
}

cml.Log = fmt.Sprintf("existing gpus:%#v required gpus:%s count %d-%d", hw.MetalGPUs, c.Identifier, c.Min, c.Max)
}
cml.Match = res
return cml, res
Expand Down Expand Up @@ -121,25 +139,71 @@ nextsize:
}

func (s *Size) overlaps(so *Size) bool {
if len(so.Constraints) == 0 {
if len(lo.FromPtr(so).Constraints) == 0 {
return false
}
for _, c := range s.Constraints {
for _, co := range so.Constraints {
if c.Type == co.Type && ((c.Min < co.Min && c.Max < co.Min) || (c.Min > co.Min && c.Min > co.Max)) {
return false
srcTypes := lo.GroupBy(s.Constraints, func(item Constraint) ConstraintType {
return item.Type
})
destTypes := lo.GroupBy(so.Constraints, func(item Constraint) ConstraintType {
return item.Type
})
for t, srcConstraints := range srcTypes {
constraints, ok := destTypes[t]
if !ok {
return false
}
for _, sc := range srcConstraints {
for _, c := range constraints {
if !c.overlaps(sc) {
return false
}
}
}
}

return true
}

// overlaps is proven correct, requires that constraint are validated before that max is not smaller than min
func (c *Constraint) overlaps(other Constraint) bool {
if c.Type != other.Type {
return false
}

if c.Identifier != other.Identifier {
return false
}

if c.Min > other.Max {
return false
}

if c.Max < other.Min {
return false
}
return true
}

// Validate a size, returns error if a invalid size is passed
func (s *Size) Validate(partitions PartitionMap, projects map[string]*mdmv1.Project) error {
constraintTypes := map[ConstraintType]bool{}
for _, c := range s.Constraints {
if c.Max < c.Min {
return fmt.Errorf("size:%q type:%q max:%d is smaller than min:%d", s.ID, c.Type, c.Max, c.Min)
}

_, ok := constraintTypes[c.Type]
if ok {
return fmt.Errorf("size:%q type:%q min:%d max:%d has duplicate constraint type", s.ID, c.Type, c.Min, c.Max)
}

// Ensure GPU Constraints always have identifier specified
if c.Type == GPUConstraint && c.Identifier == "" {
return fmt.Errorf("size:%q type:%q min:%d max:%d is a gpu size but has no identifier specified", s.ID, c.Type, c.Min, c.Max)
}

constraintTypes[c.Type] = true
}

if err := s.Reservations.Validate(partitions, projects); err != nil {
Expand All @@ -151,9 +215,12 @@ func (s *Size) Validate(partitions PartitionMap, projects map[string]*mdmv1.Proj

// Overlaps returns nil if Size does not overlap with any other size, otherwise returns overlapping Size
func (s *Size) Overlaps(ss *Sizes) *Size {
for i := range *ss {
so := (*ss)[i]
if s.Name != so.Name && s.overlaps(&so) {
for _, so := range *ss {
so := so
if s.ID == so.ID {
continue
}
if s.overlaps(&so) {
return &so
}
}
Expand Down

0 comments on commit 9aa6fc7

Please sign in to comment.