Skip to content

Commit 9fd7799

Browse files
committed
update vfio validation
Signed-off-by: Arjun <[email protected]>
1 parent 0428e9c commit 9fd7799

File tree

2 files changed

+28
-3
lines changed

2 files changed

+28
-3
lines changed

assets/state-vgpu-device-manager/0600_daemonset.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ spec:
2525
command: ['sh', '-c']
2626
# TODO: Account for pre-installed vGPU Manager. Currently validator
2727
# creates a different status file when driver is pre-installed.
28-
args: ["until [ -f /run/nvidia/validations/vgpu-manager-ready ]; do echo waiting for NVIDIA vGPU Manager to be setup; sleep 5; done"]
28+
args: ["until [ -f /run/nvidia/validations/vgpu-manager-ready ] && ls /host/sys/bus/pci/devices/*/virtfn*/nvidia/creatable_vgpu_types >/dev/null 2>&1; do echo waiting for NVIDIA vGPU Manager and SR-IOV Virtual Functions to be ready; sleep 5; done"]
2929
securityContext:
3030
privileged: true
3131
volumeMounts:

cmd/nvidia-validator/main.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1734,6 +1734,31 @@ func (v *VGPUDevices) validate() error {
17341734
}
17351735

17361736
func (v *VGPUDevices) runValidation() error {
1737+
nvpci := nvpci.New()
1738+
GPUDevices, err := nvpci.GetGPUs()
1739+
if err != nil {
1740+
return fmt.Errorf("error checking for GPU devices on the host: %w", err)
1741+
}
1742+
1743+
mdevBusPath := "/sys/class/mdev_bus"
1744+
entries, err := os.ReadDir(mdevBusPath)
1745+
if err != nil {
1746+
return fmt.Errorf("unable to read mdev_bus directory: %v", err)
1747+
}
1748+
1749+
if len(entries) == 0 {
1750+
for _, device := range GPUDevices {
1751+
if device.SriovInfo.PhysicalFunction == nil {
1752+
continue
1753+
}
1754+
totalVF := int(device.SriovInfo.PhysicalFunction.TotalVFs)
1755+
if totalVF > 0 {
1756+
log.Infof("Found GPU device with SR-IOV VFs: %s (TotalVFs: %d)", device.Address, totalVF)
1757+
return nil
1758+
}
1759+
}
1760+
}
1761+
17371762
nvmdev := nvmdev.New()
17381763
vGPUDevices, err := nvmdev.GetAllDevices()
17391764
if err != nil {
@@ -1746,14 +1771,14 @@ func (v *VGPUDevices) runValidation() error {
17461771
return fmt.Errorf("no vGPU devices found")
17471772
}
17481773

1749-
log.Infof("Found %d vGPU devices", numDevices)
1774+
log.Infof("Found %d MDEV vGPU devices", numDevices)
17501775
return nil
17511776
}
17521777

17531778
for {
17541779
numDevices := len(vGPUDevices)
17551780
if numDevices > 0 {
1756-
log.Infof("Found %d vGPU devices", numDevices)
1781+
log.Infof("Found %d MDEV vGPU devices", numDevices)
17571782
return nil
17581783
}
17591784
log.Infof("No vGPU devices found, retrying after %d seconds", sleepIntervalSecondsFlag)

0 commit comments

Comments
 (0)