Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 158 additions & 6 deletions lisa/features/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dataclasses import dataclass
from enum import Enum
from functools import partial
from typing import Any, List, Type
from typing import Any, Dict, List, Type

from dataclasses_json import dataclass_json

Expand Down Expand Up @@ -135,18 +135,170 @@ def install_compute_sdk(self, version: str = "") -> None:
else:
raise LisaException(f"{driver} is not a valid value of ComputeSDK")

def get_gpu_count_with_lsvmbus(self) -> int:
def get_gpu_count_with_lsvmbus(self, expected_count: int = 0) -> int:
"""
Count GPU devices using lsvmbus.
First tries known list, then groups devices by last segment of device ID.
"""
lsvmbus_tool = self._node.tools[Lsvmbus]

# Get all VMBus devices
vmbus_devices = lsvmbus_tool.get_device_channels()
self._log.debug(f"Found {len(vmbus_devices)} VMBus devices")

# First try the known list (original approach)
gpu_count = self._get_gpu_count_from_known_list(vmbus_devices)

if gpu_count > 0:
self._log.debug(f"Found {gpu_count} GPU(s) using known list")
return gpu_count

if isinstance(expected_count, int) and expected_count <= 1:
self._log.error(
f"No GPUs found in known list. Expected count is {expected_count}. "
"Skipping segment grouping for single/no GPU scenarios."
)
return 0

# Only try segment grouping if expected count > 1
self._log.error(
f"No GPUs found in known list. Expected count is {expected_count}. "
"Trying last-segment grouping for sharing Possible GPU detection."
)
return self._get_gpu_count_by_device_id_segment(vmbus_devices)

def _get_gpu_count_by_device_id_segment(self, vmbus_devices: List[Any]) -> int:
Copy link
Member

@squirrelsc squirrelsc Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks this method doesn't help more than the raw information. The all vmbus devices should be listed by previous commands in LISA log for troubleshooting. If the list is not long like over 50, it doesn't need to check and print again.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, the initial intent was to utilize this segmentation in order to try and reduce the failure rate of the test case!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about remove this method?

"""
Group VMBus devices by last segment and find the largest group of
sequential PCI Express pass-through devices (likely GPUs).
"""
try:
# Group PCI Express pass-through devices by last segment
last_segment_groups: Dict[str, List[Any]] = {}

for device in vmbus_devices:
# Only consider PCI Express pass-through devices
if "PCI Express pass-through" not in device.name:
continue

device_id = device.device_id
# Device ID format: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
id_parts = device_id.split("-")
if len(id_parts) >= 5:
last_segment = id_parts[-1].lower()
if last_segment not in last_segment_groups:
last_segment_groups[last_segment] = []
last_segment_groups[last_segment].append(device)

if not last_segment_groups:
self._log.debug("No PCI Express pass-through devices found")
return 0

# Find the largest group with sequential pattern
max_gpu_count = 0
best_segment = None

for last_segment, devices in last_segment_groups.items():
# Check if devices have sequential numbering
if self._has_sequential_pattern(devices):
device_count = len(devices)
self._log.debug(
f"Found {device_count} sequential PCI Express devices "
f"with last segment '{last_segment}'"
)

if device_count > max_gpu_count:
max_gpu_count = device_count
best_segment = last_segment
else:
self._log.debug(
f"Segment '{last_segment}' has {len(devices)} devices "
"but not in sequential pattern"
)

if max_gpu_count > 0 and best_segment is not None:
self._log.info(
f"Detected {max_gpu_count} potential GPU(s) with last "
f"segment '{best_segment}' "
"using segment grouping method"
)
# Log the matched devices
for device in last_segment_groups[best_segment]:
self._log.debug(f" Device: {device.device_id}")

# Issue warning to user about adding this pattern
self._log.warning(
f"Found {max_gpu_count} PCI Express pass-through device(s) "
f"with sequential pattern and common"
f" last segment '{best_segment}'. "
"These might be GPU devices. Please add this pattern to the "
"gpu_devices list in NvidiaSmi class if confirmed as GPUs. "
"Example: ('<ModelName>', '<common_segment>', 0)"
)
return 0

except Exception as e:
self._log.error(f"Failed to detect GPUs by segment grouping: {e}")
return 0

def _has_sequential_pattern(self, devices: List[Any]) -> bool:
"""
Check if devices have sequential numbering in their IDs.
GPUs typically have patterns like 0101, 0102, 0103, 0104.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where did you find this info? Could you add a link above? If there are other types of devices, maybe they’re listed in a similar way too.

Copy link
Collaborator Author

@umfranci umfranci Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could not find an official doc for it but this was a usual trend observed for multi-GPU SKUs like GB200 and MI300. Example:

Device_ID = {56475055-0002-0000-3130-303237344131}
Device_ID = {56475055-0003-0000-3130-303237344131}
Device_ID = {56475055-0004-0000-3130-303237344131}
Device_ID = {56475055-0005-0000-3130-303237344131}
Device_ID = {56475055-0006-0000-3130-303237344131}
Device_ID = {56475055-0007-0000-3130-303237344131}
Device_ID = {56475055-0008-0000-3130-303237344131}
Device_ID = {56475055-0009-0000-3130-303237344131}

Device_ID = {00000003-0101-0000-3135-423331303142}
Device_ID = {00000203-0102-0000-3135-423331303142}
Device_ID = {00001003-0103-0001-3135-423331303142}
Device_ID = {00001203-0104-0001-3135-423331303142}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not an official pattern, and maybe confusing by other devices type in future. Please remove them.

"""
if len(devices) < 2:
# Single device is considered sequential
return True

# Extract second segment which typically contains sequence numbers
segments = []
for device in devices:
parts = device.device_id.split("-")
if len(parts) >= 2:
# Second segment often contains the sequence (0101, 0102, etc.)
segments.append(parts[1])

if not segments:
return False

# Check if segments form a sequential pattern
try:
# Try to parse as integers
segment_values = []
for seg in segments:
# Handle both pure numbers and alphanumeric (extract numeric part)
numeric_part = "".join(filter(str.isdigit, seg))
if numeric_part:
segment_values.append(int(numeric_part))

if len(segment_values) == len(devices):
segment_values.sort()
# Check if sequential (difference of 1 between consecutive values)
for i in range(1, len(segment_values)):
if segment_values[i] - segment_values[i - 1] != 1:
return False
return True
except Exception as e:
self._log.error(f"Error while detecting sequential patterns: {e}")

return False

def _get_gpu_count_from_known_list(self, vmbus_devices: List[Any]) -> int:
"""
Original method - check against known list of GPUs
"""
lsvmbus_device_count = 0
bridge_device_count = 0

lsvmbus_tool = self._node.tools[Lsvmbus]
device_list = lsvmbus_tool.get_device_channels()
for device in device_list:
for device in vmbus_devices:
for name, id_, bridge_count in NvidiaSmi.gpu_devices:
if id_ in device.device_id:
lsvmbus_device_count += 1
bridge_device_count = bridge_count
self._log.debug(f"GPU device {name} found!")
self._log.debug(
f"GPU device {name} found using hardcoded list! "
f"Device ID: {device.device_id}"
)
break

return lsvmbus_device_count - bridge_device_count
Expand Down
29 changes: 23 additions & 6 deletions lisa/tools/nvidiasmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,33 @@ def can_install(self) -> bool:
return False

def get_gpu_count(self) -> int:
"""
Get GPU count from nvidia-smi output.

Args:
known_only: If True, only count GPUs in the hardcoded list.
If False, count all GPUs reported by nvidia-smi.

Returns:
Number of GPUs detected.
"""
result = self.run("-L")
if result.exit_code != 0 or (result.exit_code == 0 and result.stdout == ""):
result = self.run("-L", sudo=True)
if result.exit_code != 0 or (result.exit_code == 0 and result.stdout == ""):
raise LisaException(
f"nvidia-smi command exited with exit_code {result.exit_code}"
)
gpu_types = [x[0] for x in self.gpu_devices]
device_count = 0
for gpu_type in gpu_types:
device_count += result.stdout.count(gpu_type)

return device_count
# Count all GPUs regardless of model
gpu_lines = [
line
for line in result.stdout.splitlines()
if line.strip().startswith("GPU ")
]
gpu_count = len(gpu_lines)

self._log.debug(f"nvidia-smi detected {gpu_count} GPU(s)")
for line in gpu_lines:
self._log.debug(f" {line}")

return gpu_count
2 changes: 1 addition & 1 deletion microsoft/testsuites/gpu/gpusuite.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def verify_gpu_adapter_count(self, node: Node, log_path: Path, log: Logger) -> N
assert isinstance(node.capability.gpu_count, int)
expected_count = node.capability.gpu_count

lsvmbus_device_count = gpu_feature.get_gpu_count_with_lsvmbus()
lsvmbus_device_count = gpu_feature.get_gpu_count_with_lsvmbus(expected_count)
assert_that(
lsvmbus_device_count,
"Expected device count didn't match Actual device count from lsvmbus",
Expand Down
Loading