-
Notifications
You must be signed in to change notification settings - Fork 223
Fixing GPU Adapter Count test to be more dynamic and fail resistent #4038
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 11 commits
01b3d6c
57121bd
1750183
c2aaf45
7ce447f
51351e7
69c3735
027528a
29269e8
378e101
d6b7bb9
810ac73
bbe2fc4
2eee4a2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,7 @@ | |
| from dataclasses import dataclass | ||
| from enum import Enum | ||
| from functools import partial | ||
| from typing import Any, List, Type | ||
| from typing import Any, Dict, List, Type | ||
|
|
||
| from dataclasses_json import dataclass_json | ||
|
|
||
|
|
@@ -135,18 +135,170 @@ def install_compute_sdk(self, version: str = "") -> None: | |
| else: | ||
| raise LisaException(f"{driver} is not a valid value of ComputeSDK") | ||
|
|
||
| def get_gpu_count_with_lsvmbus(self) -> int: | ||
| def get_gpu_count_with_lsvmbus(self, expected_count: int = 0) -> int: | ||
| """ | ||
| Count GPU devices using lsvmbus. | ||
| First tries known list, then groups devices by last segment of device ID. | ||
| """ | ||
| lsvmbus_tool = self._node.tools[Lsvmbus] | ||
|
|
||
| # Get all VMBus devices | ||
| vmbus_devices = lsvmbus_tool.get_device_channels() | ||
| self._log.debug(f"Found {len(vmbus_devices)} VMBus devices") | ||
|
|
||
| # First try the known list (original approach) | ||
| gpu_count = self._get_gpu_count_from_known_list(vmbus_devices) | ||
squirrelsc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| if gpu_count > 0: | ||
| self._log.debug(f"Found {gpu_count} GPU(s) using known list") | ||
| return gpu_count | ||
|
|
||
| if isinstance(expected_count, int) and expected_count <= 1: | ||
| self._log.error( | ||
squirrelsc marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| f"No GPUs found in known list. Expected count is {expected_count}. " | ||
| "Skipping segment grouping for single/no GPU scenarios." | ||
| ) | ||
| return 0 | ||
|
|
||
| # Only try segment grouping if expected count > 1 | ||
| self._log.error( | ||
| f"No GPUs found in known list. Expected count is {expected_count}. " | ||
| "Trying last-segment grouping for sharing Possible GPU detection." | ||
| ) | ||
| return self._get_gpu_count_by_device_id_segment(vmbus_devices) | ||
|
|
||
| def _get_gpu_count_by_device_id_segment(self, vmbus_devices: List[Any]) -> int: | ||
|
||
| """ | ||
| Group VMBus devices by last segment and find the largest group of | ||
| sequential PCI Express pass-through devices (likely GPUs). | ||
| """ | ||
| try: | ||
| # Group PCI Express pass-through devices by last segment | ||
| last_segment_groups: Dict[str, List[Any]] = {} | ||
|
|
||
| for device in vmbus_devices: | ||
| # Only consider PCI Express pass-through devices | ||
| if "PCI Express pass-through" not in device.name: | ||
| continue | ||
|
|
||
| device_id = device.device_id | ||
| # Device ID format: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX | ||
| id_parts = device_id.split("-") | ||
| if len(id_parts) >= 5: | ||
| last_segment = id_parts[-1].lower() | ||
| if last_segment not in last_segment_groups: | ||
| last_segment_groups[last_segment] = [] | ||
| last_segment_groups[last_segment].append(device) | ||
|
|
||
| if not last_segment_groups: | ||
| self._log.debug("No PCI Express pass-through devices found") | ||
| return 0 | ||
|
|
||
| # Find the largest group with sequential pattern | ||
| max_gpu_count = 0 | ||
| best_segment = None | ||
|
|
||
| for last_segment, devices in last_segment_groups.items(): | ||
| # Check if devices have sequential numbering | ||
| if self._has_sequential_pattern(devices): | ||
| device_count = len(devices) | ||
| self._log.debug( | ||
| f"Found {device_count} sequential PCI Express devices " | ||
| f"with last segment '{last_segment}'" | ||
| ) | ||
|
|
||
| if device_count > max_gpu_count: | ||
| max_gpu_count = device_count | ||
| best_segment = last_segment | ||
| else: | ||
| self._log.debug( | ||
| f"Segment '{last_segment}' has {len(devices)} devices " | ||
| "but not in sequential pattern" | ||
| ) | ||
|
|
||
| if max_gpu_count > 0 and best_segment is not None: | ||
| self._log.info( | ||
| f"Detected {max_gpu_count} potential GPU(s) with last " | ||
| f"segment '{best_segment}' " | ||
| "using segment grouping method" | ||
| ) | ||
| # Log the matched devices | ||
| for device in last_segment_groups[best_segment]: | ||
| self._log.debug(f" Device: {device.device_id}") | ||
|
|
||
| # Issue warning to user about adding this pattern | ||
| self._log.warning( | ||
| f"Found {max_gpu_count} PCI Express pass-through device(s) " | ||
| f"with sequential pattern and common" | ||
| f" last segment '{best_segment}'. " | ||
| "These might be GPU devices. Please add this pattern to the " | ||
| "gpu_devices list in NvidiaSmi class if confirmed as GPUs. " | ||
| "Example: ('<ModelName>', '<common_segment>', 0)" | ||
| ) | ||
| return 0 | ||
|
|
||
| except Exception as e: | ||
| self._log.error(f"Failed to detect GPUs by segment grouping: {e}") | ||
| return 0 | ||
|
|
||
| def _has_sequential_pattern(self, devices: List[Any]) -> bool: | ||
| """ | ||
| Check if devices have sequential numbering in their IDs. | ||
| GPUs typically have patterns like 0101, 0102, 0103, 0104. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where did you find this info? Could you add a link above? If there are other types of devices, maybe they’re listed in a similar way too.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could not find an official doc for it but this was a usual trend observed for multi-GPU SKUs like GB200 and MI300. Example:
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not an official pattern, and maybe confusing by other devices type in future. Please remove them. |
||
| """ | ||
| if len(devices) < 2: | ||
| # Single device is considered sequential | ||
| return True | ||
|
|
||
| # Extract second segment which typically contains sequence numbers | ||
| segments = [] | ||
| for device in devices: | ||
| parts = device.device_id.split("-") | ||
| if len(parts) >= 2: | ||
| # Second segment often contains the sequence (0101, 0102, etc.) | ||
| segments.append(parts[1]) | ||
|
|
||
| if not segments: | ||
| return False | ||
|
|
||
| # Check if segments form a sequential pattern | ||
| try: | ||
| # Try to parse as integers | ||
| segment_values = [] | ||
| for seg in segments: | ||
| # Handle both pure numbers and alphanumeric (extract numeric part) | ||
| numeric_part = "".join(filter(str.isdigit, seg)) | ||
| if numeric_part: | ||
| segment_values.append(int(numeric_part)) | ||
|
|
||
| if len(segment_values) == len(devices): | ||
| segment_values.sort() | ||
| # Check if sequential (difference of 1 between consecutive values) | ||
| for i in range(1, len(segment_values)): | ||
| if segment_values[i] - segment_values[i - 1] != 1: | ||
| return False | ||
| return True | ||
| except Exception as e: | ||
| self._log.error(f"Error while detecting sequential patterns: {e}") | ||
|
|
||
| return False | ||
|
|
||
| def _get_gpu_count_from_known_list(self, vmbus_devices: List[Any]) -> int: | ||
| """ | ||
| Original method - check against known list of GPUs | ||
| """ | ||
| lsvmbus_device_count = 0 | ||
| bridge_device_count = 0 | ||
|
|
||
| lsvmbus_tool = self._node.tools[Lsvmbus] | ||
| device_list = lsvmbus_tool.get_device_channels() | ||
| for device in device_list: | ||
| for device in vmbus_devices: | ||
| for name, id_, bridge_count in NvidiaSmi.gpu_devices: | ||
| if id_ in device.device_id: | ||
| lsvmbus_device_count += 1 | ||
| bridge_device_count = bridge_count | ||
| self._log.debug(f"GPU device {name} found!") | ||
| self._log.debug( | ||
| f"GPU device {name} found using hardcoded list! " | ||
| f"Device ID: {device.device_id}" | ||
| ) | ||
| break | ||
|
|
||
| return lsvmbus_device_count - bridge_device_count | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.