Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions collector/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
moe_perf.txt
*.log
moe_*/
17 changes: 15 additions & 2 deletions collector/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ def worker(queue, device_id: int, func, progress_value, lock, error_queue=None,
for handler in worker_logger.handlers:
handler.flush()

# This error is could be fatal and require a process restart.
if isinstance(e, torch.AcceleratorError):
# Exiting with non-zero code will add an additional error to the summary,
# which we don't want.
exit(0)


def parallel_run(tasks, func, num_processes, module_name="unknown"):
"""parallel runner with error collection"""
Expand Down Expand Up @@ -427,7 +433,7 @@ def collect_sglang(num_processes: int, ops: list[str] | None = None):

def collect_vllm(num_processes: int, ops: list[str] | None = None):
"""
Collect performance data for VLLM v1.
Collect performance data for VLLM
"""

try:
Expand All @@ -441,7 +447,7 @@ def collect_vllm(num_processes: int, ops: list[str] | None = None):

collections = [
# GEMM collections
# vllm v1 GEMM collection for fp16, fp8, fp8_block, nvfp4, awq, and gptq
# vllm GEMM collection for fp16, fp8, fp8_block, nvfp4, awq, and gptq
{
"name": "vllm",
"type": "gemm",
Expand All @@ -464,6 +470,13 @@ def collect_vllm(num_processes: int, ops: list[str] | None = None):
"get_func": "get_generation_attention_test_cases",
"run_func": "run_attention_torch",
},
{
"name": "vllm",
"type": "moe",
"module": "collector.vllm.collect_moe",
"get_func": "get_moe_test_cases",
"run_func": "run_moe_torch",
},
]

all_errors = collect_ops(num_processes, collections, ops, version)
Expand Down
132 changes: 132 additions & 0 deletions collector/common_test_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import dataclasses
import itertools
from typing import Optional


@dataclasses.dataclass
class MoeCommonTestCase:
num_tokens_list: list[int]
hidden_size: int
inter_size: int
topk: int
num_experts: int
tp: int
ep: int
model_name: str
token_expert_distribution: str
power_law_alpha: Optional[float]


def get_common_moe_test_cases():
num_tokens = [
1,
2,
4,
8,
16,
32,
48,
64,
80,
96,
128,
160,
192,
256,
320,
384,
512,
768,
1024,
1536,
2048,
3072,
4096,
6144,
8192,
12288,
16384,
20480,
32768,
65536,
]
tp_list = [1, 2, 4, 8, 16, 32]
ep_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]
num_gpu_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]

token_distributions = [
("balanced", 0.0),
("power_law", 1.01),
("power_law", 1.2),
]

# alpha_list = [1.01, 1.2]
# hidden_size,inter_s,topk,num_expert, gated act
# [15360,30720,2,16],# GPT-MOE-1.8T
# [15360,3840,16,128],# GPT-MOE-1.8T-FineGrained
# [3584,2560,8,64],# Qwen2-57B
# [2048,1408,4,60], #qwen1.5_moe
# [2048,1408,6,64], #deepseekv1_moe
# [5120,1536,6,160], #deepseekv2
model_config_list = [
[4096, 14336, 2, 8, "MOE_Mixtral8x7B"], # mixtral_8x7b
[6144, 16384, 2, 8, "MOE_Mixtral8x22B"], # mixtral_8x22b
[7168, 2048, 8, 256, "DEEPSEEK_V3"], # deepseekv3, will have 1 shared expert
[2048, 768, 8, 128, "QWEN3_30B_A3B"], # qwen3-moe, 30b-a3b
[4096, 1536, 8, 128, "QWEN3_235B"], # qwen3-moe, 235b-a22b
[6144, 2560, 8, 160, "QWEN3_480B"], # qwen3-moe, 480b-a35b
[7168, 2048, 8, 384, "KIMI_K2"], # kimi k2
[2880, 2880, 4, 128, "GPT_OSS_120B"],
[2880, 2880, 4, 32, "GPT_OSS_20B"],
]

test_cases: list[MoeCommonTestCase] = []

for (
num_gpu, # starting from fewer gpus. workaround for potential buffer bug in moe impl.
model_config,
tp,
ep,
(token_distribution, power_law_alpha),
) in itertools.product(
num_gpu_list,
model_config_list,
tp_list,
ep_list,
token_distributions,
):
hs, inter_s, topk, num_experts, model_name = model_config

# QWEN3_30B_A3B: exclude tp >= 8 as they are not used for actual deployments
if model_name == "QWEN3_30B_A3B" and tp >= 8:
continue

if tp * ep != num_gpu:
continue
if ep > num_experts:
continue
if num_experts % ep != 0:
continue
# we need to ensure inter_s can be divided by tp.
if inter_s % tp != 0:
continue

test_cases.append(
MoeCommonTestCase(
num_tokens_list=num_tokens,
hidden_size=hs,
inter_size=inter_s,
topk=topk,
num_experts=num_experts,
tp=tp,
ep=ep,
model_name=model_name,
token_expert_distribution=token_distribution,
power_law_alpha=power_law_alpha,
)
)

return test_cases
50 changes: 33 additions & 17 deletions collector/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@
import signal
import sys
import traceback

try:
from cuda import cuda
except:
from cuda.bindings import driver as cuda
from datetime import datetime
from pathlib import Path

Expand Down Expand Up @@ -218,21 +213,42 @@ def save_error_report(errors, filename):


def get_sm_version():
# Init
(err,) = cuda.cuInit(0)
"""Get CUDA compute capability (SM version)"""
try:
import torch

# Device
err, cu_device = cuda.cuDeviceGet(0)
if torch.cuda.is_available():
device = torch.cuda.current_device()
capability = torch.cuda.get_device_capability(device)
return capability[0] * 10 + capability[1]
except Exception:
pass

# fallback to cuda-python
try:
from cuda import cuda

# Get target architecture
err, sm_major = cuda.cuDeviceGetAttribute(
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
)
err, sm_minor = cuda.cuDeviceGetAttribute(
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
)
# Init
(err,) = cuda.cuInit(0)
if err != 0:
raise RuntimeError(f"cuInit failed with error code: {err}")

# Device
err, cu_device = cuda.cuDeviceGet(0)
if err != 0:
raise RuntimeError(f"cuDeviceGet failed with error code: {err}")

# Get target architecture
err, sm_major = cuda.cuDeviceGetAttribute(
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
)
err, sm_minor = cuda.cuDeviceGetAttribute(
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
)

return sm_major * 10 + sm_minor
return sm_major * 10 + sm_minor
except Exception as e:
raise RuntimeError(f"Cannot get SM version: both PyTorch and cuda-python failed. Error: {e}") from e


def create_test_case_id(test_case, test_type, module_name):
Expand Down
Loading
Loading