fix

ilyasher · ilyasher · commit ca37da4e5665 · 2025-11-20T19:01:32.000-08:00
diff --git a/collector/vllm/collect_moe.py b/collector/vllm/collect_moe.py
@@ -10,13 +10,13 @@
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.version import __version__ as vllm_version
 
-# from helper import get_sm_version, log_perf
+from helper import get_sm_version, log_perf
 
-def get_sm_version():
-    return 86
+# def get_sm_version():
+#     return 86
 
-def log_perf(*args, **kwargs):
-    pass
+# def log_perf(*args, **kwargs):
+#     pass
 
 aic_debug = int(os.getenv("aic_moe_debug", "0"))  # noqa: SIM112
 
@@ -34,7 +34,7 @@ def balanced_logits(num_tokens, num_experts, topk):
                 h_selected_experts[token_i][i] = (token_i * stride / num_tokens + i * stride) % num_experts
 
     expert_map = F.one_hot(h_selected_experts.long(), num_classes=num_experts).sum(1)
-    router_logits = F.softmax(expert_map.bfloat16(), dim=1)
+    router_logits = F.softmax(expert_map.half(), dim=1)
     return router_logits
 
 
@@ -146,7 +146,7 @@ def power_law_logits_v3(num_tokens, num_experts, topk, ep, alpha, return_first_g
 
 
     expert_map = F.one_hot(h_selected_experts.long(), num_classes=num_experts).sum(1)
-    router_logits = F.softmax(expert_map.bfloat16(), dim=1)
+    router_logits = F.softmax(expert_map.half(), dim=1)
     return router_logits
 
 
@@ -227,6 +227,10 @@ def get_moe_test_cases():
                         if inter_s % tp != 0:
                             continue
 
+                        # vllm does not support TP when EP is enabled.
+                        if tp > 1 and ep > 1:
+                            continue
+
                         for power_law_alpha in alpha_list:
                             test_cases.append(
                                 [
@@ -245,7 +249,7 @@ def get_moe_test_cases():
                                 ]
                             )
 
-    return test_cases[:20]
+    return test_cases
 
 def run_moe_torch(
     moe_type,
@@ -267,7 +271,7 @@ def run_moe_torch(
     torch.set_default_device(device)
 
     # Configure quantization parameters
-    dtype = torch.bfloat16
+    dtype = torch.float16
     quant_config = None
 
     if moe_type == "fp8":
@@ -300,14 +304,14 @@ def run_moe_torch(
         local_num_experts, 
         2 * local_inter_size, 
         hidden_size, 
-        dtype=torch.bfloat16, 
+        dtype=torch.float16, 
         device=device
     )
     w2 = torch.randn(
         local_num_experts, 
         hidden_size, 
         local_inter_size, 
-        dtype=torch.bfloat16, 
+        dtype=torch.float16, 
         device=device
     )
 
@@ -319,7 +323,7 @@ def run_moe_torch(
     for num_tokens_idx, num_tokens in enumerate(num_tokens_lists):
         print("num_tokens", num_tokens)
         print("topk", topk)
-        hidden_states = torch.randn([num_tokens, hidden_size]).bfloat16().to(device)
+        hidden_states = torch.randn([num_tokens, hidden_size]).half().to(device)
 
         # Generate topk_weights and topk_ids
         num_iter = 10 if distributed == "power_law" else 1
@@ -335,7 +339,7 @@ def run_moe_torch(
                     moe_ep_size,
                     power_law_alpha,
                     return_first_gpu_only=True
-                ).bfloat16().to(device)
+                ).half().to(device)
                 weights, ids = torch.topk(logits, local_topk, dim=-1)
                 topk_weights_list.append(F.softmax(weights, dim=-1))
                 topk_ids_list.append(ids)
@@ -344,7 +348,7 @@ def run_moe_torch(
 
         elif distributed == "balanced":
             local_num_tokens = math.ceil(num_tokens / moe_ep_size)
-            actual_logits = balanced_logits(local_num_tokens, local_num_experts, local_topk).bfloat16().to(device)
+            actual_logits = balanced_logits(local_num_tokens, local_num_experts, local_topk).half().to(device)
             topk_weights, topk_ids = torch.topk(actual_logits, local_topk, dim=-1)
             topk_weights = F.softmax(topk_weights, dim=-1)
             print("actual num_tokens: ", actual_logits.shape[0])
@@ -459,6 +463,8 @@ def run_iterations(use_cuda_graph=False):
     # test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 7168, 2048, 8, 384, 1, 4, 'KIMI_K2', 'moe_perf.txt', 'power_law', 1.01]]
     # test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 2048, 768, 8, 128, 2, 32, 'QWEN3_30B_A3B', 'moe_perf.txt', 'power_law', 1.01]]
 
+    test_cases = [['float16',[65536],4096,14336,2,8,1,1,'MOE_Mixtral8x7B', 'moe_perf.txt', 'power_law', 1.01]]
+
     print(f"Total test cases: {len(test_cases)}")
     
     for test_case in test_cases[:40]: