revert

ilyasher · ilyasher · commit ff4e8198bac7 · 2025-11-25T11:37:44.000-08:00
diff --git a/collector/vllm/collect_moe.py b/collector/vllm/collect_moe.py
@@ -8,6 +8,7 @@
 import torch.nn.functional as F
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
+from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
 from vllm.version import __version__ as vllm_version
 
 from helper import get_sm_version, log_perf
@@ -97,8 +98,10 @@ def power_law_logits_v3(num_tokens, num_experts, topk, ep, alpha, return_first_g
     res = conv1d(num_tokens_per_expert.unsqueeze(0).unsqueeze(0).float())
     max_ep_idx = torch.argmax(res).item()
 
+    # Number of experts per GPU
+    ep_group_size = num_experts // ep
+
     if max_ep_idx != 0:
-        ep_group_size = num_experts // ep
         num_tokens_per_expert_reshaped = num_tokens_per_expert.view(ep, ep_group_size)
         num_tokens_per_expert_reshaped[0], num_tokens_per_expert_reshaped[max_ep_idx] = (
             num_tokens_per_expert_reshaped[max_ep_idx].clone(),
@@ -109,9 +112,6 @@ def power_law_logits_v3(num_tokens, num_experts, topk, ep, alpha, return_first_g
     revised_num_tokens = num_tokens
     revised_topk = topk
     if return_first_gpu_only:
-        # Number of experts per GPU
-        ep_group_size = num_experts // ep
-
         # How many experts will be run on the first GPU.
         # Can't exceed the number of experts per GPU.
         revised_topk = min(topk, ep_group_size)
@@ -294,7 +294,8 @@ def run_moe_torch(
     local_inter_size = inter_size // moe_tp_size
 
     # How many experts will be run on this GPU
-    local_topk = min(topk, local_num_experts)
+    # local_topk = min(topk, local_num_experts)
+    local_topk = topk
 
     # Create weight tensors
     # w1: gate + up projection weights [num_experts, 2 * inter_size, hidden_size]
@@ -314,6 +315,9 @@ def run_moe_torch(
         device=device,
     )
 
+    # Maps global expert index to local expert index.
+    _, expert_map = determine_expert_map(moe_ep_size, 0, num_experts)
+
     if dtype == torch.float8_e4m3fn:
         w1 = w1.to(dtype)
         w2 = w2.to(dtype)
@@ -333,23 +337,29 @@ def run_moe_torch(
             for _ in range(num_iter):
                 logits = (
                     power_law_logits_v3(
-                        num_tokens, num_experts, topk, moe_ep_size, power_law_alpha, return_first_gpu_only=True
+                        # num_tokens, num_experts, topk, moe_ep_size, power_law_alpha, return_first_gpu_only=True
+                        num_tokens,
+                        num_experts,
+                        topk,
+                        moe_ep_size,
+                        power_law_alpha,
+                        return_first_gpu_only=False,
                     )
                     .half()
                     .to(device)
                 )
-                weights, ids = torch.topk(logits, local_topk, dim=-1)
+                weights, ids = torch.topk(logits, topk, dim=-1)
+                # weights, ids = torch.topk(logits, local_topk, dim=-1)
                 topk_weights_list.append(F.softmax(weights, dim=-1))
                 topk_ids_list.append(ids)
 
             print("actual num_tokens: ", [topk_ids.shape[0] for topk_ids in topk_ids_list])
 
         elif distributed == "balanced":
-            local_num_tokens = math.ceil(num_tokens / moe_ep_size)
-            actual_logits = balanced_logits(local_num_tokens, local_num_experts, local_topk).half().to(device)
+            # actual_logits = balanced_logits(num_tokens, local_num_experts, local_topk).half().to(device)
+            actual_logits = balanced_logits(num_tokens, num_experts, topk).half().to(device)
             topk_weights, topk_ids = torch.topk(actual_logits, local_topk, dim=-1)
             topk_weights = F.softmax(topk_weights, dim=-1)
-            print("actual num_tokens: ", actual_logits.shape[0])
 
         else:
             raise ValueError(f"Unsupported distributed mode: {distributed}")
@@ -372,6 +382,8 @@ def run_single_iteration():
                         ti,
                         inplace=True,
                         quant_config=quant_config,
+                        global_num_experts=num_experts,
+                        expert_map=expert_map,
                     )
             else:
                 _ = fused_experts(
@@ -382,6 +394,8 @@ def run_single_iteration():
                     topk_ids,
                     inplace=True,
                     quant_config=quant_config,
+                    global_num_experts=num_experts,
+                    expert_map=expert_map,
                 )
 
         def run_iterations(use_cuda_graph=False):
@@ -453,6 +467,8 @@ def run_iterations(use_cuda_graph=False):
     test_cases = get_moe_test_cases()
     print(f"Total test cases: {len(test_cases)}")
 
+    # test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 4096, 14336, 2, 8, 1, 2, 'MOE_Mixtral8x7B', 'moe_perf.txt', 'power_law', 1.01]]
+
     for test_case in test_cases:
         print(f"Running test case: {test_case}")
         try: