[Optimization]: add autotune and fix little things

ijpq · ijpq · commit a404e555891f · 2025-12-02T01:36:05.000+08:00
Signed-off-by: ijpq &lt;509634578tk@gmail.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py b/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py
@@ -20,6 +20,21 @@ def torch_dtype_to_tl(dtype: torch.dtype):
         raise ValueError(f"Unsupported dtype: {dtype}")
 
 
+@triton.autotune(
+    configs=[
+        triton.Config({"ROWS_PER_PID": 1, "num_stages": 2, "num_warps": 1}),
+        triton.Config({"ROWS_PER_PID": 1, "num_stages": 2, "num_warps": 2}),
+        triton.Config({"ROWS_PER_PID": 2, "num_stages": 2, "num_warps": 2}),
+        triton.Config({"ROWS_PER_PID": 4, "num_stages": 2, "num_warps": 2}),
+        triton.Config({"ROWS_PER_PID": 16, "num_stages": 2, "num_warps": 4}),
+        triton.Config({"ROWS_PER_PID": 16, "num_stages": 3, "num_warps": 8}),
+        triton.Config({"ROWS_PER_PID": 32, "num_stages": 2, "num_warps": 4}),
+        triton.Config({"ROWS_PER_PID": 32, "num_stages": 3, "num_warps": 8}),
+        triton.Config({"ROWS_PER_PID": 64, "num_stages": 3, "num_warps": 8}),
+        triton.Config({"ROWS_PER_PID": 128, "num_stages": 3, "num_warps": 8}),
+    ],
+    key=["N", "topk", "RENORM"],
+)
 @triton.jit
 def _topk_softmax_kernel(
     logits_ptr,
@@ -37,8 +52,8 @@ def _topk_softmax_kernel(
     stride_ik,
     BLOCK_N: tl.constexpr,
     RENORM: tl.constexpr,
-    num_stages: tl.constexpr,
     ROWS_PER_PID: tl.constexpr,
+    num_stages: tl.constexpr,
 ):
     pid = tl.program_id(0)
     num_programs = tl.num_programs(0)
@@ -48,10 +63,10 @@ def _topk_softmax_kernel(
     mask_n = offs_n < N
     store_mask = offs_k < topk
 
-    # specify topk<=2 and RENORM specialization by tl.constexpr,
-    # similar as `constexpr if` in C++17
+    # impl topk<=2 and RENORM specialization by tl.constexpr,
+    # same as constexpr if in C++17
     if topk == 1:
-        for row_idx in tl.range(pid, M, num_programs, num_stages):
+        for row_idx in tl.range(pid, M, num_programs, num_stages, warp_specialize=True):
             logits = tl.load(
                 logits_ptr + row_idx * stride_lm + offs_n * stride_ln,
                 mask=mask_n,
@@ -71,10 +86,10 @@ def _topk_softmax_kernel(
                 cur_max = 1
 
             tl.store(weights_ptr + row_idx * stride_wm + 0 * stride_wk, cur_max)
-            tl.store(indices_ptr + row_idx * stride_im + 0 * stride_wk, cur_idx)
+            tl.store(indices_ptr + row_idx * stride_im + 0 * stride_ik, cur_idx)
 
     elif topk == 2:
-        for row_idx in tl.range(pid, M, num_programs, num_stages):
+        for row_idx in tl.range(pid, M, num_programs, num_stages, warp_specialize=True):
             logits = tl.load(
                 logits_ptr + row_idx * stride_lm + offs_n * stride_ln,
                 mask=mask_n,
@@ -103,7 +118,7 @@ def _topk_softmax_kernel(
             tl.store(weights_ptr + row_idx * stride_wm, val0)
             tl.store(indices_ptr + row_idx * stride_im, idx0)
             tl.store(weights_ptr + row_idx * stride_wm + 1 * stride_wk, val1)
-            tl.store(indices_ptr + row_idx * stride_im + 1 * stride_wk, idx1)
+            tl.store(indices_ptr + row_idx * stride_im + 1 * stride_ik, idx1)
 
     else:
         topk_vals = tl.zeros([ROWS_PER_PID, topk_padded], dtype=tl.float32) + float(
@@ -113,7 +128,11 @@ def _topk_softmax_kernel(
 
         rows = tl.arange(0, ROWS_PER_PID)
         for row_idx in tl.range(
-            pid * ROWS_PER_PID, M, num_programs * ROWS_PER_PID, num_stages
+            pid * ROWS_PER_PID,
+            M,
+            num_programs * ROWS_PER_PID,
+            num_stages,
+            warp_specialize=True,
         ):
             row_indices = row_idx + rows  # [ROWS_PER_POD,]
             row_mask = row_indices < M
@@ -183,16 +202,15 @@ def fused_topk_softmax(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     M, N = router_logits.shape  # num_tokens, num_experts
 
-    weights = torch.empty(
-        (M, topk), device=router_logits.device, dtype=router_logits.dtype
-    )
+    weights = torch.empty((M, topk), device=router_logits.device, dtype=torch.float32)
     indices = torch.empty((M, topk), device=router_logits.device, dtype=torch.int32)
 
     BLOCK_N = triton.next_power_of_2(N)  # num_padded_experts
     topk_padded = triton.next_power_of_2(topk)
-    grid = (M,)
-    num_stages = 2
-    ROWS_PER_PID = 4
+
+    # enable autotune to find correct num threadblock,
+    # refer to https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    grid = lambda META: (triton.cdiv(M, META["ROWS_PER_PID"]),)
 
     _topk_softmax_kernel[grid](
         logits_ptr=router_logits,
@@ -210,8 +228,6 @@ def fused_topk_softmax(
         stride_ik=indices.stride(1),
         BLOCK_N=BLOCK_N,
         RENORM=renormalize,
-        num_stages=num_stages,
-        ROWS_PER_PID=ROWS_PER_PID,
     )
 
     return weights, indices