[Optimization]: add specialization for small topk

ijpq · ijpq · commit 0c7698e4a6af · 2025-12-01T16:36:29.000+08:00
Signed-off-by: ijpq &lt;509634578tk@gmail.com&gt;
diff --git a/tests/kernels/moe/test_gpt_oss_routing_consistency.py b/tests/kernels/moe/test_gpt_oss_routing_consistency.py
@@ -12,7 +12,7 @@
 
 @pytest.mark.parametrize("num_tokens", [10, 128, 1024])
 @pytest.mark.parametrize("num_experts", [32, 65, 128])
-@pytest.mark.parametrize("topk", [2, 4])
+@pytest.mark.parametrize("topk", [1, 2, 3, 4])
 @pytest.mark.parametrize("renorm", [True, False])
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="only available on CUDA")
 def test_routing_consistency(num_tokens, num_experts, topk, renorm):
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py b/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py
@@ -47,48 +47,106 @@ def _topk_softmax_kernel(
     mask_n = offs_n < N
     store_mask = offs_k < topk
 
-    topk_vals = tl.zeros([topk_padded], dtype=tl.float32) + float("-inf")
-    topk_idxs = tl.zeros([topk_padded], dtype=tl.int32)
-
-    for row_idx in tl.range(pid, M, num_programs, num_stages):
-        logits = tl.load(
-            logits_ptr + row_idx * stride_lm + offs_n * stride_ln,
-            mask=mask_n,
-            other=float("-inf"),
-        )
-
-        if not RENORM:
-            row_sub_max = logits - tl.max(logits, axis=0)
-            numerator = tl.exp(row_sub_max)
-            denominator = tl.sum(numerator, axis=0)
-            logits = numerator / denominator
-
-        for k in tl.static_range(topk):
+    # specify topk<=2 and RENORM specialization by tl.constexpr,
+    # similar as `constexpr if` in C++17
+    if topk == 1:
+        for row_idx in tl.range(pid, M, num_programs, num_stages):
+            logits = tl.load(
+                logits_ptr + row_idx * stride_lm + offs_n * stride_ln,
+                mask=mask_n,
+                other=float("-inf"),
+            )
+
+            if not RENORM:
+                row_sub_max = logits - tl.max(logits, axis=0)
+                numerator = tl.exp(row_sub_max)
+                denominator = tl.sum(numerator, axis=0)
+                logits = numerator / denominator
+
             cur_max = tl.max(logits, axis=0)
             cur_idx = tl.argmax(logits, axis=0)
 
-            k_mask = offs_k == k
-            topk_vals = tl.where(k_mask, cur_max, topk_vals)
-            topk_idxs = tl.where(k_mask, cur_idx, topk_idxs)
-
-            logits = tl.where(offs_n == cur_idx, float("-inf"), logits)
-
-        if RENORM:
-            topk_vals = topk_vals - tl.max(topk_vals, axis=0)
-            numerator = tl.exp(topk_vals)
-            denominator = tl.sum(numerator, axis=0)
-            topk_vals = numerator / denominator
-
-        tl.store(
-            weights_ptr + row_idx * stride_wm + offs_k * stride_wk,
-            topk_vals,
-            mask=store_mask,
-        )
-        tl.store(
-            indices_ptr + row_idx * stride_im + offs_k * stride_ik,
-            topk_idxs,
-            mask=store_mask,
-        )
+            if RENORM:
+                cur_max = 1
+
+            tl.store(weights_ptr + row_idx * stride_wm + 0 * stride_wk, cur_max)
+            tl.store(indices_ptr + row_idx * stride_im + 0 * stride_wk, cur_idx)
+
+    elif topk == 2:
+        for row_idx in tl.range(pid, M, num_programs, num_stages):
+            logits = tl.load(
+                logits_ptr + row_idx * stride_lm + offs_n * stride_ln,
+                mask=mask_n,
+                other=float("-inf"),
+            )
+
+            if not RENORM:
+                row_sub_max = logits - tl.max(logits, axis=0)
+                numerator = tl.exp(row_sub_max)
+                denominator = tl.sum(numerator, axis=0)
+                logits = numerator / denominator
+
+            val0 = tl.max(logits, axis=0)
+            idx0 = tl.argmax(logits, axis=0)
+            logits = tl.where(offs_n == idx0, float("-inf"), logits)
+            val1 = tl.max(logits, axis=0)
+            idx1 = tl.argmax(logits, axis=0)
+
+            if RENORM:
+                max_val = tl.maximum(val0, val1)
+                exp0 = tl.exp(val0 - max_val)
+                exp1 = tl.exp(val1 - max_val)
+                val0 = exp0 / (exp0 + exp1)
+                val1 = exp1 / (exp0 + exp1)
+
+            tl.store(weights_ptr + row_idx * stride_wm, val0)
+            tl.store(indices_ptr + row_idx * stride_im, idx0)
+            tl.store(weights_ptr + row_idx * stride_wm + 1 * stride_wk, val1)
+            tl.store(indices_ptr + row_idx * stride_im + 1 * stride_wk, idx1)
+
+    else:
+        topk_vals = tl.zeros([topk_padded], dtype=tl.float32) + float("-inf")
+        topk_idxs = tl.zeros([topk_padded], dtype=tl.int32)
+
+        for row_idx in tl.range(pid, M, num_programs, num_stages):
+            logits = tl.load(
+                logits_ptr + row_idx * stride_lm + offs_n * stride_ln,
+                mask=mask_n,
+                other=float("-inf"),
+            )
+
+            if not RENORM:
+                row_sub_max = logits - tl.max(logits, axis=0)
+                numerator = tl.exp(row_sub_max)
+                denominator = tl.sum(numerator, axis=0)
+                logits = numerator / denominator
+
+            for k in tl.static_range(topk):
+                cur_max = tl.max(logits, axis=0)
+                cur_idx = tl.argmax(logits, axis=0)
+
+                k_mask = offs_k == k
+                topk_vals = tl.where(k_mask, cur_max, topk_vals)
+                topk_idxs = tl.where(k_mask, cur_idx, topk_idxs)
+
+                logits = tl.where(offs_n == cur_idx, float("-inf"), logits)
+
+            if RENORM:
+                topk_vals = topk_vals - tl.max(topk_vals, axis=0)
+                numerator = tl.exp(topk_vals)
+                denominator = tl.sum(numerator, axis=0)
+                topk_vals = numerator / denominator
+
+            tl.store(
+                weights_ptr + row_idx * stride_wm + offs_k * stride_wk,
+                topk_vals,
+                mask=store_mask,
+            )
+            tl.store(
+                indices_ptr + row_idx * stride_im + offs_k * stride_ik,
+                topk_idxs,
+                mask=store_mask,
+            )
 
 
 def fused_topk_softmax(