[Optimization] Add Fused Triton Kernel for GPT-OSS Router

ijpq · ijpq · commit 4ef122b07d30 · 2025-11-22T19:40:29.000+08:00
Signed-off-by: ijpq &lt;509634578tk@gmail.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py b/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Fused Router Kernel for GPT-OSS MoE.
+Fuses the router linear layer (GEMM) and Top-K selection + Softmax.
+"""
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def fused_moe_router_kernel(
+    # Pointers
+    x_ptr,  # Input [M, K]
+    w_ptr,  # Weight [N, K]
+    out_w_ptr,  # Output Weights [M, TopK]
+    out_i_ptr,  # Output Indices [M, TopK]
+    # Dimensions
+    M,
+    K,
+    N,
+    TopK: tl.constexpr,
+    # Strides
+    stride_xm,
+    stride_xk,
+    stride_wn,
+    stride_wk,
+    stride_wm,
+    stride_wk_out,  # output weights stride
+    stride_im,
+    stride_ik_out,  # output indices stride
+    # Meta-parameters
+    BLOCK_M: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_N: tl.constexpr,  # Must be >= N (number of experts)
+):
+    # 1. Program ID
+    pid = tl.program_id(axis=0)
+
+    # 2. Create offsets
+    offs_m = pid * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+
+    # 3. Initialize accumulator for GEMM (Logits)
+    # Accumulator shape: [BLOCK_M, BLOCK_N]
+    # We perform computation in float32 for numerical stability
+    acc = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+
+    # 4. GEMM Loop over K dimension
+    for k in range(0, K, BLOCK_K):
+        # Load Input X [BLOCK_M, BLOCK_K]
+        offs_k = k + tl.arange(0, BLOCK_K)
+        x_ptrs = x_ptr + (offs_m[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+        # We use a mask for M dimension boundary, and K dimension
+        x_mask = (offs_m[:, None] < M) & (offs_k[None, :] < K)
+        x = tl.load(x_ptrs, mask=x_mask, other=0.0)
+
+        # Load Weight W [BLOCK_N, BLOCK_K]
+        # Assuming W is stored as [N, K] row-major usually, but here accessed as [N, K]
+        # NOTE: PyTorch Linear weights are [Out_Features, In_Features] -> [N, K]
+        w_ptrs = w_ptr + (offs_n[None, :] * stride_wn + offs_k[:, None] * stride_wk)
+        w_mask = (offs_n[None, :] < N) & (offs_k[:, None] < K)
+        w = tl.load(w_ptrs, mask=w_mask, other=0.0)
+
+        # Matrix Multiply: [BLOCK_M, BLOCK_K] @ [BLOCK_K, BLOCK_N] -> [BLOCK_M, BLOCK_N]
+        # Note: w is loaded as [BLOCK_N, BLOCK_K] so we essentially transpose
+        # it by how we use dot
+        # In triton tl.dot(a, b), if a is [M, K] and b is [K, N], it works.
+        # Here we loaded w as [1, N] broadcasted vs [K, 1].
+        # Let's correct the load for W to match dot product expectation.
+        # We want W^T in the math X @ W^T.
+        # X is [M, K]. W is [N, K].
+        # We need to load W blocks such that we can do dot(X, W.T).
+        # Efficient way: Load WT tile [BLOCK_K, BLOCK_N].
+        # W_ptrs above defined: offs_n (col) * stride_wn + offs_k (row) * stride_wk
+        # This treats W as [K, N] effectively if stride_wn is correct.
+        # But PyTorch Linear.weight is [N, K]. So stride_wn is stride for N (rows),
+        # stride_wk for K (cols).
+        # To get a [BLOCK_K, BLOCK_N] tile from [N, K] matrix transposed:
+        # The tile needs elements where K varies along rows 0..BK and N varies along
+        # cols 0..BN.
+        # ptr = base + (offs_n[None, :] * stride_n) + (offs_k[:, None] * stride_k)
+        # This loads a [BLOCK_K, BLOCK_N] block from the weight matrix. Correct.
+
+        acc += tl.dot(x, w)
+
+    # 5. Top-K Selection in SRAM
+    # acc now contains the logits [BLOCK_M, BLOCK_N]
+    # We only care about valid experts (column index < N)
+    # Mask out invalid experts with -inf
+    logits = acc
+    # Usually BLOCK_N is power of 2 (e.g. 32, 128), so if N < BLOCK_N we need masking
+    if BLOCK_N > N:
+        logits = tl.where(tl.arange(0, BLOCK_N)[None, :] < N, logits, float("-inf"))
+
+    # Storage for TopK results
+    # Since TopK is small (usually 1 or 2), we can iteratively find max.
+    # We can't dynamic loop in Triton easily with python range, but TopK is constexpr.
+
+    # We will store topk values temporarily to compute softmax
+    topk_val_storage = tl.zeros([BLOCK_M, TopK], dtype=tl.float32)
+    topk_idx_storage = tl.zeros([BLOCK_M, TopK], dtype=tl.int32)
+
+    for i in range(TopK):
+        # Find max along the expert dimension
+        val_max, idx_max = tl.max(logits, axis=1, return_indices=True)
+
+        # Store current max
+        topk_val_storage[:, i] = val_max
+        topk_idx_storage[:, i] = idx_max
+
+        # Mask out the selected expert to find the next max in next iteration
+        # Construct a mask: broadcast indices to [BLOCK_M, BLOCK_N] and compare
+        mask = tl.arange(0, BLOCK_N)[None, :] == idx_max[:, None]
+        logits = tl.where(mask, float("-inf"), logits)
+
+    # 6. Softmax Renormalization
+    # Now we have the TopK logits in topk_val_storage [BLOCK_M, TopK]
+    # We perform softmax on these TopK values.
+
+    # Subtract max for numerical stability
+    val_max_for_softmax = tl.max(topk_val_storage, axis=1)
+    numerator = tl.exp(topk_val_storage - val_max_for_softmax[:, None])
+    denominator = tl.sum(numerator, axis=1)
+    softmax_res = numerator / denominator[:, None]
+
+    # 7. Write Output
+    # We only write valid rows (M boundary)
+    output_mask = offs_m[:, None] < M
+
+    # Pointers for output
+    # out_w_ptr shape: [M, TopK]
+    # out_i_ptr shape: [M, TopK]
+
+    offs_topk = tl.arange(0, TopK)
+    out_w_ptrs = out_w_ptr + (
+        offs_m[:, None] * stride_wm + offs_topk[None, :] * stride_wk_out
+    )
+    out_i_ptrs = out_i_ptr + (
+        offs_m[:, None] * stride_im + offs_topk[None, :] * stride_ik_out
+    )
+
+    tl.store(out_w_ptrs, softmax_res, mask=output_mask)
+    tl.store(out_i_ptrs, topk_idx_storage, mask=output_mask)
+
+
+def fused_router(
+    hidden_states: torch.Tensor,
+    router_weights: torch.Tensor,
+    top_k: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        router_weights: [num_experts, hidden_size]
+        top_k: int
+    Returns:
+        topk_weights: [num_tokens, top_k] (after softmax)
+        topk_indices: [num_tokens, top_k]
+    """
+    assert hidden_states.ndim == 2
+    assert router_weights.ndim == 2
+
+    M, K = hidden_states.shape
+    N, _ = router_weights.shape
+
+    # Outputs
+    topk_weights = torch.empty(
+        (M, top_k), device=hidden_states.device, dtype=torch.float32
+    )
+    topk_indices = torch.empty(
+        (M, top_k), device=hidden_states.device, dtype=torch.int32
+    )
+
+    # Heuristics for Block Size
+    BLOCK_M = 32
+    BLOCK_K = 128
+    # BLOCK_N must be power of 2 and >= N
+    BLOCK_N = triton.next_power_of_2(N)
+
+    grid = (triton.cdiv(M, BLOCK_M), 1, 1)
+
+    fused_moe_router_kernel[grid](
+        hidden_states,
+        router_weights,
+        topk_weights,
+        topk_indices,
+        M,
+        K,
+        N,
+        TopK=top_k,
+        stride_xm=hidden_states.stride(0),
+        stride_xk=hidden_states.stride(1),
+        stride_wn=router_weights.stride(0),
+        stride_wk=router_weights.stride(1),
+        stride_wm=topk_weights.stride(0),
+        stride_wk_out=topk_weights.stride(1),
+        stride_im=topk_indices.stride(0),
+        stride_ik_out=topk_indices.stride(1),
+        BLOCK_M=BLOCK_M,
+        BLOCK_K=BLOCK_K,
+        BLOCK_N=BLOCK_N,
+    )
+
+    return topk_weights, topk_indices
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
@@ -20,6 +20,7 @@
 )
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.gpt_oss_fused_router import fused_router
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -184,9 +185,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             g = rocm_unquantized_gemm(
                 self, x[:, : self.hidden_size], self.router.weight, self.router.bias
             )
+            x = self.experts(hidden_states=x, router_logits=g)
         else:
-            g = self.router(x)
-        x = self.experts(hidden_states=x, router_logits=g)
+            topk_weights, topk_indices = fused_router(
+                hidden_states=x,
+                router_weights=self.router.weight,
+                top_k=self.experts_per_token,
+            )
+
+            x = self.experts(
+                hidden_states=x, topk_weights=topk_weights, topk_ids=topk_indices
+            )
 
         if self.is_sequence_parallel:
             x = tensor_model_parallel_all_gather(x.contiguous(), 0)