[fix][cpu] Use a SwigluOAI impl which supports interleaved gate-up weights

fadara01 · fadara01 · commit da1644d3b631 · 2025-11-23T17:06:05.000Z
Current impl of `swigluoai_and_mul` for CPU assumes that gate-up weights
have been de-interleaved at load time, which is not the case.
The new impl we dispatch to is the same one used for the BF16 path on
GPU and handles interleaved gate-up.

Signed-off-by: Fadi Arafeh &lt;fadi.arafeh@arm.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -6,24 +6,13 @@
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.activation import SwigluOAIAndMul
 
 
 def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
     d = x.shape[-1] // 2
     return F.silu(x[..., :d]) * x[..., d:]
 
-
-def swigluoai_and_mul(
-    x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0
-) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    gate, up = x[..., :d], x[..., d:]
-    gate = gate.clamp(max=limit)
-    up = up.clamp(min=-limit, max=limit)
-    glu = gate * torch.sigmoid(alpha * gate)
-    return (up + 1) * glu
-
-
 def grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -284,7 +273,7 @@ def __call__(
 
             gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
             if activation == "swigluoai":
-                gate_up = swigluoai_and_mul(gate_up)
+                gate_up = SwigluOAIAndMul().forward_native(gate_up)
             else:
                 gate_up = silu_and_mul(gate_up)
             expert_out = layer.down_linear[i](gate_up)