Skip to content

Commit 02fef8e

Browse files
committed
lint
Signed-off-by: Barbara Suslova <[email protected]>
1 parent 0c89e7f commit 02fef8e

File tree

20 files changed

+108
-97
lines changed

20 files changed

+108
-97
lines changed

csrc/moe/moe_fused_gate.cu

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -287,19 +287,18 @@ struct KernelParams {
287287

288288
template <typename T, int VPT, int NUM_EXPERTS, int THREADS_PER_ROW,
289289
int ROWS_PER_WARP, int ROWS_PER_CTA, int WARPS_PER_CTA>
290-
__global__ void moe_fused_gate_kernel(void* input, void* bias,
291-
float* output_ptr, int32_t* indices_ptr,
292-
int64_t num_rows, int64_t topk_group,
293-
int64_t topk,
294-
int64_t num_fused_shared_experts,
295-
double routed_scaling_factor,
296-
bool apply_routed_scaling_factor_on_output) {
290+
__global__ void moe_fused_gate_kernel(
291+
void* input, void* bias, float* output_ptr, int32_t* indices_ptr,
292+
int64_t num_rows, int64_t topk_group, int64_t topk,
293+
int64_t num_fused_shared_experts, double routed_scaling_factor,
294+
bool apply_routed_scaling_factor_on_output) {
297295
KernelParams<VPT, NUM_EXPERTS, THREADS_PER_ROW, ROWS_PER_WARP, ROWS_PER_CTA,
298296
WARPS_PER_CTA>
299297
params;
300298
moe_fused_gate_impl<T>(input, bias, output_ptr, indices_ptr, num_rows,
301299
topk_group, topk, num_fused_shared_experts,
302-
routed_scaling_factor, apply_routed_scaling_factor_on_output, params);
300+
routed_scaling_factor,
301+
apply_routed_scaling_factor_on_output, params);
303302
}
304303

305304
// Macro to compute compile-time constants and launch the kernel.
@@ -352,18 +351,17 @@ __global__ void moe_fused_gate_kernel_dynamic(
352351

353352
moe_fused_gate_impl<T>(input, bias, output_ptr, indices_ptr, num_rows,
354353
topk_group, topk, num_fused_shared_experts,
355-
routed_scaling_factor, apply_routed_scaling_factor_on_output, params);
354+
routed_scaling_factor,
355+
apply_routed_scaling_factor_on_output, params);
356356
}
357357

358358
//------------------------------------------------------------------------------
359359
// Host Launcher Function
360360
//------------------------------------------------------------------------------
361-
std::vector<at::Tensor> moe_fused_gate(at::Tensor& input, at::Tensor& bias,
362-
int64_t num_expert_group,
363-
int64_t topk_group, int64_t topk,
364-
int64_t num_fused_shared_experts,
365-
double routed_scaling_factor,
366-
bool apply_routed_scaling_factor_on_output) {
361+
std::vector<at::Tensor> moe_fused_gate(
362+
at::Tensor& input, at::Tensor& bias, int64_t num_expert_group,
363+
int64_t topk_group, int64_t topk, int64_t num_fused_shared_experts,
364+
double routed_scaling_factor, bool apply_routed_scaling_factor_on_output) {
367365
int64_t num_rows = input.size(0);
368366
int32_t num_experts = input.size(1);
369367
auto options =

csrc/moe/moe_ops.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,10 @@ void moe_lora_align_block_size(
2828
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
2929
torch::Tensor lora_ids);
3030

31-
std::vector<at::Tensor> moe_fused_gate(torch::Tensor& input,
32-
torch::Tensor& bias,
33-
int64_t num_expert_group,
34-
int64_t topk_group, int64_t topk,
35-
int64_t num_fused_shared_experts,
36-
double routed_scaling_factor,
37-
bool apply_routed_scaling_factor_on_output
38-
);
31+
std::vector<at::Tensor> moe_fused_gate(
32+
torch::Tensor& input, torch::Tensor& bias, int64_t num_expert_group,
33+
int64_t topk_group, int64_t topk, int64_t num_fused_shared_experts,
34+
double routed_scaling_factor, bool apply_routed_scaling_factor_on_output);
3935

4036
#ifndef USE_ROCM
4137
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,

tests/kernels/moe/test_moe_fused_gate.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,12 @@
1010

1111
@pytest.mark.parametrize(
1212
"seq_length",
13-
list(range(1, 10)) +
14-
[16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536],
13+
list(range(1, 10))
14+
+ [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536],
1515
)
1616
@pytest.mark.parametrize(
1717
"dtype",
18-
[
19-
torch.float32
20-
] # torch.float16, torch.bfloat16 - aren't working correctly yet
18+
[torch.float32], # torch.float16, torch.bfloat16 - aren't working correctly yet
2119
)
2220
@pytest.mark.parametrize(
2321
"params",
@@ -36,7 +34,7 @@
3634
)
3735
def test_moe_fused_gate_combined(
3836
seq_length, dtype, params, num_fused_shared_experts, monkeypatch
39-
):
37+
):
4038
num_experts, num_expert_group, topk_group, topk = params
4139
topk += 1 if num_fused_shared_experts > 0 else 0
4240

@@ -82,16 +80,19 @@ def test_moe_fused_gate_combined(
8280
shared_indices = original_indices[:, -1]
8381
shared_ref_indices = original_ref_indices[:, -1]
8482
if shared_indices is not None:
85-
assert torch.all((shared_indices >= valid_min) & (
86-
shared_indices < valid_max)), (
87-
"Shared expert indices out of range: ",
88-
f"found values outside [{valid_min}, {valid_max})")
83+
assert torch.all(
84+
(shared_indices >= valid_min) & (shared_indices < valid_max)
85+
), (
86+
"Shared expert indices out of range: ",
87+
f"found values outside [{valid_min}, {valid_max})",
88+
)
8989
if shared_ref_indices is not None:
9090
assert torch.all(
91-
(shared_ref_indices >= valid_min)
92-
& (shared_ref_indices < valid_max)), (
93-
"Shared expert reference indices out of range: ",
94-
f"found values outside [{valid_min}, {valid_max})")
91+
(shared_ref_indices >= valid_min) & (shared_ref_indices < valid_max)
92+
), (
93+
"Shared expert reference indices out of range: ",
94+
f"found values outside [{valid_min}, {valid_max})",
95+
)
9596

9697
vllm_idx_check = torch.allclose(
9798
ref_vllm_indices.sort()[0].to(torch.int32),

vllm/_custom_ops.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1874,6 +1874,7 @@ def moe_lora_align_block_size(
18741874
lora_ids,
18751875
)
18761876

1877+
18771878
def moe_fused_gate(
18781879
input_tensor: torch.Tensor,
18791880
bias: torch.Tensor,
@@ -1919,6 +1920,7 @@ def _moe_fused_gate_fake(
19191920
device=input_tensor.device,
19201921
)
19211922

1923+
19221924
def moe_wna16_gemm(
19231925
input: torch.Tensor,
19241926
output: torch.Tensor,

vllm/config/parallel.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,10 @@ class is dynamically inherited by the worker class. This is used to inject
256256
This is an internal config that is only valid for and
257257
should only be set by API server scale-out.
258258
"""
259-
259+
260260
enable_fused_shared_experts: bool = False
261261
"""Enable the fusion of the shared experts of the model with other experts."""
262-
262+
263263
enable_fused_moe_router: bool = False
264264
"""Use the fused grouped top-k MoE expert selection router"""
265265

vllm/envs.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1486,12 +1486,13 @@ def get_vllm_port() -> int | None:
14861486
# FlatLogprobs.
14871487
"VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
14881488
# Enable the fusion of the shared experts of the model with other experts.
1489-
"VLLM_USE_CUDA_FUSION_SHARED_EXPERTS":
1490-
lambda: bool(int(os.getenv("VLLM_USE_CUDA_FUSION_SHARED_EXPERTS", "0"))),
1491-
1489+
"VLLM_USE_CUDA_FUSION_SHARED_EXPERTS": lambda: bool(
1490+
int(os.getenv("VLLM_USE_CUDA_FUSION_SHARED_EXPERTS", "0"))
1491+
),
14921492
# Use the fused grouped top-k MoE expert selection router
1493-
"VLLM_USE_FUSED_MOE_ROUTER":
1494-
lambda: bool(int(os.getenv("VLLM_USE_FUSED_MOE_ROUTER", "0"))),
1493+
"VLLM_USE_FUSED_MOE_ROUTER": lambda: bool(
1494+
int(os.getenv("VLLM_USE_FUSED_MOE_ROUTER", "0"))
1495+
),
14951496
}
14961497

14971498
# --8<-- [end:env-vars-definition]

vllm/model_executor/layers/fused_moe/fused_moe.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1173,7 +1173,7 @@ def grouped_topk(
11731173
logger.info(
11741174
"Fused MoE grouped topk is enabled with fused shared experts.",
11751175
"Only one of these options can be used at a time",
1176-
"Fused MoE grouped topk is disabled."
1176+
"Fused MoE grouped topk is disabled.",
11771177
)
11781178
use_fused_moe_grouped_topk = False
11791179

@@ -1239,21 +1239,20 @@ def grouped_topk(
12391239
# Use original unbiased scores for the routing weights
12401240
topk_weights = original_scores.gather(1, topk_ids)
12411241
else:
1242-
topk_weights, topk_ids = torch.topk(tmp_scores,
1243-
k=topk,
1244-
dim=-1,
1245-
sorted=use_sorted)
1242+
topk_weights, topk_ids = torch.topk(
1243+
tmp_scores, k=topk, dim=-1, sorted=use_sorted
1244+
)
12461245

12471246
if num_fused_shared_experts > 0:
1248-
assert routed_scaling_factor is not None, \
1249-
"With num_fused_shared_experts>0"
1247+
assert routed_scaling_factor is not None, "With num_fused_shared_experts>0"
12501248
", routed_scaling_factor need to be provided"
1251-
topk_ids[:, -1] = torch.randint(low=num_experts,
1252-
high=num_experts +
1253-
num_fused_shared_experts,
1254-
size=(topk_ids.size(0), ),
1255-
dtype=topk_ids.dtype,
1256-
device=topk_ids.device)
1249+
topk_ids[:, -1] = torch.randint(
1250+
low=num_experts,
1251+
high=num_experts + num_fused_shared_experts,
1252+
size=(topk_ids.size(0),),
1253+
dtype=topk_ids.dtype,
1254+
device=topk_ids.device,
1255+
)
12571256
topk_weights[:, -1] = topk_weights[:, :-1].sum(dim=-1) / routed_scaling_factor
12581257

12591258
if renormalize:
@@ -1263,9 +1262,8 @@ def grouped_topk(
12631262
topk_weights_sum = topk_weights[:, :-1].sum(dim=-1, keepdim=True)
12641263
topk_weights = topk_weights / topk_weights_sum
12651264

1266-
if num_fused_shared_experts == 0:
1267-
if routed_scaling_factor != 1.0:
1268-
topk_weights = topk_weights * routed_scaling_factor
1265+
if num_fused_shared_experts == 0 and routed_scaling_factor != 1.0:
1266+
topk_weights = topk_weights * routed_scaling_factor
12691267
return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
12701268

12711269

vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def apply(
103103
expert_load_view: torch.Tensor | None = None,
104104
logical_to_physical_map: torch.Tensor | None = None,
105105
logical_replica_count: torch.Tensor | None = None,
106+
enable_fused_moe_router: bool = False,
106107
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
107108
# Is getattr needed?
108109
zero_expert_num = getattr(layer, "zero_expert_num", 0)

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,9 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
import math
5-
from abc import abstractmethod
65
from collections.abc import Callable, Iterable
76
from contextlib import nullcontext
87
from enum import Enum
9-
from functools import partial
108
from typing import Literal, get_args, overload
119

1210
import torch
@@ -54,8 +52,9 @@
5452
from vllm.v1.worker.ubatching import dbo_current_ubatch_id
5553

5654
if current_platform.is_cuda_alike():
57-
from .fused_moe import eplb_map_to_physical_and_record, fused_experts
5855
from vllm._custom_ops import moe_fused_gate
56+
57+
from .fused_moe import eplb_map_to_physical_and_record, fused_experts
5958
else:
6059
fused_experts = None # type: ignore
6160
FusedMoEPermuteExpertsUnpermute = object # type: ignore
@@ -371,9 +370,10 @@ def __init__(
371370
dp_size_=dp_size_,
372371
vllm_parallel_config=vllm_config.parallel_config,
373372
)
374-
373+
375374
self.enable_fused_shared_experts = enable_fused_shared_experts
376375
if self.enable_fused_shared_experts:
376+
assert n_shared_experts is not None
377377
num_experts += n_shared_experts
378378
top_k += n_shared_experts
379379

@@ -414,10 +414,11 @@ def __init__(
414414

415415
self.num_fused_shared_experts = (
416416
n_shared_experts
417-
if (
418-
n_shared_experts is not None
419-
and self.aiter_fmoe_shared_expert_enabled
420-
) or self.enable_fused_shared_experts
417+
if n_shared_experts is not None
418+
and (
419+
self.aiter_fmoe_shared_expert_enabled
420+
or self.enable_fused_shared_experts
421+
)
421422
else 0
422423
)
423424
if (
@@ -487,12 +488,15 @@ def __init__(
487488
self.global_num_experts,
488489
get_compressed_expert_map(self.expert_map),
489490
)
490-
if (self.num_fused_shared_experts > 0):
491+
if self.num_fused_shared_experts > 0:
491492
logger.warning(
492493
"With EP enabled and share expert fusion enabled"
493494
", share expert replica should be same as ep_size"
494495
"got share expert replica = %d"
495-
"and ep_size = %d", self.num_fused_shared_experts, self.ep_size)
496+
"and ep_size = %d",
497+
self.num_fused_shared_experts,
498+
self.ep_size,
499+
)
496500
else:
497501
self.local_num_experts, self.expert_map, self.expert_mask = (
498502
self.global_num_experts,
@@ -1375,23 +1379,24 @@ def select_experts(
13751379
assert topk_group is not None
13761380
assert num_expert_group is not None
13771381
if hidden_states.shape[0] == 0:
1378-
topk_ids = torch.full((0, top_k),
1379-
-1,
1380-
dtype=torch.int,
1381-
device=hidden_states.device)
1382-
topk_weights = torch.empty((0, top_k),
1383-
dtype=torch.float32,
1384-
device=hidden_states.device)
1382+
topk_ids = torch.full(
1383+
(0, top_k), -1, dtype=torch.int, device=hidden_states.device
1384+
)
1385+
topk_weights = torch.empty(
1386+
(0, top_k), dtype=torch.float32, device=hidden_states.device
1387+
)
13851388
elif rocm_aiter_ops.is_fused_moe_enabled():
13861389
if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
13871390
assert num_fused_shared_experts == 0
13881391
grouped_topk_impl = rocm_aiter_grouped_topk
13891392
else:
13901393
grouped_topk_impl = grouped_topk
13911394

1392-
if (enable_fused_moe_router
1393-
and e_score_correction_bias is not None
1394-
and is_power_of_two(e_score_correction_bias.shape[0])):
1395+
if (
1396+
enable_fused_moe_router
1397+
and e_score_correction_bias is not None
1398+
and is_power_of_two(e_score_correction_bias.shape[0])
1399+
):
13951400
# The fused kernel can only work with 128/256 experts
13961401
topk_weights, topk_ids = moe_fused_gate(
13971402
input_tensor=router_logits.to(dtype=torch.float32),
@@ -1401,7 +1406,8 @@ def select_experts(
14011406
topk=top_k,
14021407
num_fused_shared_experts=num_fused_shared_experts,
14031408
routed_scaling_factor=routed_scaling_factor
1404-
if routed_scaling_factor is not None else 1.0,
1409+
if routed_scaling_factor is not None
1410+
else 1.0,
14051411
apply_routed_scaling_factor_on_output=False,
14061412
)
14071413
else:
@@ -1415,7 +1421,7 @@ def select_experts(
14151421
scoring_func=scoring_func,
14161422
routed_scaling_factor=routed_scaling_factor,
14171423
e_score_correction_bias=e_score_correction_bias,
1418-
num_fused_shared_experts=num_fused_shared_experts
1424+
num_fused_shared_experts=num_fused_shared_experts,
14191425
)
14201426
if indices_type is not None:
14211427
topk_ids = topk_ids.to(dtype=indices_type)

vllm/model_executor/layers/quantization/awq_marlin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,7 @@ def apply(
639639
indices_type=self.topk_indices_dtype,
640640
num_fused_shared_experts=layer.num_fused_shared_experts,
641641
enable_fused_moe_router=enable_fused_moe_router,
642-
)
642+
)
643643

644644
return fused_marlin_moe(
645645
x,

0 commit comments

Comments
 (0)