Skip to content

Commit 52d75e8

Browse files
author
PatchouliTaisa
committed
change codes to fix comments
Signed-off-by: PatchouliTaisa <[email protected]>
2 parents bf65328 + 71b0dca commit 52d75e8

File tree

82 files changed

+1735
-484
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+1735
-484
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -903,11 +903,12 @@ steps:
903903
- label: Transformers Nightly Models Test
904904
working_dir: "/vllm-workspace/"
905905
optional: true
906+
soft_fail: true
906907
commands:
907908
- pip install --upgrade git+https://github.com/huggingface/transformers
908-
- pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
909+
- pytest -v -s tests/models/test_initialization.py
909910
- pytest -v -s tests/models/test_transformers.py
910-
# - pytest -v -s tests/models/multimodal/processing/
911+
- pytest -v -s tests/models/multimodal/processing/
911912
- pytest -v -s tests/models/multimodal/test_mapping.py
912913
- python3 examples/offline_inference/basic/chat.py
913914
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl

.github/workflows/cleanup_pr_body.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313

1414
steps:
1515
- name: Checkout repository
16-
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
16+
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
1717

1818
- name: Set up Python
1919
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0

.github/workflows/macos-smoke-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
timeout-minutes: 30
1313

1414
steps:
15-
- uses: actions/checkout@v4
15+
- uses: actions/checkout@v6
1616

1717
- uses: astral-sh/setup-uv@v7
1818
with:

.github/workflows/pre-commit.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
pre-commit:
1717
runs-on: ubuntu-latest
1818
steps:
19-
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
19+
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
2020
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
2121
with:
2222
python-version: "3.12"

CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
604604
set(SRCS
605605
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
606606
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
607-
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
607+
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
608+
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
609+
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
608610
set_gencode_flags_for_srcs(
609611
SRCS "${SRCS}"
610612
CUDA_ARCHS "${FP4_ARCHS}")
611613
list(APPEND VLLM_EXT_SRC "${SRCS}")
612614
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
615+
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
613616
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
614617
else()
615618
message(STATUS "Not building NVFP4 as no compatible archs were found.")

csrc/attention/merge_attn_states.cu

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
1616
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
1717
const float* prefix_lse, const scalar_t* suffix_output,
1818
const float* suffix_lse, const uint num_tokens, const uint num_heads,
19-
const uint head_size) {
19+
const uint head_size, const uint prefix_head_stride,
20+
const uint output_head_stride) {
2021
using pack_128b_t = uint4;
2122
const uint pack_size = 16 / sizeof(scalar_t);
2223
const uint threads_per_head = head_size / pack_size;
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
3435
const uint head_idx = token_head_idx % num_heads;
3536

3637
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
37-
const uint head_offset =
38-
token_idx * num_heads * head_size + head_idx * head_size;
39-
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
40-
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
41-
scalar_t* output_head_ptr = output + head_offset;
38+
const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
39+
head_idx * prefix_head_stride;
40+
const uint dst_head_offset = token_idx * num_heads * output_head_stride +
41+
head_idx * output_head_stride;
42+
const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
43+
const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
44+
scalar_t* output_head_ptr = output + dst_head_offset;
4245

4346
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
4447
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
140143
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
141144
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
142145
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
143-
num_heads, head_size); \
146+
num_heads, head_size, prefix_head_stride, output_head_stride); \
144147
}
145148

146149
/*@brief Merges the attention states from prefix and suffix
@@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
166169
const uint num_tokens = output.size(0);
167170
const uint num_heads = output.size(1);
168171
const uint head_size = output.size(2);
172+
const uint prefix_head_stride = prefix_output.stride(1);
173+
const uint output_head_stride = output.stride(1);
169174
const uint pack_size = 16 / sizeof(scalar_t);
170175
TORCH_CHECK(head_size % pack_size == 0,
171176
"headsize must be multiple of pack_size:", pack_size);
172-
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
173-
"output heads must be contiguous in memory");
174-
TORCH_CHECK(
175-
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
176-
"prefix_output heads must be contiguous in memory");
177-
TORCH_CHECK(
178-
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
179-
"suffix_output heads must be contiguous in memory");
180177
float* output_lse_ptr = nullptr;
181178
if (output_lse.has_value()) {
182179
output_lse_ptr = output_lse.value().data_ptr<float>();

csrc/ops.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,13 @@ void paged_attention_v2(
5252
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
5353
const int64_t blocksparse_head_sliding_step);
5454

55-
#ifndef USE_ROCM
5655
void merge_attn_states(torch::Tensor& output,
5756
std::optional<torch::Tensor> output_lse,
5857
const torch::Tensor& prefix_output,
5958
const torch::Tensor& prefix_lse,
6059
const torch::Tensor& suffix_output,
6160
const torch::Tensor& suffix_lse);
62-
61+
#ifndef USE_ROCM
6362
void convert_vertical_slash_indexes(
6463
torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS]
6564
torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]

0 commit comments

Comments
 (0)