Skip to content

Commit 6dea7df

Browse files
authored
Merge branch 'main' into patchy/async_ngram
2 parents e9524b8 + 3085478 commit 6dea7df

File tree

198 files changed

+7694
-1222
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

198 files changed

+7694
-1222
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,22 @@ function cpu_tests() {
2525

2626
# offline inference
2727
podman exec -it "$container_id" bash -c "
28+
export TORCH_COMPILE_DISABLE=1
2829
set -xve
2930
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
3031

3132
# Run basic model test
3233
podman exec -it "$container_id" bash -c "
34+
export TORCH_COMPILE_DISABLE=1
3335
set -evx
3436
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
35-
pip install sentence-transformers datamodel_code_generator
37+
pip install sentence-transformers datamodel_code_generator tblib
3638
3739
# Note: disable Bart until supports V1
3840
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
39-
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
40-
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
41-
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
41+
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
42+
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
43+
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
4244
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
4345
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
4446
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log

.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,17 @@ wait_for_server() {
1717
}
1818

1919
MODEL="deepseek-ai/DeepSeek-V2-lite"
20-
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
20+
21+
# Set BACKENDS based on platform
22+
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
23+
# ROCm platform
24+
BACKENDS=("allgather_reducescatter")
25+
# Disable MOE padding for ROCm since it is causing eplb to fail
26+
export VLLM_ROCM_MOE_PADDING=0
27+
else
28+
# Non-ROCm platform (CUDA/other)
29+
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
30+
fi
2131

2232
cleanup() {
2333
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then

.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,16 @@ wait_for_server() {
1717
}
1818

1919
MODEL="QWen/Qwen3-30B-A3B-FP8"
20-
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
20+
# Set BACKENDS based on platform
21+
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
22+
# ROCm platform
23+
BACKENDS=("allgather_reducescatter")
24+
# Disable MOE padding for ROCm since it is causing eplb to fail
25+
export VLLM_ROCM_MOE_PADDING=0
26+
else
27+
# Non-ROCm platform (CUDA/other)
28+
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
29+
fi
2130

2231
cleanup() {
2332
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then

.buildkite/test-amd.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,7 @@ steps:
754754
torch_nightly: true
755755
source_file_dependencies:
756756
- vllm/model_executor/models/
757+
- vllm/transformers_utils/
757758
- tests/models/test_initialization.py
758759
commands:
759760
# Only when vLLM model source is modified - test initialization of a large
@@ -1319,7 +1320,10 @@ steps:
13191320
- pytest -v -s -x lora/test_llama_tp.py
13201321
- pytest -v -s -x lora/test_llm_with_multi_loras.py
13211322
- pytest -v -s -x lora/test_olmoe_tp.py
1322-
- pytest -v -s -x lora/test_gptoss_tp.py
1323+
1324+
# Disabled for now because MXFP4 backend on non-cuda platform
1325+
# doesn't support LoRA yet
1326+
#- pytest -v -s -x lora/test_gptoss_tp.py
13231327

13241328

13251329
- label: Weight Loading Multiple GPU Test # 33min

.buildkite/test-pipeline.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,18 @@ steps:
346346
commands:
347347
- pytest -v -s v1/attention
348348

349+
- label: Batch Invariance Tests (H100) # 10min
350+
timeout_in_minutes: 25
351+
gpu: h100
352+
source_file_dependencies:
353+
- vllm/
354+
- tests/v1/determinism/
355+
commands:
356+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
357+
- pip install pytest-timeout pytest-forked
358+
- pytest -v -s v1/determinism/test_batch_invariance.py
359+
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
360+
349361
- label: V1 Test attention (B200) # 10min
350362
timeout_in_minutes: 30
351363
gpu: b200
@@ -679,6 +691,7 @@ steps:
679691
torch_nightly: true
680692
source_file_dependencies:
681693
- vllm/model_executor/models/
694+
- vllm/transformers_utils/
682695
- tests/models/test_initialization.py
683696
commands:
684697
# Only when vLLM model source is modified - test initialization of a large

.github/CODEOWNERS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
1010
/vllm/model_executor/layers/mamba @tdoublep
1111
/vllm/model_executor/model_loader @22quinn
12+
/vllm/model_executor/layers/batch_invariant.py @yewentao256
1213
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
1314
/vllm/vllm_flash_attn @LucasWilkinson
1415
/vllm/lora @jeejeelee
@@ -35,6 +36,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
3536
/vllm/v1/kv_cache_interface.py @heheda12345
3637
/vllm/v1/offloading @ApostaC
3738

39+
# Model runner V2
40+
/vllm/v1/worker/gpu @WoosukKwon
41+
3842
# Test ownership
3943
/.buildkite/lm-eval-harness @mgoin
4044
/tests/distributed/test_multi_node_assignment.py @youkaichao
@@ -56,6 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
5660
/tests/v1/kv_connector/nixl_integration @NickLucche
5761
/tests/v1/kv_connector @ApostaC
5862
/tests/v1/offloading @ApostaC
63+
/tests/v1/determinism @yewentao256
5964

6065
# Transformers modeling backend
6166
/vllm/model_executor/models/transformers @hmellor

cmake/external_projects/vllm_flash_attn.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ else()
3838
FetchContent_Declare(
3939
vllm-flash-attn
4040
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
41-
GIT_TAG 71bb26f6295449be880344b93b51791cc009237d
41+
GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
4242
GIT_PROGRESS TRUE
4343
# Don't share the vllm-flash-attn build between build types
4444
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

csrc/cpu/cpu_attn.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,18 @@
1313
#define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
1414
#endif
1515

16+
#ifdef __aarch64__
17+
#include "cpu_attn_neon.hpp"
18+
#define NEON_DISPATCH(...) \
19+
case cpu_attention::ISA::NEON: { \
20+
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
21+
scalar_t, head_dim>; \
22+
return __VA_ARGS__(); \
23+
}
24+
#else
25+
#define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
26+
#endif // #ifdef __aarch64__
27+
1628
#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
1729
case HEAD_DIM: { \
1830
constexpr size_t head_dim = HEAD_DIM; \
@@ -41,6 +53,7 @@
4153
[&] { \
4254
switch (ISA_TYPE) { \
4355
AMX_DISPATCH(__VA_ARGS__) \
56+
NEON_DISPATCH(__VA_ARGS__) \
4457
case cpu_attention::ISA::VEC: { \
4558
using attn_impl = \
4659
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t, \
@@ -73,6 +86,8 @@ torch::Tensor get_scheduler_metadata(
7386
isa = cpu_attention::ISA::VEC;
7487
} else if (isa_hint == "vec16") {
7588
isa = cpu_attention::ISA::VEC16;
89+
} else if (isa_hint == "neon") {
90+
isa = cpu_attention::ISA::NEON;
7691
} else {
7792
TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
7893
}
@@ -158,6 +173,8 @@ void cpu_attn_reshape_and_cache(
158173
return cpu_attention::ISA::VEC;
159174
} else if (isa == "vec16") {
160175
return cpu_attention::ISA::VEC16;
176+
} else if (isa == "neon") {
177+
return cpu_attention::ISA::NEON;
161178
} else {
162179
TORCH_CHECK(false, "Invalid ISA type: " + isa);
163180
}

csrc/cpu/cpu_attn_impl.hpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
#include "utils.hpp"
1515

1616
namespace cpu_attention {
17-
enum class ISA { AMX, VEC, VEC16 };
17+
enum class ISA { AMX, VEC, VEC16, NEON };
1818

1919
template <ISA isa, typename scalar_t, int64_t head_dim>
2020
class AttentionImpl {};
@@ -143,6 +143,12 @@ struct AttentionMetadata {
143143
case ISA::VEC:
144144
ss << "VEC, ";
145145
break;
146+
case ISA::VEC16:
147+
ss << "VEC16, ";
148+
break;
149+
case ISA::NEON:
150+
ss << "NEON, ";
151+
break;
146152
}
147153
ss << "workitem_group_num: " << workitem_group_num
148154
<< ", reduction_item_num: " << reduction_item_num

0 commit comments

Comments
 (0)