Skip to content

Commit 47c7e05

Browse files
Merge branch 'main' into restart
2 parents ce6bada + c6fa389 commit 47c7e05

File tree

282 files changed

+10359
-3446
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

282 files changed

+10359
-3446
lines changed

.buildkite/scripts/annotate-release.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ To download the wheel (by version):
2323
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
2424
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
2525
26-
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
2726
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
27+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
2828
\`\`\`
2929
3030
To download and upload the image:
@@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
4545
docker push vllm/vllm-openai:latest-aarch64
4646
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
4747
48-
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
49-
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
48+
docker manifest rm vllm/vllm-openai:latest
49+
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
50+
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
5051
docker manifest push vllm/vllm-openai:latest
5152
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
5253
\`\`\`
53-
EOF
54+
EOF
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
3+
# This script build the CPU docker image and run the offline inference inside the container.
4+
# It serves a sanity check for compilation and basic model usage.
5+
set -ex
6+
7+
# allow to bind to different cores
8+
CORE_RANGE=${CORE_RANGE:-0-16}
9+
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
10+
NUMA_NODE=${NUMA_NODE:-0}
11+
12+
export CMAKE_BUILD_PARALLEL_LEVEL=32
13+
14+
# Setup cleanup
15+
remove_docker_container() {
16+
set -e;
17+
docker rm -f cpu-test-"$NUMA_NODE" || true;
18+
}
19+
trap remove_docker_container EXIT
20+
remove_docker_container
21+
22+
# Try building the docker image
23+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
24+
25+
# Run the image, setting --shm-size=4g for tensor parallel.
26+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
27+
28+
function cpu_tests() {
29+
set -e
30+
export NUMA_NODE=$2
31+
32+
docker exec cpu-test-"$NUMA_NODE" bash -c "
33+
set -e
34+
pip list"
35+
36+
# offline inference
37+
docker exec cpu-test-"$NUMA_NODE" bash -c "
38+
set -e
39+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
40+
41+
# Run kernel tests
42+
docker exec cpu-test-"$NUMA_NODE" bash -c "
43+
set -e
44+
pytest -x -v -s tests/kernels/test_onednn.py
45+
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
46+
47+
# basic online serving
48+
docker exec cpu-test-"$NUMA_NODE" bash -c '
49+
set -e
50+
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
51+
server_pid=$!
52+
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
53+
vllm bench serve \
54+
--backend vllm \
55+
--dataset-name random \
56+
--model meta-llama/Llama-3.2-3B-Instruct \
57+
--num-prompts 20 \
58+
--endpoint /v1/completions
59+
kill -s SIGTERM $server_pid &'
60+
}
61+
62+
# All of CPU tests are expected to be finished less than 40 mins.
63+
export -f cpu_tests
64+
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,22 @@ function cpu_tests() {
2525

2626
# offline inference
2727
podman exec -it "$container_id" bash -c "
28+
export TORCH_COMPILE_DISABLE=1
2829
set -xve
2930
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
3031

3132
# Run basic model test
3233
podman exec -it "$container_id" bash -c "
34+
export TORCH_COMPILE_DISABLE=1
3335
set -evx
3436
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
35-
pip install sentence-transformers datamodel_code_generator
37+
pip install sentence-transformers datamodel_code_generator tblib
3638
3739
# Note: disable Bart until supports V1
3840
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
39-
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
40-
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
41-
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
41+
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
42+
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
43+
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
4244
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
4345
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
4446
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log

.buildkite/test-amd.yaml

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ steps:
6161
- pytest -v -s -m 'not cpu_test' multimodal
6262
- pytest -v -s utils_
6363

64-
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
64+
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
6565
timeout_in_minutes: 10
6666
mirror_hardwares: [amdexperimental, amdproduction]
6767
agent_pool: mi325_1
@@ -73,13 +73,15 @@ steps:
7373
- tests/multimodal
7474
- tests/standalone_tests/lazy_imports.py
7575
- tests/transformers_utils
76+
- tests/config
7677
no_gpu: true
7778
commands:
7879
- python3 standalone_tests/lazy_imports.py
7980
- pytest -v -s test_inputs.py
8081
- pytest -v -s test_outputs.py
8182
- pytest -v -s -m 'cpu_test' multimodal
8283
- pytest -v -s transformers_utils
84+
- pytest -v -s config
8385

8486
- label: Python-only Installation Test # 10min
8587
timeout_in_minutes: 20
@@ -390,6 +392,15 @@ steps:
390392
commands:
391393
- pytest -v -s v1/attention
392394

395+
- label: V1 Test attention (B200) # 10min
396+
timeout_in_minutes: 30
397+
gpu: b200
398+
source_file_dependencies:
399+
- vllm/v1/attention
400+
- tests/v1/attention
401+
commands:
402+
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
403+
393404
- label: V1 Test others (CPU) # 5 mins
394405
mirror_hardwares: [amdexperimental, amdproduction]
395406
agent_pool: mi325_1
@@ -529,7 +540,7 @@ steps:
529540
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
530541
# Limit to no custom ops to reduce running time
531542
# Wrap with quotes to escape yaml and avoid starting -k string with a -
532-
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
543+
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
533544

534545
- label: Cudagraph test
535546
timeout_in_minutes: 20
@@ -694,7 +705,7 @@ steps:
694705
- vllm/model_executor/models/whisper.py
695706
commands: # LMEval
696707
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
697-
- pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
708+
- pytest -s entrypoints/openai/correctness/
698709

699710
- label: OpenAI-Compatible Tool Use # 23 min
700711
timeout_in_minutes: 35
@@ -995,12 +1006,12 @@ steps:
9951006
optional: true
9961007
commands:
9971008
- pip install --upgrade git+https://github.com/huggingface/transformers
998-
- pytest -v -s tests/models/test_initialization.py
1009+
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
9991010
- pytest -v -s tests/models/test_transformers.py
1000-
- pytest -v -s tests/models/multimodal/processing/
1001-
- pytest -v -s tests/models/multimodal/test_mapping.py
1011+
# - pytest -v -s tests/models/multimodal/processing/
1012+
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
10021013
- python3 examples/offline_inference/basic/chat.py
1003-
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
1014+
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
10041015
# Whisper needs spawn method to avoid deadlock
10051016
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
10061017

@@ -1045,7 +1056,7 @@ steps:
10451056
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
10461057
- pytest -v -s tests/kernels/moe/test_flashinfer.py
10471058

1048-
- label: Blackwell Fusion Tests # 30 min
1059+
- label: Blackwell Fusion and Compile Tests # 30 min
10491060
timeout_in_minutes: 40
10501061
working_dir: "/vllm-workspace/"
10511062
gpu: b200
@@ -1066,7 +1077,9 @@ steps:
10661077
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
10671078
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
10681079
# Wrap with quotes to escape yaml
1069-
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
1080+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
1081+
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1082+
- pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
10701083

10711084
- label: Blackwell Fusion E2E Tests # 30 min
10721085
timeout_in_minutes: 40
@@ -1088,15 +1101,13 @@ steps:
10881101
commands:
10891102
- nvidia-smi
10901103
# Run all e2e fusion tests
1091-
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
1092-
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1093-
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
1104+
- pytest -v -s tests/compile/test_fusions_e2e.py
10941105

10951106
- label: ROCm GPT-OSS Eval
10961107
timeout_in_minutes: 60
10971108
working_dir: "/vllm-workspace/"
10981109
agent_pool: mi325_1
1099-
mirror_hardwares: [amdproduction]
1110+
mirror_hardwares: [amdexperimental, amdproduction]
11001111
optional: true # run on nightlies
11011112
source_file_dependencies:
11021113
- tests/evals/gpt_oss
@@ -1323,21 +1334,20 @@ steps:
13231334
- vllm/
13241335
- tests/weight_loading
13251336
commands:
1326-
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
1337+
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
13271338

13281339
- label: Weight Loading Multiple GPU Test - Large Models # optional
13291340
mirror_hardwares: [amdexperimental]
13301341
agent_pool: mi325_2
13311342
# grade: Blocking
13321343
working_dir: "/vllm-workspace/tests"
13331344
num_gpus: 2
1334-
gpu: a100
13351345
optional: true
13361346
source_file_dependencies:
13371347
- vllm/
13381348
- tests/weight_loading
13391349
commands:
1340-
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
1350+
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
13411351

13421352
- label: NixlConnector PD accuracy tests (Distributed) # 30min
13431353
mirror_hardwares: [amdexperimental]
@@ -1417,7 +1427,9 @@ steps:
14171427
- pytest -v -s tests/compile/distributed/test_async_tp.py
14181428
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
14191429
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
1420-
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1430+
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1431+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
1432+
- pytest -v -s tests/compile/distributed/test_sequence_parallel.py
14211433
- pytest -v -s tests/distributed/test_context_parallel.py
14221434
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
14231435
- pytest -v -s tests/v1/distributed/test_dbo.py

.buildkite/test-pipeline.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,18 @@ steps:
346346
commands:
347347
- pytest -v -s v1/attention
348348

349+
- label: Batch Invariance Tests (H100) # 10min
350+
timeout_in_minutes: 25
351+
gpu: h100
352+
source_file_dependencies:
353+
- vllm/
354+
- tests/v1/determinism/
355+
commands:
356+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
357+
- pip install pytest-timeout pytest-forked
358+
- pytest -v -s v1/determinism/test_batch_invariance.py
359+
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
360+
349361
- label: V1 Test attention (B200) # 10min
350362
timeout_in_minutes: 30
351363
gpu: b200
@@ -550,6 +562,25 @@ steps:
550562
commands:
551563
- pytest -v -s kernels/mamba
552564

565+
- label: Kernels DeepGEMM Test (H100)
566+
timeout_in_minutes: 45
567+
gpu: h100
568+
num_gpus: 1
569+
source_file_dependencies:
570+
- tools/install_deepgemm.sh
571+
- vllm/utils/deep_gemm.py
572+
- vllm/model_executor/layers/fused_moe
573+
- vllm/model_executor/layers/quantization
574+
- tests/kernels/quantization/test_block_fp8.py
575+
- tests/kernels/moe/test_deepgemm.py
576+
- tests/kernels/moe/test_batched_deepgemm.py
577+
- tests/kernels/attention/test_deepgemm_attention.py
578+
commands:
579+
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
580+
- pytest -v -s kernels/moe/test_deepgemm.py
581+
- pytest -v -s kernels/moe/test_batched_deepgemm.py
582+
- pytest -v -s kernels/attention/test_deepgemm_attention.py
583+
553584
- label: Model Executor Test # 23min
554585
timeout_in_minutes: 35
555586
torch_nightly: true
@@ -921,6 +952,7 @@ steps:
921952
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
922953
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
923954
- pytest -v -s tests/kernels/moe/test_flashinfer.py
955+
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
924956

925957
- label: Blackwell Fusion and Compile Tests # 30 min
926958
timeout_in_minutes: 40

.github/CODEOWNERS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
3535
/vllm/v1/kv_cache_interface.py @heheda12345
3636
/vllm/v1/offloading @ApostaC
3737

38+
# Model runner V2
39+
/vllm/v1/worker/gpu @WoosukKwon
40+
3841
# Test ownership
3942
/.buildkite/lm-eval-harness @mgoin
4043
/tests/distributed/test_multi_node_assignment.py @youkaichao

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
307307
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
308308

309309
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
310-
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
310+
set(CUTLASS_REVISION "v4.2.1")
311311

312312
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
313313
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})

0 commit comments

Comments
 (0)