Skip to content

Commit 7fe33be

Browse files
authored
Merge branch 'main' into dcp-bugfix
2 parents 92f0085 + 745a3ba commit 7fe33be

File tree

230 files changed

+3838
-1558
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

230 files changed

+3838
-1558
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,58 +7,56 @@ set -ex
77
# allow to bind to different cores
88
CORE_RANGE=${CORE_RANGE:-0-16}
99
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
10-
NUMA_NODE=${NUMA_NODE:-0}
1110

12-
export CMAKE_BUILD_PARALLEL_LEVEL=32
11+
export CMAKE_BUILD_PARALLEL_LEVEL=16
1312

1413
# Setup cleanup
1514
remove_docker_container() {
1615
set -e;
17-
docker rm -f cpu-test-"$NUMA_NODE" || true;
16+
docker rm -f cpu-test || true;
1817
}
1918
trap remove_docker_container EXIT
2019
remove_docker_container
2120

2221
# Try building the docker image
23-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
22+
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
2423

25-
# Run the image, setting --shm-size=4g for tensor parallel.
26-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
24+
# Run the image
25+
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
2726

2827
function cpu_tests() {
2928
set -e
30-
export NUMA_NODE=$2
3129

32-
docker exec cpu-test-"$NUMA_NODE" bash -c "
30+
docker exec cpu-test bash -c "
3331
set -e
3432
pip list"
3533

3634
# offline inference
37-
docker exec cpu-test-"$NUMA_NODE" bash -c "
35+
docker exec cpu-test bash -c "
3836
set -e
3937
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
4038

4139
# Run kernel tests
42-
docker exec cpu-test-"$NUMA_NODE" bash -c "
40+
docker exec cpu-test bash -c "
4341
set -e
4442
pytest -x -v -s tests/kernels/test_onednn.py
4543
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
4644

4745
# basic online serving
48-
docker exec cpu-test-"$NUMA_NODE" bash -c '
46+
docker exec cpu-test bash -c '
4947
set -e
50-
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
48+
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
5149
server_pid=$!
5250
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
5351
vllm bench serve \
5452
--backend vllm \
5553
--dataset-name random \
56-
--model meta-llama/Llama-3.2-3B-Instruct \
54+
--model Qwen/Qwen3-0.6B \
5755
--num-prompts 20 \
5856
--endpoint /v1/completions
5957
kill -s SIGTERM $server_pid &'
6058
}
6159

6260
# All of CPU tests are expected to be finished less than 40 mins.
6361
export -f cpu_tests
64-
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
62+
timeout 2h bash -c cpu_tests

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,7 @@ steps:
632632
# we can only upgrade after this is resolved
633633
# TODO(jerryzh168): resolve the above comment
634634
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
635+
- uv pip install --system conch-triton-kernels
635636
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
636637

637638
- label: LM Eval Small Models # 53min
@@ -972,7 +973,6 @@ steps:
972973
- vllm/model_executor/layers/layernorm.py
973974
- vllm/model_executor/layers/activation.py
974975
- vllm/model_executor/layers/quantization/input_quant_fp8.py
975-
- vllm/model_executor/layers/fused_moe/layer.py
976976
- tests/compile/test_fusion_attn.py
977977
- tests/compile/test_silu_mul_quant_fusion.py
978978
- tests/compile/distributed/test_fusion_all_reduce.py

.github/workflows/issue_autolabel.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,31 @@ jobs:
105105
}
106106
],
107107
},
108+
cpu: {
109+
// Keyword search - matches whole words only (with word boundaries)
110+
keywords: [
111+
{
112+
term: "CPU Backend",
113+
searchIn: "title"
114+
},
115+
{
116+
term: "x86",
117+
searchIn: "title"
118+
},
119+
{
120+
term: "ARM",
121+
searchIn: "title"
122+
},
123+
{
124+
term: "Apple Silicon",
125+
searchIn: "title"
126+
},
127+
{
128+
term: "IBM Z",
129+
searchIn: "title"
130+
},
131+
],
132+
},
108133
// Add more label configurations here as needed
109134
// example: {
110135
// keywords: [...],

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
2121

2222
*Latest News* 🔥
2323

24+
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
2425
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
2526
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
2627
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).

csrc/moe/marlin_moe_wna16/marlin_template.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -489,14 +489,16 @@ __global__ void Marlin(
489489
#pragma unroll
490490
for (int i = 0; i < 4; i++) {
491491
int idx = tid4 * 4 + i;
492-
idx = idx < block_num_valid_tokens ? idx : 0;
493-
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
494-
sh_block_topk_weights[idx] = __hmul2(
495-
global_scale, Dtype::num2num2(Dtype::float2num(
496-
topk_weights_ptr[sh_block_sorted_ids[idx]])));
497-
} else {
498-
sh_block_topk_weights[idx] = Dtype::num2num2(
499-
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
492+
if (idx < block_num_valid_tokens) {
493+
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
494+
sh_block_topk_weights[idx] =
495+
__hmul2(global_scale,
496+
Dtype::num2num2(Dtype::float2num(
497+
topk_weights_ptr[sh_block_sorted_ids[idx]])));
498+
} else {
499+
sh_block_topk_weights[idx] = Dtype::num2num2(
500+
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
501+
}
500502
}
501503
}
502504
}

docker/Dockerfile

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,15 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
244244

245245
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
246246
# Install EP kernels(pplx-kernels and DeepEP)
247+
ARG PPLX_COMMIT_HASH
248+
ARG DEEPEP_COMMIT_HASH
247249
RUN --mount=type=cache,target=/root/.cache/uv \
248250
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
249-
/tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
251+
/tmp/install_python_libraries.sh \
252+
--workspace /tmp/ep_kernels_workspace \
253+
--mode wheel \
254+
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
255+
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
250256
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
251257

252258
# Check the size of the wheel if RUN_WHEEL_CHECK is true
@@ -392,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
392398
# Install FlashInfer pre-compiled kernel cache and binaries
393399
# https://docs.flashinfer.ai/installation.html
394400
RUN --mount=type=cache,target=/root/.cache/uv \
395-
uv pip install --system flashinfer-cubin==0.5.2 \
396-
&& uv pip install --system flashinfer-jit-cache==0.5.2 \
401+
uv pip install --system flashinfer-cubin==0.5.3 \
402+
&& uv pip install --system flashinfer-jit-cache==0.5.3 \
397403
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
398404
&& flashinfer show-config
399405

docker/Dockerfile.cpu

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ FROM base AS vllm-test-deps
119119

120120
WORKDIR /workspace/vllm
121121

122-
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
123122
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
124123
cp requirements/test.in requirements/cpu-test.in && \
125124
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
@@ -132,9 +131,6 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
132131
esac; \
133132
}; \
134133
remove_packages_not_supported_on_aarch64 && \
135-
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
136-
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
137-
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
138134
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
139135

140136
RUN --mount=type=cache,target=/root/.cache/uv \

docs/community/meetups.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
1010

1111
Below you'll find slides and recordings from our previous meetups:
1212

13+
- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
1314
- [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
1415
- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
1516
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)

docs/contributing/model/basic.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ The initialization code should look like this:
2929
```python
3030
from torch import nn
3131
from vllm.config import VllmConfig
32-
from vllm.attention import Attention
32+
from vllm.attention.layer import Attention
3333

3434
class MyAttention(nn.Module):
3535
def __init__(self, vllm_config: VllmConfig, prefix: str):

docs/design/cuda_graphs.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,14 @@ See the following figures for a quick comparison between the previous and curren
8484
```python
8585
class BatchDescriptor(NamedTuple):
8686
num_tokens: int
87-
uniform_decode: bool = False
87+
num_reqs: int
88+
uniform: bool = False
89+
has_lora: bool = False
8890
```
8991

90-
where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`.
92+
where `num_tokens` can be the padded token length, and `uniform` indicates if all the requests have the same query lengths. Many attention backends only support full cudagraphs when the batches are uniform; pure decode batches are uniform but may not be query length 1 (i.e. `num_tokens == num_reqs`), this occurs in the validation pass of spec-decode where "decode" batches will have a query length of `1+num_spec_tokens`.
9193

92-
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
94+
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item.
9395

9496
!!! note
9597
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).

0 commit comments

Comments
 (0)