Skip to content

Commit 2b34bc9

Browse files
authored
Merge branch 'main' into mistral-tool-parser-streaming-update
2 parents 83e86c5 + 15b1511 commit 2b34bc9

File tree

76 files changed

+2296
-648
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+2296
-648
lines changed
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/usr/bin/env bash
2+
set -euxo pipefail
3+
4+
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
5+
THRESHOLD=${1:-0.25}
6+
NUM_Q=${2:-1319}
7+
PORT=${3:-8030}
8+
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
9+
mkdir -p "${OUT_DIR}"
10+
11+
wait_for_server() {
12+
local port=$1
13+
timeout 600 bash -c '
14+
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
15+
sleep 1
16+
done'
17+
}
18+
19+
MODEL="deepseek-ai/DeepSeek-V2-lite"
20+
21+
# Set BACKENDS based on platform
22+
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
23+
# ROCm platform
24+
BACKENDS=("allgather_reducescatter")
25+
# Disable MOE padding for ROCm since it is causing eplb to fail
26+
export VLLM_ROCM_MOE_PADDING=0
27+
else
28+
# Non-ROCm platform (CUDA/other)
29+
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
30+
fi
31+
32+
cleanup() {
33+
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
34+
kill "${SERVER_PID}" 2>/dev/null || true
35+
for _ in {1..20}; do
36+
kill -0 "${SERVER_PID}" 2>/dev/null || break
37+
sleep 0.5
38+
done
39+
kill -9 "${SERVER_PID}" 2>/dev/null || true
40+
fi
41+
}
42+
trap cleanup EXIT
43+
44+
for BACK in "${BACKENDS[@]}"; do
45+
VLLM_DEEP_GEMM_WARMUP=skip \
46+
VLLM_ALL2ALL_BACKEND=$BACK \
47+
vllm serve "$MODEL" \
48+
--enforce-eager \
49+
--tensor-parallel-size 2 \
50+
--data-parallel-size 2 \
51+
--enable-expert-parallel \
52+
--enable-eplb \
53+
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
54+
--trust-remote-code \
55+
--max-model-len 2048 \
56+
--port $PORT &
57+
SERVER_PID=$!
58+
wait_for_server $PORT
59+
60+
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
61+
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
62+
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
63+
python3 - <<PY
64+
import json; acc=json.load(open('${OUT}'))['accuracy']
65+
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
66+
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
67+
PY
68+
69+
cleanup
70+
SERVER_PID=
71+
sleep 1
72+
PORT=$((PORT+1))
73+
done

.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
5050
--data-parallel-size 2 \
5151
--enable-expert-parallel \
5252
--enable-eplb \
53+
--eplb-config '{"window_size":200,"step_interval":600}' \
5354
--trust-remote-code \
5455
--max-model-len 2048 \
5556
--port $PORT &
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env bash
2+
set -euxo pipefail
3+
4+
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
5+
THRESHOLD=${1:-0.25}
6+
NUM_Q=${2:-1319}
7+
PORT=${3:-8040}
8+
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
9+
mkdir -p "${OUT_DIR}"
10+
11+
wait_for_server() {
12+
local port=$1
13+
timeout 600 bash -c '
14+
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
15+
sleep 1
16+
done'
17+
}
18+
19+
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
20+
21+
# Set BACKENDS based on platform
22+
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
23+
# ROCm platform
24+
BACKENDS=("allgather_reducescatter")
25+
# Disable MOE padding for ROCm since it is causing eplb to fail
26+
export VLLM_ROCM_MOE_PADDING=0
27+
else
28+
# Non-ROCm platform (CUDA/other)
29+
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
30+
fi
31+
32+
cleanup() {
33+
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
34+
kill "${SERVER_PID}" 2>/dev/null || true
35+
for _ in {1..20}; do
36+
kill -0 "${SERVER_PID}" 2>/dev/null || break
37+
sleep 0.5
38+
done
39+
kill -9 "${SERVER_PID}" 2>/dev/null || true
40+
fi
41+
}
42+
trap cleanup EXIT
43+
44+
for BACK in "${BACKENDS[@]}"; do
45+
VLLM_DEEP_GEMM_WARMUP=skip \
46+
VLLM_ALL2ALL_BACKEND=$BACK \
47+
vllm serve "$MODEL" \
48+
--enforce-eager \
49+
--tensor-parallel-size 4 \
50+
--enable-expert-parallel \
51+
--enable-eplb \
52+
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
53+
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
54+
--trust-remote-code \
55+
--max-model-len 2048 \
56+
--gpu-memory-utilization 0.9 \
57+
--port $PORT &
58+
SERVER_PID=$!
59+
wait_for_server $PORT
60+
61+
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
62+
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
63+
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
64+
python3 - <<PY
65+
import json; acc=json.load(open('${OUT}'))['accuracy']
66+
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
67+
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
68+
PY
69+
70+
cleanup
71+
SERVER_PID=
72+
sleep 1
73+
PORT=$((PORT+1))
74+
done

.buildkite/test-pipeline.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1373,4 +1373,22 @@ steps:
13731373
num_gpus: 2
13741374
working_dir: "/vllm-workspace"
13751375
commands:
1376-
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
1376+
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
1377+
1378+
- label: DeepSeek V2-Lite Async EPLB Accuracy
1379+
timeout_in_minutes: 60
1380+
gpu: h100
1381+
optional: true
1382+
num_gpus: 4
1383+
working_dir: "/vllm-workspace"
1384+
commands:
1385+
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
1386+
1387+
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
1388+
timeout_in_minutes: 60
1389+
gpu: h100
1390+
optional: true
1391+
num_gpus: 4
1392+
working_dir: "/vllm-workspace"
1393+
commands:
1394+
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ Compute Resources:
137137
- Alibaba Cloud
138138
- AMD
139139
- Anyscale
140+
- Arm
140141
- AWS
141142
- Crusoe Cloud
142143
- Databricks

docker/Dockerfile

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
150150
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
151151
#################### BASE BUILD IMAGE ####################
152152

153-
#################### WHEEL BUILD IMAGE ####################
154-
FROM base AS build
153+
#################### CSRC BUILD IMAGE ####################
154+
FROM base AS csrc-build
155155
ARG TARGETPLATFORM
156156

157157
ARG PIP_INDEX_URL UV_INDEX_URL
@@ -172,10 +172,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
172172
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
173173
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
174174

175-
COPY . .
176-
ARG GIT_REPO_CHECK=0
177-
RUN --mount=type=bind,source=.git,target=.git \
178-
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
175+
WORKDIR /workspace
176+
177+
COPY pyproject.toml setup.py CMakeLists.txt ./
178+
COPY cmake cmake/
179+
COPY csrc csrc/
180+
COPY vllm/envs.py vllm/envs.py
181+
COPY vllm/__init__.py vllm/__init__.py
179182

180183
# max jobs used by Ninja to build extensions
181184
ARG max_jobs=2
@@ -195,9 +198,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
195198
ARG VLLM_USE_PRECOMPILED=""
196199
ARG VLLM_MAIN_CUDA_VERSION=""
197200

201+
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
202+
ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
203+
198204
# if USE_SCCACHE is set, use sccache to speed up compilation
199205
RUN --mount=type=cache,target=/root/.cache/uv \
200-
--mount=type=bind,source=.git,target=.git \
201206
if [ "$USE_SCCACHE" = "1" ]; then \
202207
echo "Installing sccache..." \
203208
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
@@ -223,7 +228,6 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device}
223228
ENV CCACHE_DIR=/root/.cache/ccache
224229
RUN --mount=type=cache,target=/root/.cache/ccache \
225230
--mount=type=cache,target=/root/.cache/uv \
226-
--mount=type=bind,source=.git,target=.git \
227231
if [ "$USE_SCCACHE" != "1" ]; then \
228232
# Clean any existing CMake artifacts
229233
rm -rf .deps && \
@@ -232,6 +236,52 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
232236
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
233237
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
234238
fi
239+
#################### CSRC BUILD IMAGE ####################
240+
241+
#################### WHEEL BUILD IMAGE ####################
242+
FROM base AS build
243+
ARG TARGETPLATFORM
244+
245+
ARG PIP_INDEX_URL UV_INDEX_URL
246+
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
247+
ARG PYTORCH_CUDA_INDEX_BASE_URL
248+
249+
# install build dependencies
250+
COPY requirements/build.txt requirements/build.txt
251+
252+
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
253+
# Reference: https://github.com/astral-sh/uv/pull/1694
254+
ENV UV_HTTP_TIMEOUT=500
255+
ENV UV_INDEX_STRATEGY="unsafe-best-match"
256+
# Use copy mode to avoid hardlink failures with Docker cache mounts
257+
ENV UV_LINK_MODE=copy
258+
259+
RUN --mount=type=cache,target=/root/.cache/uv \
260+
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
261+
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
262+
263+
WORKDIR /workspace
264+
265+
COPY --from=csrc-build /workspace/dist /precompiled-wheels
266+
267+
COPY . .
268+
269+
ARG GIT_REPO_CHECK=0
270+
RUN --mount=type=bind,source=.git,target=.git \
271+
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
272+
273+
ARG vllm_target_device="cuda"
274+
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
275+
276+
# Skip adding +precompiled suffix to version (preserves git-derived version)
277+
ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
278+
279+
RUN --mount=type=cache,target=/root/.cache/uv \
280+
--mount=type=bind,source=.git,target=.git \
281+
if [ "${vllm_target_device}" = "cuda" ]; then \
282+
export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
283+
fi && \
284+
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
235285

236286
# Install DeepGEMM from source
237287
ARG DEEPGEMM_GIT_REF
27.8 KB
Loading

docs/community/sponsors.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Compute Resources:
1818
- Alibaba Cloud
1919
- AMD
2020
- Anyscale
21+
- Arm
2122
- AWS
2223
- Crusoe Cloud
2324
- Databricks

docs/getting_started/installation/gpu.rocm.inc.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@ vLLM supports AMD GPUs with ROCm 6.3 or above, and torch 2.8.0 and above.
55
!!! tip
66
[Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
77

8-
!!! warning
9-
There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
10-
118
# --8<-- [end:installation]
129
# --8<-- [start:requirements]
1310

docs/getting_started/installation/gpu.xpu.inc.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22

33
vLLM initially supports basic model inference and serving on Intel GPU platform.
44

5-
!!! warning
6-
There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.
7-
85
# --8<-- [end:installation]
96
# --8<-- [start:requirements]
107

0 commit comments

Comments
 (0)