Skip to content

Commit db14f61

Browse files
authored
[ci] Refactor CI file structure (#29343)
1 parent 78c7503 commit db14f61

31 files changed

+1570
-0
lines changed

.buildkite/ci_config.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: vllm_ci
2+
job_dirs:
3+
- ".buildkite/test_areas"
4+
- ".buildkite/image_build"
5+
run_all_patterns:
6+
- "docker/Dockerfile"
7+
- "CMakeLists.txt"
8+
- "requirements/common.txt"
9+
- "requirements/cuda.txt"
10+
- "requirements/build.txt"
11+
- "requirements/test.txt"
12+
- "setup.py"
13+
- "csrc/"
14+
- "cmake/"
15+
run_all_exclude_patterns:
16+
- "docker/Dockerfile."
17+
- "csrc/cpu/"
18+
- "csrc/rocm/"
19+
- "cmake/hipify.py"
20+
- "cmake/cpu_extension.cmake"
21+
registries: public.ecr.aws/q9t5s3a7
22+
repositories:
23+
main: "vllm-ci-postmerge-repo"
24+
premerge: "vllm-ci-test-repo"
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/bin/bash
2+
set -e
3+
4+
if [[ $# -lt 8 ]]; then
5+
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
6+
exit 1
7+
fi
8+
9+
REGISTRY=$1
10+
REPO=$2
11+
BUILDKITE_COMMIT=$3
12+
BRANCH=$4
13+
VLLM_USE_PRECOMPILED=$5
14+
VLLM_MERGE_BASE_COMMIT=$6
15+
CACHE_FROM=$7
16+
CACHE_TO=$8
17+
18+
# authenticate with AWS ECR
19+
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
20+
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
21+
22+
# docker buildx
23+
docker buildx create --name vllm-builder --driver docker-container --use
24+
docker buildx inspect --bootstrap
25+
docker buildx ls
26+
27+
# skip build if image already exists
28+
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
29+
echo "Image not found, proceeding with build..."
30+
else
31+
echo "Image found"
32+
exit 0
33+
fi
34+
35+
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
36+
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
37+
else
38+
merge_base_commit_build_args=""
39+
fi
40+
41+
# build
42+
docker buildx build --file docker/Dockerfile \
43+
--build-arg max_jobs=16 \
44+
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
45+
--build-arg USE_SCCACHE=1 \
46+
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
47+
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
48+
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
49+
${merge_base_commit_build_args} \
50+
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
51+
--cache-to type=registry,ref=${CACHE_TO},mode=max \
52+
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
53+
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
54+
--push \
55+
--target test \
56+
--progress plain .
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
group: Abuild
2+
steps:
3+
- label: ":docker: Build image"
4+
key: image-build
5+
depends_on: []
6+
commands:
7+
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
8+
retry:
9+
automatic:
10+
- exit_status: -1 # Agent was lost
11+
limit: 2
12+
- exit_status: -10 # Agent was lost
13+
limit: 2
14+
15+
- label: ":docker: Build CPU image"
16+
key: image-build-cpu
17+
depends_on: []
18+
commands:
19+
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
20+
env:
21+
DOCKER_BUILDKIT: "1"
22+
retry:
23+
automatic:
24+
- exit_status: -1 # Agent was lost
25+
limit: 2
26+
- exit_status: -10 # Agent was lost
27+
limit: 2
28+
29+
- label: ":docker: Build HPU image"
30+
soft_fail: true
31+
depends_on: []
32+
key: image-build-hpu
33+
commands:
34+
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
35+
env:
36+
DOCKER_BUILDKIT: "1"
37+
retry:
38+
automatic:
39+
- exit_status: -1 # Agent was lost
40+
limit: 2
41+
- exit_status: -10 # Agent was lost
42+
limit: 2
43+
44+
- label: ":docker: Build CPU arm64 image"
45+
key: cpu-arm64-image-build
46+
depends_on: []
47+
optional: true
48+
commands:
49+
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
50+
env:
51+
DOCKER_BUILDKIT: "1"
52+
retry:
53+
automatic:
54+
- exit_status: -1 # Agent was lost
55+
limit: 2
56+
- exit_status: -10 # Agent was lost
57+
limit: 2
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
set -e
3+
4+
if [[ $# -lt 3 ]]; then
5+
echo "Usage: $0 <registry> <repo> <commit>"
6+
exit 1
7+
fi
8+
9+
REGISTRY=$1
10+
REPO=$2
11+
BUILDKITE_COMMIT=$3
12+
13+
# authenticate with AWS ECR
14+
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
15+
16+
# skip build if image already exists
17+
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
18+
echo "Image not found, proceeding with build..."
19+
else
20+
echo "Image found"
21+
exit 0
22+
fi
23+
24+
# build
25+
docker build --file docker/Dockerfile.cpu \
26+
--build-arg max_jobs=16 \
27+
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
28+
--build-arg VLLM_CPU_AVX512BF16=true \
29+
--build-arg VLLM_CPU_AVX512VNNI=true \
30+
--build-arg VLLM_CPU_AMXBF16=true \
31+
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
32+
--target vllm-test \
33+
--progress plain .
34+
35+
# push
36+
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
set -e
3+
4+
if [[ $# -lt 3 ]]; then
5+
echo "Usage: $0 <registry> <repo> <commit>"
6+
exit 1
7+
fi
8+
9+
REGISTRY=$1
10+
REPO=$2
11+
BUILDKITE_COMMIT=$3
12+
13+
# authenticate with AWS ECR
14+
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
15+
16+
# skip build if image already exists
17+
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
18+
echo "Image not found, proceeding with build..."
19+
else
20+
echo "Image found"
21+
exit 0
22+
fi
23+
24+
# build
25+
docker build --file docker/Dockerfile.cpu \
26+
--build-arg max_jobs=16 \
27+
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
28+
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
29+
--target vllm-test \
30+
--progress plain .
31+
32+
# push
33+
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
set -e
3+
4+
if [[ $# -lt 3 ]]; then
5+
echo "Usage: $0 <registry> <repo> <commit>"
6+
exit 1
7+
fi
8+
9+
REGISTRY=$1
10+
REPO=$2
11+
BUILDKITE_COMMIT=$3
12+
13+
# authenticate with AWS ECR
14+
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
15+
16+
# skip build if image already exists
17+
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
18+
echo "Image not found, proceeding with build..."
19+
else
20+
echo "Image found"
21+
exit 0
22+
fi
23+
24+
# build
25+
docker build \
26+
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
27+
--build-arg max_jobs=16 \
28+
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
29+
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
30+
--progress plain \
31+
https://github.com/vllm-project/vllm-gaudi.git
32+
33+
# push
34+
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
group: Attention
2+
depends_on:
3+
- image-build
4+
steps:
5+
- label: V1 attention (H100)
6+
timeout_in_minutes: 30
7+
gpu: h100
8+
source_file_dependencies:
9+
- vllm/v1/attention
10+
- tests/v1/attention
11+
commands:
12+
- pytest -v -s v1/attention
13+
14+
- label: V1 attention (B200)
15+
timeout_in_minutes: 30
16+
gpu: b200
17+
source_file_dependencies:
18+
- vllm/v1/attention
19+
- tests/v1/attention
20+
commands:
21+
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
group: Basic Correctness
2+
depends_on:
3+
- image-build
4+
steps:
5+
- label: Basic Correctness
6+
timeout_in_minutes: 30
7+
source_file_dependencies:
8+
- vllm/
9+
- tests/basic_correctness/test_basic_correctness
10+
- tests/basic_correctness/test_cpu_offload
11+
- tests/basic_correctness/test_cumem.py
12+
commands:
13+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
14+
- pytest -v -s basic_correctness/test_cumem.py
15+
- pytest -v -s basic_correctness/test_basic_correctness.py
16+
- pytest -v -s basic_correctness/test_cpu_offload.py
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
group: Benchmarks
2+
depends_on:
3+
- image-build
4+
steps:
5+
- label: Benchmarks
6+
timeout_in_minutes: 20
7+
working_dir: "/vllm-workspace/.buildkite"
8+
source_file_dependencies:
9+
- benchmarks/
10+
commands:
11+
- bash scripts/run-benchmarks.sh
12+
13+
- label: Benchmarks CLI Test
14+
timeout_in_minutes: 20
15+
source_file_dependencies:
16+
- vllm/
17+
- tests/benchmarks/
18+
commands:
19+
- pytest -v -s benchmarks/

.buildkite/test_areas/compile.yaml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
group: Compile
2+
depends_on:
3+
- image-build
4+
steps:
5+
- label: Fusion and Compile Tests (B200)
6+
timeout_in_minutes: 40
7+
working_dir: "/vllm-workspace/"
8+
gpu: b200
9+
source_file_dependencies:
10+
- csrc/quantization/fp4/
11+
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
12+
- vllm/v1/attention/backends/flashinfer.py
13+
- vllm/v1/worker/
14+
- vllm/v1/cudagraph_dispatcher.py
15+
- vllm/compilation/
16+
# can affect pattern matching
17+
- vllm/model_executor/layers/layernorm.py
18+
- vllm/model_executor/layers/activation.py
19+
- vllm/model_executor/layers/quantization/input_quant_fp8.py
20+
- tests/compile/test_fusion_attn.py
21+
- tests/compile/test_silu_mul_quant_fusion.py
22+
- tests/compile/distributed/test_fusion_all_reduce.py
23+
- tests/compile/distributed/test_fusions_e2e.py
24+
- tests/compile/fullgraph/test_full_graph.py
25+
commands:
26+
- nvidia-smi
27+
- pytest -v -s tests/compile/test_fusion_attn.py
28+
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
29+
# this runner has 2 GPUs available even though num_gpus=2 is not set
30+
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
31+
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
32+
# Wrap with quotes to escape yaml
33+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
34+
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
35+
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
36+
37+
- label: Fusion E2E (2 GPUs)(B200)
38+
timeout_in_minutes: 40
39+
working_dir: "/vllm-workspace/"
40+
gpu: b200
41+
optional: true
42+
num_gpus: 2
43+
source_file_dependencies:
44+
- csrc/quantization/fp4/
45+
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
46+
- vllm/v1/attention/backends/flashinfer.py
47+
- vllm/compilation/
48+
# can affect pattern matching
49+
- vllm/model_executor/layers/layernorm.py
50+
- vllm/model_executor/layers/activation.py
51+
- vllm/model_executor/layers/quantization/input_quant_fp8.py
52+
- tests/compile/distributed/test_fusions_e2e.py
53+
commands:
54+
- nvidia-smi
55+
# Run all e2e fusion tests
56+
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
57+

0 commit comments

Comments
 (0)