From a7f11ca733519f489ac4e091adc8585633484d95 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 24 Nov 2025 12:42:54 -0800 Subject: [PATCH 01/24] p Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/attention.yaml | 19 ++ .buildkite/test_areas/basic_correctness.yaml | 14 ++ .buildkite/test_areas/benchmarks.yaml | 17 ++ .buildkite/test_areas/compile.yaml | 55 +++++ .buildkite/test_areas/cuda.yaml | 20 ++ .buildkite/test_areas/distributed.yaml | 205 ++++++++++++++++++ .buildkite/test_areas/e2e_integration.yaml | 19 ++ .buildkite/test_areas/engine.yaml | 27 +++ .buildkite/test_areas/entrypoints.yaml | 66 ++++++ .buildkite/test_areas/expert_parallelism.yaml | 21 ++ .buildkite/test_areas/kernels.yaml | 115 ++++++++++ .buildkite/test_areas/lm_eval.yaml | 44 ++++ .buildkite/test_areas/lora.yaml | 38 ++++ .buildkite/test_areas/misc.yaml | 150 +++++++++++++ .buildkite/test_areas/model_executor.yaml | 15 ++ .buildkite/test_areas/models_basic.yaml | 65 ++++++ .buildkite/test_areas/models_distributed.yaml | 20 ++ .buildkite/test_areas/models_language.yaml | 94 ++++++++ .buildkite/test_areas/models_multimodal.yaml | 67 ++++++ .buildkite/test_areas/plugins.yaml | 32 +++ .buildkite/test_areas/pytorch.yaml | 48 ++++ .buildkite/test_areas/quantization.yaml | 35 +++ .buildkite/test_areas/samplers.yaml | 12 + .buildkite/test_areas/tool_use.yaml | 20 ++ .buildkite/test_areas/weight_loading.yaml | 25 +++ 25 files changed, 1243 insertions(+) create mode 100644 .buildkite/test_areas/attention.yaml create mode 100644 .buildkite/test_areas/basic_correctness.yaml create mode 100644 .buildkite/test_areas/benchmarks.yaml create mode 100644 .buildkite/test_areas/compile.yaml create mode 100644 .buildkite/test_areas/cuda.yaml create mode 100644 .buildkite/test_areas/distributed.yaml create mode 100644 .buildkite/test_areas/e2e_integration.yaml create mode 100644 .buildkite/test_areas/engine.yaml create mode 100644 .buildkite/test_areas/entrypoints.yaml create mode 100644 .buildkite/test_areas/expert_parallelism.yaml create mode 100644 .buildkite/test_areas/kernels.yaml create mode 100644 .buildkite/test_areas/lm_eval.yaml create mode 100644 .buildkite/test_areas/lora.yaml create mode 100644 .buildkite/test_areas/misc.yaml create mode 100644 .buildkite/test_areas/model_executor.yaml create mode 100644 .buildkite/test_areas/models_basic.yaml create mode 100644 .buildkite/test_areas/models_distributed.yaml create mode 100644 .buildkite/test_areas/models_language.yaml create mode 100644 .buildkite/test_areas/models_multimodal.yaml create mode 100644 .buildkite/test_areas/plugins.yaml create mode 100644 .buildkite/test_areas/pytorch.yaml create mode 100644 .buildkite/test_areas/quantization.yaml create mode 100644 .buildkite/test_areas/samplers.yaml create mode 100644 .buildkite/test_areas/tool_use.yaml create mode 100644 .buildkite/test_areas/weight_loading.yaml diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml new file mode 100644 index 000000000000..af57cc6681b4 --- /dev/null +++ b/.buildkite/test_areas/attention.yaml @@ -0,0 +1,19 @@ +group: Attention +steps: +- label: V1 attention (H100) + timeout_in_minutes: 30 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + +- label: V1 attention (B200) + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml new file mode 100644 index 000000000000..27c4d96aeb8c --- /dev/null +++ b/.buildkite/test_areas/basic_correctness.yaml @@ -0,0 +1,14 @@ +group: Basic Correctness +steps: +- label: Basic Correctness + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml new file mode 100644 index 000000000000..c48c72fb405b --- /dev/null +++ b/.buildkite/test_areas/benchmarks.yaml @@ -0,0 +1,17 @@ +group: Benchmarks +steps: +- label: Benchmarks + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml new file mode 100644 index 000000000000..4b05bd8976e4 --- /dev/null +++ b/.buildkite/test_areas/compile.yaml @@ -0,0 +1,55 @@ +group: Compile +steps: +- label: Fusion and Compile Tests (B200) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # Wrap with quotes to escape yaml + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + +- label: Fusion E2E (2 GPUs)(B200) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + num_gpus: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/distributed/test_fusions_e2e.py + commands: + - nvidia-smi + # Run all e2e fusion tests + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py + diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml new file mode 100644 index 000000000000..6c8ff70ba45a --- /dev/null +++ b/.buildkite/test_areas/cuda.yaml @@ -0,0 +1,20 @@ +group: CUDA +steps: +- label: Platform Tests (CUDA) + timeout_in_minutes: 15 + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + +- label: Cudagraph + timeout_in_minutes: 20 + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py \ No newline at end of file diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml new file mode 100644 index 000000000000..56dee5c31389 --- /dev/null +++ b/.buildkite/test_areas/distributed.yaml @@ -0,0 +1,205 @@ +group: Distributed +steps: +- label: Distributed Comm Ops + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: Distributed (2 GPUs) + timeout_in_minutes: 90 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/fullgraph/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Tests (4 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and external_dp=2 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + +- label: Distributed Tests (8 GPUs)(H100) + timeout_in_minutes: 10 + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + +- label: Distributed Tests (4 GPUs)(A100) + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: Distributed Tests (2 GPUs)(H200) + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/distributed/test_sequence_parallel.py + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py + +- label: Distributed Tests (2 GPUs)(B200) + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py + +- label: 2 Node Test (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + +- label: Distributed NixlConnector PD accuracy (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh + +- label: Pipeline + Context Parallelism (4 GPUs)) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py \ No newline at end of file diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml new file mode 100644 index 000000000000..dca7c1fcdf31 --- /dev/null +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -0,0 +1,19 @@ +group: E2E Integration +steps: +- label: DeepSeek V2-Lite Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-30B-A3B-FP8-block Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020 diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml new file mode 100644 index 000000000000..be099758eb88 --- /dev/null +++ b/.buildkite/test_areas/engine.yaml @@ -0,0 +1,27 @@ +group: Engine +steps: +- label: Engine + timeout_in_minutes: 40 + source_file_dependencies: + - vllm/ + - tests/engine + - tests/tokenization + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + # OOM in the CI unless we run this separately + - pytest -v -s tokenization + +- label: V1 e2e + engine + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml new file mode 100644 index 000000000000..adbd6e96291e --- /dev/null +++ b/.buildkite/test_areas/entrypoints.yaml @@ -0,0 +1,66 @@ +group: Entrypoints +steps: +- label: Entrypoints Unit Tests + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration (LLM) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Entrypoints Integration (API Server) + timeout_in_minutes: 130 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + + +- label: Entrypoints Integration (Pooling) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + + +- label: Entrypoints V1 + timeout_in_minutes: 50 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: OpenAI API Correctness + timeout_in_minutes: 30 + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval+Transcription WER check + - pytest -s entrypoints/openai/correctness/ diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml new file mode 100644 index 000000000000..a1316f289d59 --- /dev/null +++ b/.buildkite/test_areas/expert_parallelism.yaml @@ -0,0 +1,21 @@ +group: Expert Parallelism +steps: +- label: EPLB Algorithm + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py \ No newline at end of file diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml new file mode 100644 index 000000000000..91c682ca9546 --- /dev/null +++ b/.buildkite/test_areas/kernels.yaml @@ -0,0 +1,115 @@ +group: Kernels +steps: +- label: Kernels Core Operation Test + timeout_in_minutes: 75 + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + +- label: Kernels Attention Test %N + timeout_in_minutes: 35 + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N + timeout_in_minutes: 90 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test %N + timeout_in_minutes: 60 + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Mamba Test + timeout_in_minutes: 45 + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Kernels DeepGEMM Test (H100) + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + +- label: Kernels (B200) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/attention/backends/mla/cutlass_mla.py + - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/platforms/cuda.py + - vllm/attention/selector.py + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_attention_selector.py + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py \ No newline at end of file diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml new file mode 100644 index 000000000000..c6498c032440 --- /dev/null +++ b/.buildkite/test_areas/lm_eval.yaml @@ -0,0 +1,44 @@ +group: LM Eval +steps: +- label: LM Eval Small Models + timeout_in_minutes: 75 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + +- label: LM Eval Large Models (4 GPUs)(A100) + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: LM Eval Large Models (4 GPUs)(H100) + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + +- label: LM Eval Small Models (B200) + timeout_in_minutes: 120 + gpu: b200 + optional: true + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml new file mode 100644 index 000000000000..3f41b5ff0f8f --- /dev/null +++ b/.buildkite/test_areas/lora.yaml @@ -0,0 +1,38 @@ +group: LoRA +steps: +- label: LoRA %N + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py \ + --ignore=lora/test_olmoe_tp.py \ + --ignore=lora/test_deepseekv2_tp.py \ + --ignore=lora/test_gptoss_tp.py \ + --ignore=lora/test_qwen3moe_tp.py + parallelism: 4 + + +- label: LoRA TP (Distributed) + timeout_in_minutes: 30 + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py \ No newline at end of file diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml new file mode 100644 index 000000000000..ddca24efed1f --- /dev/null +++ b/.buildkite/test_areas/misc.yaml @@ -0,0 +1,150 @@ +group: Miscellaneous +steps: +- label: V1 Others + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + # split the test to avoid interference + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Others (CPU) + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + +- label: Regression + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Prime-RL Integration (2 GPUs) + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh + +- label: Examples + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - examples/ + commands: + - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_pooling.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + +- label: Metrics, Tracing (2 GPUs) + timeout_in_minutes: 20 + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +- label: Python-only Installation + timeout_in_minutes: 20 + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Async Engine, Inputs, Utils, Worker + timeout_in_minutes: 50 + source_file_dependencies: + - vllm/ + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker, Config (CPU) + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/multimodal + - tests/standalone_tests/lazy_imports.py + - tests/transformers_utils + - tests/config + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s transformers_utils + - pytest -v -s config + +- label: GPT-OSS Eval (B200) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 \ No newline at end of file diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml new file mode 100644 index 000000000000..c2d52654f0d2 --- /dev/null +++ b/.buildkite/test_areas/model_executor.yaml @@ -0,0 +1,15 @@ +group: Model Executor +steps: +- label: Model Executor + timeout_in_minutes: 35 + source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml new file mode 100644 index 000000000000..9506a613790c --- /dev/null +++ b/.buildkite/test_areas/models_basic.yaml @@ -0,0 +1,65 @@ +group: Models - Basic +steps: +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + timeout_in_minutes: 10 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Transformers Nightly Models + working_dir: "/vllm-workspace/" + optional: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_transformers.py + # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml new file mode 100644 index 000000000000..ea38fdb12d2e --- /dev/null +++ b/.buildkite/test_areas/models_distributed.yaml @@ -0,0 +1,20 @@ +group: Models - Distributed +steps: +- label: Distributed Model Tests (2 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml new file mode 100644 index 000000000000..65303f049613 --- /dev/null +++ b/.buildkite/test_areas/models_language.yaml @@ -0,0 +1,94 @@ +group: Models - Language +steps: +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml new file mode 100644 index 000000000000..5d31192d169a --- /dev/null +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -0,0 +1,67 @@ +group: Models - Multimodal +steps: +- label: Multi-Modal Models (Standard) # 60min + timeout_in_minutes: 80 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Processor # 44min + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + +- label: Multi-Modal Models (Extended) 1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models (Extended) 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models (Extended) 3 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml new file mode 100644 index 000000000000..f922d5c919f8 --- /dev/null +++ b/.buildkite/test_areas/plugins.yaml @@ -0,0 +1,32 @@ +group: Plugins +steps: +- label: Plugin Tests (2 GPUs) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml new file mode 100644 index 000000000000..34c0c87fb2c6 --- /dev/null +++ b/.buildkite/test_areas/pytorch.yaml @@ -0,0 +1,48 @@ +group: PyTorch +steps: +- label: PyTorch Compilation Unit Tests + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Fullgraph Smoke Test + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Fullgraph + timeout_in_minutes: 40 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # fp8 kv scales not supported on sm89, tested on Blackwell instead + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + # Limit to no custom ops to reduce running time + # Wrap with quotes to escape yaml and avoid starting -k string with a - + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/pre_commit/generate_nightly_torch_test.py + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh \ No newline at end of file diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml new file mode 100644 index 000000000000..554d6447d791 --- /dev/null +++ b/.buildkite/test_areas/quantization.yaml @@ -0,0 +1,35 @@ +group: Quantization +steps: +- label: Quantization + timeout_in_minutes: 90 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + +- label: Quantized MoE Test (B200) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml new file mode 100644 index 000000000000..0d26ffbd00ac --- /dev/null +++ b/.buildkite/test_areas/samplers.yaml @@ -0,0 +1,12 @@ +group: Samplers +steps: +- label: Samplers Test + timeout_in_minutes: 75 + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml new file mode 100644 index 000000000000..328158d0a948 --- /dev/null +++ b/.buildkite/test_areas/tool_use.yaml @@ -0,0 +1,20 @@ +group: Tool use +steps: +- label: OpenAI-Compatible Tool Use + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + fast_check: false + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s -m 'not cpu_test' tool_use + +- label: OpenAI-Compatible Tool Use (CPU) + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/tool_use + no_gpu: true + commands: + - pytest -v -s -m 'cpu_test' tool_use diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml new file mode 100644 index 000000000000..98ac8ef2ec2a --- /dev/null +++ b/.buildkite/test_areas/weight_loading.yaml @@ -0,0 +1,25 @@ +group: Weight Loading +steps: +- label: Weight Loading Multiple GPU # 33min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU - Large Models # optional + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt From 27a893bb054b0988fcb733476d606a9c27db6dc5 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 25 Nov 2025 01:58:45 -0800 Subject: [PATCH 02/24] move primerl Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/e2e_integration.yaml | 13 ++++++++++++- .buildkite/test_areas/misc.yaml | 13 +------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index dca7c1fcdf31..b7255737f889 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -16,4 +16,15 @@ steps: num_gpus: 4 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020 + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 + +- label: Prime-RL Integration (2 GPUs) + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index ddca24efed1f..ef57557b568f 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -47,17 +47,6 @@ steps: - pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional -- label: Prime-RL Integration (2 GPUs) - timeout_in_minutes: 30 - optional: true - num_gpus: 2 - working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh - commands: - - bash .buildkite/scripts/run-prime-rl-test.sh - - label: Examples timeout_in_minutes: 45 working_dir: "/vllm-workspace/examples" @@ -147,4 +136,4 @@ steps: - vllm/v1/attention/backends/flashinfer.py commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 \ No newline at end of file + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 From 77011542b42a43b643a47ed42b84333bf191e822 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 26 Nov 2025 02:05:22 -0800 Subject: [PATCH 03/24] test Signed-off-by: Kevin H. Luu --- .buildkite/ci_config.yaml | 10 + .buildkite/test_areas/distributed.yaml | 4 +- buildkite_steps.yaml | 2212 ++++++++++++++++++++++++ 3 files changed, 2224 insertions(+), 2 deletions(-) create mode 100644 .buildkite/ci_config.yaml create mode 100644 buildkite_steps.yaml diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml new file mode 100644 index 000000000000..40e923a24b71 --- /dev/null +++ b/.buildkite/ci_config.yaml @@ -0,0 +1,10 @@ +name: ci +job_dirs: + - ".buildkite/test_areas" +run_all_patterns: + - ".*" +run_all_exclude_patterns: + - ".*" +registries: + main: "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo" + premerge: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo" diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 56dee5c31389..67d7527e36c1 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -168,13 +168,13 @@ steps: - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) + # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code diff --git a/buildkite_steps.yaml b/buildkite_steps.yaml new file mode 100644 index 000000000000..7b489a91ef01 --- /dev/null +++ b/buildkite_steps.yaml @@ -0,0 +1,2212 @@ +steps: +- group: Attention + steps: + - label: V1 attention (B200) + agents: + queue: gpu_1_queue + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: V1 attention (H100) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s v1/attention + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Basic Correctness + steps: + - label: Basic Correctness + agents: + queue: gpu_1_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Benchmarks + steps: + - label: Benchmarks + agents: + queue: gpu_1_queue + commands: + - bash scripts/run-benchmarks.sh + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Benchmarks CLI Test + agents: + queue: gpu_1_queue + commands: + - pytest -v -s benchmarks/ + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: CUDA + steps: + - label: Cudagraph + agents: + queue: gpu_1_queue + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Platform Tests (CUDA) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s cuda/test_cuda_context.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Compile + steps: + - label: Fusion E2E (2 GPUs)(B200) + agents: + queue: gpu_4_queue + commands: + - nvidia-smi + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Fusion and Compile Tests (B200) + agents: + queue: gpu_1_queue + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + -k 'True and not +quant_fp8 and not +rms_norm' + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Distributed + steps: + - label: 2 Node Test (4 GPUs) + agents: + queue: gpu_4_queue + commands: + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d + --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node + test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 + distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 + --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 + --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d + --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node + test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 + distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 + --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 + --enforce-eager --trust-remote-code + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py + | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 + distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Comm Ops + agents: + queue: gpu_4_queue + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed NixlConnector PD accuracy (4 GPUs) + agents: + queue: gpu_4_queue + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Tests (2 GPUs)(B200) + agents: + queue: gpu_4_queue + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Tests (2 GPUs)(H200) + agents: + queue: gpu_4_queue + commands: + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' + - pytest -v -s tests/distributed/test_sequence_parallel.py + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 + VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py + --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Tests (4 GPUs) + agents: + queue: gpu_4_queue + commands: + - export NCCL_CUMEM_HOST_ENABLE=0 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Tests (4 GPUs)(A100) + agents: + queue: gpu_4_queue + commands: + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Tests (8 GPUs)(H100) + agents: + queue: gpu_1_queue + commands: + - export NCCL_CUMEM_HOST_ENABLE=0 + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py + --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Pipeline + Context Parallelism (4 GPUs)) + agents: + queue: gpu_4_queue + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: E2E Integration + steps: + - label: DeepSeek V2-Lite Accuracy + agents: + queue: gpu_4_queue + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh + 0.25 200 8010 + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Prime-RL Integration (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Qwen3-30B-A3B-FP8-block Accuracy + agents: + queue: gpu_4_queue + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh + 0.8 200 8020 + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Engine + steps: + - label: Engine + agents: + queue: gpu_1_queue + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + - pytest -v -s tokenization + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: V1 e2e + engine + agents: + queue: gpu_1_queue + commands: + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Entrypoints + steps: + - label: Entrypoints Integration (API Server) + agents: + queue: gpu_1_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py + --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py + --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py + --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Entrypoints Integration (LLM) + agents: + queue: gpu_1_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py + - pytest -v -s entrypoints/offline_mode + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Entrypoints Integration (Pooling) + agents: + queue: gpu_1_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Entrypoints Unit Tests + agents: + queue: gpu_1_queue + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai + --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Entrypoints V1 + agents: + queue: gpu_1_queue + commands: + - pytest -v -s v1/entrypoints + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: OpenAI API Correctness + agents: + queue: gpu_1_queue + commands: + - pytest -s entrypoints/openai/correctness/ + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Expert Parallelism + steps: + - label: EPLB Algorithm + agents: + queue: gpu_1_queue + commands: + - pytest -v -s distributed/test_eplb_algo.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: EPLB Execution + agents: + queue: gpu_4_queue + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Kernels + steps: + - label: Kernels (B200) + agents: + queue: gpu_1_queue + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + - pytest -v -s tests/kernels/attention/test_attention_selector.py + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels Attention Test %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels Core Operation Test + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels DeepGEMM Test (H100) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels Mamba Test + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/mamba + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels MoE Test %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels Quantization Test %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: LM Eval + steps: + - label: LM Eval Large Models (4 GPUs)(A100) + agents: + queue: gpu_4_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt + --tp-size=4 + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: LM Eval Large Models (4 GPUs)(H100) + agents: + queue: gpu_4_queue + commands: + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: LM Eval Small Models + agents: + queue: gpu_1_queue + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + --tp-size=1 + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: LM Eval Small Models (B200) + agents: + queue: gpu_1_queue + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt + --tp-size=1 + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: LoRA + steps: + - label: LoRA %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py + \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py + \ --ignore=lora/test_qwen3moe_tp.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: LoRA TP (Distributed) + agents: + queue: gpu_4_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Miscellaneous + steps: + - label: Async Engine, Inputs, Utils, Worker + agents: + queue: gpu_1_queue + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Async Engine, Inputs, Utils, Worker, Config (CPU) + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s transformers_utils + - pytest -v -s config + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Examples + agents: + queue: gpu_1_queue + commands: + - pip install tensorizer + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf + --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_pooling.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory + /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m + deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper + --seed 0 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens + 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp + 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens + 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp + 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: GPT-OSS Eval (B200) + agents: + queue: gpu_1_queue + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b + --metric 0.58 + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Metrics, Tracing (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - pip install 'opentelemetry-sdk>=1.26.0' 'opentelemetry-api>=1.26.0' 'opentelemetry-exporter-otlp>=1.26.0' + 'opentelemetry-semantic-conventions-ai>=0.4.1' + - pytest -v -s v1/tracing + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Python-only Installation + agents: + queue: gpu_1_queue + commands: + - bash standalone_tests/python_only_compile.sh + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Regression + agents: + queue: gpu_1_queue + commands: + - pip install modelscope + - pytest -v -s test_regression.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: V1 Others + agents: + queue: gpu_1_queue + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: V1 Others (CPU) + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Model Executor + steps: + - label: Model Executor + agents: + queue: gpu_1_queue + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Models - Basic + steps: + - label: Basic Models Test (Other CPU) + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Basic Models Tests (Extra Initialization) %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/test_initialization.py \ -k 'not test_can_initialize_small_subset' + \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Basic Models Tests (Initialization) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Basic Models Tests (Other) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Transformers Nightly Models + agents: + queue: gpu_1_queue + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal + or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR + or KimiVL)' + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py + --model-type whisper + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Models - Distributed + steps: + - label: Distributed Model Tests (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py + -v -s -m 'distributed(num_gpus=2)' + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Models - Language + steps: + - label: Language Models Test (Extended Generation) + agents: + queue: gpu_1_queue + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Test (Extended Pooling) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Test (MTEB) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/language/pooling_mteb_test + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Test (PPL) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/language/generation_ppl_test + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Tests (Extra Standard) %N + agents: + queue: gpu_1_queue + commands: + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + \ --shard-id=$$BUILDKITE_PARALLEL_JOB + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Tests (Hybrid) %N + agents: + queue: gpu_1_queue + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + \ --shard-id=$$BUILDKITE_PARALLEL_JOB + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Tests (Standard) + agents: + queue: gpu_1_queue + commands: + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Models - Multimodal + steps: + - label: Custom Models + agents: + queue: gpu_1_queue + commands: + - echo 'Testing custom models...' + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Accuracy Eval (Small Models) + agents: + queue: gpu_1_queue + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt + --tp-size=1 + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Models (Extended) 1 + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py + --ignore models/multimodal/processing + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Models (Extended) 2 + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) + and not core_model' + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Models (Extended) 3 + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) + and not core_model' + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Models (Standard) + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py + --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py + -m core_model + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Processor + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Plugins + steps: + - label: Plugin Tests (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py + - pytest -v -s models/test_oot_registration.py + - pytest -v -s plugins/lora_resolvers + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: PyTorch + steps: + - label: PyTorch Compilation Unit Tests + agents: + queue: gpu_1_queue + commands: + - find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\; + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: PyTorch Fullgraph + agents: + queue: gpu_1_queue + commands: + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 + and not Llama-4' + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: PyTorch Fullgraph Smoke Test + agents: + queue: gpu_1_queue + commands: + - find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec + pytest -s -v {} \\; + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Pytorch Nightly Dependency Override Check + agents: + queue: gpu_1_queue + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + soft_fail: true + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Quantization + steps: + - label: Quantization + agents: + queue: gpu_1_queue + commands: + - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Quantized MoE Test (B200) + agents: + queue: gpu_1_queue + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Samplers + steps: + - label: Samplers Test + agents: + queue: gpu_1_queue + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Tool use + steps: + - label: OpenAI-Compatible Tool Use + agents: + queue: gpu_1_queue + commands: + - pytest -v -s -m 'not cpu_test' tool_use + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: OpenAI-Compatible Tool Use (CPU) + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - pytest -v -s -m 'cpu_test' tool_use + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Weight Loading + steps: + - label: Weight Loading Multiple GPU + agents: + queue: gpu_4_queue + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Weight Loading Multiple GPU - Large Models + agents: + queue: gpu_4_queue + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true From 0a7642e28828600763418730efee543427fcdddc Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 01:41:13 -0800 Subject: [PATCH 04/24] update pipeline yaml Signed-off-by: Kevin H. Luu --- .buildkite/ci_config.yaml | 8 +- .buildkite/pipeline.yaml | 2700 +++++++++++++++++ .buildkite/test_areas/attention.yaml | 2 + .buildkite/test_areas/basic_correctness.yaml | 2 + .buildkite/test_areas/benchmarks.yaml | 2 + .buildkite/test_areas/compile.yaml | 2 + .buildkite/test_areas/cuda.yaml | 2 + .buildkite/test_areas/distributed.yaml | 2 + .buildkite/test_areas/e2e_integration.yaml | 2 + .buildkite/test_areas/engine.yaml | 2 + .buildkite/test_areas/entrypoints.yaml | 2 + .buildkite/test_areas/expert_parallelism.yaml | 2 + .buildkite/test_areas/kernels.yaml | 2 + .buildkite/test_areas/lm_eval.yaml | 2 + .buildkite/test_areas/lora.yaml | 2 + .buildkite/test_areas/misc.yaml | 5 + .buildkite/test_areas/model_executor.yaml | 2 + .buildkite/test_areas/models_basic.yaml | 5 +- .buildkite/test_areas/models_distributed.yaml | 2 + .buildkite/test_areas/models_language.yaml | 2 + .buildkite/test_areas/models_multimodal.yaml | 2 + .buildkite/test_areas/plugins.yaml | 2 + .buildkite/test_areas/pytorch.yaml | 2 + .buildkite/test_areas/quantization.yaml | 2 + .buildkite/test_areas/samplers.yaml | 2 + .buildkite/test_areas/tool_use.yaml | 3 + .buildkite/test_areas/weight_loading.yaml | 4 +- 27 files changed, 2759 insertions(+), 8 deletions(-) create mode 100644 .buildkite/pipeline.yaml diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml index 40e923a24b71..2b0908bd3bd7 100644 --- a/.buildkite/ci_config.yaml +++ b/.buildkite/ci_config.yaml @@ -1,10 +1,12 @@ name: ci job_dirs: - ".buildkite/test_areas" + - ".buildkite/build" run_all_patterns: - ".*" run_all_exclude_patterns: - ".*" -registries: - main: "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo" - premerge: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo" +registries: public.ecr.aws/q9t5s3a7 +repositories: + main: "vllm-ci-postmerge-repo" + premerge: "vllm-ci-test-repo" diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml new file mode 100644 index 000000000000..818400f36234 --- /dev/null +++ b/.buildkite/pipeline.yaml @@ -0,0 +1,2700 @@ +steps: +- group: Abuild + steps: + - label: ':docker: Build CPU arm64 image' + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - .buildkite/build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo + 123 + soft_fail: false + - label: ':docker: Build CPU image' + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - .buildkite/build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo + 123 + soft_fail: false + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: -10 + limit: 2 + env: + DOCKER_BUILDKIT: '1' + - label: ':docker: Build CUDA 11.8 image' + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - .buildkite/build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo + 123 + soft_fail: false + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: -10 + limit: 2 + env: + DOCKER_BUILDKIT: '1' + - label: ':docker: Build HPU image' + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - .buildkite/build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo + 123 + soft_fail: true + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: -10 + limit: 2 + env: + DOCKER_BUILDKIT: '1' + - label: ':docker: Build image' + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - .buildkite/build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 + soft_fail: false + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: -10 + limit: 2 + env: + DOCKER_BUILDKIT: '1' +- group: Attention + steps: + - label: V1 attention (B200) + agents: + queue: gpu_1_queue + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: V1 attention (H100) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s v1/attention + depends_on: + - image-build + soft_fail: false + plugins: + - kubernetes: + kubernetes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate +- group: Basic Correctness + steps: + - label: Basic Correctness + agents: + queue: gpu_1_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Benchmarks + steps: + - label: Benchmarks + agents: + queue: gpu_1_queue + commands: + - bash scripts/run-benchmarks.sh + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Benchmarks CLI Test + agents: + queue: gpu_1_queue + commands: + - pytest -v -s benchmarks/ + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: CUDA + steps: + - label: Cudagraph + agents: + queue: gpu_1_queue + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Platform Tests (CUDA) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s cuda/test_cuda_context.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Compile + steps: + - label: Fusion E2E (2 GPUs)(B200) + agents: + queue: gpu_4_queue + commands: + - nvidia-smi + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Fusion and Compile Tests (B200) + agents: + queue: gpu_1_queue + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + -k 'True and not +quant_fp8 and not +rms_norm' + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Distributed + steps: + - label: 2 Node Test (4 GPUs) + agents: + queue: gpu_4_queue + commands: + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d + --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node + test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 + distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 + --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 + --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d + --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node + test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 + distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 + --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 + --enforce-eager --trust-remote-code + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py + | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 + distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Comm Ops + agents: + queue: gpu_4_queue + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed NixlConnector PD accuracy (4 GPUs) + agents: + queue: gpu_4_queue + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Tests (2 GPUs)(B200) + agents: + queue: gpu_4_queue + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Tests (2 GPUs)(H200) + agents: + queue: gpu_4_queue + commands: + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' + - pytest -v -s tests/distributed/test_sequence_parallel.py + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 + VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py + --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Tests (4 GPUs) + agents: + queue: gpu_4_queue + commands: + - export NCCL_CUMEM_HOST_ENABLE=0 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Distributed Tests (4 GPUs)(A100) + agents: + queue: gpu_4_queue + commands: + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + depends_on: + - image-build + soft_fail: false + plugins: + - kubernetes: + kubernetes: + podSpec: + priorityClassName: ci + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: Distributed Tests (8 GPUs)(H100) + agents: + queue: gpu_1_queue + commands: + - export NCCL_CUMEM_HOST_ENABLE=0 + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py + --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + depends_on: + - image-build + soft_fail: false + plugins: + - kubernetes: + kubernetes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: Pipeline + Context Parallelism (4 GPUs)) + agents: + queue: gpu_4_queue + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: E2E Integration + steps: + - label: DeepSeek V2-Lite Accuracy + agents: + queue: gpu_4_queue + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh + 0.25 200 8010 + depends_on: + - image-build + soft_fail: false + plugins: + - kubernetes: + kubernetes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: Prime-RL Integration (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Qwen3-30B-A3B-FP8-block Accuracy + agents: + queue: gpu_4_queue + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh + 0.8 200 8020 + depends_on: + - image-build + soft_fail: false + plugins: + - kubernetes: + kubernetes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate +- group: Engine + steps: + - label: Engine + agents: + queue: gpu_1_queue + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + - pytest -v -s tokenization + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: V1 e2e + engine + agents: + queue: gpu_1_queue + commands: + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Entrypoints + steps: + - label: Entrypoints Integration (API Server) + agents: + queue: gpu_1_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py + --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py + --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py + --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Entrypoints Integration (LLM) + agents: + queue: gpu_1_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py + - pytest -v -s entrypoints/offline_mode + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Entrypoints Integration (Pooling) + agents: + queue: gpu_1_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Entrypoints Unit Tests + agents: + queue: gpu_1_queue + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai + --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Entrypoints V1 + agents: + queue: gpu_1_queue + commands: + - pytest -v -s v1/entrypoints + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: OpenAI API Correctness + agents: + queue: gpu_1_queue + commands: + - pytest -s entrypoints/openai/correctness/ + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Expert Parallelism + steps: + - label: EPLB Algorithm + agents: + queue: gpu_1_queue + commands: + - pytest -v -s distributed/test_eplb_algo.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: EPLB Execution + agents: + queue: gpu_4_queue + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Kernels + steps: + - label: Kernels (B200) + agents: + queue: gpu_1_queue + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + - pytest -v -s tests/kernels/attention/test_attention_selector.py + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels Attention Test %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels Core Operation Test + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels DeepGEMM Test (H100) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + depends_on: + - image-build + soft_fail: false + plugins: + - kubernetes: + kubernetes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: Kernels Mamba Test + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/mamba + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels MoE Test %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Kernels Quantization Test %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: LM Eval + steps: + - label: LM Eval Large Models (4 GPUs)(A100) + agents: + queue: gpu_4_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt + --tp-size=4 + depends_on: + - image-build + soft_fail: false + plugins: + - kubernetes: + kubernetes: + podSpec: + priorityClassName: ci + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: LM Eval Large Models (4 GPUs)(H100) + agents: + queue: gpu_4_queue + commands: + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + depends_on: + - image-build + soft_fail: false + plugins: + - kubernetes: + kubernetes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: LM Eval Small Models + agents: + queue: gpu_1_queue + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + --tp-size=1 + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: LM Eval Small Models (B200) + agents: + queue: gpu_1_queue + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt + --tp-size=1 + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: LoRA + steps: + - label: LoRA %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py + \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py + \ --ignore=lora/test_qwen3moe_tp.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: LoRA TP (Distributed) + agents: + queue: gpu_4_queue + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Miscellaneous + steps: + - label: Async Engine, Inputs, Utils, Worker + agents: + queue: gpu_1_queue + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Async Engine, Inputs, Utils, Worker, Config (CPU) + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s transformers_utils + - pytest -v -s config + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Examples + agents: + queue: gpu_1_queue + commands: + - pip install tensorizer + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf + --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_pooling.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory + /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m + deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper + --seed 0 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens + 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp + 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens + 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp + 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: GPT-OSS Eval (B200) + agents: + queue: gpu_1_queue + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b + --metric 0.58 + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Metrics, Tracing (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - pip install 'opentelemetry-sdk>=1.26.0' 'opentelemetry-api>=1.26.0' 'opentelemetry-exporter-otlp>=1.26.0' + 'opentelemetry-semantic-conventions-ai>=0.4.1' + - pytest -v -s v1/tracing + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Python-only Installation + agents: + queue: gpu_1_queue + commands: + - bash standalone_tests/python_only_compile.sh + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Regression + agents: + queue: gpu_1_queue + commands: + - pip install modelscope + - pytest -v -s test_regression.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: V1 Others + agents: + queue: gpu_1_queue + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: V1 Others (CPU) + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Model Executor + steps: + - label: Model Executor + agents: + queue: gpu_1_queue + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Models - Basic + steps: + - label: Basic Models Test (Other CPU) + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Basic Models Tests (Extra Initialization) %N + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/test_initialization.py \ -k 'not test_can_initialize_small_subset' + \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Basic Models Tests (Initialization) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Basic Models Tests (Other) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Transformers Nightly Models + agents: + queue: gpu_1_queue + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal + or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR + or KimiVL)' + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py + --model-type whisper + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Models - Distributed + steps: + - label: Distributed Model Tests (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py + -v -s -m 'distributed(num_gpus=2)' + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Models - Language + steps: + - label: Language Models Test (Extended Generation) + agents: + queue: gpu_1_queue + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Test (Extended Pooling) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Test (MTEB) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/language/pooling_mteb_test + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Test (PPL) + agents: + queue: gpu_1_queue + commands: + - pytest -v -s models/language/generation_ppl_test + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Tests (Extra Standard) %N + agents: + queue: gpu_1_queue + commands: + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + \ --shard-id=$$BUILDKITE_PARALLEL_JOB + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Tests (Hybrid) %N + agents: + queue: gpu_1_queue + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + \ --shard-id=$$BUILDKITE_PARALLEL_JOB + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Language Models Tests (Standard) + agents: + queue: gpu_1_queue + commands: + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Models - Multimodal + steps: + - label: Custom Models + agents: + queue: gpu_1_queue + commands: + - echo 'Testing custom models...' + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Accuracy Eval (Small Models) + agents: + queue: gpu_1_queue + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt + --tp-size=1 + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Models (Extended) 1 + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py + --ignore models/multimodal/processing + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Models (Extended) 2 + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) + and not core_model' + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Models (Extended) 3 + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) + and not core_model' + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Models (Standard) + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py + --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py + -m core_model + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Processor + agents: + queue: gpu_1_queue + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Plugins + steps: + - label: Plugin Tests (2 GPUs) + agents: + queue: gpu_4_queue + commands: + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py + - pytest -v -s models/test_oot_registration.py + - pytest -v -s plugins/lora_resolvers + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: PyTorch + steps: + - label: PyTorch Compilation Unit Tests + agents: + queue: gpu_1_queue + commands: + - find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\; + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: PyTorch Fullgraph + agents: + queue: gpu_1_queue + commands: + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 + and not Llama-4' + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: PyTorch Fullgraph Smoke Test + agents: + queue: gpu_1_queue + commands: + - find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec + pytest -s -v {} \\; + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Pytorch Nightly Dependency Override Check + agents: + queue: gpu_1_queue + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + depends_on: + - image-build + soft_fail: true + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Quantization + steps: + - label: Quantization + agents: + queue: gpu_1_queue + commands: + - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Quantized MoE Test (B200) + agents: + queue: gpu_1_queue + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Samplers + steps: + - label: Samplers Test + agents: + queue: gpu_1_queue + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Tool use + steps: + - label: OpenAI-Compatible Tool Use + agents: + queue: gpu_1_queue + commands: + - pytest -v -s -m 'not cpu_test' tool_use + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: OpenAI-Compatible Tool Use (CPU) + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - pytest -v -s -m 'cpu_test' tool_use + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true +- group: Weight Loading + steps: + - label: Weight Loading Multiple GPU + agents: + queue: gpu_4_queue + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + always_pull: true + propagate_environment: true + gpus: all + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Weight Loading Multiple GPU - Large Models + agents: + queue: gpu_4_queue + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + depends_on: + - image-build + soft_fail: false + plugins: + - kubernetes: + kubernetes: + podSpec: + priorityClassName: ci + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml index af57cc6681b4..6e444eae14c7 100644 --- a/.buildkite/test_areas/attention.yaml +++ b/.buildkite/test_areas/attention.yaml @@ -1,4 +1,6 @@ group: Attention +depends_on: + - image-build steps: - label: V1 attention (H100) timeout_in_minutes: 30 diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml index 27c4d96aeb8c..759d2b535871 100644 --- a/.buildkite/test_areas/basic_correctness.yaml +++ b/.buildkite/test_areas/basic_correctness.yaml @@ -1,4 +1,6 @@ group: Basic Correctness +depends_on: + - image-build steps: - label: Basic Correctness timeout_in_minutes: 30 diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml index c48c72fb405b..574b642d407b 100644 --- a/.buildkite/test_areas/benchmarks.yaml +++ b/.buildkite/test_areas/benchmarks.yaml @@ -1,4 +1,6 @@ group: Benchmarks +depends_on: + - image-build steps: - label: Benchmarks timeout_in_minutes: 20 diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml index 4b05bd8976e4..0ba00925a483 100644 --- a/.buildkite/test_areas/compile.yaml +++ b/.buildkite/test_areas/compile.yaml @@ -1,4 +1,6 @@ group: Compile +depends_on: + - image-build steps: - label: Fusion and Compile Tests (B200) timeout_in_minutes: 40 diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml index 6c8ff70ba45a..50c0c338c243 100644 --- a/.buildkite/test_areas/cuda.yaml +++ b/.buildkite/test_areas/cuda.yaml @@ -1,4 +1,6 @@ group: CUDA +depends_on: + - image-build steps: - label: Platform Tests (CUDA) timeout_in_minutes: 15 diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 67d7527e36c1..e6ae13b8156d 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -1,4 +1,6 @@ group: Distributed +depends_on: + - image-build steps: - label: Distributed Comm Ops timeout_in_minutes: 20 diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index b7255737f889..817b995574bc 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -1,4 +1,6 @@ group: E2E Integration +depends_on: + - image-build steps: - label: DeepSeek V2-Lite Accuracy timeout_in_minutes: 60 diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index be099758eb88..e4d12f3453f1 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -1,4 +1,6 @@ group: Engine +depends_on: + - image-build steps: - label: Engine timeout_in_minutes: 40 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index adbd6e96291e..0a789be943f3 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -1,4 +1,6 @@ group: Entrypoints +depends_on: + - image-build steps: - label: Entrypoints Unit Tests timeout_in_minutes: 10 diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml index a1316f289d59..feb8252148c7 100644 --- a/.buildkite/test_areas/expert_parallelism.yaml +++ b/.buildkite/test_areas/expert_parallelism.yaml @@ -1,4 +1,6 @@ group: Expert Parallelism +depends_on: + - image-build steps: - label: EPLB Algorithm timeout_in_minutes: 15 diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index 91c682ca9546..7ca099516d64 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -1,4 +1,6 @@ group: Kernels +depends_on: + - image-build steps: - label: Kernels Core Operation Test timeout_in_minutes: 75 diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index c6498c032440..9af43e0c375a 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -1,4 +1,6 @@ group: LM Eval +depends_on: + - image-build steps: - label: LM Eval Small Models timeout_in_minutes: 75 diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml index 3f41b5ff0f8f..45e3af03591d 100644 --- a/.buildkite/test_areas/lora.yaml +++ b/.buildkite/test_areas/lora.yaml @@ -1,4 +1,6 @@ group: LoRA +depends_on: + - image-build steps: - label: LoRA %N timeout_in_minutes: 30 diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index ef57557b568f..ec719825b377 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -1,4 +1,6 @@ group: Miscellaneous +depends_on: + - image-build steps: - label: V1 Others timeout_in_minutes: 60 @@ -25,6 +27,7 @@ steps: - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - label: V1 Others (CPU) + depends_on: ~ source_file_dependencies: - vllm/ - tests/v1 @@ -88,6 +91,7 @@ steps: - pytest -v -s v1/tracing - label: Python-only Installation + depends_on: ~ timeout_in_minutes: 20 source_file_dependencies: - tests/standalone_tests/python_only_compile.sh @@ -106,6 +110,7 @@ steps: - pytest -v -s utils_ - label: Async Engine, Inputs, Utils, Worker, Config (CPU) + depends_on: ~ timeout_in_minutes: 10 source_file_dependencies: - vllm/ diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml index c2d52654f0d2..996c8bb8b780 100644 --- a/.buildkite/test_areas/model_executor.yaml +++ b/.buildkite/test_areas/model_executor.yaml @@ -1,4 +1,6 @@ group: Model Executor +depends_on: + - image-build steps: - label: Model Executor timeout_in_minutes: 35 diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index 9506a613790c..ceddf841f87a 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -1,4 +1,6 @@ group: Models - Basic +depends_on: + - image-build steps: - label: Basic Models Tests (Initialization) timeout_in_minutes: 45 @@ -30,8 +32,6 @@ steps: - label: Basic Models Tests (Other) timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - torch_nightly: true source_file_dependencies: - vllm/ - tests/models/test_transformers.py @@ -41,7 +41,6 @@ steps: - label: Basic Models Test (Other CPU) # 5min timeout_in_minutes: 10 - torch_nightly: true source_file_dependencies: - vllm/ - tests/models/test_utils.py diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml index ea38fdb12d2e..b6bfbf2ddab4 100644 --- a/.buildkite/test_areas/models_distributed.yaml +++ b/.buildkite/test_areas/models_distributed.yaml @@ -1,4 +1,6 @@ group: Models - Distributed +depends_on: + - image-build steps: - label: Distributed Model Tests (2 GPUs) timeout_in_minutes: 50 diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml index 65303f049613..fdf78dc48746 100644 --- a/.buildkite/test_areas/models_language.yaml +++ b/.buildkite/test_areas/models_language.yaml @@ -1,4 +1,6 @@ group: Models - Language +depends_on: + - image-build steps: - label: Language Models Tests (Standard) timeout_in_minutes: 25 diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml index 5d31192d169a..68e5e485c316 100644 --- a/.buildkite/test_areas/models_multimodal.yaml +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -1,4 +1,6 @@ group: Models - Multimodal +depends_on: + - image-build steps: - label: Multi-Modal Models (Standard) # 60min timeout_in_minutes: 80 diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml index f922d5c919f8..60c179aa098e 100644 --- a/.buildkite/test_areas/plugins.yaml +++ b/.buildkite/test_areas/plugins.yaml @@ -1,4 +1,6 @@ group: Plugins +depends_on: + - image-build steps: - label: Plugin Tests (2 GPUs) timeout_in_minutes: 60 diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml index 34c0c87fb2c6..dab6e674990b 100644 --- a/.buildkite/test_areas/pytorch.yaml +++ b/.buildkite/test_areas/pytorch.yaml @@ -1,4 +1,6 @@ group: PyTorch +depends_on: + - image-build steps: - label: PyTorch Compilation Unit Tests timeout_in_minutes: 30 diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml index 554d6447d791..cff4a7189806 100644 --- a/.buildkite/test_areas/quantization.yaml +++ b/.buildkite/test_areas/quantization.yaml @@ -1,4 +1,6 @@ group: Quantization +depends_on: + - image-build steps: - label: Quantization timeout_in_minutes: 90 diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml index 0d26ffbd00ac..ad377148fd07 100644 --- a/.buildkite/test_areas/samplers.yaml +++ b/.buildkite/test_areas/samplers.yaml @@ -1,4 +1,6 @@ group: Samplers +depends_on: + - image-build steps: - label: Samplers Test timeout_in_minutes: 75 diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml index 328158d0a948..7040cd1d253b 100644 --- a/.buildkite/test_areas/tool_use.yaml +++ b/.buildkite/test_areas/tool_use.yaml @@ -1,4 +1,6 @@ group: Tool use +depends_on: + - image-build steps: - label: OpenAI-Compatible Tool Use timeout_in_minutes: 35 @@ -11,6 +13,7 @@ steps: - pytest -v -s -m 'not cpu_test' tool_use - label: OpenAI-Compatible Tool Use (CPU) + depends_on: ~ timeout_in_minutes: 10 source_file_dependencies: - vllm/ diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml index 98ac8ef2ec2a..cfc5bb20fe7a 100644 --- a/.buildkite/test_areas/weight_loading.yaml +++ b/.buildkite/test_areas/weight_loading.yaml @@ -1,8 +1,9 @@ group: Weight Loading +depends_on: + - image-build steps: - label: Weight Loading Multiple GPU # 33min timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 optional: true @@ -13,7 +14,6 @@ steps: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - label: Weight Loading Multiple GPU - Large Models # optional - mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 gpu: a100 From 265bf9f6e24be205a87474f5e654e0877cc8c636 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 01:44:01 -0800 Subject: [PATCH 05/24] key Signed-off-by: Kevin H. Luu --- .buildkite/pipeline.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index 818400f36234..91a9208ebedb 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -2,6 +2,7 @@ steps: - group: Abuild steps: - label: ':docker: Build CPU arm64 image' + key: image-build-cpu-arm64 agents: queue: cpu_queue_premerge_us_east_1 commands: @@ -9,6 +10,7 @@ steps: 123 soft_fail: false - label: ':docker: Build CPU image' + key: image-build-cpu agents: queue: cpu_queue_premerge_us_east_1 commands: @@ -24,6 +26,7 @@ steps: env: DOCKER_BUILDKIT: '1' - label: ':docker: Build CUDA 11.8 image' + key: image-build-cu118 agents: queue: cpu_queue_premerge_us_east_1 commands: @@ -39,6 +42,7 @@ steps: env: DOCKER_BUILDKIT: '1' - label: ':docker: Build HPU image' + key: image-build-hpu agents: queue: cpu_queue_premerge_us_east_1 commands: @@ -54,6 +58,7 @@ steps: env: DOCKER_BUILDKIT: '1' - label: ':docker: Build image' + key: image-build agents: queue: cpu_queue_premerge_us_east_1 commands: From 0784707b3fc7abbc55a3fcc3fb8b0eac5cc11b5c Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 01:49:06 -0800 Subject: [PATCH 06/24] build files Signed-off-by: Kevin H. Luu --- .buildkite/ci_config.yaml | 2 +- .buildkite/image_build/image_build.sh | 38 ++++++++++++ .buildkite/image_build/image_build.yaml | 61 +++++++++++++++++++ .buildkite/image_build/image_build_cpu.sh | 36 +++++++++++ .../image_build/image_build_cpu_arm64.sh | 33 ++++++++++ .buildkite/image_build/image_build_cu118.sh | 36 +++++++++++ .buildkite/image_build/image_build_hpu.sh | 34 +++++++++++ .buildkite/pipeline.yaml | 11 ++-- 8 files changed, 245 insertions(+), 6 deletions(-) create mode 100644 .buildkite/image_build/image_build.sh create mode 100644 .buildkite/image_build/image_build.yaml create mode 100644 .buildkite/image_build/image_build_cpu.sh create mode 100644 .buildkite/image_build/image_build_cpu_arm64.sh create mode 100644 .buildkite/image_build/image_build_cu118.sh create mode 100644 .buildkite/image_build/image_build_hpu.sh diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml index 2b0908bd3bd7..5b00e1cab6c7 100644 --- a/.buildkite/ci_config.yaml +++ b/.buildkite/ci_config.yaml @@ -1,7 +1,7 @@ name: ci job_dirs: - ".buildkite/test_areas" - - ".buildkite/build" + - ".buildkite/image_build" run_all_patterns: - ".*" run_all_exclude_patterns: diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh new file mode 100644 index 000000000000..87e35acd5e84 --- /dev/null +++ b/.buildkite/image_build/image_build.sh @@ -0,0 +1,38 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build --file docker/Dockerfile \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --build-arg USE_SCCACHE=1 \ + --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \ + --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT \ + --target test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT +docker tag $REGISTRY/$REPO:$BUILDKITE_COMMIT $REGISTRY/$REPO:latest +docker push $REGISTRY/$REPO:latest diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml new file mode 100644 index 000000000000..4b2c1da458af --- /dev/null +++ b/.buildkite/image_build/image_build.yaml @@ -0,0 +1,61 @@ +group: Abuild +steps: + - label: ":docker: Build image" + key: image-build + commands: + - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build CPU image" + key: image-build-cpu + commands: + - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build CUDA 11.8 image" + key: image-build-cu118 + commands: + - .buildkite/image_build/image_build_cu118.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build HPU image" + soft_fail: true + key: image-build-hpu + commands: + - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build CPU arm64 image" + key: image-build-cpu-arm64 + optional: true + commands: + - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh new file mode 100644 index 000000000000..a69732f43098 --- /dev/null +++ b/.buildkite/image_build/image_build_cpu.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build --file docker/Dockerfile.cpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --build-arg VLLM_CPU_AVX512BF16=true \ + --build-arg VLLM_CPU_AVX512VNNI=true \ + --build-arg VLLM_CPU_AMXBF16=true \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --target vllm-test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh new file mode 100644 index 000000000000..615298b6555b --- /dev/null +++ b/.buildkite/image_build/image_build_cpu_arm64.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build --file docker/Dockerfile.cpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --target vllm-test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu diff --git a/.buildkite/image_build/image_build_cu118.sh b/.buildkite/image_build/image_build_cu118.sh new file mode 100644 index 000000000000..699cef2ad60f --- /dev/null +++ b/.buildkite/image_build/image_build_cu118.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build \ + --file docker/Dockerfile \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --build-arg USE_SCCACHE=1 \ + --build-arg CUDA_VERSION=11.8.0 \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118 \ + --target test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118 diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh new file mode 100644 index 000000000000..192447ef4577 --- /dev/null +++ b/.buildkite/image_build/image_build_hpu.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build \ + --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \ + --progress plain \ + https://github.com/vllm-project/vllm-gaudi.git + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index 91a9208ebedb..b4646952fa18 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -6,7 +6,7 @@ steps: agents: queue: cpu_queue_premerge_us_east_1 commands: - - .buildkite/build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo + - .buildkite/image_build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 soft_fail: false - label: ':docker: Build CPU image' @@ -14,7 +14,7 @@ steps: agents: queue: cpu_queue_premerge_us_east_1 commands: - - .buildkite/build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo + - .buildkite/image_build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 soft_fail: false retry: @@ -30,7 +30,7 @@ steps: agents: queue: cpu_queue_premerge_us_east_1 commands: - - .buildkite/build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo + - .buildkite/image_build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 soft_fail: false retry: @@ -46,7 +46,7 @@ steps: agents: queue: cpu_queue_premerge_us_east_1 commands: - - .buildkite/build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo + - .buildkite/image_build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 soft_fail: true retry: @@ -62,7 +62,8 @@ steps: agents: queue: cpu_queue_premerge_us_east_1 commands: - - .buildkite/build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 + - .buildkite/image_build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo + 123 soft_fail: false retry: automatic: From c8707ff0f92086d563a2efeb36a587640ff797f8 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 01:51:31 -0800 Subject: [PATCH 07/24] permission Signed-off-by: Kevin H. Luu --- .buildkite/image_build/image_build.sh | 0 .buildkite/image_build/image_build_cpu.sh | 0 .buildkite/image_build/image_build_cpu_arm64.sh | 0 .buildkite/image_build/image_build_cu118.sh | 0 .buildkite/image_build/image_build_hpu.sh | 0 5 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 .buildkite/image_build/image_build.sh mode change 100644 => 100755 .buildkite/image_build/image_build_cpu.sh mode change 100644 => 100755 .buildkite/image_build/image_build_cpu_arm64.sh mode change 100644 => 100755 .buildkite/image_build/image_build_cu118.sh mode change 100644 => 100755 .buildkite/image_build/image_build_hpu.sh diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh old mode 100644 new mode 100755 diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh old mode 100644 new mode 100755 diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh old mode 100644 new mode 100755 diff --git a/.buildkite/image_build/image_build_cu118.sh b/.buildkite/image_build/image_build_cu118.sh old mode 100644 new mode 100755 diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh old mode 100644 new mode 100755 From b9a6433cd91d1f94f1753690074d3fd9f16bdccc Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 03:09:13 -0800 Subject: [PATCH 08/24] key change Signed-off-by: Kevin H. Luu --- .buildkite/image_build/image_build.yaml | 10 +- .buildkite/pipeline.yaml | 835 +++++---- buildkite_steps.yaml | 2212 ----------------------- 3 files changed, 446 insertions(+), 2611 deletions(-) delete mode 100644 buildkite_steps.yaml diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml index 4b2c1da458af..26ec10bc8f8d 100644 --- a/.buildkite/image_build/image_build.yaml +++ b/.buildkite/image_build/image_build.yaml @@ -2,6 +2,7 @@ group: Abuild steps: - label: ":docker: Build image" key: image-build + depends_on: ~ commands: - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT env: @@ -54,8 +55,15 @@ steps: limit: 2 - label: ":docker: Build CPU arm64 image" - key: image-build-cpu-arm64 + key: cpu-arm64-image-build optional: true commands: - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index b4646952fa18..471f24d74b75 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -1,6 +1,9 @@ steps: - group: Abuild steps: + - block: 'Run :docker: Build CPU arm64 image' + depends_on: image-build + key: block--docker--build-cpu-arm64-image - label: ':docker: Build CPU arm64 image' key: image-build-cpu-arm64 agents: @@ -8,6 +11,7 @@ steps: commands: - .buildkite/image_build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 + depends_on: block--docker--build-cpu-arm64-image soft_fail: false - label: ':docker: Build CPU image' key: image-build-cpu @@ -109,47 +113,46 @@ steps: soft_fail: false plugins: - kubernetes: - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - group: Basic Correctness steps: - label: Basic Correctness @@ -282,14 +285,16 @@ steps: mount_buildkite_agent: true - group: Compile steps: + - block: Run Fusion E2E (2 GPUs)(B200) + depends_on: image-build + key: block-fusion-e2e-2-gpusb200 - label: Fusion E2E (2 GPUs)(B200) agents: queue: gpu_4_queue commands: - nvidia-smi - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - depends_on: - - image-build + depends_on: block-fusion-e2e-2-gpusb200 soft_fail: false plugins: - docker#v5.2.0: @@ -469,6 +474,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Distributed Tests (2 GPUs)(B200) + depends_on: image-build + key: block-distributed-tests-2-gpusb200 - label: Distributed Tests (2 GPUs)(B200) agents: queue: gpu_4_queue @@ -476,8 +484,7 @@ steps: - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/v1/distributed/test_dbo.py - depends_on: - - image-build + depends_on: block-distributed-tests-2-gpusb200 soft_fail: false plugins: - docker#v5.2.0: @@ -495,6 +502,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Distributed Tests (2 GPUs)(H200) + depends_on: image-build + key: block-distributed-tests-2-gpush200 - label: Distributed Tests (2 GPUs)(H200) agents: queue: gpu_4_queue @@ -509,8 +519,7 @@ steps: VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py - depends_on: - - image-build + depends_on: block-distributed-tests-2-gpush200 soft_fail: false plugins: - docker#v5.2.0: @@ -573,6 +582,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Distributed Tests (4 GPUs)(A100) + depends_on: image-build + key: block-distributed-tests-4-gpusa100 - label: Distributed Tests (4 GPUs)(A100) agents: queue: gpu_4_queue @@ -581,53 +593,51 @@ steps: - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py - depends_on: - - image-build + depends_on: block-distributed-tests-4-gpusa100 soft_fail: false plugins: - kubernetes: - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt - --tp-size=4 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: + podSpec: + priorityClassName: ci + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - label: Distributed Tests (8 GPUs)(H100) agents: queue: gpu_1_queue @@ -640,47 +650,46 @@ steps: soft_fail: false plugins: - kubernetes: - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - label: Pipeline + Context Parallelism (4 GPUs)) agents: queue: gpu_4_queue @@ -708,65 +717,68 @@ steps: mount_buildkite_agent: true - group: E2E Integration steps: + - block: Run DeepSeek V2-Lite Accuracy + depends_on: image-build + key: block-deepseek-v2-lite-accuracy - label: DeepSeek V2-Lite Accuracy agents: queue: gpu_4_queue commands: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - depends_on: - - image-build + depends_on: block-deepseek-v2-lite-accuracy soft_fail: false plugins: - kubernetes: - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - block: Run Prime-RL Integration (2 GPUs) + depends_on: image-build + key: block-prime-rl-integration-2-gpus - label: Prime-RL Integration (2 GPUs) agents: queue: gpu_4_queue commands: - bash .buildkite/scripts/run-prime-rl-test.sh - depends_on: - - image-build + depends_on: block-prime-rl-integration-2-gpus soft_fail: false plugins: - docker#v5.2.0: @@ -784,58 +796,59 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Qwen3-30B-A3B-FP8-block Accuracy + depends_on: image-build + key: block-qwen3-30b-a3b-fp8-block-accuracy - label: Qwen3-30B-A3B-FP8-block Accuracy agents: queue: gpu_4_queue commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - depends_on: - - image-build + depends_on: block-qwen3-30b-a3b-fp8-block-accuracy soft_fail: false plugins: - kubernetes: - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - group: Engine steps: - label: Engine @@ -1202,47 +1215,46 @@ steps: soft_fail: false plugins: - kubernetes: - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - label: Kernels Mamba Test agents: queue: gpu_1_queue @@ -1317,6 +1329,9 @@ steps: mount_buildkite_agent: true - group: LM Eval steps: + - block: Run LM Eval Large Models (4 GPUs)(A100) + depends_on: image-build + key: block-lm-eval-large-models-4-gpusa100 - label: LM Eval Large Models (4 GPUs)(A100) agents: queue: gpu_4_queue @@ -1324,53 +1339,54 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - depends_on: - - image-build + depends_on: block-lm-eval-large-models-4-gpusa100 soft_fail: false plugins: - kubernetes: - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt - --tp-size=4 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: + podSpec: + priorityClassName: ci + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - block: Run LM Eval Large Models (4 GPUs)(H100) + depends_on: image-build + key: block-lm-eval-large-models-4-gpush100 - label: LM Eval Large Models (4 GPUs)(H100) agents: queue: gpu_4_queue @@ -1378,52 +1394,50 @@ steps: - export VLLM_USE_DEEP_GEMM=0 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - depends_on: - - image-build + depends_on: block-lm-eval-large-models-4-gpush100 soft_fail: false plugins: - kubernetes: - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - label: LM Eval Small Models agents: queue: gpu_1_queue @@ -1449,14 +1463,16 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run LM Eval Small Models (B200) + depends_on: image-build + key: block-lm-eval-small-models-b200 - label: LM Eval Small Models (B200) agents: queue: gpu_1_queue commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - depends_on: - - image-build + depends_on: block-lm-eval-small-models-b200 soft_fail: false plugins: - docker#v5.2.0: @@ -1636,6 +1652,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run GPT-OSS Eval (B200) + depends_on: image-build + key: block-gpt-oss-eval-b200 - label: GPT-OSS Eval (B200) agents: queue: gpu_1_queue @@ -1643,8 +1662,7 @@ steps: - uv pip install --system 'gpt-oss[eval]==0.0.5' - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - depends_on: - - image-build + depends_on: block-gpt-oss-eval-b200 soft_fail: false plugins: - docker#v5.2.0: @@ -1931,6 +1949,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Transformers Nightly Models + depends_on: image-build + key: block-transformers-nightly-models - label: Transformers Nightly Models agents: queue: gpu_1_queue @@ -1945,8 +1966,7 @@ steps: - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - depends_on: - - image-build + depends_on: block-transformers-nightly-models soft_fail: false plugins: - docker#v5.2.0: @@ -1998,6 +2018,9 @@ steps: mount_buildkite_agent: true - group: Models - Language steps: + - block: Run Language Models Test (Extended Generation) + depends_on: image-build + key: block-language-models-test-extended-generation - label: Language Models Test (Extended Generation) agents: queue: gpu_1_queue @@ -2005,8 +2028,7 @@ steps: - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - depends_on: - - image-build + depends_on: block-language-models-test-extended-generation soft_fail: false plugins: - docker#v5.2.0: @@ -2024,13 +2046,15 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Language Models Test (Extended Pooling) + depends_on: image-build + key: block-language-models-test-extended-pooling - label: Language Models Test (Extended Pooling) agents: queue: gpu_1_queue commands: - pytest -v -s models/language/pooling -m 'not core_model' - depends_on: - - image-build + depends_on: block-language-models-test-extended-pooling soft_fail: false plugins: - docker#v5.2.0: @@ -2048,13 +2072,15 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Language Models Test (MTEB) + depends_on: image-build + key: block-language-models-test-mteb - label: Language Models Test (MTEB) agents: queue: gpu_1_queue commands: - pytest -v -s models/language/pooling_mteb_test - depends_on: - - image-build + depends_on: block-language-models-test-mteb soft_fail: false plugins: - docker#v5.2.0: @@ -2072,13 +2098,15 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Language Models Test (PPL) + depends_on: image-build + key: block-language-models-test-ppl - label: Language Models Test (PPL) agents: queue: gpu_1_queue commands: - pytest -v -s models/language/generation_ppl_test - depends_on: - - image-build + depends_on: block-language-models-test-ppl soft_fail: false plugins: - docker#v5.2.0: @@ -2176,13 +2204,15 @@ steps: mount_buildkite_agent: true - group: Models - Multimodal steps: + - block: Run Custom Models + depends_on: image-build + key: block-custom-models - label: Custom Models agents: queue: gpu_1_queue commands: - echo 'Testing custom models...' - depends_on: - - image-build + depends_on: block-custom-models soft_fail: false plugins: - docker#v5.2.0: @@ -2225,6 +2255,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Multi-Modal Models (Extended) 1 + depends_on: image-build + key: block-multi-modal-models-extended-1 - label: Multi-Modal Models (Extended) 1 agents: queue: gpu_1_queue @@ -2232,8 +2265,7 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - depends_on: - - image-build + depends_on: block-multi-modal-models-extended-1 soft_fail: false plugins: - docker#v5.2.0: @@ -2251,6 +2283,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Multi-Modal Models (Extended) 2 + depends_on: image-build + key: block-multi-modal-models-extended-2 - label: Multi-Modal Models (Extended) 2 agents: queue: gpu_1_queue @@ -2258,8 +2293,7 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - depends_on: - - image-build + depends_on: block-multi-modal-models-extended-2 soft_fail: false plugins: - docker#v5.2.0: @@ -2277,6 +2311,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Multi-Modal Models (Extended) 3 + depends_on: image-build + key: block-multi-modal-models-extended-3 - label: Multi-Modal Models (Extended) 3 agents: queue: gpu_1_queue @@ -2284,8 +2321,7 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - depends_on: - - image-build + depends_on: block-multi-modal-models-extended-3 soft_fail: false plugins: - docker#v5.2.0: @@ -2628,13 +2664,15 @@ steps: mount_buildkite_agent: true - group: Weight Loading steps: + - block: Run Weight Loading Multiple GPU + depends_on: image-build + key: block-weight-loading-multiple-gpu - label: Weight Loading Multiple GPU agents: queue: gpu_4_queue commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - depends_on: - - image-build + depends_on: block-weight-loading-multiple-gpu soft_fail: false plugins: - docker#v5.2.0: @@ -2652,55 +2690,56 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Weight Loading Multiple GPU - Large Models + depends_on: image-build + key: block-weight-loading-multiple-gpu---large-models - label: Weight Loading Multiple GPU - Large Models agents: queue: gpu_4_queue commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - depends_on: - - image-build + depends_on: block-weight-loading-multiple-gpu---large-models soft_fail: false plugins: - kubernetes: - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt - --tp-size=4 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: + podSpec: + priorityClassName: ci + containers: + - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + command: + - bash + - -c + - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn + && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt + --tp-size=4 + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: - name: devshm - emptyDir: - medium: Memory + mountPath: /dev/shm - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: '0' + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate diff --git a/buildkite_steps.yaml b/buildkite_steps.yaml deleted file mode 100644 index 7b489a91ef01..000000000000 --- a/buildkite_steps.yaml +++ /dev/null @@ -1,2212 +0,0 @@ -steps: -- group: Attention - steps: - - label: V1 attention (B200) - agents: - queue: gpu_1_queue - commands: - - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: V1 attention (H100) - agents: - queue: gpu_1_queue - commands: - - pytest -v -s v1/attention - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Basic Correctness - steps: - - label: Basic Correctness - agents: - queue: gpu_1_queue - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s basic_correctness/test_cumem.py - - pytest -v -s basic_correctness/test_basic_correctness.py - - pytest -v -s basic_correctness/test_cpu_offload.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Benchmarks - steps: - - label: Benchmarks - agents: - queue: gpu_1_queue - commands: - - bash scripts/run-benchmarks.sh - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Benchmarks CLI Test - agents: - queue: gpu_1_queue - commands: - - pytest -v -s benchmarks/ - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: CUDA - steps: - - label: Cudagraph - agents: - queue: gpu_1_queue - commands: - - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Platform Tests (CUDA) - agents: - queue: gpu_1_queue - commands: - - pytest -v -s cuda/test_cuda_context.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Compile - steps: - - label: Fusion E2E (2 GPUs)(B200) - agents: - queue: gpu_4_queue - commands: - - nvidia-smi - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Fusion and Compile Tests (B200) - agents: - queue: gpu_1_queue - commands: - - nvidia-smi - - pytest -v -s tests/compile/test_fusion_attn.py - - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - -k 'True and not +quant_fp8 and not +rms_norm' - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Distributed - steps: - - label: 2 Node Test (4 GPUs) - agents: - queue: gpu_4_queue - commands: - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d - --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node - test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 - distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 - --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 - --enforce-eager --trust-remote-code - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d - --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node - test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 - distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 - --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 - --enforce-eager --trust-remote-code - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Distributed (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - export NCCL_CUMEM_HOST_ENABLE=0 - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - - pytest -v -s ./compile/test_wrapper.py - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - | grep 'Same node test passed' - - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 - distributed/test_same_node.py | grep 'Same node test passed' - - pytest -v -s distributed/test_sequence_parallel.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - - pytest -v -s v1/worker/test_worker_memory_snapshot.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Distributed Comm Ops - agents: - queue: gpu_4_queue - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - - pytest -v -s distributed/test_shm_buffer.py - - pytest -v -s distributed/test_shm_storage.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Distributed NixlConnector PD accuracy (4 GPUs) - agents: - queue: gpu_4_queue - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Distributed Tests (2 GPUs)(B200) - agents: - queue: gpu_4_queue - commands: - - pytest -v -s tests/distributed/test_context_parallel.py - - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - - pytest -v -s tests/v1/distributed/test_dbo.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Distributed Tests (2 GPUs)(H200) - agents: - queue: gpu_4_queue - commands: - - pytest -v -s tests/compile/distributed/test_async_tp.py - - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - - pytest -v -s tests/distributed/test_sequence_parallel.py - - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py - --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - - pytest -v -s tests/v1/distributed/test_dbo.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Distributed Tests (4 GPUs) - agents: - queue: gpu_4_queue - commands: - - export NCCL_CUMEM_HOST_ENABLE=0 - - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/fullgraph/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s distributed/test_events.py - - pytest -v -s distributed/test_symm_mem_allreduce.py - - pushd ../examples/offline_inference - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - - popd - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Distributed Tests (4 GPUs)(A100) - agents: - queue: gpu_4_queue - commands: - - pytest -v -s distributed/test_custom_all_reduce.py - - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s -x lora/test_mixtral.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Distributed Tests (8 GPUs)(H100) - agents: - queue: gpu_1_queue - commands: - - export NCCL_CUMEM_HOST_ENABLE=0 - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py - --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Pipeline + Context Parallelism (4 GPUs)) - agents: - queue: gpu_4_queue - commands: - - pytest -v -s distributed/test_pp_cudagraph.py - - pytest -v -s distributed/test_pipeline_parallel.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: E2E Integration - steps: - - label: DeepSeek V2-Lite Accuracy - agents: - queue: gpu_4_queue - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh - 0.25 200 8010 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Prime-RL Integration (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - bash .buildkite/scripts/run-prime-rl-test.sh - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Qwen3-30B-A3B-FP8-block Accuracy - agents: - queue: gpu_4_queue - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh - 0.8 200 8020 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Engine - steps: - - label: Engine - agents: - queue: gpu_1_queue - commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - - pytest -v -s tokenization - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: V1 e2e + engine - agents: - queue: gpu_1_queue - commands: - - pytest -v -s v1/e2e - - pytest -v -s v1/engine - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Entrypoints - steps: - - label: Entrypoints Integration (API Server) - agents: - queue: gpu_1_queue - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py - --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py - --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py - --ignore=entrypoints/openai/tool_parsers/ - - pytest -v -s entrypoints/test_chat_utils.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Entrypoints Integration (LLM) - agents: - queue: gpu_1_queue - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_generate.py - - pytest -v -s entrypoints/offline_mode - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Entrypoints Integration (Pooling) - agents: - queue: gpu_1_queue - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/pooling - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Entrypoints Unit Tests - agents: - queue: gpu_1_queue - commands: - - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai - --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Entrypoints V1 - agents: - queue: gpu_1_queue - commands: - - pytest -v -s v1/entrypoints - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: OpenAI API Correctness - agents: - queue: gpu_1_queue - commands: - - pytest -s entrypoints/openai/correctness/ - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Expert Parallelism - steps: - - label: EPLB Algorithm - agents: - queue: gpu_1_queue - commands: - - pytest -v -s distributed/test_eplb_algo.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: EPLB Execution - agents: - queue: gpu_4_queue - commands: - - pytest -v -s distributed/test_eplb_execute.py - - pytest -v -s distributed/test_eplb_spec_decode.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Kernels - steps: - - label: Kernels (B200) - agents: - queue: gpu_1_queue - commands: - - nvidia-smi - - python3 examples/offline_inference/basic/chat.py - - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Kernels Attention Test %N - agents: - queue: gpu_1_queue - commands: - - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Kernels Core Operation Test - agents: - queue: gpu_1_queue - commands: - - pytest -v -s kernels/core kernels/test_top_k_per_row.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Kernels DeepGEMM Test (H100) - agents: - queue: gpu_1_queue - commands: - - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm - - pytest -v -s kernels/moe/test_deepgemm.py - - pytest -v -s kernels/moe/test_batched_deepgemm.py - - pytest -v -s kernels/attention/test_deepgemm_attention.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Kernels Mamba Test - agents: - queue: gpu_1_queue - commands: - - pytest -v -s kernels/mamba - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Kernels MoE Test %N - agents: - queue: gpu_1_queue - commands: - - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Kernels Quantization Test %N - agents: - queue: gpu_1_queue - commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: LM Eval - steps: - - label: LM Eval Large Models (4 GPUs)(A100) - agents: - queue: gpu_4_queue - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt - --tp-size=4 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: LM Eval Large Models (4 GPUs)(H100) - agents: - queue: gpu_4_queue - commands: - - export VLLM_USE_DEEP_GEMM=0 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: LM Eval Small Models - agents: - queue: gpu_1_queue - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - --tp-size=1 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: LM Eval Small Models (B200) - agents: - queue: gpu_1_queue - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - --tp-size=1 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: LoRA - steps: - - label: LoRA %N - agents: - queue: gpu_1_queue - commands: - - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py - \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py - \ --ignore=lora/test_qwen3moe_tp.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: LoRA TP (Distributed) - agents: - queue: gpu_4_queue - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s -x lora/test_chatglm3_tp.py - - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py - - pytest -v -s -x lora/test_olmoe_tp.py - - pytest -v -s -x lora/test_gptoss_tp.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Miscellaneous - steps: - - label: Async Engine, Inputs, Utils, Worker - agents: - queue: gpu_1_queue - commands: - - pytest -v -s -m 'not cpu_test' multimodal - - pytest -v -s utils_ - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Async Engine, Inputs, Utils, Worker, Config (CPU) - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - python3 standalone_tests/lazy_imports.py - - pytest -v -s test_inputs.py - - pytest -v -s test_outputs.py - - pytest -v -s -m 'cpu_test' multimodal - - pytest -v -s transformers_utils - - pytest -v -s config - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Examples - agents: - queue: gpu_1_queue - commands: - - pip install tensorizer - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf - --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_pooling.py --seed 0 - - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory - /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m - deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper - --seed 0 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens - 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp - 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens - 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp - 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: GPT-OSS Eval (B200) - agents: - queue: gpu_1_queue - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b - --metric 0.58 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Metrics, Tracing (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - pip install 'opentelemetry-sdk>=1.26.0' 'opentelemetry-api>=1.26.0' 'opentelemetry-exporter-otlp>=1.26.0' - 'opentelemetry-semantic-conventions-ai>=0.4.1' - - pytest -v -s v1/tracing - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Python-only Installation - agents: - queue: gpu_1_queue - commands: - - bash standalone_tests/python_only_compile.sh - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Regression - agents: - queue: gpu_1_queue - commands: - - pip install modelscope - - pytest -v -s test_regression.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: V1 Others - agents: - queue: gpu_1_queue - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - pytest -v -s -m 'not cpu_test' v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/worker - - pytest -v -s v1/spec_decode - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: V1 Others (CPU) - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - pytest -v -s -m 'cpu_test' v1/core - - pytest -v -s v1/structured_output - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s -m 'cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'cpu_test' v1/metrics - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Model Executor - steps: - - label: Model Executor - agents: - queue: gpu_1_queue - commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s model_executor - - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Models - Basic - steps: - - label: Basic Models Test (Other CPU) - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - pytest -v -s models/test_utils.py models/test_vision.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Basic Models Tests (Extra Initialization) %N - agents: - queue: gpu_1_queue - commands: - - pytest -v -s models/test_initialization.py \ -k 'not test_can_initialize_small_subset' - \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Basic Models Tests (Initialization) - agents: - queue: gpu_1_queue - commands: - - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Basic Models Tests (Other) - agents: - queue: gpu_1_queue - commands: - - pytest -v -s models/test_transformers.py models/test_registry.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Transformers Nightly Models - agents: - queue: gpu_1_queue - commands: - - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal - or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR - or KimiVL)' - - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/test_mapping.py - - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py - --model-type whisper - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Models - Distributed - steps: - - label: Distributed Model Tests (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py - - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py - -v -s -m 'distributed(num_gpus=2)' - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Models - Language - steps: - - label: Language Models Test (Extended Generation) - agents: - queue: gpu_1_queue - commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Language Models Test (Extended Pooling) - agents: - queue: gpu_1_queue - commands: - - pytest -v -s models/language/pooling -m 'not core_model' - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Language Models Test (MTEB) - agents: - queue: gpu_1_queue - commands: - - pytest -v -s models/language/pooling_mteb_test - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Language Models Test (PPL) - agents: - queue: gpu_1_queue - commands: - - pytest -v -s models/language/generation_ppl_test - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Language Models Tests (Extra Standard) %N - agents: - queue: gpu_1_queue - commands: - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - \ --shard-id=$$BUILDKITE_PARALLEL_JOB - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Language Models Tests (Hybrid) %N - agents: - queue: gpu_1_queue - commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - \ --shard-id=$$BUILDKITE_PARALLEL_JOB - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Language Models Tests (Standard) - agents: - queue: gpu_1_queue - commands: - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and (not slow_test)' - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Models - Multimodal - steps: - - label: Custom Models - agents: - queue: gpu_1_queue - commands: - - echo 'Testing custom models...' - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Multi-Modal Accuracy Eval (Small Models) - agents: - queue: gpu_1_queue - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt - --tp-size=1 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Multi-Modal Models (Extended) 1 - agents: - queue: gpu_1_queue - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py - --ignore models/multimodal/processing - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Multi-Modal Models (Extended) 2 - agents: - queue: gpu_1_queue - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) - and not core_model' - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Multi-Modal Models (Extended) 3 - agents: - queue: gpu_1_queue - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) - and not core_model' - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Multi-Modal Models (Standard) - agents: - queue: gpu_1_queue - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py - --ignore models/multimodal/processing - - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py - -m core_model - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Multi-Modal Processor - agents: - queue: gpu_1_queue - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Plugins - steps: - - label: Plugin Tests (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - pip install -e ./plugins/vllm_add_dummy_platform - - pytest -v -s plugins_tests/test_platform_plugins.py - - pip uninstall vllm_add_dummy_platform -y - - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py - - pip uninstall prithvi_io_processor_plugin -y - - pip install -e ./plugins/vllm_add_dummy_stat_logger - - pytest -v -s plugins_tests/test_stats_logger_plugins.py - - pip uninstall dummy_stat_logger -y - - pytest -v -s plugins_tests/test_scheduler_plugins.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - - pytest -v -s entrypoints/openai/test_oot_registration.py - - pytest -v -s models/test_oot_registration.py - - pytest -v -s plugins/lora_resolvers - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: PyTorch - steps: - - label: PyTorch Compilation Unit Tests - agents: - queue: gpu_1_queue - commands: - - find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\; - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: PyTorch Fullgraph - agents: - queue: gpu_1_queue - commands: - - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - - pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 - and not Llama-4' - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: PyTorch Fullgraph Smoke Test - agents: - queue: gpu_1_queue - commands: - - find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec - pytest -s -v {} \\; - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Pytorch Nightly Dependency Override Check - agents: - queue: gpu_1_queue - commands: - - bash standalone_tests/pytorch_nightly_dependency.sh - soft_fail: true - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Quantization - steps: - - label: Quantization - agents: - queue: gpu_1_queue - commands: - - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Quantized MoE Test (B200) - agents: - queue: gpu_1_queue - commands: - - pytest -s -v tests/quantization/test_blackwell_moe.py - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Samplers - steps: - - label: Samplers Test - agents: - queue: gpu_1_queue - commands: - - pytest -v -s samplers - - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Tool use - steps: - - label: OpenAI-Compatible Tool Use - agents: - queue: gpu_1_queue - commands: - - pytest -v -s -m 'not cpu_test' tool_use - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: OpenAI-Compatible Tool Use (CPU) - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - pytest -v -s -m 'cpu_test' tool_use - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Weight Loading - steps: - - label: Weight Loading Multiple GPU - agents: - queue: gpu_4_queue - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - label: Weight Loading Multiple GPU - Large Models - agents: - queue: gpu_4_queue - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true From ac8206580de8ec1310eb82f46a23708ff61f1771 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 03:21:49 -0800 Subject: [PATCH 09/24] depends_on for build job Signed-off-by: Kevin H. Luu --- .buildkite/image_build/image_build.yaml | 6 +++++- .buildkite/pipeline.yaml | 19 +++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml index 26ec10bc8f8d..af23621a598c 100644 --- a/.buildkite/image_build/image_build.yaml +++ b/.buildkite/image_build/image_build.yaml @@ -2,7 +2,7 @@ group: Abuild steps: - label: ":docker: Build image" key: image-build - depends_on: ~ + depends_on: [] commands: - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT env: @@ -16,6 +16,7 @@ steps: - label: ":docker: Build CPU image" key: image-build-cpu + depends_on: [] commands: - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT env: @@ -29,6 +30,7 @@ steps: - label: ":docker: Build CUDA 11.8 image" key: image-build-cu118 + optional: true commands: - .buildkite/image_build/image_build_cu118.sh $REGISTRY $REPO $BUILDKITE_COMMIT env: @@ -42,6 +44,7 @@ steps: - label: ":docker: Build HPU image" soft_fail: true + depends_on: [] key: image-build-hpu commands: - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT @@ -56,6 +59,7 @@ steps: - label: ":docker: Build CPU arm64 image" key: cpu-arm64-image-build + depends_on: [] optional: true commands: - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index 471f24d74b75..97580e597409 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -2,10 +2,10 @@ steps: - group: Abuild steps: - block: 'Run :docker: Build CPU arm64 image' - depends_on: image-build + depends_on: [] key: block--docker--build-cpu-arm64-image - label: ':docker: Build CPU arm64 image' - key: image-build-cpu-arm64 + key: cpu-arm64-image-build agents: queue: cpu_queue_premerge_us_east_1 commands: @@ -13,6 +13,14 @@ steps: 123 depends_on: block--docker--build-cpu-arm64-image soft_fail: false + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: -10 + limit: 2 + env: + DOCKER_BUILDKIT: '1' - label: ':docker: Build CPU image' key: image-build-cpu agents: @@ -20,6 +28,7 @@ steps: commands: - .buildkite/image_build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 + depends_on: [] soft_fail: false retry: automatic: @@ -29,6 +38,9 @@ steps: limit: 2 env: DOCKER_BUILDKIT: '1' + - block: 'Run :docker: Build CUDA 11.8 image' + depends_on: [] + key: block--docker--build-cuda-11.8-image - label: ':docker: Build CUDA 11.8 image' key: image-build-cu118 agents: @@ -36,6 +48,7 @@ steps: commands: - .buildkite/image_build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 + depends_on: block--docker--build-cuda-11.8-image soft_fail: false retry: automatic: @@ -52,6 +65,7 @@ steps: commands: - .buildkite/image_build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 + depends_on: [] soft_fail: true retry: automatic: @@ -68,6 +82,7 @@ steps: commands: - .buildkite/image_build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 + depends_on: [] soft_fail: false retry: automatic: From 8b886aa54d00956688c1146c66999dadd17d598c Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 13:37:33 -0800 Subject: [PATCH 10/24] Revert "[CI] fix url-encoding behavior in nightly metadata generation (#29787)" This reverts commit 37593deb02423826e9206ff28e77f57a0ff8a0b0. --- .buildkite/scripts/generate-nightly-index.py | 11 +++---- setup.py | 33 ++++++++------------ 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index 90286ad4c6e1..a61f08107647 100644 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -112,12 +112,11 @@ def generate_package_index_and_metadata( relative_path = ( wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename ) - # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B' - # NOTE: this is AWS S3 specific behavior! - file_path_quoted = quote(relative_path.as_posix(), safe=":%/") - href_tags.append(f' {file.filename}
') + href_tags.append( + f' {file.filename}
' + ) file_meta = asdict(file) - file_meta["path"] = file_path_quoted + file_meta["path"] = relative_path.as_posix() metadata.append(file_meta) index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags)) metadata_str = json.dumps(metadata, indent=2) @@ -186,7 +185,7 @@ def generate_index_and_metadata( "platform_tag": "manylinux2014_aarch64", "variant": "cu129", "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl", - "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded + "path": "../vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL }, ... ] diff --git a/setup.py b/setup.py index 67fbebb1d37b..67226b4447c7 100644 --- a/setup.py +++ b/setup.py @@ -319,17 +319,14 @@ class precompiled_wheel_utils: """Extracts libraries and other files from an existing wheel.""" @staticmethod - def extract_precompiled_and_patch_package( - wheel_url_or_path: str, download_filename: str | None - ) -> dict: + def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict: import tempfile import zipfile temp_dir = None try: if not os.path.isfile(wheel_url_or_path): - # use provided filename first, then derive from URL - wheel_filename = download_filename or wheel_url_or_path.split("/")[-1] + wheel_filename = wheel_url_or_path.split("/")[-1] temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") wheel_path = os.path.join(temp_dir, wheel_filename) print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}") @@ -676,8 +673,7 @@ def _fetch_metadata_for_variant( wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) if wheel_location is not None: wheel_url = wheel_location - download_filename = None - logger.info("Using user-specified precompiled wheel location: %s", wheel_url) + logger.info("Using user-specified precompiled wheel location: {}", wheel_url) else: import platform @@ -690,17 +686,17 @@ def _fetch_metadata_for_variant( precompiled_wheel_utils.get_base_commit_in_main_branch(), ) logger.info( - "Using precompiled wheel commit %s with variant %s", commit, variant + "Using precompiled wheel commit {} with variant {}", commit, variant ) try_default = False - wheels, repo_url, download_filename = None, None, None + wheels, repo_url = None, None try: wheels, repo_url = _fetch_metadata_for_variant(commit, variant) - except Exception: + except Exception as e: logger.warning( - "Failed to fetch precompiled wheel metadata for variant %s", + "Failed to fetch precompiled wheel metadata for variant {}", variant, - exc_info=True, + exc_info=e, ) try_default = True # try outside handler to keep the stacktrace simple if try_default: @@ -721,29 +717,26 @@ def _fetch_metadata_for_variant( "platform_tag": "manylinux1_x86_64", "variant": null, "filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl", -"path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl" +"path": "../vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl" }, ...]""" for wheel in wheels: - # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc) if wheel.get("package_name") == "vllm" and arch in wheel.get( "platform_tag", "" ): - logger.info("Found precompiled wheel metadata: %s", wheel) + logger.info("Found precompiled wheel metadata: {}", wheel) if "path" not in wheel: raise ValueError(f"Wheel metadata missing path: {wheel}") + # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc) wheel_url = repo_url + wheel["path"] - download_filename = wheel.get("filename") - logger.info("Using precompiled wheel URL: %s", wheel_url) + logger.info("Using precompiled wheel URL: {}", wheel_url) break else: raise ValueError( f"No precompiled vllm wheel found for architecture {arch} " f"from repo {repo_url}. All available wheels: {wheels}" ) - patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( - wheel_url, download_filename - ) + patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url) for pkg, files in patch.items(): package_data.setdefault(pkg, []).extend(files) From 95b4cdf3f1675300d4574c00e8b1451640b4bc93 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 13:37:43 -0800 Subject: [PATCH 11/24] Revert "[CI] Renovation of nightly wheel build & generation (#29690)" This reverts commit 36db0a35e45f32f7c37f6f1967dc8d6ff301d882. --- .buildkite/generate_index.py | 46 +++ .buildkite/release-pipeline.yaml | 16 +- .buildkite/scripts/generate-nightly-index.py | 368 ------------------ .buildkite/scripts/upload-wheels.sh | 121 +++--- docs/getting_started/installation/cpu.md | 15 +- .../installation/gpu.cuda.inc.md | 73 ++-- docs/getting_started/installation/gpu.md | 2 +- setup.py | 101 ++--- vllm/envs.py | 7 +- 9 files changed, 181 insertions(+), 568 deletions(-) create mode 100644 .buildkite/generate_index.py delete mode 100644 .buildkite/scripts/generate-nightly-index.py diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py new file mode 100644 index 000000000000..bbed80ebe847 --- /dev/null +++ b/.buildkite/generate_index.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import os + +template = """ + + +

Links for vLLM

+ {x86_wheel}
+ {arm_wheel}
+ + +""" + +parser = argparse.ArgumentParser() +parser.add_argument("--wheel", help="The wheel path.", required=True) +args = parser.parse_args() + +filename = os.path.basename(args.wheel) + +with open("index.html", "w") as f: + print(f"Generated index.html for {args.wheel}") + # sync the abi tag with .buildkite/scripts/upload-wheels.sh + if "x86_64" in filename: + x86_wheel = filename + arm_wheel = filename.replace("x86_64", "aarch64").replace( + "manylinux1", "manylinux2014" + ) + elif "aarch64" in filename: + x86_wheel = filename.replace("aarch64", "x86_64").replace( + "manylinux2014", "manylinux1" + ) + arm_wheel = filename + else: + raise ValueError(f"Unsupported wheel: {filename}") + # cloudfront requires escaping the '+' character + f.write( + template.format( + x86_wheel=x86_wheel, + x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"), + arm_wheel=arm_wheel, + arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"), + ) + ) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index fbfc923998f8..38c400ba1faf 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -8,7 +8,7 @@ steps: commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" @@ -30,6 +30,19 @@ steps: DOCKER_BUILDKIT: "1" # x86 + CUDA builds + - label: "Build wheel - CUDA 12.8" + depends_on: ~ + id: build-wheel-cuda-12-8 + agents: + queue: cpu_queue_postmerge + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + - label: "Build wheel - CUDA 12.9" depends_on: ~ id: build-wheel-cuda-12-9 @@ -96,6 +109,7 @@ steps: - label: "Annotate release workflow" depends_on: - create-multi-arch-manifest + - build-wheel-cuda-12-8 id: annotate-release-workflow agents: queue: cpu_queue_postmerge diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py deleted file mode 100644 index a61f08107647..000000000000 --- a/.buildkite/scripts/generate-nightly-index.py +++ /dev/null @@ -1,368 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# do not complain about line length (for docstring) -# ruff: noqa: E501 - -import argparse -import json -import re -import sys -from dataclasses import asdict, dataclass -from pathlib import Path -from typing import Any -from urllib.parse import quote - -if not sys.version_info >= (3, 10): - raise RuntimeError("This script requires Python 3.10 or higher.") - -INDEX_HTML_TEMPLATE = """ - - - -{items} - - -""" - - -@dataclass -class WheelFileInfo: - package_name: str - version: str - build_tag: str | None - python_tag: str - abi_tag: str - platform_tag: str - variant: str | None - filename: str - - -def parse_from_filename(file: str) -> WheelFileInfo: - """ - Parse wheel file name to extract metadata. - - The format of wheel names: - {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl - All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not). - Example: - vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl - vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl - vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl - vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl - """ - wheel_file_re = re.compile( - r"^(?P.+)-(?P[^-]+?)(-(?P[^-]+))?-(?P[^-]+)-(?P[^-]+)-(?P[^-]+)\.whl$" - ) - match = wheel_file_re.match(file) - if not match: - raise ValueError(f"Invalid wheel file name: {file}") - - package_name = match.group("package_name") - version = match.group("version") - build_tag = match.group("build_tag") - python_tag = match.group("python_tag") - abi_tag = match.group("abi_tag") - platform_tag = match.group("platform_tag") - - # extract variant from version - variant = None - if "dev" in version: - ver_after_dev = version.split("dev")[-1] - if "." in ver_after_dev: - variant = ver_after_dev.split(".")[-1] - version = version.removesuffix("." + variant) - else: - if "+" in version: - version, variant = version.split("+") - - return WheelFileInfo( - package_name=package_name, - version=version, - build_tag=build_tag, - python_tag=python_tag, - abi_tag=abi_tag, - platform_tag=platform_tag, - variant=variant, - filename=file, - ) - - -def generate_project_list(subdir_names: list[str]) -> str: - """ - Generate project list HTML content linking to each project & variant sub-directory. - """ - href_tags = [] - for name in sorted(subdir_names): - name = name.strip("/").strip(".") - href_tags.append(f' {name}/
') - return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags)) - - -def generate_package_index_and_metadata( - wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path -) -> tuple[str, str]: - """ - Generate package index HTML content for a specific package, linking to actual wheel files. - """ - href_tags = [] - metadata = [] - for file in sorted(wheel_files, key=lambda x: x.filename): - relative_path = ( - wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename - ) - href_tags.append( - f' {file.filename}
' - ) - file_meta = asdict(file) - file_meta["path"] = relative_path.as_posix() - metadata.append(file_meta) - index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags)) - metadata_str = json.dumps(metadata, indent=2) - return index_str, metadata_str - - -def generate_index_and_metadata( - whl_files: list[str], - wheel_base_dir: Path, - index_base_dir: Path, - default_variant: str | None = None, - alias_to_default: str | None = None, -): - """ - Generate index for all wheel files. - - Args: - whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`). - wheel_base_dir (Path): Base directory for wheel files. - index_base_dir (Path): Base directory to store index files. - default_variant (str | None): The default variant name, if any. - alias_to_default (str | None): Alias variant name for the default variant, if any. - - First, parse all wheel files to extract metadata. - We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory). - The index for the default variant (if any) is generated in the root index directory. - - If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index - is purely a copy of the corresponding variant index, with only the links adjusted. - Otherwise, all wheels without variant suffixes are treated as the default variant. - - If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content - as the default variant index, but the links are adjusted accordingly. - - Index directory structure: - index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/) - index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories - vllm/ - index.html # package index, pointing to actual files in wheel_base_dir (relative path) - metadata.json # machine-readable metadata for all wheels in this package - cpu/ # cpu variant sub-directory - index.html - vllm/ - index.html - metadata.json - cu129/ # cu129 is actually the alias to default variant - index.html - vllm/ - index.html - metadata.json - cu130/ # cu130 variant sub-directory - index.html - vllm/ - index.html - metadata.json - ... - - metadata.json stores a dump of all wheel files' metadata in a machine-readable format: - [ - { - "package_name": "vllm", - "version": "0.10.2rc2", - "build_tag": null, - "python_tag": "cp38", - "abi_tag": "abi3", - "platform_tag": "manylinux2014_aarch64", - "variant": "cu129", - "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl", - "path": "../vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL - }, - ... - ] - """ - - parsed_files = [parse_from_filename(f) for f in whl_files] - - if not parsed_files: - print("No wheel files found, skipping index generation.") - return - - # Group by variant - variant_to_files: dict[str, list[WheelFileInfo]] = {} - for file in parsed_files: - variant = file.variant or "default" - if variant not in variant_to_files: - variant_to_files[variant] = [] - variant_to_files[variant].append(file) - - print(f"Found variants: {list(variant_to_files.keys())}") - - # sanity check for default variant - if default_variant: - if "default" in variant_to_files: - raise ValueError( - "All wheel files must have variant suffixes when `default_variant` is specified." - ) - if default_variant not in variant_to_files: - raise ValueError( - f"Default variant '{default_variant}' not found among wheel files." - ) - - if alias_to_default: - if "default" not in variant_to_files: - # e.g. only some wheels are uploaded to S3 currently - print( - "[WARN] Alias to default variant specified, but no default variant found." - ) - elif alias_to_default in variant_to_files: - raise ValueError( - f"Alias variant name '{alias_to_default}' already exists among wheel files." - ) - else: - variant_to_files[alias_to_default] = variant_to_files["default"].copy() - print(f"Alias variant '{alias_to_default}' created for default variant.") - - # Generate index for each variant - subdir_names = set() - for variant, files in variant_to_files.items(): - if variant == "default": - variant_dir = index_base_dir - else: - variant_dir = index_base_dir / variant - subdir_names.add(variant) - - variant_dir.mkdir(parents=True, exist_ok=True) - - # gather all package names in this variant - packages = set(f.package_name for f in files) - if variant == "default": - # these packages should also appear in the "project list" - # generate after all variants are processed - subdir_names = subdir_names.union(packages) - else: - # generate project list for this variant directly - project_list_str = generate_project_list(sorted(packages)) - with open(variant_dir / "index.html", "w") as f: - f.write(project_list_str) - - for package in packages: - # filter files belonging to this package only - package_files = [f for f in files if f.package_name == package] - package_dir = variant_dir / package - package_dir.mkdir(parents=True, exist_ok=True) - index_str, metadata_str = generate_package_index_and_metadata( - package_files, wheel_base_dir, package_dir - ) - with open(package_dir / "index.html", "w") as f: - f.write(index_str) - with open(package_dir / "metadata.json", "w") as f: - f.write(metadata_str) - - # Generate top-level project list index - project_list_str = generate_project_list(sorted(subdir_names)) - with open(index_base_dir / "index.html", "w") as f: - f.write(project_list_str) - - -if __name__ == "__main__": - """ - Arguments: - --version : version string for the current build (e.g., commit hash) - --current-objects : path to JSON file containing current S3 objects listing in this version directory - --output-dir : directory to store generated index files - --alias-to-default : (optional) alias variant name for the default variant - """ - - parser = argparse.ArgumentParser( - description="Process nightly build wheel files to generate indices." - ) - parser.add_argument( - "--version", - type=str, - required=True, - help="Version string for the current build (e.g., commit hash)", - ) - parser.add_argument( - "--current-objects", - type=str, - required=True, - help="Path to JSON file containing current S3 objects listing in this version directory", - ) - parser.add_argument( - "--output-dir", - type=str, - required=True, - help="Directory to store generated index files", - ) - parser.add_argument( - "--alias-to-default", - type=str, - default=None, - help="Alias variant name for the default variant", - ) - - args = parser.parse_args() - - version = args.version - if "/" in version or "\\" in version: - raise ValueError("Version string must not contain slashes.") - current_objects_path = Path(args.current_objects) - output_dir = Path(args.output_dir) - if not output_dir.exists(): - output_dir.mkdir(parents=True, exist_ok=True) - - # Read current objects JSON - with open(current_objects_path) as f: - current_objects: dict[str, list[dict[str, Any]]] = json.load(f) - - # current_objects looks like from list_objects_v2 S3 API: - """ - "Contents": [ - { - "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl", - "LastModified": "2025-11-28T14:00:32+00:00", - "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"", - "ChecksumAlgorithm": [ - "CRC64NVME" - ], - "ChecksumType": "FULL_OBJECT", - "Size": 435649349, - "StorageClass": "STANDARD" - }, - ... - ] - """ - - # Extract wheel file keys - wheel_files = [] - for item in current_objects.get("Contents", []): - key: str = item["Key"] - if key.endswith(".whl"): - wheel_files.append(key.split("/")[-1]) # only the filename is used - - print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") - - # Generate index and metadata, assuming wheels and indices are stored as: - # s3://vllm-wheels/{version}/ - # s3://vllm-wheels// - wheel_base_dir = Path(output_dir).parent / version - index_base_dir = Path(output_dir) - - generate_index_and_metadata( - whl_files=wheel_files, - wheel_base_dir=wheel_base_dir, - index_base_dir=index_base_dir, - default_variant=None, - alias_to_default=args.alias_to_default, - ) - print(f"Successfully generated index and metadata in {output_dir}") diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 05accb9cf16d..945c5e48c009 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -2,28 +2,6 @@ set -ex -# ======== part 0: setup ======== - -BUCKET="vllm-wheels" -INDICES_OUTPUT_DIR="indices" -DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py -PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3 -SUBPATH=$BUILDKITE_COMMIT -S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" - -# detect if python3.10+ is available -has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,10) else 0)") -if [[ "$has_new_python" -eq 0 ]]; then - # use new python from docker - docker pull python:3-slim - PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3" -fi - -echo "Using python interpreter: $PYTHON" -echo "Python version: $($PYTHON --version)" - -# ========= part 1: collect, rename & upload the wheel ========== - # Assume wheels are in artifacts/dist/*.whl wheel_files=(artifacts/dist/*.whl) @@ -32,69 +10,74 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" exit 1 fi + +# Get the single wheel file wheel="${wheel_files[0]}" -# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31 -# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels -manylinux_version="manylinux_2_31" +# Detect architecture and rename 'linux' to appropriate manylinux version +arch=$(uname -m) +if [[ $arch == "x86_64" ]]; then + manylinux_version="manylinux1" +elif [[ $arch == "aarch64" ]]; then + manylinux_version="manylinux2014" +else + echo "Warning: Unknown architecture $arch, using manylinux1 as default" + manylinux_version="manylinux1" +fi # Rename 'linux' to the appropriate manylinux version in the wheel filename -if [[ "$wheel" != *"linux"* ]]; then - echo "Error: Wheel filename does not contain 'linux': $wheel" - exit 1 -fi new_wheel="${wheel/linux/$manylinux_version}" mv -- "$wheel" "$new_wheel" wheel="$new_wheel" -echo "Renamed wheel to: $wheel" # Extract the version from the wheel version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) -echo "Version in wheel: $version" -pure_version="${version%%+*}" -echo "Pure version (without variant): $pure_version" - -# copy wheel to its own bucket -aws s3 cp "$wheel" "$S3_COMMIT_PREFIX" +echo "Version: $version" + +normal_wheel="$wheel" # Save the original wheel filename + +# If the version contains "dev", rename it to v1.0.0.dev for consistency +if [[ $version == *dev* ]]; then + suffix="${version##*.}" + if [[ $suffix == cu* ]]; then + new_version="1.0.0.dev+${suffix}" + else + new_version="1.0.0.dev" + fi + new_wheel="${wheel/$version/$new_version}" + # use cp to keep both files in the artifacts directory + cp -- "$wheel" "$new_wheel" + wheel="$new_wheel" + version="$new_version" +fi -# ========= part 2: generate and upload indices ========== -# generate indices for all existing wheels in the commit directory -# this script might be run multiple times if there are multiple variants being built -# so we need to guarantee there is little chance for "TOCTOU" issues -# i.e., one process is generating indices while another is uploading a new wheel -# so we need to ensure no time-consuming operations happen below +# Upload the wheel to S3 +python3 .buildkite/generate_index.py --wheel "$normal_wheel" -# list all wheels in the commit directory -echo "Existing wheels on S3:" -aws s3 ls "$S3_COMMIT_PREFIX" -obj_json="objects.json" -aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json" -mkdir -p "$INDICES_OUTPUT_DIR" +# generate index for this commit +aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" +aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" -# call script to generate indicies for all existing wheels -# this indices have relative paths that could work as long as it is next to the wheel directory in s3 -# i.e., the wheels are always in s3://vllm-wheels// -# and indices can be placed in //, or /nightly/, or // -if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then - alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS" +if [[ $normal_wheel == *"cu129"* ]]; then + # only upload index.html for cu129 wheels (default wheels) as it + # is available on both x86 and arm64 + aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" + aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" else - alias_arg="" + echo "Skipping index files for non-cu129 wheels" fi -$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg - -# copy indices to // unconditionally -echo "Uploading indices to $S3_COMMIT_PREFIX" -aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX" +# generate index for nightly +aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" +aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" -# copy to /nightly/ only if it is on the main branch and not a PR -if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then - echo "Uploading indices to overwrite /nightly/" - aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/" +if [[ $normal_wheel == *"cu129"* ]]; then + # only upload index.html for cu129 wheels (default wheels) as it + # is available on both x86 and arm64 + aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" +else + echo "Skipping index files for non-cu129 wheels" fi -# copy to // only if it does not have "dev" in the version -if [[ "$version" != *"dev"* ]]; then - echo "Uploading indices to overwrite /$pure_version/" - aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" -fi +aws s3 cp "$wheel" "s3://vllm-wheels/$version/" +aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html" diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 18dc6d19434b..d1beab7855b1 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -46,23 +46,10 @@ vLLM is a Python library that supports the following CPU variants. Select your C ### Pre-built wheels -Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels). - -When specifying the index URL, please make sure to use the `cpu` variant subdirectory. -For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`. +Currently, there are no pre-built CPU wheels. ### Build wheel from source -#### Set up using Python-only build (without compilation) {#python-only-build} - -Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with: - -```bash -VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable . -``` - -#### Full build (with compilation) {#full-build} - === "Intel/AMD x86" --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source" diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md index ad26672f8092..601d3659af88 100644 --- a/docs/getting_started/installation/gpu.cuda.inc.md +++ b/docs/getting_started/installation/gpu.cuda.inc.md @@ -26,50 +26,43 @@ uv pip install vllm --torch-backend=auto ??? console "pip" ```bash - # Install vLLM with CUDA 12.9. - pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129 + # Install vLLM with CUDA 12.8. + pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 ``` -We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first. +We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first. !!! note NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration. -As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions: +As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: ```bash -# Install vLLM with a specific CUDA version (e.g., 13.0). +# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6). export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') -export CUDA_VERSION=130 # or other -uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION} +export CUDA_VERSION=118 # or 126 +uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION} ``` #### Install the latest code -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on . There are multiple indices that could be used: - -* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9. -* `https://wheels.vllm.ai/nightly/`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency. - -To install from nightly index, run: +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`. ```bash uv pip install -U vllm \ --torch-backend=auto \ - --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed + --extra-index-url https://wheels.vllm.ai/nightly ``` -!!! warning "`pip` caveat" - - Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). - - If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page). - +??? console "pip" ```bash - pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!) - pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit + pip install -U vllm \ + --pre \ + --extra-index-url https://wheels.vllm.ai/nightly ``` + `--pre` is required for `pip` to consider pre-released versions. + ##### Install specific revisions If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: @@ -78,13 +71,33 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch uv pip install vllm \ --torch-backend=auto \ - --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed + --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` +The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. + +??? note "pip" + If you want to access the wheels for previous commits (e.g. to bisect the behavior change, + performance regression), due to the limitation of `pip`, you have to specify the full URL of the + wheel file by embedding the commit hash in the URL: + + ```bash + export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch + pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + ``` + + Note that the wheels are built with Python 3.8 ABI (see [PEP + 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible + with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a + placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in + the wheel metadata (the wheels listed in the extra index url have correct versions). Although we + don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the + wheels are still built with Python 3.8 ABI to keep the same wheel name as before. + # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] -#### Set up using Python-only build (without compilation) {#python-only-build} +#### Set up using Python-only build (without compilation) If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM: @@ -108,24 +121,18 @@ This command will do the following: In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable. ```bash -export VLLM_PRECOMPILED_WHEEL_COMIMT=$(git rev-parse HEAD~1) # or earlier commit on main -export VLLM_USE_PRECOMPILED=1 +export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl uv pip install --editable . ``` -There are more environment variables to control the behavior of Python-only build: - -* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped. -* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch. -* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index. - You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code). !!! note There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel. -#### Full build (with compilation) {#full-build} +#### Full build (with compilation) If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md index fb750f449985..bc7508b29475 100644 --- a/docs/getting_started/installation/gpu.md +++ b/docs/getting_started/installation/gpu.md @@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python" -### Pre-built wheels {#pre-built-wheels} +### Pre-built wheels === "NVIDIA CUDA" diff --git a/setup.py b/setup.py index 67226b4447c7..0022e7fe0bf3 100644 --- a/setup.py +++ b/setup.py @@ -310,6 +310,9 @@ def run(self): class precompiled_build_ext(build_ext): """Disables extension building when using precompiled binaries.""" + def run(self) -> None: + assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + def build_extensions(self) -> None: print("Skipping build_ext: using precompiled extensions.") return @@ -645,97 +648,37 @@ def _read_requirements(filename: str) -> list[str]: ] } - -def _fetch_metadata_for_variant( - commit: str, variant: str | None -) -> tuple[list[dict], str]: - variant_dir = f"{variant}/" if variant is not None else "" - repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/" - meta_url = repo_url + "metadata.json" - logger.info("Trying to fetch metadata from {}", meta_url) - from urllib.request import urlopen - - with urlopen(meta_url) as resp: - # urlopen raises HTTPError on unexpected status code - wheels = json.loads(resp.read().decode("utf-8")) - return wheels, repo_url - - # If using precompiled, extract and patch package_data (in advance of setup) if envs.VLLM_USE_PRECOMPILED: - # Attempts: - # 1. user-specified wheel location (can be either local or remote, via - # VLLM_PRECOMPILED_WHEEL_LOCATION) - # 2. user-specified variant from nightly repo (current main commit via - # VLLM_PRECOMPILED_WHEEL_VARIANT) - # 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo - # 4. the default variant from nightly repo (current main commit) + assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) if wheel_location is not None: wheel_url = wheel_location - logger.info("Using user-specified precompiled wheel location: {}", wheel_url) else: import platform arch = platform.machine() - # try to fetch the wheel metadata from the nightly wheel repo - main_variant = envs.VLLM_MAIN_CUDA_VERSION.replace(".", "") - variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant) - commit = os.getenv( - "VLLM_PRECOMPILED_WHEEL_COMMIT", - precompiled_wheel_utils.get_base_commit_in_main_branch(), - ) - logger.info( - "Using precompiled wheel commit {} with variant {}", commit, variant + if arch == "x86_64": + wheel_tag = "manylinux1_x86_64" + elif arch == "aarch64": + wheel_tag = "manylinux2014_aarch64" + else: + raise ValueError(f"Unsupported architecture: {arch}") + base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() + wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl" + nightly_wheel_url = ( + f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl" ) - try_default = False - wheels, repo_url = None, None + from urllib.request import urlopen + try: - wheels, repo_url = _fetch_metadata_for_variant(commit, variant) + with urlopen(wheel_url) as resp: + if resp.status != 200: + wheel_url = nightly_wheel_url except Exception as e: - logger.warning( - "Failed to fetch precompiled wheel metadata for variant {}", - variant, - exc_info=e, - ) - try_default = True # try outside handler to keep the stacktrace simple - if try_default: - logger.info("Trying the default variant") - wheels, repo_url = _fetch_metadata_for_variant(commit, None) - # if this also fails, then we have nothing more to try / cache - assert wheels is not None and repo_url is not None, ( - "Failed to fetch precompiled wheel metadata" - ) - # The metadata.json has the following format: - # see .buildkite/scripts/generate-nightly-index.py for details - """[{ -"package_name": "vllm", -"version": "0.11.2.dev278+gdbc3d9991", -"build_tag": null, -"python_tag": "cp38", -"abi_tag": "abi3", -"platform_tag": "manylinux1_x86_64", -"variant": null, -"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl", -"path": "../vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl" -}, -...]""" - for wheel in wheels: - if wheel.get("package_name") == "vllm" and arch in wheel.get( - "platform_tag", "" - ): - logger.info("Found precompiled wheel metadata: {}", wheel) - if "path" not in wheel: - raise ValueError(f"Wheel metadata missing path: {wheel}") - # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc) - wheel_url = repo_url + wheel["path"] - logger.info("Using precompiled wheel URL: {}", wheel_url) - break - else: - raise ValueError( - f"No precompiled vllm wheel found for architecture {arch} " - f"from repo {repo_url}. All available wheels: {wheels}" - ) + print(f"[warn] Falling back to nightly wheel: {e}") + wheel_url = nightly_wheel_url + patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url) for pkg, files in patch.items(): package_data.setdefault(pkg, []).extend(files) diff --git a/vllm/envs.py b/vllm/envs.py index d0912863e644..46f1aa3222be 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -74,7 +74,7 @@ VLLM_MEDIA_CONNECTOR: str = "http" VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" - VLLM_MAIN_CUDA_VERSION: str = "12.9" + VLLM_MAIN_CUDA_VERSION: str = "12.8" MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -445,9 +445,10 @@ def get_vllm_port() -> int | None: # Target device of vLLM, supporting [cuda (by default), # rocm, cpu] "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), - # Main CUDA version of vLLM. This follows PyTorch but can be overridden. + # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9], + # 12.8 is the default. This follows PyTorch but can be overridden. "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() - or "12.9", + or "12.8", # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None), From d4d268cb707b0ecfacaf52dd5a937f810fe6ec98 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 15:24:46 -0800 Subject: [PATCH 12/24] sync Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/distributed.yaml | 9 ++++++--- .buildkite/test_areas/engine.yaml | 5 +---- .buildkite/test_areas/misc.yaml | 4 +++- .buildkite/test_areas/models_basic.yaml | 5 +++-- .buildkite/test_areas/models_multimodal.yaml | 12 +++++++++++- .buildkite/test_areas/quantization.yaml | 1 + 6 files changed, 25 insertions(+), 11 deletions(-) diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index e6ae13b8156d..1328ecec1b16 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -39,6 +39,7 @@ steps: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py @@ -84,6 +85,7 @@ steps: # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py @@ -101,6 +103,7 @@ steps: - popd - label: Distributed Tests (8 GPUs)(H100) + optional: true timeout_in_minutes: 10 gpu: h100 num_gpus: 8 @@ -138,11 +141,11 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/distributed/test_async_tp.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - - pytest -v -s tests/distributed/test_sequence_parallel.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' + - VLLM_TEST_CLEAN_GPU_MEMORY=1pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index e4d12f3453f1..a028e0e4af4c 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -3,19 +3,16 @@ depends_on: - image-build steps: - label: Engine - timeout_in_minutes: 40 + timeout_in_minutes: 15 source_file_dependencies: - vllm/ - tests/engine - - tests/tokenization - tests/test_sequence - tests/test_config - tests/test_logger - tests/test_vllm_port commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - # OOM in the CI unless we run this separately - - pytest -v -s tokenization - label: V1 e2e + engine timeout_in_minutes: 45 diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index ec719825b377..e4182005bb45 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -111,13 +111,14 @@ steps: - label: Async Engine, Inputs, Utils, Worker, Config (CPU) depends_on: ~ - timeout_in_minutes: 10 + timeout_in_minutes: 20 source_file_dependencies: - vllm/ - tests/test_inputs.py - tests/test_outputs.py - tests/multimodal - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ - tests/transformers_utils - tests/config no_gpu: true @@ -126,6 +127,7 @@ steps: - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s tokenizers_ - pytest -v -s transformers_utils - pytest -v -s config diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index ceddf841f87a..9b7f574a95c3 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -52,11 +52,12 @@ steps: - label: Transformers Nightly Models working_dir: "/vllm-workspace/" optional: true + soft_fail: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py - pytest -v -s tests/models/test_transformers.py - # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml index 68e5e485c316..fc24068c20a4 100644 --- a/.buildkite/test_areas/models_multimodal.yaml +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -13,6 +13,16 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work +- label: Multi-Modal Processor Test (CPU) + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + no_gpu: true + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + - label: Multi-Modal Processor # 44min timeout_in_minutes: 60 source_file_dependencies: @@ -20,7 +30,7 @@ steps: - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing + - pytest -v -s models/multimodal/processing/test_tensor_schema.py - label: Multi-Modal Accuracy Eval (Small Models) # 50min timeout_in_minutes: 70 diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml index cff4a7189806..02a836b90bdf 100644 --- a/.buildkite/test_areas/quantization.yaml +++ b/.buildkite/test_areas/quantization.yaml @@ -17,6 +17,7 @@ steps: # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: Quantized MoE Test (B200) From 1ad5b4dff3c10ca70ea5a5394b3faed20c51e5cc Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 15:32:05 -0800 Subject: [PATCH 13/24] sync Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/distributed.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 1328ecec1b16..30a1002b701b 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -39,7 +39,7 @@ steps: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py From c1629aa6c593dbdfa743324864192939c4cf7045 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 16:54:03 -0800 Subject: [PATCH 14/24] fix long command Signed-off-by: Kevin H. Luu --- .buildkite/pipeline.yaml | 1157 ++++++++++++-------- .buildkite/test_areas/lora.yaml | 11 +- .buildkite/test_areas/models_basic.yaml | 5 +- .buildkite/test_areas/models_language.yaml | 9 +- 4 files changed, 691 insertions(+), 491 deletions(-) diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index 97580e597409..cb6abecd38c9 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -40,7 +40,7 @@ steps: DOCKER_BUILDKIT: '1' - block: 'Run :docker: Build CUDA 11.8 image' depends_on: [] - key: block--docker--build-cuda-11.8-image + key: block--docker--build-cuda-11-8-image - label: ':docker: Build CUDA 11.8 image' key: image-build-cu118 agents: @@ -48,7 +48,7 @@ steps: commands: - .buildkite/image_build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 - depends_on: block--docker--build-cuda-11.8-image + depends_on: block--docker--build-cuda-11-8-image soft_fail: false retry: automatic: @@ -96,32 +96,37 @@ steps: steps: - label: V1 attention (B200) agents: - queue: gpu_1_queue + queue: B200 commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache + - HF_HOME=/benchmark-hf-cache - HF_TOKEN - CODECOV_TOKEN volumes: - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true + - /data/benchmark-hf-cache:/benchmark-hf-cache + - /data/benchmark-vllm-cache:/root/.cache/vllm - label: V1 attention (H100) agents: - queue: gpu_1_queue + queue: mithril-h100-pool commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s v1/attention depends_on: - image-build @@ -130,14 +135,15 @@ steps: - kubernetes: podSpec: containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 command: - bash - -c - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 + && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi + || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness + && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py + --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -174,6 +180,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py @@ -183,10 +192,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -203,16 +211,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/.buildkite - bash scripts/run-benchmarks.sh depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -227,16 +237,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s benchmarks/ depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -253,6 +265,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - pytest -v -s v1/cudagraph/test_cudagraph_mode.py depends_on: @@ -260,10 +275,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -278,16 +292,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s cuda/test_cuda_context.py depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -305,116 +321,111 @@ steps: key: block-fusion-e2e-2-gpusb200 - label: Fusion E2E (2 GPUs)(B200) agents: - queue: gpu_4_queue + queue: B200 commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/ - nvidia-smi - pytest -v -s tests/compile/distributed/test_fusions_e2e.py depends_on: block-fusion-e2e-2-gpusb200 soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache + - HF_HOME=/benchmark-hf-cache - HF_TOKEN - CODECOV_TOKEN volumes: - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true + - /data/benchmark-hf-cache:/benchmark-hf-cache + - /data/benchmark-vllm-cache:/root/.cache/vllm - label: Fusion and Compile Tests (B200) agents: - queue: gpu_1_queue + queue: B200 commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/ - nvidia-smi - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - -k 'True and not +quant_fp8 and not +rms_norm' + -k "True and not +quant_fp8 and not +rms_norm" - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache + - HF_HOME=/benchmark-hf-cache - HF_TOKEN - CODECOV_TOKEN volumes: - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true + - /data/benchmark-hf-cache:/benchmark-hf-cache + - /data/benchmark-vllm-cache:/root/.cache/vllm - group: Distributed steps: - label: 2 Node Test (4 GPUs) agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d - --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node - test passed' + --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep "Same node + test passed" - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 - distributed/test_node_count.py | grep 'Node count test passed' + distributed/test_node_count.py | grep "Node count test passed" - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d - --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node - test passed' + --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep "Same node + test passed" - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 - distributed/test_node_count.py | grep 'Node count test passed' + distributed/test_node_count.py | grep "Node count test passed" - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code depends_on: - image-build soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - label: Distributed (2 GPUs) agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - | grep 'Same node test passed' + | grep "Same node test passed" - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 - distributed/test_same_node.py | grep 'Same node test passed' + distributed/test_same_node.py | grep "Same node test passed" - pytest -v -s distributed/test_sequence_parallel.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py @@ -423,10 +434,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -441,6 +451,9 @@ steps: agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py - pytest -v -s distributed/test_shm_buffer.py @@ -450,10 +463,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -468,6 +480,9 @@ steps: agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh depends_on: @@ -475,10 +490,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -494,8 +508,11 @@ steps: key: block-distributed-tests-2-gpusb200 - label: Distributed Tests (2 GPUs)(B200) agents: - queue: gpu_4_queue + queue: B200 commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/ - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/v1/distributed/test_dbo.py @@ -503,32 +520,35 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache + - HF_HOME=/benchmark-hf-cache - HF_TOKEN - CODECOV_TOKEN volumes: - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true + - /data/benchmark-hf-cache:/benchmark-hf-cache + - /data/benchmark-vllm-cache:/root/.cache/vllm - block: Run Distributed Tests (2 GPUs)(H200) depends_on: image-build key: block-distributed-tests-2-gpush200 - label: Distributed Tests (2 GPUs)(H200) agents: - queue: gpu_4_queue + queue: skylab-h200 commands: - - pytest -v -s tests/compile/distributed/test_async_tp.py + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/ + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - - pytest -v -s tests/distributed/test_sequence_parallel.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py + -k "not Llama-4" + - VLLM_TEST_CLEAN_GPU_MEMORY=1pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py @@ -538,24 +558,27 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true gpus: all environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache + - HF_HOME=/benchmark-hf-cache - HF_TOKEN - CODECOV_TOKEN volumes: - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true + - /data/benchmark-hf-cache:/benchmark-hf-cache + - /data/benchmark-vllm-cache:/root/.cache/vllm - label: Distributed Tests (4 GPUs) agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - export NCCL_CUMEM_HOST_ENABLE=0 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py @@ -565,6 +588,7 @@ steps: - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py @@ -583,10 +607,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -602,11 +625,14 @@ steps: key: block-distributed-tests-4-gpusa100 - label: Distributed Tests (4 GPUs)(A100) agents: - queue: gpu_4_queue + queue: a100_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m "distributed(num_gpus=2)" - pytest -v -s -x lora/test_mixtral.py depends_on: block-distributed-tests-4-gpusa100 soft_fail: false @@ -615,14 +641,15 @@ steps: podSpec: priorityClassName: ci containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 command: - bash - -c - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt - --tp-size=4 + && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi + || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness + && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py + --config-list-file=configs/models-large.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -653,28 +680,34 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + - block: Run Distributed Tests (8 GPUs)(H100) + depends_on: image-build + key: block-distributed-tests-8-gpush100 - label: Distributed Tests (8 GPUs)(H100) agents: - queue: gpu_1_queue + queue: mithril-h100-pool commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - export NCCL_CUMEM_HOST_ENABLE=0 - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - depends_on: - - image-build + depends_on: block-distributed-tests-8-gpush100 soft_fail: false plugins: - kubernetes: podSpec: containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 command: - bash - -c - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 + && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi + || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness + && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py + --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -709,6 +742,9 @@ steps: agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py depends_on: @@ -716,10 +752,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -737,8 +772,11 @@ steps: key: block-deepseek-v2-lite-accuracy - label: DeepSeek V2-Lite Accuracy agents: - queue: gpu_4_queue + queue: mithril-h100-pool commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 depends_on: block-deepseek-v2-lite-accuracy @@ -747,14 +785,15 @@ steps: - kubernetes: podSpec: containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 command: - bash - -c - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 + && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi + || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness + && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py + --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -792,15 +831,17 @@ steps: agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace - bash .buildkite/scripts/run-prime-rl-test.sh depends_on: block-prime-rl-integration-2-gpus soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -816,8 +857,11 @@ steps: key: block-qwen3-30b-a3b-fp8-block-accuracy - label: Qwen3-30B-A3B-FP8-block Accuracy agents: - queue: gpu_4_queue + queue: mithril-h100-pool commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 depends_on: block-qwen3-30b-a3b-fp8-block-accuracy @@ -826,14 +870,15 @@ steps: - kubernetes: podSpec: containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 command: - bash - -c - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 + && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi + || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness + && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py + --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -870,17 +915,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - - pytest -v -s tokenization depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -895,6 +941,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s v1/e2e - pytest -v -s v1/engine depends_on: @@ -902,10 +951,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -922,6 +970,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - export VLLM_WORKER_MULTIPROC_METHOD=spawn - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py @@ -934,10 +985,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -952,6 +1002,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py @@ -961,10 +1014,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -979,6 +1031,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/pooling depends_on: @@ -986,10 +1041,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1004,6 +1058,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling @@ -1012,10 +1069,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1030,16 +1086,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s v1/entrypoints depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1054,16 +1112,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -s entrypoints/openai/correctness/ depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1080,16 +1140,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s distributed/test_eplb_algo.py depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1104,6 +1166,9 @@ steps: agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s distributed/test_eplb_execute.py - pytest -v -s distributed/test_eplb_spec_decode.py depends_on: @@ -1111,10 +1176,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1129,16 +1193,19 @@ steps: steps: - label: Kernels (B200) agents: - queue: gpu_1_queue + queue: B200 commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/ - nvidia-smi - python3 examples/offline_inference/basic/chat.py - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k "not num_heads2" - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k "fp8" - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py @@ -1155,34 +1222,35 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache + - HF_HOME=/benchmark-hf-cache - HF_TOKEN - CODECOV_TOKEN volumes: - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true + - /data/benchmark-hf-cache:/benchmark-hf-cache + - /data/benchmark-vllm-cache:/root/.cache/vllm - label: Kernels Attention Test %N agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1193,20 +1261,23 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + parallelism: 2 - label: Kernels Core Operation Test agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s kernels/core kernels/test_top_k_per_row.py depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1219,8 +1290,11 @@ steps: mount_buildkite_agent: true - label: Kernels DeepGEMM Test (H100) agents: - queue: gpu_1_queue + queue: mithril-h100-pool commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm - pytest -v -s kernels/moe/test_deepgemm.py - pytest -v -s kernels/moe/test_batched_deepgemm.py @@ -1232,14 +1306,15 @@ steps: - kubernetes: podSpec: containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 command: - bash - -c - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 + && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi + || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness + && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py + --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -1274,16 +1349,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s kernels/mamba depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1298,16 +1375,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1318,20 +1397,23 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + parallelism: 2 - label: Kernels Quantization Test %N agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1342,6 +1424,7 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + parallelism: 2 - group: LM Eval steps: - block: Run LM Eval Large Models (4 GPUs)(A100) @@ -1349,8 +1432,11 @@ steps: key: block-lm-eval-large-models-4-gpusa100 - label: LM Eval Large Models (4 GPUs)(A100) agents: - queue: gpu_4_queue + queue: a100_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/.buildkite/lm-eval-harness - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 @@ -1361,14 +1447,15 @@ steps: podSpec: priorityClassName: ci containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 command: - bash - -c - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt - --tp-size=4 + && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi + || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness + && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py + --config-list-file=configs/models-large.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -1404,8 +1491,11 @@ steps: key: block-lm-eval-large-models-4-gpush100 - label: LM Eval Large Models (4 GPUs)(H100) agents: - queue: gpu_4_queue + queue: mithril-h100-pool commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/.buildkite/lm-eval-harness - export VLLM_USE_DEEP_GEMM=0 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 @@ -1415,14 +1505,15 @@ steps: - kubernetes: podSpec: containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 command: - bash - -c - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0 - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 + && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi + || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness + && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py + --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -1457,6 +1548,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 depends_on: @@ -1464,10 +1558,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1483,34 +1576,39 @@ steps: key: block-lm-eval-small-models-b200 - label: LM Eval Small Models (B200) agents: - queue: gpu_1_queue + queue: B200 commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 depends_on: block-lm-eval-small-models-b200 soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache + - HF_HOME=/benchmark-hf-cache - HF_TOKEN - CODECOV_TOKEN volumes: - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true + - /data/benchmark-hf-cache:/benchmark-hf-cache + - /data/benchmark-vllm-cache:/root/.cache/vllm - group: LoRA steps: - label: LoRA %N agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py @@ -1520,10 +1618,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1534,10 +1631,14 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + parallelism: 4 - label: LoRA TP (Distributed) agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py @@ -1549,10 +1650,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1569,17 +1669,19 @@ steps: agents: queue: gpu_1_queue commands: - - pytest -v -s -m 'not cpu_test' multimodal + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pytest -v -s -m "not cpu_test" multimodal - pytest -v -s utils_ depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1594,10 +1696,14 @@ steps: agents: queue: cpu_queue_premerge_us_east_1 commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - python3 standalone_tests/lazy_imports.py - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s -m "cpu_test" multimodal + - pytest -v -s tokenizers_ - pytest -v -s transformers_utils - pytest -v -s config depends_on: @@ -1605,10 +1711,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1623,6 +1728,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/examples - pip install tensorizer - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf @@ -1653,10 +1761,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1672,45 +1779,49 @@ steps: key: block-gpt-oss-eval-b200 - label: GPT-OSS Eval (B200) agents: - queue: gpu_1_queue + queue: B200 commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/ + - uv pip install --system "gpt-oss[eval]==0.0.5" - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 depends_on: block-gpt-oss-eval-b200 soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache + - HF_HOME=/benchmark-hf-cache - HF_TOKEN - CODECOV_TOKEN volumes: - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true + - /data/benchmark-hf-cache:/benchmark-hf-cache + - /data/benchmark-vllm-cache:/root/.cache/vllm - label: Metrics, Tracing (2 GPUs) agents: queue: gpu_4_queue commands: - - pip install 'opentelemetry-sdk>=1.26.0' 'opentelemetry-api>=1.26.0' 'opentelemetry-exporter-otlp>=1.26.0' - 'opentelemetry-semantic-conventions-ai>=0.4.1' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pip install "opentelemetry-sdk>=1.26.0" "opentelemetry-api>=1.26.0" "opentelemetry-exporter-otlp>=1.26.0" + "opentelemetry-semantic-conventions-ai>=0.4.1" - pytest -v -s v1/tracing depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1725,16 +1836,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - bash standalone_tests/python_only_compile.sh depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1749,6 +1862,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pip install modelscope - pytest -v -s test_regression.py depends_on: @@ -1756,10 +1872,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1774,16 +1889,19 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s -m "not cpu_test" v1/core - pytest -v -s v1/executor - pytest -v -s v1/kv_offload - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker - pytest -v -s v1/spec_decode - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s -m "not cpu_test" v1/kv_connector/unit + - pytest -v -s -m "not cpu_test" v1/metrics - pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_request.py - pytest -v -s v1/test_outputs.py @@ -1794,10 +1912,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1812,20 +1929,22 @@ steps: agents: queue: cpu_queue_premerge_us_east_1 commands: - - pytest -v -s -m 'cpu_test' v1/core + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pytest -v -s -m "cpu_test" v1/core - pytest -v -s v1/structured_output - pytest -v -s v1/test_serial_utils.py - - pytest -v -s -m 'cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'cpu_test' v1/metrics + - pytest -v -s -m "cpu_test" v1/kv_connector/unit + - pytest -v -s -m "cpu_test" v1/metrics depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1842,6 +1961,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - apt-get update && apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s model_executor @@ -1851,10 +1973,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1871,16 +1992,18 @@ steps: agents: queue: cpu_queue_premerge_us_east_1 commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s models/test_utils.py models/test_vision.py depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1895,17 +2018,19 @@ steps: agents: queue: gpu_1_queue commands: - - pytest -v -s models/test_initialization.py \ -k 'not test_can_initialize_small_subset' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pytest -v -s models/test_initialization.py \ -k "not test_can_initialize_small_subset" \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1916,20 +2041,23 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + parallelism: 2 - label: Basic Models Tests (Initialization) agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1944,16 +2072,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s models/test_transformers.py models/test_registry.py depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -1971,24 +2101,25 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/ - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal - or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR - or KimiVL)' + - pytest -v -s tests/models/test_initialization.py - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper depends_on: block-transformers-nightly-models - soft_fail: false + soft_fail: true plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2005,22 +2136,24 @@ steps: agents: queue: gpu_4_queue commands: - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m "distributed(num_gpus=2)" - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py - - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - pytest models/test_transformers.py -v -s -m "distributed(num_gpus=2)" + - pytest models/language -v -s -m "distributed(num_gpus=2)" + - pytest models/multimodal -v -s -m "distributed(num_gpus=2)" --ignore models/multimodal/generation/test_whisper.py - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py - -v -s -m 'distributed(num_gpus=2)' + -v -s -m "distributed(num_gpus=2)" depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2040,17 +2173,19 @@ steps: agents: queue: gpu_1_queue commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5" + - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2" + - pytest -v -s models/language/generation -m "(not core_model) and (not hybrid_model)" depends_on: block-language-models-test-extended-generation soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2068,15 +2203,17 @@ steps: agents: queue: gpu_1_queue commands: - - pytest -v -s models/language/pooling -m 'not core_model' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pytest -v -s models/language/pooling -m "not core_model" depends_on: block-language-models-test-extended-pooling soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2094,15 +2231,17 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s models/language/pooling_mteb_test depends_on: block-language-models-test-mteb soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2120,15 +2259,17 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s models/language/generation_ppl_test depends_on: block-language-models-test-ppl soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2143,18 +2284,20 @@ steps: agents: queue: gpu_1_queue commands: - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pip freeze | grep -E "torch" + - pytest -v -s models/language -m "core_model and slow_test" \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2165,12 +2308,16 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + parallelism: 2 - label: Language Models Tests (Hybrid) %N agents: queue: gpu_1_queue commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5" + - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2" - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB depends_on: @@ -2178,10 +2325,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2192,21 +2338,24 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + parallelism: 2 - label: Language Models Tests (Standard) agents: queue: gpu_1_queue commands: - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and (not slow_test)' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pip freeze | grep -E "torch" + - pytest -v -s models/language -m "core_model and (not slow_test)" depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2226,15 +2375,17 @@ steps: agents: queue: gpu_1_queue commands: - - echo 'Testing custom models...' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - echo "Testing custom models..." depends_on: block-custom-models soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2249,6 +2400,9 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/.buildkite/lm-eval-harness - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 depends_on: @@ -2256,10 +2410,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2277,17 +2430,19 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py + - pytest -v -s models/multimodal -m "not core_model" --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing depends_on: block-multi-modal-models-extended-1 soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2305,17 +2460,19 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) - and not core_model' + - pytest -v -s models/multimodal/generation/test_common.py -m "split(group=0) + and not core_model" depends_on: block-multi-modal-models-extended-2 soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2333,17 +2490,19 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) - and not core_model' + - pytest -v -s models/multimodal/generation/test_common.py -m "split(group=1) + and not core_model" depends_on: block-multi-modal-models-extended-3 soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2358,8 +2517,11 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pip freeze | grep -E 'torch' + - pip freeze | grep -E "torch" - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py @@ -2369,10 +2531,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2387,17 +2548,46 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing + - pytest -v -s models/multimodal/processing/test_tensor_schema.py depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true + environment: + - VLLM_USAGE_SOURCE=ci-test + - NCCL_CUMEM_HOST_ENABLE=0 + - HF_HOME=/fsx/hf_cache + - HF_TOKEN + - CODECOV_TOKEN + volumes: + - /dev/shm:/dev/shm + - /fsx/hf_cache:/fsx/hf_cache + mount_buildkite_agent: true + - label: Multi-Modal Processor Test (CPU) + agents: + queue: cpu_queue_premerge_us_east_1 + commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + depends_on: + - image-build + soft_fail: false + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2414,6 +2604,9 @@ steps: agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pip install -e ./plugins/vllm_add_dummy_platform - pytest -v -s plugins_tests/test_platform_plugins.py - pip uninstall vllm_add_dummy_platform -y @@ -2434,10 +2627,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2454,16 +2646,18 @@ steps: agents: queue: gpu_1_queue commands: - - find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\; + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\; depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2478,18 +2672,20 @@ steps: agents: queue: gpu_1_queue commands: - - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - - pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 - and not Llama-4' + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pytest -v -s compile/fullgraph/test_full_graph.py -k "not test_fp8_kv_scale_compile" + - pytest -v -s compile/distributed/test_fusions_e2e.py -k "TRITON and not +quant_fp8 + and not Llama-4" depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2504,17 +2700,19 @@ steps: agents: queue: gpu_1_queue commands: - - find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - find compile/fullgraph/ -name "test_*.py" -not -name "test_full_graph.py" -exec pytest -s -v {} \\; depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2529,16 +2727,18 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - bash standalone_tests/pytorch_nightly_dependency.sh depends_on: - image-build soft_fail: true plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2555,17 +2755,20 @@ steps: agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2578,34 +2781,39 @@ steps: mount_buildkite_agent: true - label: Quantized MoE Test (B200) agents: - queue: gpu_1_queue + queue: B200 commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/ - pytest -s -v tests/quantization/test_blackwell_moe.py depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache + - HF_HOME=/benchmark-hf-cache - HF_TOKEN - CODECOV_TOKEN volumes: - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true + - /data/benchmark-hf-cache:/benchmark-hf-cache + - /data/benchmark-vllm-cache:/root/.cache/vllm - group: Samplers steps: - label: Samplers Test agents: queue: gpu_1_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers depends_on: @@ -2613,10 +2821,9 @@ steps: soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2633,16 +2840,18 @@ steps: agents: queue: gpu_1_queue commands: - - pytest -v -s -m 'not cpu_test' tool_use + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pytest -v -s -m "not cpu_test" tool_use depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2657,16 +2866,18 @@ steps: agents: queue: cpu_queue_premerge_us_east_1 commands: - - pytest -v -s -m 'cpu_test' tool_use + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests + - pytest -v -s -m "cpu_test" tool_use depends_on: - image-build soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2686,15 +2897,17 @@ steps: agents: queue: gpu_4_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt depends_on: block-weight-loading-multiple-gpu soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 - always_pull: true - propagate_environment: true - gpus: all + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + always-pull: true + propagate-environment: true environment: - VLLM_USAGE_SOURCE=ci-test - NCCL_CUMEM_HOST_ENABLE=0 @@ -2710,8 +2923,11 @@ steps: key: block-weight-loading-multiple-gpu---large-models - label: Weight Loading Multiple GPU - Large Models agents: - queue: gpu_4_queue + queue: a100_queue commands: + - (command nvidia-smi || true) + - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 + - cd /vllm-workspace/tests - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt depends_on: block-weight-loading-multiple-gpu---large-models soft_fail: false @@ -2720,14 +2936,15 @@ steps: podSpec: priorityClassName: ci containers: - - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123 + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 command: - bash - -c - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn - && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt - --tp-size=4 + && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi + || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness + && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py + --config-list-file=configs/models-large.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml index 45e3af03591d..809b4138f44b 100644 --- a/.buildkite/test_areas/lora.yaml +++ b/.buildkite/test_areas/lora.yaml @@ -8,16 +8,7 @@ steps: - vllm/lora - tests/lora commands: - - pytest -v -s lora \ - --shard-id=$$BUILDKITE_PARALLEL_JOB \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --ignore=lora/test_chatglm3_tp.py \ - --ignore=lora/test_llama_tp.py \ - --ignore=lora/test_llm_with_multi_loras.py \ - --ignore=lora/test_olmoe_tp.py \ - --ignore=lora/test_deepseekv2_tp.py \ - --ignore=lora/test_gptoss_tp.py \ - --ignore=lora/test_qwen3moe_tp.py + - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py parallelism: 4 diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index 9b7f574a95c3..39a5d51c4883 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -24,10 +24,7 @@ steps: # Only when vLLM model source is modified - test initialization of a large # subset of supported models (the complement of the small subset in the above # test.) Also run if model initialization test file is modified - - pytest -v -s models/test_initialization.py \ - -k 'not test_can_initialize_small_subset' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB + - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB parallelism: 2 - label: Basic Models Tests (Other) diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml index fdf78dc48746..f70192c4ebc0 100644 --- a/.buildkite/test_areas/models_language.yaml +++ b/.buildkite/test_areas/models_language.yaml @@ -27,9 +27,7 @@ steps: # Shard slow subset of standard language models tests. Only run when model # source is modified, or when specified test files are modified - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and slow_test' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB + - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB parallelism: 2 - label: Language Models Tests (Hybrid) %N @@ -45,10 +43,7 @@ steps: - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' # Shard hybrid language model tests - - pytest -v -s models/language/generation \ - -m hybrid_model \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB parallelism: 2 - label: Language Models Test (Extended Generation) # 80min From 950643d974785b0a25b874f2adb1d1242e253979 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 1 Dec 2025 18:48:18 -0800 Subject: [PATCH 15/24] debug Signed-off-by: Kevin H. Luu --- .buildkite/pipeline.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index cb6abecd38c9..c387bff0a317 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -2649,13 +2649,13 @@ steps: - (command nvidia-smi || true) - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\; + - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\\\; depends_on: - - image-build + - [] soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:52ef28ad15b328f5b6b3edb3f8b8904528a183f6 always-pull: true propagate-environment: true environment: @@ -2704,13 +2704,13 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - find compile/fullgraph/ -name "test_*.py" -not -name "test_full_graph.py" -exec - pytest -s -v {} \\; + pytest -s -v {} \\\\; depends_on: - - image-build + - [] soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:52ef28ad15b328f5b6b3edb3f8b8904528a183f6 always-pull: true propagate-environment: true environment: From 98a38d1a167874b9c9368fbd66c0462f57387532 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 2 Dec 2025 01:33:58 -0800 Subject: [PATCH 16/24] slashes Signed-off-by: Kevin H. Luu --- .buildkite/pipeline.yaml | 547 ++++++++++++++++------------- .buildkite/test_areas/pytorch.yaml | 4 +- 2 files changed, 309 insertions(+), 242 deletions(-) diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index c387bff0a317..ae55531ecf13 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -21,6 +21,9 @@ steps: limit: 2 env: DOCKER_BUILDKIT: '1' + - block: 'Run :docker: Build CPU image' + depends_on: [] + key: block--docker--build-cpu-image - label: ':docker: Build CPU image' key: image-build-cpu agents: @@ -28,7 +31,7 @@ steps: commands: - .buildkite/image_build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 - depends_on: [] + depends_on: block--docker--build-cpu-image soft_fail: false retry: automatic: @@ -58,6 +61,9 @@ steps: limit: 2 env: DOCKER_BUILDKIT: '1' + - block: 'Run :docker: Build HPU image' + depends_on: [] + key: block--docker--build-hpu-image - label: ':docker: Build HPU image' key: image-build-hpu agents: @@ -65,7 +71,7 @@ steps: commands: - .buildkite/image_build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 - depends_on: [] + depends_on: block--docker--build-hpu-image soft_fail: true retry: automatic: @@ -75,6 +81,9 @@ steps: limit: 2 env: DOCKER_BUILDKIT: '1' + - block: 'Run :docker: Build image' + depends_on: [] + key: block--docker--build-image - label: ':docker: Build image' key: image-build agents: @@ -82,7 +91,7 @@ steps: commands: - .buildkite/image_build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123 - depends_on: [] + depends_on: block--docker--build-image soft_fail: false retry: automatic: @@ -94,6 +103,9 @@ steps: DOCKER_BUILDKIT: '1' - group: Attention steps: + - block: Run V1 attention (B200) + depends_on: [] + key: block-v1-attention-b200 - label: V1 attention (B200) agents: queue: B200 @@ -102,8 +114,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention - depends_on: - - image-build + depends_on: block-v1-attention-b200 soft_fail: false plugins: - docker#v5.2.0: @@ -120,6 +131,9 @@ steps: - /dev/shm:/dev/shm - /data/benchmark-hf-cache:/benchmark-hf-cache - /data/benchmark-vllm-cache:/root/.cache/vllm + - block: Run V1 attention (H100) + depends_on: [] + key: block-v1-attention-h100 - label: V1 attention (H100) agents: queue: mithril-h100-pool @@ -128,22 +142,13 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s v1/attention - depends_on: - - image-build + depends_on: block-v1-attention-h100 soft_fail: false plugins: - kubernetes: podSpec: containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi - || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness - && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py - --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -176,6 +181,9 @@ steps: type: DirectoryOrCreate - group: Basic Correctness steps: + - block: Run Basic Correctness + depends_on: [] + key: block-basic-correctness - label: Basic Correctness agents: queue: gpu_1_queue @@ -187,8 +195,7 @@ steps: - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - depends_on: - - image-build + depends_on: block-basic-correctness soft_fail: false plugins: - docker#v5.2.0: @@ -207,6 +214,9 @@ steps: mount_buildkite_agent: true - group: Benchmarks steps: + - block: Run Benchmarks + depends_on: [] + key: block-benchmarks - label: Benchmarks agents: queue: gpu_1_queue @@ -215,8 +225,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/.buildkite - bash scripts/run-benchmarks.sh - depends_on: - - image-build + depends_on: block-benchmarks soft_fail: false plugins: - docker#v5.2.0: @@ -233,6 +242,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Benchmarks CLI Test + depends_on: [] + key: block-benchmarks-cli-test - label: Benchmarks CLI Test agents: queue: gpu_1_queue @@ -241,8 +253,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s benchmarks/ - depends_on: - - image-build + depends_on: block-benchmarks-cli-test soft_fail: false plugins: - docker#v5.2.0: @@ -261,6 +272,9 @@ steps: mount_buildkite_agent: true - group: CUDA steps: + - block: Run Cudagraph + depends_on: [] + key: block-cudagraph - label: Cudagraph agents: queue: gpu_1_queue @@ -270,8 +284,7 @@ steps: - cd /vllm-workspace/tests - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - depends_on: - - image-build + depends_on: block-cudagraph soft_fail: false plugins: - docker#v5.2.0: @@ -288,6 +301,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Platform Tests (CUDA) + depends_on: [] + key: block-platform-tests-cuda - label: Platform Tests (CUDA) agents: queue: gpu_1_queue @@ -296,8 +312,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s cuda/test_cuda_context.py - depends_on: - - image-build + depends_on: block-platform-tests-cuda soft_fail: false plugins: - docker#v5.2.0: @@ -317,7 +332,7 @@ steps: - group: Compile steps: - block: Run Fusion E2E (2 GPUs)(B200) - depends_on: image-build + depends_on: [] key: block-fusion-e2e-2-gpusb200 - label: Fusion E2E (2 GPUs)(B200) agents: @@ -345,6 +360,9 @@ steps: - /dev/shm:/dev/shm - /data/benchmark-hf-cache:/benchmark-hf-cache - /data/benchmark-vllm-cache:/root/.cache/vllm + - block: Run Fusion and Compile Tests (B200) + depends_on: [] + key: block-fusion-and-compile-tests-b200 - label: Fusion and Compile Tests (B200) agents: queue: B200 @@ -359,8 +377,7 @@ steps: - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k "True and not +quant_fp8 and not +rms_norm" - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - depends_on: - - image-build + depends_on: block-fusion-and-compile-tests-b200 soft_fail: false plugins: - docker#v5.2.0: @@ -379,6 +396,9 @@ steps: - /data/benchmark-vllm-cache:/root/.cache/vllm - group: Distributed steps: + - block: Run 2 Node Test (4 GPUs) + depends_on: [] + key: block-2-node-test-4-gpus - label: 2 Node Test (4 GPUs) agents: queue: gpu_4_queue @@ -404,9 +424,11 @@ steps: - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - depends_on: - - image-build + depends_on: block-2-node-test-4-gpus soft_fail: false + - block: Run Distributed (2 GPUs) + depends_on: [] + key: block-distributed-2-gpus - label: Distributed (2 GPUs) agents: queue: gpu_4_queue @@ -429,8 +451,7 @@ steps: - pytest -v -s distributed/test_sequence_parallel.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py - depends_on: - - image-build + depends_on: block-distributed-2-gpus soft_fail: false plugins: - docker#v5.2.0: @@ -447,6 +468,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Distributed Comm Ops + depends_on: [] + key: block-distributed-comm-ops - label: Distributed Comm Ops agents: queue: gpu_4_queue @@ -458,8 +482,7 @@ steps: - pytest -v -s distributed/test_shm_broadcast.py - pytest -v -s distributed/test_shm_buffer.py - pytest -v -s distributed/test_shm_storage.py - depends_on: - - image-build + depends_on: block-distributed-comm-ops soft_fail: false plugins: - docker#v5.2.0: @@ -476,6 +499,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Distributed NixlConnector PD accuracy (4 GPUs) + depends_on: [] + key: block-distributed-nixlconnector-pd-accuracy-4-gpus - label: Distributed NixlConnector PD accuracy (4 GPUs) agents: queue: gpu_4_queue @@ -485,8 +511,7 @@ steps: - cd /vllm-workspace/tests - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh - depends_on: - - image-build + depends_on: block-distributed-nixlconnector-pd-accuracy-4-gpus soft_fail: false plugins: - docker#v5.2.0: @@ -504,7 +529,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Distributed Tests (2 GPUs)(B200) - depends_on: image-build + depends_on: [] key: block-distributed-tests-2-gpusb200 - label: Distributed Tests (2 GPUs)(B200) agents: @@ -534,7 +559,7 @@ steps: - /data/benchmark-hf-cache:/benchmark-hf-cache - /data/benchmark-vllm-cache:/root/.cache/vllm - block: Run Distributed Tests (2 GPUs)(H200) - depends_on: image-build + depends_on: [] key: block-distributed-tests-2-gpush200 - label: Distributed Tests (2 GPUs)(H200) agents: @@ -572,6 +597,9 @@ steps: - /dev/shm:/dev/shm - /data/benchmark-hf-cache:/benchmark-hf-cache - /data/benchmark-vllm-cache:/root/.cache/vllm + - block: Run Distributed Tests (4 GPUs) + depends_on: [] + key: block-distributed-tests-4-gpus - label: Distributed Tests (4 GPUs) agents: queue: gpu_4_queue @@ -602,8 +630,7 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd - depends_on: - - image-build + depends_on: block-distributed-tests-4-gpus soft_fail: false plugins: - docker#v5.2.0: @@ -621,7 +648,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Distributed Tests (4 GPUs)(A100) - depends_on: image-build + depends_on: [] key: block-distributed-tests-4-gpusa100 - label: Distributed Tests (4 GPUs)(A100) agents: @@ -642,14 +669,6 @@ steps: priorityClassName: ci containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi - || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness - && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py - --config-list-file=configs/models-large.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -681,7 +700,7 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - block: Run Distributed Tests (8 GPUs)(H100) - depends_on: image-build + depends_on: [] key: block-distributed-tests-8-gpush100 - label: Distributed Tests (8 GPUs)(H100) agents: @@ -700,14 +719,6 @@ steps: podSpec: containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi - || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness - && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py - --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -738,6 +749,9 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + - block: Run Pipeline + Context Parallelism (4 GPUs)) + depends_on: [] + key: block-pipeline---context-parallelism-4-gpus - label: Pipeline + Context Parallelism (4 GPUs)) agents: queue: gpu_4_queue @@ -747,8 +761,7 @@ steps: - cd /vllm-workspace/tests - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py - depends_on: - - image-build + depends_on: block-pipeline---context-parallelism-4-gpus soft_fail: false plugins: - docker#v5.2.0: @@ -768,7 +781,7 @@ steps: - group: E2E Integration steps: - block: Run DeepSeek V2-Lite Accuracy - depends_on: image-build + depends_on: [] key: block-deepseek-v2-lite-accuracy - label: DeepSeek V2-Lite Accuracy agents: @@ -786,14 +799,6 @@ steps: podSpec: containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi - || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness - && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py - --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -825,7 +830,7 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - block: Run Prime-RL Integration (2 GPUs) - depends_on: image-build + depends_on: [] key: block-prime-rl-integration-2-gpus - label: Prime-RL Integration (2 GPUs) agents: @@ -853,7 +858,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Qwen3-30B-A3B-FP8-block Accuracy - depends_on: image-build + depends_on: [] key: block-qwen3-30b-a3b-fp8-block-accuracy - label: Qwen3-30B-A3B-FP8-block Accuracy agents: @@ -871,14 +876,6 @@ steps: podSpec: containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi - || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness - && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py - --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -911,6 +908,9 @@ steps: type: DirectoryOrCreate - group: Engine steps: + - block: Run Engine + depends_on: [] + key: block-engine - label: Engine agents: queue: gpu_1_queue @@ -919,8 +919,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - depends_on: - - image-build + depends_on: block-engine soft_fail: false plugins: - docker#v5.2.0: @@ -937,6 +936,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run V1 e2e + engine + depends_on: [] + key: block-v1-e2e---engine - label: V1 e2e + engine agents: queue: gpu_1_queue @@ -946,8 +948,7 @@ steps: - cd /vllm-workspace/tests - pytest -v -s v1/e2e - pytest -v -s v1/engine - depends_on: - - image-build + depends_on: block-v1-e2e---engine soft_fail: false plugins: - docker#v5.2.0: @@ -966,6 +967,9 @@ steps: mount_buildkite_agent: true - group: Entrypoints steps: + - block: Run Entrypoints Integration (API Server) + depends_on: [] + key: block-entrypoints-integration-api-server - label: Entrypoints Integration (API Server) agents: queue: gpu_1_queue @@ -980,8 +984,7 @@ steps: --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py - depends_on: - - image-build + depends_on: block-entrypoints-integration-api-server soft_fail: false plugins: - docker#v5.2.0: @@ -998,6 +1001,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Entrypoints Integration (LLM) + depends_on: [] + key: block-entrypoints-integration-llm - label: Entrypoints Integration (LLM) agents: queue: gpu_1_queue @@ -1009,8 +1015,7 @@ steps: - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py - pytest -v -s entrypoints/offline_mode - depends_on: - - image-build + depends_on: block-entrypoints-integration-llm soft_fail: false plugins: - docker#v5.2.0: @@ -1027,6 +1032,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Entrypoints Integration (Pooling) + depends_on: [] + key: block-entrypoints-integration-pooling - label: Entrypoints Integration (Pooling) agents: queue: gpu_1_queue @@ -1036,8 +1044,7 @@ steps: - cd /vllm-workspace/tests - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/pooling - depends_on: - - image-build + depends_on: block-entrypoints-integration-pooling soft_fail: false plugins: - docker#v5.2.0: @@ -1054,6 +1061,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Entrypoints Unit Tests + depends_on: [] + key: block-entrypoints-unit-tests - label: Entrypoints Unit Tests agents: queue: gpu_1_queue @@ -1064,8 +1074,7 @@ steps: - pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - depends_on: - - image-build + depends_on: block-entrypoints-unit-tests soft_fail: false plugins: - docker#v5.2.0: @@ -1082,6 +1091,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Entrypoints V1 + depends_on: [] + key: block-entrypoints-v1 - label: Entrypoints V1 agents: queue: gpu_1_queue @@ -1090,8 +1102,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s v1/entrypoints - depends_on: - - image-build + depends_on: block-entrypoints-v1 soft_fail: false plugins: - docker#v5.2.0: @@ -1108,6 +1119,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run OpenAI API Correctness + depends_on: [] + key: block-openai-api-correctness - label: OpenAI API Correctness agents: queue: gpu_1_queue @@ -1116,8 +1130,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -s entrypoints/openai/correctness/ - depends_on: - - image-build + depends_on: block-openai-api-correctness soft_fail: false plugins: - docker#v5.2.0: @@ -1136,6 +1149,9 @@ steps: mount_buildkite_agent: true - group: Expert Parallelism steps: + - block: Run EPLB Algorithm + depends_on: [] + key: block-eplb-algorithm - label: EPLB Algorithm agents: queue: gpu_1_queue @@ -1144,8 +1160,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s distributed/test_eplb_algo.py - depends_on: - - image-build + depends_on: block-eplb-algorithm soft_fail: false plugins: - docker#v5.2.0: @@ -1162,6 +1177,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run EPLB Execution + depends_on: [] + key: block-eplb-execution - label: EPLB Execution agents: queue: gpu_4_queue @@ -1171,8 +1189,7 @@ steps: - cd /vllm-workspace/tests - pytest -v -s distributed/test_eplb_execute.py - pytest -v -s distributed/test_eplb_spec_decode.py - depends_on: - - image-build + depends_on: block-eplb-execution soft_fail: false plugins: - docker#v5.2.0: @@ -1191,6 +1208,9 @@ steps: mount_buildkite_agent: true - group: Kernels steps: + - block: Run Kernels (B200) + depends_on: [] + key: block-kernels-b200 - label: Kernels (B200) agents: queue: B200 @@ -1217,8 +1237,7 @@ steps: - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - depends_on: - - image-build + depends_on: block-kernels-b200 soft_fail: false plugins: - docker#v5.2.0: @@ -1235,6 +1254,9 @@ steps: - /dev/shm:/dev/shm - /data/benchmark-hf-cache:/benchmark-hf-cache - /data/benchmark-vllm-cache:/root/.cache/vllm + - block: Run Kernels Attention Test %N + depends_on: [] + key: block-kernels-attention-test-n - label: Kernels Attention Test %N agents: queue: gpu_1_queue @@ -1243,8 +1265,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - depends_on: - - image-build + depends_on: block-kernels-attention-test-n soft_fail: false plugins: - docker#v5.2.0: @@ -1262,6 +1283,9 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true parallelism: 2 + - block: Run Kernels Core Operation Test + depends_on: [] + key: block-kernels-core-operation-test - label: Kernels Core Operation Test agents: queue: gpu_1_queue @@ -1270,8 +1294,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s kernels/core kernels/test_top_k_per_row.py - depends_on: - - image-build + depends_on: block-kernels-core-operation-test soft_fail: false plugins: - docker#v5.2.0: @@ -1288,6 +1311,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Kernels DeepGEMM Test (H100) + depends_on: [] + key: block-kernels-deepgemm-test-h100 - label: Kernels DeepGEMM Test (H100) agents: queue: mithril-h100-pool @@ -1299,22 +1325,13 @@ steps: - pytest -v -s kernels/moe/test_deepgemm.py - pytest -v -s kernels/moe/test_batched_deepgemm.py - pytest -v -s kernels/attention/test_deepgemm_attention.py - depends_on: - - image-build + depends_on: block-kernels-deepgemm-test-h100 soft_fail: false plugins: - kubernetes: podSpec: containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi - || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness - && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py - --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -1345,6 +1362,9 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + - block: Run Kernels Mamba Test + depends_on: [] + key: block-kernels-mamba-test - label: Kernels Mamba Test agents: queue: gpu_1_queue @@ -1353,8 +1373,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s kernels/mamba - depends_on: - - image-build + depends_on: block-kernels-mamba-test soft_fail: false plugins: - docker#v5.2.0: @@ -1371,6 +1390,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Kernels MoE Test %N + depends_on: [] + key: block-kernels-moe-test-n - label: Kernels MoE Test %N agents: queue: gpu_1_queue @@ -1379,8 +1401,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - depends_on: - - image-build + depends_on: block-kernels-moe-test-n soft_fail: false plugins: - docker#v5.2.0: @@ -1398,6 +1419,9 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true parallelism: 2 + - block: Run Kernels Quantization Test %N + depends_on: [] + key: block-kernels-quantization-test-n - label: Kernels Quantization Test %N agents: queue: gpu_1_queue @@ -1406,8 +1430,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - depends_on: - - image-build + depends_on: block-kernels-quantization-test-n soft_fail: false plugins: - docker#v5.2.0: @@ -1428,7 +1451,7 @@ steps: - group: LM Eval steps: - block: Run LM Eval Large Models (4 GPUs)(A100) - depends_on: image-build + depends_on: [] key: block-lm-eval-large-models-4-gpusa100 - label: LM Eval Large Models (4 GPUs)(A100) agents: @@ -1448,14 +1471,6 @@ steps: priorityClassName: ci containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi - || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness - && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py - --config-list-file=configs/models-large.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -1487,7 +1502,7 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - block: Run LM Eval Large Models (4 GPUs)(H100) - depends_on: image-build + depends_on: [] key: block-lm-eval-large-models-4-gpush100 - label: LM Eval Large Models (4 GPUs)(H100) agents: @@ -1506,14 +1521,6 @@ steps: podSpec: containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi - || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness - && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py - --config-list-file=configs/models-large-hopper.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 @@ -1544,6 +1551,9 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + - block: Run LM Eval Small Models + depends_on: [] + key: block-lm-eval-small-models - label: LM Eval Small Models agents: queue: gpu_1_queue @@ -1553,8 +1563,7 @@ steps: - cd /vllm-workspace/tests - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - depends_on: - - image-build + depends_on: block-lm-eval-small-models soft_fail: false plugins: - docker#v5.2.0: @@ -1572,7 +1581,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run LM Eval Small Models (B200) - depends_on: image-build + depends_on: [] key: block-lm-eval-small-models-b200 - label: LM Eval Small Models (B200) agents: @@ -1602,6 +1611,9 @@ steps: - /data/benchmark-vllm-cache:/root/.cache/vllm - group: LoRA steps: + - block: Run LoRA %N + depends_on: [] + key: block-lora-n - label: LoRA %N agents: queue: gpu_1_queue @@ -1609,12 +1621,11 @@ steps: - (command nvidia-smi || true) - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py - \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py - \ --ignore=lora/test_qwen3moe_tp.py - depends_on: - - image-build + - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py + --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py + --ignore=lora/test_qwen3moe_tp.py + depends_on: block-lora-n soft_fail: false plugins: - docker#v5.2.0: @@ -1632,6 +1643,9 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true parallelism: 4 + - block: Run LoRA TP (Distributed) + depends_on: [] + key: block-lora-tp-distributed - label: LoRA TP (Distributed) agents: queue: gpu_4_queue @@ -1645,8 +1659,7 @@ steps: - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - pytest -v -s -x lora/test_gptoss_tp.py - depends_on: - - image-build + depends_on: block-lora-tp-distributed soft_fail: false plugins: - docker#v5.2.0: @@ -1665,6 +1678,9 @@ steps: mount_buildkite_agent: true - group: Miscellaneous steps: + - block: Run Async Engine, Inputs, Utils, Worker + depends_on: [] + key: block-async-engine--inputs--utils--worker - label: Async Engine, Inputs, Utils, Worker agents: queue: gpu_1_queue @@ -1674,8 +1690,7 @@ steps: - cd /vllm-workspace/tests - pytest -v -s -m "not cpu_test" multimodal - pytest -v -s utils_ - depends_on: - - image-build + depends_on: block-async-engine--inputs--utils--worker soft_fail: false plugins: - docker#v5.2.0: @@ -1692,6 +1707,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Async Engine, Inputs, Utils, Worker, Config (CPU) + depends_on: [] + key: block-async-engine--inputs--utils--worker--config-cpu - label: Async Engine, Inputs, Utils, Worker, Config (CPU) agents: queue: cpu_queue_premerge_us_east_1 @@ -1706,8 +1724,7 @@ steps: - pytest -v -s tokenizers_ - pytest -v -s transformers_utils - pytest -v -s config - depends_on: - - image-build + depends_on: block-async-engine--inputs--utils--worker--config-cpu soft_fail: false plugins: - docker#v5.2.0: @@ -1724,6 +1741,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Examples + depends_on: [] + key: block-examples - label: Examples agents: queue: gpu_1_queue @@ -1756,8 +1776,7 @@ steps: - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - depends_on: - - image-build + depends_on: block-examples soft_fail: false plugins: - docker#v5.2.0: @@ -1775,7 +1794,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run GPT-OSS Eval (B200) - depends_on: image-build + depends_on: [] key: block-gpt-oss-eval-b200 - label: GPT-OSS Eval (B200) agents: @@ -1804,6 +1823,9 @@ steps: - /dev/shm:/dev/shm - /data/benchmark-hf-cache:/benchmark-hf-cache - /data/benchmark-vllm-cache:/root/.cache/vllm + - block: Run Metrics, Tracing (2 GPUs) + depends_on: [] + key: block-metrics--tracing-2-gpus - label: Metrics, Tracing (2 GPUs) agents: queue: gpu_4_queue @@ -1814,8 +1836,7 @@ steps: - pip install "opentelemetry-sdk>=1.26.0" "opentelemetry-api>=1.26.0" "opentelemetry-exporter-otlp>=1.26.0" "opentelemetry-semantic-conventions-ai>=0.4.1" - pytest -v -s v1/tracing - depends_on: - - image-build + depends_on: block-metrics--tracing-2-gpus soft_fail: false plugins: - docker#v5.2.0: @@ -1832,6 +1853,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Python-only Installation + depends_on: [] + key: block-python-only-installation - label: Python-only Installation agents: queue: gpu_1_queue @@ -1840,8 +1864,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - bash standalone_tests/python_only_compile.sh - depends_on: - - image-build + depends_on: block-python-only-installation soft_fail: false plugins: - docker#v5.2.0: @@ -1858,6 +1881,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Regression + depends_on: [] + key: block-regression - label: Regression agents: queue: gpu_1_queue @@ -1867,8 +1893,7 @@ steps: - cd /vllm-workspace/tests - pip install modelscope - pytest -v -s test_regression.py - depends_on: - - image-build + depends_on: block-regression soft_fail: false plugins: - docker#v5.2.0: @@ -1885,6 +1910,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run V1 Others + depends_on: [] + key: block-v1-others - label: V1 Others agents: queue: gpu_1_queue @@ -1907,8 +1935,7 @@ steps: - pytest -v -s v1/test_outputs.py - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - depends_on: - - image-build + depends_on: block-v1-others soft_fail: false plugins: - docker#v5.2.0: @@ -1925,6 +1952,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run V1 Others (CPU) + depends_on: [] + key: block-v1-others-cpu - label: V1 Others (CPU) agents: queue: cpu_queue_premerge_us_east_1 @@ -1937,8 +1967,7 @@ steps: - pytest -v -s v1/test_serial_utils.py - pytest -v -s -m "cpu_test" v1/kv_connector/unit - pytest -v -s -m "cpu_test" v1/metrics - depends_on: - - image-build + depends_on: block-v1-others-cpu soft_fail: false plugins: - docker#v5.2.0: @@ -1957,6 +1986,9 @@ steps: mount_buildkite_agent: true - group: Model Executor steps: + - block: Run Model Executor + depends_on: [] + key: block-model-executor - label: Model Executor agents: queue: gpu_1_queue @@ -1968,8 +2000,7 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s model_executor - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - depends_on: - - image-build + depends_on: block-model-executor soft_fail: false plugins: - docker#v5.2.0: @@ -1988,6 +2019,9 @@ steps: mount_buildkite_agent: true - group: Models - Basic steps: + - block: Run Basic Models Test (Other CPU) + depends_on: [] + key: block-basic-models-test-other-cpu - label: Basic Models Test (Other CPU) agents: queue: cpu_queue_premerge_us_east_1 @@ -1996,8 +2030,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s models/test_utils.py models/test_vision.py - depends_on: - - image-build + depends_on: block-basic-models-test-other-cpu soft_fail: false plugins: - docker#v5.2.0: @@ -2014,6 +2047,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Basic Models Tests (Extra Initialization) %N + depends_on: [] + key: block-basic-models-tests-extra-initialization-n - label: Basic Models Tests (Extra Initialization) %N agents: queue: gpu_1_queue @@ -2021,10 +2057,9 @@ steps: - (command nvidia-smi || true) - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - - pytest -v -s models/test_initialization.py \ -k "not test_can_initialize_small_subset" - \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB - depends_on: - - image-build + - pytest -v -s models/test_initialization.py -k "not test_can_initialize_small_subset" + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + depends_on: block-basic-models-tests-extra-initialization-n soft_fail: false plugins: - docker#v5.2.0: @@ -2042,6 +2077,9 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true parallelism: 2 + - block: Run Basic Models Tests (Initialization) + depends_on: [] + key: block-basic-models-tests-initialization - label: Basic Models Tests (Initialization) agents: queue: gpu_1_queue @@ -2050,8 +2088,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - depends_on: - - image-build + depends_on: block-basic-models-tests-initialization soft_fail: false plugins: - docker#v5.2.0: @@ -2068,6 +2105,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Basic Models Tests (Other) + depends_on: [] + key: block-basic-models-tests-other - label: Basic Models Tests (Other) agents: queue: gpu_1_queue @@ -2076,8 +2116,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s models/test_transformers.py models/test_registry.py - depends_on: - - image-build + depends_on: block-basic-models-tests-other soft_fail: false plugins: - docker#v5.2.0: @@ -2095,7 +2134,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Transformers Nightly Models - depends_on: image-build + depends_on: [] key: block-transformers-nightly-models - label: Transformers Nightly Models agents: @@ -2132,6 +2171,9 @@ steps: mount_buildkite_agent: true - group: Models - Distributed steps: + - block: Run Distributed Model Tests (2 GPUs) + depends_on: [] + key: block-distributed-model-tests-2-gpus - label: Distributed Model Tests (2 GPUs) agents: queue: gpu_4_queue @@ -2146,8 +2188,7 @@ steps: - pytest models/multimodal -v -s -m "distributed(num_gpus=2)" --ignore models/multimodal/generation/test_whisper.py - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m "distributed(num_gpus=2)" - depends_on: - - image-build + depends_on: block-distributed-model-tests-2-gpus soft_fail: false plugins: - docker#v5.2.0: @@ -2167,7 +2208,7 @@ steps: - group: Models - Language steps: - block: Run Language Models Test (Extended Generation) - depends_on: image-build + depends_on: [] key: block-language-models-test-extended-generation - label: Language Models Test (Extended Generation) agents: @@ -2197,7 +2238,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Language Models Test (Extended Pooling) - depends_on: image-build + depends_on: [] key: block-language-models-test-extended-pooling - label: Language Models Test (Extended Pooling) agents: @@ -2225,7 +2266,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Language Models Test (MTEB) - depends_on: image-build + depends_on: [] key: block-language-models-test-mteb - label: Language Models Test (MTEB) agents: @@ -2253,7 +2294,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Language Models Test (PPL) - depends_on: image-build + depends_on: [] key: block-language-models-test-ppl - label: Language Models Test (PPL) agents: @@ -2280,6 +2321,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Language Models Tests (Extra Standard) %N + depends_on: [] + key: block-language-models-tests-extra-standard-n - label: Language Models Tests (Extra Standard) %N agents: queue: gpu_1_queue @@ -2288,10 +2332,9 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pip freeze | grep -E "torch" - - pytest -v -s models/language -m "core_model and slow_test" \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - \ --shard-id=$$BUILDKITE_PARALLEL_JOB - depends_on: - - image-build + - pytest -v -s models/language -m "core_model and slow_test" --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + --shard-id=$$BUILDKITE_PARALLEL_JOB + depends_on: block-language-models-tests-extra-standard-n soft_fail: false plugins: - docker#v5.2.0: @@ -2309,6 +2352,9 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true parallelism: 2 + - block: Run Language Models Tests (Hybrid) %N + depends_on: [] + key: block-language-models-tests-hybrid-n - label: Language Models Tests (Hybrid) %N agents: queue: gpu_1_queue @@ -2318,10 +2364,9 @@ steps: - cd /vllm-workspace/tests - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5" - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2" - - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - \ --shard-id=$$BUILDKITE_PARALLEL_JOB - depends_on: - - image-build + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + --shard-id=$$BUILDKITE_PARALLEL_JOB + depends_on: block-language-models-tests-hybrid-n soft_fail: false plugins: - docker#v5.2.0: @@ -2339,6 +2384,9 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true parallelism: 2 + - block: Run Language Models Tests (Standard) + depends_on: [] + key: block-language-models-tests-standard - label: Language Models Tests (Standard) agents: queue: gpu_1_queue @@ -2348,8 +2396,7 @@ steps: - cd /vllm-workspace/tests - pip freeze | grep -E "torch" - pytest -v -s models/language -m "core_model and (not slow_test)" - depends_on: - - image-build + depends_on: block-language-models-tests-standard soft_fail: false plugins: - docker#v5.2.0: @@ -2369,7 +2416,7 @@ steps: - group: Models - Multimodal steps: - block: Run Custom Models - depends_on: image-build + depends_on: [] key: block-custom-models - label: Custom Models agents: @@ -2396,6 +2443,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Multi-Modal Accuracy Eval (Small Models) + depends_on: [] + key: block-multi-modal-accuracy-eval-small-models - label: Multi-Modal Accuracy Eval (Small Models) agents: queue: gpu_1_queue @@ -2405,8 +2455,7 @@ steps: - cd /vllm-workspace/.buildkite/lm-eval-harness - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - depends_on: - - image-build + depends_on: block-multi-modal-accuracy-eval-small-models soft_fail: false plugins: - docker#v5.2.0: @@ -2424,7 +2473,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Multi-Modal Models (Extended) 1 - depends_on: image-build + depends_on: [] key: block-multi-modal-models-extended-1 - label: Multi-Modal Models (Extended) 1 agents: @@ -2454,7 +2503,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Multi-Modal Models (Extended) 2 - depends_on: image-build + depends_on: [] key: block-multi-modal-models-extended-2 - label: Multi-Modal Models (Extended) 2 agents: @@ -2484,7 +2533,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Multi-Modal Models (Extended) 3 - depends_on: image-build + depends_on: [] key: block-multi-modal-models-extended-3 - label: Multi-Modal Models (Extended) 3 agents: @@ -2513,6 +2562,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Multi-Modal Models (Standard) + depends_on: [] + key: block-multi-modal-models-standard - label: Multi-Modal Models (Standard) agents: queue: gpu_1_queue @@ -2526,8 +2578,7 @@ steps: --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model - depends_on: - - image-build + depends_on: block-multi-modal-models-standard soft_fail: false plugins: - docker#v5.2.0: @@ -2544,6 +2595,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Multi-Modal Processor + depends_on: [] + key: block-multi-modal-processor - label: Multi-Modal Processor agents: queue: gpu_1_queue @@ -2553,8 +2607,7 @@ steps: - cd /vllm-workspace/tests - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/processing/test_tensor_schema.py - depends_on: - - image-build + depends_on: block-multi-modal-processor soft_fail: false plugins: - docker#v5.2.0: @@ -2571,6 +2624,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Multi-Modal Processor Test (CPU) + depends_on: [] + key: block-multi-modal-processor-test-cpu - label: Multi-Modal Processor Test (CPU) agents: queue: cpu_queue_premerge_us_east_1 @@ -2580,8 +2636,7 @@ steps: - cd /vllm-workspace/tests - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - depends_on: - - image-build + depends_on: block-multi-modal-processor-test-cpu soft_fail: false plugins: - docker#v5.2.0: @@ -2600,6 +2655,9 @@ steps: mount_buildkite_agent: true - group: Plugins steps: + - block: Run Plugin Tests (2 GPUs) + depends_on: [] + key: block-plugin-tests-2-gpus - label: Plugin Tests (2 GPUs) agents: queue: gpu_4_queue @@ -2622,8 +2680,7 @@ steps: - pytest -v -s entrypoints/openai/test_oot_registration.py - pytest -v -s models/test_oot_registration.py - pytest -v -s plugins/lora_resolvers - depends_on: - - image-build + depends_on: block-plugin-tests-2-gpus soft_fail: false plugins: - docker#v5.2.0: @@ -2642,6 +2699,9 @@ steps: mount_buildkite_agent: true - group: PyTorch steps: + - block: Run PyTorch Compilation Unit Tests + depends_on: [] + key: block-pytorch-compilation-unit-tests - label: PyTorch Compilation Unit Tests agents: queue: gpu_1_queue @@ -2650,12 +2710,11 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\\\; - depends_on: - - [] + depends_on: block-pytorch-compilation-unit-tests soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:52ef28ad15b328f5b6b3edb3f8b8904528a183f6 + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 always-pull: true propagate-environment: true environment: @@ -2668,6 +2727,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run PyTorch Fullgraph + depends_on: [] + key: block-pytorch-fullgraph - label: PyTorch Fullgraph agents: queue: gpu_1_queue @@ -2678,8 +2740,7 @@ steps: - pytest -v -s compile/fullgraph/test_full_graph.py -k "not test_fp8_kv_scale_compile" - pytest -v -s compile/distributed/test_fusions_e2e.py -k "TRITON and not +quant_fp8 and not Llama-4" - depends_on: - - image-build + depends_on: block-pytorch-fullgraph soft_fail: false plugins: - docker#v5.2.0: @@ -2696,6 +2757,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run PyTorch Fullgraph Smoke Test + depends_on: [] + key: block-pytorch-fullgraph-smoke-test - label: PyTorch Fullgraph Smoke Test agents: queue: gpu_1_queue @@ -2705,12 +2769,11 @@ steps: - cd /vllm-workspace/tests - find compile/fullgraph/ -name "test_*.py" -not -name "test_full_graph.py" -exec pytest -s -v {} \\\\; - depends_on: - - [] + depends_on: block-pytorch-fullgraph-smoke-test soft_fail: false plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:52ef28ad15b328f5b6b3edb3f8b8904528a183f6 + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 always-pull: true propagate-environment: true environment: @@ -2723,6 +2786,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Pytorch Nightly Dependency Override Check + depends_on: [] + key: block-pytorch-nightly-dependency-override-check - label: Pytorch Nightly Dependency Override Check agents: queue: gpu_1_queue @@ -2731,8 +2797,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - bash standalone_tests/pytorch_nightly_dependency.sh - depends_on: - - image-build + depends_on: block-pytorch-nightly-dependency-override-check soft_fail: true plugins: - docker#v5.2.0: @@ -2751,6 +2816,9 @@ steps: mount_buildkite_agent: true - group: Quantization steps: + - block: Run Quantization + depends_on: [] + key: block-quantization - label: Quantization agents: queue: gpu_1_queue @@ -2761,8 +2829,7 @@ steps: - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - depends_on: - - image-build + depends_on: block-quantization soft_fail: false plugins: - docker#v5.2.0: @@ -2779,6 +2846,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run Quantized MoE Test (B200) + depends_on: [] + key: block-quantized-moe-test-b200 - label: Quantized MoE Test (B200) agents: queue: B200 @@ -2787,8 +2857,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/ - pytest -s -v tests/quantization/test_blackwell_moe.py - depends_on: - - image-build + depends_on: block-quantized-moe-test-b200 soft_fail: false plugins: - docker#v5.2.0: @@ -2807,6 +2876,9 @@ steps: - /data/benchmark-vllm-cache:/root/.cache/vllm - group: Samplers steps: + - block: Run Samplers Test + depends_on: [] + key: block-samplers-test - label: Samplers Test agents: queue: gpu_1_queue @@ -2816,8 +2888,7 @@ steps: - cd /vllm-workspace/tests - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - depends_on: - - image-build + depends_on: block-samplers-test soft_fail: false plugins: - docker#v5.2.0: @@ -2836,6 +2907,9 @@ steps: mount_buildkite_agent: true - group: Tool use steps: + - block: Run OpenAI-Compatible Tool Use + depends_on: [] + key: block-openai-compatible-tool-use - label: OpenAI-Compatible Tool Use agents: queue: gpu_1_queue @@ -2844,8 +2918,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s -m "not cpu_test" tool_use - depends_on: - - image-build + depends_on: block-openai-compatible-tool-use soft_fail: false plugins: - docker#v5.2.0: @@ -2862,6 +2935,9 @@ steps: - /dev/shm:/dev/shm - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true + - block: Run OpenAI-Compatible Tool Use (CPU) + depends_on: [] + key: block-openai-compatible-tool-use-cpu - label: OpenAI-Compatible Tool Use (CPU) agents: queue: cpu_queue_premerge_us_east_1 @@ -2870,8 +2946,7 @@ steps: - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - cd /vllm-workspace/tests - pytest -v -s -m "cpu_test" tool_use - depends_on: - - image-build + depends_on: block-openai-compatible-tool-use-cpu soft_fail: false plugins: - docker#v5.2.0: @@ -2891,7 +2966,7 @@ steps: - group: Weight Loading steps: - block: Run Weight Loading Multiple GPU - depends_on: image-build + depends_on: [] key: block-weight-loading-multiple-gpu - label: Weight Loading Multiple GPU agents: @@ -2919,7 +2994,7 @@ steps: - /fsx/hf_cache:/fsx/hf_cache mount_buildkite_agent: true - block: Run Weight Loading Multiple GPU - Large Models - depends_on: image-build + depends_on: [] key: block-weight-loading-multiple-gpu---large-models - label: Weight Loading Multiple GPU - Large Models agents: @@ -2937,14 +3012,6 @@ steps: priorityClassName: ci containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - command: - - bash - - -c - - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi - || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness - && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py - --config-list-file=configs/models-large.txt --tp-size=4 resources: limits: nvidia.com/gpu: 4 diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml index dab6e674990b..c158b4ecbfcf 100644 --- a/.buildkite/test_areas/pytorch.yaml +++ b/.buildkite/test_areas/pytorch.yaml @@ -13,7 +13,7 @@ steps: # tests covered elsewhere. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\\\\\;" - label: PyTorch Fullgraph Smoke Test timeout_in_minutes: 30 @@ -25,7 +25,7 @@ steps: # as it is a heavy test that is covered in other steps. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\\\\\;" - label: PyTorch Fullgraph timeout_in_minutes: 40 From a020a18c40fa776bd67fb4c673d5f49e6fbf7079 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 2 Dec 2025 01:58:10 -0800 Subject: [PATCH 17/24] slashes Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/pytorch.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml index c158b4ecbfcf..703c82eb1a91 100644 --- a/.buildkite/test_areas/pytorch.yaml +++ b/.buildkite/test_areas/pytorch.yaml @@ -13,7 +13,7 @@ steps: # tests covered elsewhere. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\\\\\;" + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;" - label: PyTorch Fullgraph Smoke Test timeout_in_minutes: 30 @@ -25,7 +25,7 @@ steps: # as it is a heavy test that is covered in other steps. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\\\\\;" + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;" - label: PyTorch Fullgraph timeout_in_minutes: 40 From f2e32c9a3964d41cf2f02110b378403f25314acf Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 3 Dec 2025 00:57:44 -0800 Subject: [PATCH 18/24] 2node test Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/distributed.yaml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 30a1002b701b..e57414ba0c28 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -173,16 +173,7 @@ steps: - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" - label: Distributed NixlConnector PD accuracy (4 GPUs) timeout_in_minutes: 30 From e35d711b1936e763df9ba8b6da52286f7b201885 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 3 Dec 2025 14:57:04 -0800 Subject: [PATCH 19/24] switch pushd to cd Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/distributed.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index e57414ba0c28..57756aae4808 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -97,10 +97,9 @@ steps: - pytest -v -s distributed/test_symm_mem_allreduce.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - - pushd ../examples/offline_inference + - cd ../examples/offline_inference - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - - popd - label: Distributed Tests (8 GPUs)(H100) optional: true From 54cb6029de1329661f41602777b1cf4c241dbebf Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Thu, 4 Dec 2025 02:52:15 -0800 Subject: [PATCH 20/24] remove old file Signed-off-by: Kevin H. Luu --- .buildkite/pipeline.yaml | 3044 -------------------------------------- 1 file changed, 3044 deletions(-) delete mode 100644 .buildkite/pipeline.yaml diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml deleted file mode 100644 index ae55531ecf13..000000000000 --- a/.buildkite/pipeline.yaml +++ /dev/null @@ -1,3044 +0,0 @@ -steps: -- group: Abuild - steps: - - block: 'Run :docker: Build CPU arm64 image' - depends_on: [] - key: block--docker--build-cpu-arm64-image - - label: ':docker: Build CPU arm64 image' - key: cpu-arm64-image-build - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - .buildkite/image_build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo - 123 - depends_on: block--docker--build-cpu-arm64-image - soft_fail: false - retry: - automatic: - - exit_status: -1 - limit: 2 - - exit_status: -10 - limit: 2 - env: - DOCKER_BUILDKIT: '1' - - block: 'Run :docker: Build CPU image' - depends_on: [] - key: block--docker--build-cpu-image - - label: ':docker: Build CPU image' - key: image-build-cpu - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - .buildkite/image_build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo - 123 - depends_on: block--docker--build-cpu-image - soft_fail: false - retry: - automatic: - - exit_status: -1 - limit: 2 - - exit_status: -10 - limit: 2 - env: - DOCKER_BUILDKIT: '1' - - block: 'Run :docker: Build CUDA 11.8 image' - depends_on: [] - key: block--docker--build-cuda-11-8-image - - label: ':docker: Build CUDA 11.8 image' - key: image-build-cu118 - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - .buildkite/image_build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo - 123 - depends_on: block--docker--build-cuda-11-8-image - soft_fail: false - retry: - automatic: - - exit_status: -1 - limit: 2 - - exit_status: -10 - limit: 2 - env: - DOCKER_BUILDKIT: '1' - - block: 'Run :docker: Build HPU image' - depends_on: [] - key: block--docker--build-hpu-image - - label: ':docker: Build HPU image' - key: image-build-hpu - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - .buildkite/image_build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo - 123 - depends_on: block--docker--build-hpu-image - soft_fail: true - retry: - automatic: - - exit_status: -1 - limit: 2 - - exit_status: -10 - limit: 2 - env: - DOCKER_BUILDKIT: '1' - - block: 'Run :docker: Build image' - depends_on: [] - key: block--docker--build-image - - label: ':docker: Build image' - key: image-build - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - .buildkite/image_build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo - 123 - depends_on: block--docker--build-image - soft_fail: false - retry: - automatic: - - exit_status: -1 - limit: 2 - - exit_status: -10 - limit: 2 - env: - DOCKER_BUILDKIT: '1' -- group: Attention - steps: - - block: Run V1 attention (B200) - depends_on: [] - key: block-v1-attention-b200 - - label: V1 attention (B200) - agents: - queue: B200 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention - depends_on: block-v1-attention-b200 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/benchmark-hf-cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /data/benchmark-hf-cache:/benchmark-hf-cache - - /data/benchmark-vllm-cache:/root/.cache/vllm - - block: Run V1 attention (H100) - depends_on: [] - key: block-v1-attention-h100 - - label: V1 attention (H100) - agents: - queue: mithril-h100-pool - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s v1/attention - depends_on: block-v1-attention-h100 - soft_fail: false - plugins: - - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate -- group: Basic Correctness - steps: - - block: Run Basic Correctness - depends_on: [] - key: block-basic-correctness - - label: Basic Correctness - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s basic_correctness/test_cumem.py - - pytest -v -s basic_correctness/test_basic_correctness.py - - pytest -v -s basic_correctness/test_cpu_offload.py - depends_on: block-basic-correctness - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Benchmarks - steps: - - block: Run Benchmarks - depends_on: [] - key: block-benchmarks - - label: Benchmarks - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/.buildkite - - bash scripts/run-benchmarks.sh - depends_on: block-benchmarks - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Benchmarks CLI Test - depends_on: [] - key: block-benchmarks-cli-test - - label: Benchmarks CLI Test - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s benchmarks/ - depends_on: block-benchmarks-cli-test - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: CUDA - steps: - - block: Run Cudagraph - depends_on: [] - key: block-cudagraph - - label: Cudagraph - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - depends_on: block-cudagraph - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Platform Tests (CUDA) - depends_on: [] - key: block-platform-tests-cuda - - label: Platform Tests (CUDA) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s cuda/test_cuda_context.py - depends_on: block-platform-tests-cuda - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Compile - steps: - - block: Run Fusion E2E (2 GPUs)(B200) - depends_on: [] - key: block-fusion-e2e-2-gpusb200 - - label: Fusion E2E (2 GPUs)(B200) - agents: - queue: B200 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/ - - nvidia-smi - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - depends_on: block-fusion-e2e-2-gpusb200 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/benchmark-hf-cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /data/benchmark-hf-cache:/benchmark-hf-cache - - /data/benchmark-vllm-cache:/root/.cache/vllm - - block: Run Fusion and Compile Tests (B200) - depends_on: [] - key: block-fusion-and-compile-tests-b200 - - label: Fusion and Compile Tests (B200) - agents: - queue: B200 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/ - - nvidia-smi - - pytest -v -s tests/compile/test_fusion_attn.py - - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - -k "True and not +quant_fp8 and not +rms_norm" - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - depends_on: block-fusion-and-compile-tests-b200 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/benchmark-hf-cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /data/benchmark-hf-cache:/benchmark-hf-cache - - /data/benchmark-vllm-cache:/root/.cache/vllm -- group: Distributed - steps: - - block: Run 2 Node Test (4 GPUs) - depends_on: [] - key: block-2-node-test-4-gpus - - label: 2 Node Test (4 GPUs) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d - --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep "Same node - test passed" - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 - distributed/test_node_count.py | grep "Node count test passed" - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 - --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 - --enforce-eager --trust-remote-code - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d - --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep "Same node - test passed" - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 - distributed/test_node_count.py | grep "Node count test passed" - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 - --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 - --enforce-eager --trust-remote-code - depends_on: block-2-node-test-4-gpus - soft_fail: false - - block: Run Distributed (2 GPUs) - depends_on: [] - key: block-distributed-2-gpus - - label: Distributed (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - export NCCL_CUMEM_HOST_ENABLE=0 - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - - pytest -v -s ./compile/test_wrapper.py - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - | grep "Same node test passed" - - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 - distributed/test_same_node.py | grep "Same node test passed" - - pytest -v -s distributed/test_sequence_parallel.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - - pytest -v -s v1/worker/test_worker_memory_snapshot.py - depends_on: block-distributed-2-gpus - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Distributed Comm Ops - depends_on: [] - key: block-distributed-comm-ops - - label: Distributed Comm Ops - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - - pytest -v -s distributed/test_shm_buffer.py - - pytest -v -s distributed/test_shm_storage.py - depends_on: block-distributed-comm-ops - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Distributed NixlConnector PD accuracy (4 GPUs) - depends_on: [] - key: block-distributed-nixlconnector-pd-accuracy-4-gpus - - label: Distributed NixlConnector PD accuracy (4 GPUs) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh - depends_on: block-distributed-nixlconnector-pd-accuracy-4-gpus - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Distributed Tests (2 GPUs)(B200) - depends_on: [] - key: block-distributed-tests-2-gpusb200 - - label: Distributed Tests (2 GPUs)(B200) - agents: - queue: B200 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/ - - pytest -v -s tests/distributed/test_context_parallel.py - - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - - pytest -v -s tests/v1/distributed/test_dbo.py - depends_on: block-distributed-tests-2-gpusb200 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/benchmark-hf-cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /data/benchmark-hf-cache:/benchmark-hf-cache - - /data/benchmark-vllm-cache:/root/.cache/vllm - - block: Run Distributed Tests (2 GPUs)(H200) - depends_on: [] - key: block-distributed-tests-2-gpush200 - - label: Distributed Tests (2 GPUs)(H200) - agents: - queue: skylab-h200 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/ - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py - -k "not Llama-4" - - VLLM_TEST_CLEAN_GPU_MEMORY=1pytest -v -s tests/distributed/test_sequence_parallel.py - - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py - --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - - pytest -v -s tests/v1/distributed/test_dbo.py - depends_on: block-distributed-tests-2-gpush200 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - gpus: all - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/benchmark-hf-cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /data/benchmark-hf-cache:/benchmark-hf-cache - - /data/benchmark-vllm-cache:/root/.cache/vllm - - block: Run Distributed Tests (4 GPUs) - depends_on: [] - key: block-distributed-tests-4-gpus - - label: Distributed Tests (4 GPUs) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - export NCCL_CUMEM_HOST_ENABLE=0 - - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/fullgraph/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s distributed/test_events.py - - pytest -v -s distributed/test_symm_mem_allreduce.py - - pushd ../examples/offline_inference - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - - popd - depends_on: block-distributed-tests-4-gpus - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Distributed Tests (4 GPUs)(A100) - depends_on: [] - key: block-distributed-tests-4-gpusa100 - - label: Distributed Tests (4 GPUs)(A100) - agents: - queue: a100_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s distributed/test_custom_all_reduce.py - - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m "distributed(num_gpus=2)" - - pytest -v -s -x lora/test_mixtral.py - depends_on: block-distributed-tests-4-gpusa100 - soft_fail: false - plugins: - - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - block: Run Distributed Tests (8 GPUs)(H100) - depends_on: [] - key: block-distributed-tests-8-gpush100 - - label: Distributed Tests (8 GPUs)(H100) - agents: - queue: mithril-h100-pool - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - export NCCL_CUMEM_HOST_ENABLE=0 - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py - --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - depends_on: block-distributed-tests-8-gpush100 - soft_fail: false - plugins: - - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - block: Run Pipeline + Context Parallelism (4 GPUs)) - depends_on: [] - key: block-pipeline---context-parallelism-4-gpus - - label: Pipeline + Context Parallelism (4 GPUs)) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s distributed/test_pp_cudagraph.py - - pytest -v -s distributed/test_pipeline_parallel.py - depends_on: block-pipeline---context-parallelism-4-gpus - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: E2E Integration - steps: - - block: Run DeepSeek V2-Lite Accuracy - depends_on: [] - key: block-deepseek-v2-lite-accuracy - - label: DeepSeek V2-Lite Accuracy - agents: - queue: mithril-h100-pool - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh - 0.25 200 8010 - depends_on: block-deepseek-v2-lite-accuracy - soft_fail: false - plugins: - - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - block: Run Prime-RL Integration (2 GPUs) - depends_on: [] - key: block-prime-rl-integration-2-gpus - - label: Prime-RL Integration (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace - - bash .buildkite/scripts/run-prime-rl-test.sh - depends_on: block-prime-rl-integration-2-gpus - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Qwen3-30B-A3B-FP8-block Accuracy - depends_on: [] - key: block-qwen3-30b-a3b-fp8-block-accuracy - - label: Qwen3-30B-A3B-FP8-block Accuracy - agents: - queue: mithril-h100-pool - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh - 0.8 200 8020 - depends_on: block-qwen3-30b-a3b-fp8-block-accuracy - soft_fail: false - plugins: - - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate -- group: Engine - steps: - - block: Run Engine - depends_on: [] - key: block-engine - - label: Engine - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - depends_on: block-engine - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run V1 e2e + engine - depends_on: [] - key: block-v1-e2e---engine - - label: V1 e2e + engine - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s v1/e2e - - pytest -v -s v1/engine - depends_on: block-v1-e2e---engine - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Entrypoints - steps: - - block: Run Entrypoints Integration (API Server) - depends_on: [] - key: block-entrypoints-integration-api-server - - label: Entrypoints Integration (API Server) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py - --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py - --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py - --ignore=entrypoints/openai/tool_parsers/ - - pytest -v -s entrypoints/test_chat_utils.py - depends_on: block-entrypoints-integration-api-server - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Entrypoints Integration (LLM) - depends_on: [] - key: block-entrypoints-integration-llm - - label: Entrypoints Integration (LLM) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_generate.py - - pytest -v -s entrypoints/offline_mode - depends_on: block-entrypoints-integration-llm - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Entrypoints Integration (Pooling) - depends_on: [] - key: block-entrypoints-integration-pooling - - label: Entrypoints Integration (Pooling) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/pooling - depends_on: block-entrypoints-integration-pooling - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Entrypoints Unit Tests - depends_on: [] - key: block-entrypoints-unit-tests - - label: Entrypoints Unit Tests - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai - --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - depends_on: block-entrypoints-unit-tests - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Entrypoints V1 - depends_on: [] - key: block-entrypoints-v1 - - label: Entrypoints V1 - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s v1/entrypoints - depends_on: block-entrypoints-v1 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run OpenAI API Correctness - depends_on: [] - key: block-openai-api-correctness - - label: OpenAI API Correctness - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -s entrypoints/openai/correctness/ - depends_on: block-openai-api-correctness - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Expert Parallelism - steps: - - block: Run EPLB Algorithm - depends_on: [] - key: block-eplb-algorithm - - label: EPLB Algorithm - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s distributed/test_eplb_algo.py - depends_on: block-eplb-algorithm - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run EPLB Execution - depends_on: [] - key: block-eplb-execution - - label: EPLB Execution - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s distributed/test_eplb_execute.py - - pytest -v -s distributed/test_eplb_spec_decode.py - depends_on: block-eplb-execution - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Kernels - steps: - - block: Run Kernels (B200) - depends_on: [] - key: block-kernels-b200 - - label: Kernels (B200) - agents: - queue: B200 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/ - - nvidia-smi - - python3 examples/offline_inference/basic/chat.py - - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k "not num_heads2" - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k "fp8" - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - depends_on: block-kernels-b200 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/benchmark-hf-cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /data/benchmark-hf-cache:/benchmark-hf-cache - - /data/benchmark-vllm-cache:/root/.cache/vllm - - block: Run Kernels Attention Test %N - depends_on: [] - key: block-kernels-attention-test-n - - label: Kernels Attention Test %N - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - depends_on: block-kernels-attention-test-n - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - parallelism: 2 - - block: Run Kernels Core Operation Test - depends_on: [] - key: block-kernels-core-operation-test - - label: Kernels Core Operation Test - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s kernels/core kernels/test_top_k_per_row.py - depends_on: block-kernels-core-operation-test - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Kernels DeepGEMM Test (H100) - depends_on: [] - key: block-kernels-deepgemm-test-h100 - - label: Kernels DeepGEMM Test (H100) - agents: - queue: mithril-h100-pool - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm - - pytest -v -s kernels/moe/test_deepgemm.py - - pytest -v -s kernels/moe/test_batched_deepgemm.py - - pytest -v -s kernels/attention/test_deepgemm_attention.py - depends_on: block-kernels-deepgemm-test-h100 - soft_fail: false - plugins: - - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - block: Run Kernels Mamba Test - depends_on: [] - key: block-kernels-mamba-test - - label: Kernels Mamba Test - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s kernels/mamba - depends_on: block-kernels-mamba-test - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Kernels MoE Test %N - depends_on: [] - key: block-kernels-moe-test-n - - label: Kernels MoE Test %N - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - depends_on: block-kernels-moe-test-n - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - parallelism: 2 - - block: Run Kernels Quantization Test %N - depends_on: [] - key: block-kernels-quantization-test-n - - label: Kernels Quantization Test %N - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - depends_on: block-kernels-quantization-test-n - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - parallelism: 2 -- group: LM Eval - steps: - - block: Run LM Eval Large Models (4 GPUs)(A100) - depends_on: [] - key: block-lm-eval-large-models-4-gpusa100 - - label: LM Eval Large Models (4 GPUs)(A100) - agents: - queue: a100_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/.buildkite/lm-eval-harness - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt - --tp-size=4 - depends_on: block-lm-eval-large-models-4-gpusa100 - soft_fail: false - plugins: - - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - block: Run LM Eval Large Models (4 GPUs)(H100) - depends_on: [] - key: block-lm-eval-large-models-4-gpush100 - - label: LM Eval Large Models (4 GPUs)(H100) - agents: - queue: mithril-h100-pool - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/.buildkite/lm-eval-harness - - export VLLM_USE_DEEP_GEMM=0 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt - --tp-size=4 - depends_on: block-lm-eval-large-models-4-gpush100 - soft_fail: false - plugins: - - kubernetes: - podSpec: - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - block: Run LM Eval Small Models - depends_on: [] - key: block-lm-eval-small-models - - label: LM Eval Small Models - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - --tp-size=1 - depends_on: block-lm-eval-small-models - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run LM Eval Small Models (B200) - depends_on: [] - key: block-lm-eval-small-models-b200 - - label: LM Eval Small Models (B200) - agents: - queue: B200 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - --tp-size=1 - depends_on: block-lm-eval-small-models-b200 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/benchmark-hf-cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /data/benchmark-hf-cache:/benchmark-hf-cache - - /data/benchmark-vllm-cache:/root/.cache/vllm -- group: LoRA - steps: - - block: Run LoRA %N - depends_on: [] - key: block-lora-n - - label: LoRA %N - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py - --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py - --ignore=lora/test_qwen3moe_tp.py - depends_on: block-lora-n - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - parallelism: 4 - - block: Run LoRA TP (Distributed) - depends_on: [] - key: block-lora-tp-distributed - - label: LoRA TP (Distributed) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s -x lora/test_chatglm3_tp.py - - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py - - pytest -v -s -x lora/test_olmoe_tp.py - - pytest -v -s -x lora/test_gptoss_tp.py - depends_on: block-lora-tp-distributed - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Miscellaneous - steps: - - block: Run Async Engine, Inputs, Utils, Worker - depends_on: [] - key: block-async-engine--inputs--utils--worker - - label: Async Engine, Inputs, Utils, Worker - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s -m "not cpu_test" multimodal - - pytest -v -s utils_ - depends_on: block-async-engine--inputs--utils--worker - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Async Engine, Inputs, Utils, Worker, Config (CPU) - depends_on: [] - key: block-async-engine--inputs--utils--worker--config-cpu - - label: Async Engine, Inputs, Utils, Worker, Config (CPU) - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - python3 standalone_tests/lazy_imports.py - - pytest -v -s test_inputs.py - - pytest -v -s test_outputs.py - - pytest -v -s -m "cpu_test" multimodal - - pytest -v -s tokenizers_ - - pytest -v -s transformers_utils - - pytest -v -s config - depends_on: block-async-engine--inputs--utils--worker--config-cpu - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Examples - depends_on: [] - key: block-examples - - label: Examples - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/examples - - pip install tensorizer - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf - --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_pooling.py --seed 0 - - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory - /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m - deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper - --seed 0 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens - 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp - 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens - 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp - 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - depends_on: block-examples - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run GPT-OSS Eval (B200) - depends_on: [] - key: block-gpt-oss-eval-b200 - - label: GPT-OSS Eval (B200) - agents: - queue: B200 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/ - - uv pip install --system "gpt-oss[eval]==0.0.5" - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b - --metric 0.58 - depends_on: block-gpt-oss-eval-b200 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/benchmark-hf-cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /data/benchmark-hf-cache:/benchmark-hf-cache - - /data/benchmark-vllm-cache:/root/.cache/vllm - - block: Run Metrics, Tracing (2 GPUs) - depends_on: [] - key: block-metrics--tracing-2-gpus - - label: Metrics, Tracing (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip install "opentelemetry-sdk>=1.26.0" "opentelemetry-api>=1.26.0" "opentelemetry-exporter-otlp>=1.26.0" - "opentelemetry-semantic-conventions-ai>=0.4.1" - - pytest -v -s v1/tracing - depends_on: block-metrics--tracing-2-gpus - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Python-only Installation - depends_on: [] - key: block-python-only-installation - - label: Python-only Installation - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - bash standalone_tests/python_only_compile.sh - depends_on: block-python-only-installation - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Regression - depends_on: [] - key: block-regression - - label: Regression - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip install modelscope - - pytest -v -s test_regression.py - depends_on: block-regression - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run V1 Others - depends_on: [] - key: block-v1-others - - label: V1 Others - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - pytest -v -s -m "not cpu_test" v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/worker - - pytest -v -s v1/spec_decode - - pytest -v -s -m "not cpu_test" v1/kv_connector/unit - - pytest -v -s -m "not cpu_test" v1/metrics - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - depends_on: block-v1-others - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run V1 Others (CPU) - depends_on: [] - key: block-v1-others-cpu - - label: V1 Others (CPU) - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s -m "cpu_test" v1/core - - pytest -v -s v1/structured_output - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s -m "cpu_test" v1/kv_connector/unit - - pytest -v -s -m "cpu_test" v1/metrics - depends_on: block-v1-others-cpu - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Model Executor - steps: - - block: Run Model Executor - depends_on: [] - key: block-model-executor - - label: Model Executor - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s model_executor - - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - depends_on: block-model-executor - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Models - Basic - steps: - - block: Run Basic Models Test (Other CPU) - depends_on: [] - key: block-basic-models-test-other-cpu - - label: Basic Models Test (Other CPU) - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s models/test_utils.py models/test_vision.py - depends_on: block-basic-models-test-other-cpu - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Basic Models Tests (Extra Initialization) %N - depends_on: [] - key: block-basic-models-tests-extra-initialization-n - - label: Basic Models Tests (Extra Initialization) %N - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s models/test_initialization.py -k "not test_can_initialize_small_subset" - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - depends_on: block-basic-models-tests-extra-initialization-n - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - parallelism: 2 - - block: Run Basic Models Tests (Initialization) - depends_on: [] - key: block-basic-models-tests-initialization - - label: Basic Models Tests (Initialization) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - depends_on: block-basic-models-tests-initialization - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Basic Models Tests (Other) - depends_on: [] - key: block-basic-models-tests-other - - label: Basic Models Tests (Other) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s models/test_transformers.py models/test_registry.py - depends_on: block-basic-models-tests-other - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Transformers Nightly Models - depends_on: [] - key: block-transformers-nightly-models - - label: Transformers Nightly Models - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/ - - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py - - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py - - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py - --model-type whisper - depends_on: block-transformers-nightly-models - soft_fail: true - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Models - Distributed - steps: - - block: Run Distributed Model Tests (2 GPUs) - depends_on: [] - key: block-distributed-model-tests-2-gpus - - label: Distributed Model Tests (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m "distributed(num_gpus=2)" - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py - - pytest models/test_transformers.py -v -s -m "distributed(num_gpus=2)" - - pytest models/language -v -s -m "distributed(num_gpus=2)" - - pytest models/multimodal -v -s -m "distributed(num_gpus=2)" --ignore models/multimodal/generation/test_whisper.py - - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py - -v -s -m "distributed(num_gpus=2)" - depends_on: block-distributed-model-tests-2-gpus - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Models - Language - steps: - - block: Run Language Models Test (Extended Generation) - depends_on: [] - key: block-language-models-test-extended-generation - - label: Language Models Test (Extended Generation) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5" - - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2" - - pytest -v -s models/language/generation -m "(not core_model) and (not hybrid_model)" - depends_on: block-language-models-test-extended-generation - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Language Models Test (Extended Pooling) - depends_on: [] - key: block-language-models-test-extended-pooling - - label: Language Models Test (Extended Pooling) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s models/language/pooling -m "not core_model" - depends_on: block-language-models-test-extended-pooling - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Language Models Test (MTEB) - depends_on: [] - key: block-language-models-test-mteb - - label: Language Models Test (MTEB) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s models/language/pooling_mteb_test - depends_on: block-language-models-test-mteb - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Language Models Test (PPL) - depends_on: [] - key: block-language-models-test-ppl - - label: Language Models Test (PPL) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s models/language/generation_ppl_test - depends_on: block-language-models-test-ppl - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Language Models Tests (Extra Standard) %N - depends_on: [] - key: block-language-models-tests-extra-standard-n - - label: Language Models Tests (Extra Standard) %N - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip freeze | grep -E "torch" - - pytest -v -s models/language -m "core_model and slow_test" --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - --shard-id=$$BUILDKITE_PARALLEL_JOB - depends_on: block-language-models-tests-extra-standard-n - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - parallelism: 2 - - block: Run Language Models Tests (Hybrid) %N - depends_on: [] - key: block-language-models-tests-hybrid-n - - label: Language Models Tests (Hybrid) %N - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5" - - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2" - - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - --shard-id=$$BUILDKITE_PARALLEL_JOB - depends_on: block-language-models-tests-hybrid-n - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - parallelism: 2 - - block: Run Language Models Tests (Standard) - depends_on: [] - key: block-language-models-tests-standard - - label: Language Models Tests (Standard) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip freeze | grep -E "torch" - - pytest -v -s models/language -m "core_model and (not slow_test)" - depends_on: block-language-models-tests-standard - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Models - Multimodal - steps: - - block: Run Custom Models - depends_on: [] - key: block-custom-models - - label: Custom Models - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - echo "Testing custom models..." - depends_on: block-custom-models - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Multi-Modal Accuracy Eval (Small Models) - depends_on: [] - key: block-multi-modal-accuracy-eval-small-models - - label: Multi-Modal Accuracy Eval (Small Models) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/.buildkite/lm-eval-harness - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt - --tp-size=1 - depends_on: block-multi-modal-accuracy-eval-small-models - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Multi-Modal Models (Extended) 1 - depends_on: [] - key: block-multi-modal-models-extended-1 - - label: Multi-Modal Models (Extended) 1 - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal -m "not core_model" --ignore models/multimodal/generation/test_common.py - --ignore models/multimodal/processing - depends_on: block-multi-modal-models-extended-1 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Multi-Modal Models (Extended) 2 - depends_on: [] - key: block-multi-modal-models-extended-2 - - label: Multi-Modal Models (Extended) 2 - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m "split(group=0) - and not core_model" - depends_on: block-multi-modal-models-extended-2 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Multi-Modal Models (Extended) 3 - depends_on: [] - key: block-multi-modal-models-extended-3 - - label: Multi-Modal Models (Extended) 3 - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m "split(group=1) - and not core_model" - depends_on: block-multi-modal-models-extended-3 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Multi-Modal Models (Standard) - depends_on: [] - key: block-multi-modal-models-standard - - label: Multi-Modal Models (Standard) - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pip freeze | grep -E "torch" - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py - --ignore models/multimodal/processing - - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py - -m core_model - depends_on: block-multi-modal-models-standard - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Multi-Modal Processor - depends_on: [] - key: block-multi-modal-processor - - label: Multi-Modal Processor - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing/test_tensor_schema.py - depends_on: block-multi-modal-processor - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Multi-Modal Processor Test (CPU) - depends_on: [] - key: block-multi-modal-processor-test-cpu - - label: Multi-Modal Processor Test (CPU) - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - depends_on: block-multi-modal-processor-test-cpu - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Plugins - steps: - - block: Run Plugin Tests (2 GPUs) - depends_on: [] - key: block-plugin-tests-2-gpus - - label: Plugin Tests (2 GPUs) - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pip install -e ./plugins/vllm_add_dummy_platform - - pytest -v -s plugins_tests/test_platform_plugins.py - - pip uninstall vllm_add_dummy_platform -y - - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py - - pip uninstall prithvi_io_processor_plugin -y - - pip install -e ./plugins/vllm_add_dummy_stat_logger - - pytest -v -s plugins_tests/test_stats_logger_plugins.py - - pip uninstall dummy_stat_logger -y - - pytest -v -s plugins_tests/test_scheduler_plugins.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - - pytest -v -s entrypoints/openai/test_oot_registration.py - - pytest -v -s models/test_oot_registration.py - - pytest -v -s plugins/lora_resolvers - depends_on: block-plugin-tests-2-gpus - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: PyTorch - steps: - - block: Run PyTorch Compilation Unit Tests - depends_on: [] - key: block-pytorch-compilation-unit-tests - - label: PyTorch Compilation Unit Tests - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\\\; - depends_on: block-pytorch-compilation-unit-tests - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run PyTorch Fullgraph - depends_on: [] - key: block-pytorch-fullgraph - - label: PyTorch Fullgraph - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s compile/fullgraph/test_full_graph.py -k "not test_fp8_kv_scale_compile" - - pytest -v -s compile/distributed/test_fusions_e2e.py -k "TRITON and not +quant_fp8 - and not Llama-4" - depends_on: block-pytorch-fullgraph - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run PyTorch Fullgraph Smoke Test - depends_on: [] - key: block-pytorch-fullgraph-smoke-test - - label: PyTorch Fullgraph Smoke Test - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - find compile/fullgraph/ -name "test_*.py" -not -name "test_full_graph.py" -exec - pytest -s -v {} \\\\; - depends_on: block-pytorch-fullgraph-smoke-test - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Pytorch Nightly Dependency Override Check - depends_on: [] - key: block-pytorch-nightly-dependency-override-check - - label: Pytorch Nightly Dependency Override Check - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - bash standalone_tests/pytorch_nightly_dependency.sh - depends_on: block-pytorch-nightly-dependency-override-check - soft_fail: true - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Quantization - steps: - - block: Run Quantization - depends_on: [] - key: block-quantization - - label: Quantization - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 - - uv pip install --system conch-triton-kernels - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - depends_on: block-quantization - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Quantized MoE Test (B200) - depends_on: [] - key: block-quantized-moe-test-b200 - - label: Quantized MoE Test (B200) - agents: - queue: B200 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/ - - pytest -s -v tests/quantization/test_blackwell_moe.py - depends_on: block-quantized-moe-test-b200 - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/benchmark-hf-cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /data/benchmark-hf-cache:/benchmark-hf-cache - - /data/benchmark-vllm-cache:/root/.cache/vllm -- group: Samplers - steps: - - block: Run Samplers Test - depends_on: [] - key: block-samplers-test - - label: Samplers Test - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s samplers - - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - depends_on: block-samplers-test - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Tool use - steps: - - block: Run OpenAI-Compatible Tool Use - depends_on: [] - key: block-openai-compatible-tool-use - - label: OpenAI-Compatible Tool Use - agents: - queue: gpu_1_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s -m "not cpu_test" tool_use - depends_on: block-openai-compatible-tool-use - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run OpenAI-Compatible Tool Use (CPU) - depends_on: [] - key: block-openai-compatible-tool-use-cpu - - label: OpenAI-Compatible Tool Use (CPU) - agents: - queue: cpu_queue_premerge_us_east_1 - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - pytest -v -s -m "cpu_test" tool_use - depends_on: block-openai-compatible-tool-use-cpu - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true -- group: Weight Loading - steps: - - block: Run Weight Loading Multiple GPU - depends_on: [] - key: block-weight-loading-multiple-gpu - - label: Weight Loading Multiple GPU - agents: - queue: gpu_4_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - depends_on: block-weight-loading-multiple-gpu - soft_fail: false - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - always-pull: true - propagate-environment: true - environment: - - VLLM_USAGE_SOURCE=ci-test - - NCCL_CUMEM_HOST_ENABLE=0 - - HF_HOME=/fsx/hf_cache - - HF_TOKEN - - CODECOV_TOKEN - volumes: - - /dev/shm:/dev/shm - - /fsx/hf_cache:/fsx/hf_cache - mount_buildkite_agent: true - - block: Run Weight Loading Multiple GPU - Large Models - depends_on: [] - key: block-weight-loading-multiple-gpu---large-models - - label: Weight Loading Multiple GPU - Large Models - agents: - queue: a100_queue - commands: - - (command nvidia-smi || true) - - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 - - cd /vllm-workspace/tests - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - depends_on: block-weight-loading-multiple-gpu---large-models - soft_fail: false - plugins: - - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123 - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: NCCL_CUMEM_HOST_ENABLE - value: '0' - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate From cde1d84b5bc1aea310c8416cb6f38e140eb9215e Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Fri, 5 Dec 2025 02:30:15 -0800 Subject: [PATCH 21/24] build Signed-off-by: Kevin H. Luu --- .buildkite/ci_config.yaml | 2 +- .buildkite/image_build/image_build.sh | 36 +++++++++++++++------ .buildkite/image_build/image_build.yaml | 16 +-------- .buildkite/image_build/image_build_cu118.sh | 36 --------------------- 4 files changed, 29 insertions(+), 61 deletions(-) delete mode 100755 .buildkite/image_build/image_build_cu118.sh diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml index 5b00e1cab6c7..d85a8517e0c9 100644 --- a/.buildkite/ci_config.yaml +++ b/.buildkite/ci_config.yaml @@ -1,4 +1,4 @@ -name: ci +name: vllm_ci job_dirs: - ".buildkite/test_areas" - ".buildkite/image_build" diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh index 87e35acd5e84..9a2384e524b6 100755 --- a/.buildkite/image_build/image_build.sh +++ b/.buildkite/image_build/image_build.sh @@ -1,17 +1,28 @@ #!/bin/bash set -e -if [[ $# -lt 3 ]]; then - echo "Usage: $0 " +if [[ $# -lt 8 ]]; then + echo "Usage: $0 " exit 1 fi REGISTRY=$1 REPO=$2 BUILDKITE_COMMIT=$3 +BRANCH=$4 +VLLM_USE_PRECOMPILED=$5 +VLLM_MERGE_BASE_COMMIT=$6 +CACHE_FROM=$7 +CACHE_TO=$8 # authenticate with AWS ECR aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY +aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com + +# docker buildx +docker buildx create --name vllm-builder --driver docker-container --use +docker buildx inspect --bootstrap +docker buildx ls # skip build if image already exists if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then @@ -21,18 +32,25 @@ else exit 0 fi +if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then + merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}" +else + merge_base_commit_build_args="" +fi + # build -docker build --file docker/Dockerfile \ +docker buildx build --file docker/Dockerfile \ --build-arg max_jobs=16 \ --build-arg buildkite_commit=$BUILDKITE_COMMIT \ --build-arg USE_SCCACHE=1 \ --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \ --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \ - --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT \ + --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \ + ${merge_base_commit_build_args} \ + --cache-from type=registry,ref=${CACHE_FROM},mode=max \ + --cache-to type=registry,ref=${CACHE_TO},mode=max \ + --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \ + $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \ + --push \ --target test \ --progress plain . - -# push -docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT -docker tag $REGISTRY/$REPO:$BUILDKITE_COMMIT $REGISTRY/$REPO:latest -docker push $REGISTRY/$REPO:latest diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml index af23621a598c..2632634922a5 100644 --- a/.buildkite/image_build/image_build.yaml +++ b/.buildkite/image_build/image_build.yaml @@ -4,7 +4,7 @@ steps: key: image-build depends_on: [] commands: - - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT + - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO env: DOCKER_BUILDKIT: "1" retry: @@ -27,20 +27,6 @@ steps: limit: 2 - exit_status: -10 # Agent was lost limit: 2 - - - label: ":docker: Build CUDA 11.8 image" - key: image-build-cu118 - optional: true - commands: - - .buildkite/image_build/image_build_cu118.sh $REGISTRY $REPO $BUILDKITE_COMMIT - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 2 - - exit_status: -10 # Agent was lost - limit: 2 - label: ":docker: Build HPU image" soft_fail: true diff --git a/.buildkite/image_build/image_build_cu118.sh b/.buildkite/image_build/image_build_cu118.sh deleted file mode 100755 index 699cef2ad60f..000000000000 --- a/.buildkite/image_build/image_build_cu118.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -set -e - -if [[ $# -lt 3 ]]; then - echo "Usage: $0 " - exit 1 -fi - -REGISTRY=$1 -REPO=$2 -BUILDKITE_COMMIT=$3 - -# authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY - -# skip build if image already exists -if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118) ]]; then - echo "Image not found, proceeding with build..." -else - echo "Image found" - exit 0 -fi - -# build -docker build \ - --file docker/Dockerfile \ - --build-arg max_jobs=16 \ - --build-arg buildkite_commit=$BUILDKITE_COMMIT \ - --build-arg USE_SCCACHE=1 \ - --build-arg CUDA_VERSION=11.8.0 \ - --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118 \ - --target test \ - --progress plain . - -# push -docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118 From 89a0c2918a5e2d83dd3713989482e22d123449d0 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Fri, 5 Dec 2025 03:34:57 -0800 Subject: [PATCH 22/24] run all patterns Signed-off-by: Kevin H. Luu --- .buildkite/ci_config.yaml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml index d85a8517e0c9..199c33159fde 100644 --- a/.buildkite/ci_config.yaml +++ b/.buildkite/ci_config.yaml @@ -3,9 +3,21 @@ job_dirs: - ".buildkite/test_areas" - ".buildkite/image_build" run_all_patterns: - - ".*" + - "docker/Dockerfile" + - "CMakeLists.txt" + - "requirements/common.txt" + - "requirements/cuda.txt" + - "requirements/build.txt" + - "requirements/test.txt" + - "setup.py" + - "csrc/" + - "cmake/" run_all_exclude_patterns: - - ".*" + - "docker/Dockerfile." + - "csrc/cpu/" + - "csrc/rocm/" + - "cmake/hipify.py" + - "cmake/cpu_extension.cmake" registries: public.ecr.aws/q9t5s3a7 repositories: main: "vllm-ci-postmerge-repo" From a303afd41869094deae2cc3c6a71eefc6130b403 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 8 Dec 2025 13:20:52 -0800 Subject: [PATCH 23/24] sync Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/e2e_integration.yaml | 18 +++++++++++++ .buildkite/test_areas/misc.yaml | 31 +++++++++++++++++----- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index 817b995574bc..3a33ee71e275 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -30,3 +30,21 @@ steps: - .buildkite/scripts/run-prime-rl-test.sh commands: - bash .buildkite/scripts/run-prime-rl-test.sh + +- label: DeepSeek V2-Lite Async EPLB Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index e4182005bb45..3d1dbc98a1e7 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -55,23 +55,29 @@ steps: working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints + - vllm/multimodal - examples/ commands: - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/chat.py # for basic - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + # for multi-modal models - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -144,3 +150,16 @@ steps: commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Batch Invariance (H100) + timeout_in_minutes: 25 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py \ No newline at end of file From d490e8ec245a388085719de518c64a4a3ef45c96 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 8 Dec 2025 13:38:45 -0800 Subject: [PATCH 24/24] remove buildkit env Signed-off-by: Kevin H. Luu --- .buildkite/image_build/image_build.yaml | 2 -- .buildkite/test_areas/distributed.yaml | 3 +-- .buildkite/test_areas/e2e_integration.yaml | 9 +++++++++ .buildkite/test_areas/misc.yaml | 2 -- .buildkite/test_areas/quantization.yaml | 8 ++++++++ 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml index 2632634922a5..d01c71dd9bec 100644 --- a/.buildkite/image_build/image_build.yaml +++ b/.buildkite/image_build/image_build.yaml @@ -5,8 +5,6 @@ steps: depends_on: [] commands: - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO - env: - DOCKER_BUILDKIT: "1" retry: automatic: - exit_status: -1 # Agent was lost diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 57756aae4808..2cc90698d916 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -102,7 +102,6 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - label: Distributed Tests (8 GPUs)(H100) - optional: true timeout_in_minutes: 10 gpu: h100 num_gpus: 8 @@ -144,7 +143,7 @@ steps: - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - - VLLM_TEST_CLEAN_GPU_MEMORY=1pytest -v -s tests/distributed/test_sequence_parallel.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index 3a33ee71e275..93d389815eda 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -20,6 +20,15 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) + timeout_in_minutes: 60 + gpu: b200 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + - label: Prime-RL Integration (2 GPUs) timeout_in_minutes: 30 optional: true diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index 3d1dbc98a1e7..072bccadb726 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -66,8 +66,6 @@ steps: - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py # for multi-modal models - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml index 02a836b90bdf..6e89d6af3b8d 100644 --- a/.buildkite/test_areas/quantization.yaml +++ b/.buildkite/test_areas/quantization.yaml @@ -36,3 +36,11 @@ steps: - vllm/v1/attention/backends/flashinfer.py commands: - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Quantized Models Test + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization