From a7f11ca733519f489ac4e091adc8585633484d95 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 24 Nov 2025 12:42:54 -0800
Subject: [PATCH 01/24] p

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/attention.yaml          |  19 ++
 .buildkite/test_areas/basic_correctness.yaml  |  14 ++
 .buildkite/test_areas/benchmarks.yaml         |  17 ++
 .buildkite/test_areas/compile.yaml            |  55 +++++
 .buildkite/test_areas/cuda.yaml               |  20 ++
 .buildkite/test_areas/distributed.yaml        | 205 ++++++++++++++++++
 .buildkite/test_areas/e2e_integration.yaml    |  19 ++
 .buildkite/test_areas/engine.yaml             |  27 +++
 .buildkite/test_areas/entrypoints.yaml        |  66 ++++++
 .buildkite/test_areas/expert_parallelism.yaml |  21 ++
 .buildkite/test_areas/kernels.yaml            | 115 ++++++++++
 .buildkite/test_areas/lm_eval.yaml            |  44 ++++
 .buildkite/test_areas/lora.yaml               |  38 ++++
 .buildkite/test_areas/misc.yaml               | 150 +++++++++++++
 .buildkite/test_areas/model_executor.yaml     |  15 ++
 .buildkite/test_areas/models_basic.yaml       |  65 ++++++
 .buildkite/test_areas/models_distributed.yaml |  20 ++
 .buildkite/test_areas/models_language.yaml    |  94 ++++++++
 .buildkite/test_areas/models_multimodal.yaml  |  67 ++++++
 .buildkite/test_areas/plugins.yaml            |  32 +++
 .buildkite/test_areas/pytorch.yaml            |  48 ++++
 .buildkite/test_areas/quantization.yaml       |  35 +++
 .buildkite/test_areas/samplers.yaml           |  12 +
 .buildkite/test_areas/tool_use.yaml           |  20 ++
 .buildkite/test_areas/weight_loading.yaml     |  25 +++
 25 files changed, 1243 insertions(+)
 create mode 100644 .buildkite/test_areas/attention.yaml
 create mode 100644 .buildkite/test_areas/basic_correctness.yaml
 create mode 100644 .buildkite/test_areas/benchmarks.yaml
 create mode 100644 .buildkite/test_areas/compile.yaml
 create mode 100644 .buildkite/test_areas/cuda.yaml
 create mode 100644 .buildkite/test_areas/distributed.yaml
 create mode 100644 .buildkite/test_areas/e2e_integration.yaml
 create mode 100644 .buildkite/test_areas/engine.yaml
 create mode 100644 .buildkite/test_areas/entrypoints.yaml
 create mode 100644 .buildkite/test_areas/expert_parallelism.yaml
 create mode 100644 .buildkite/test_areas/kernels.yaml
 create mode 100644 .buildkite/test_areas/lm_eval.yaml
 create mode 100644 .buildkite/test_areas/lora.yaml
 create mode 100644 .buildkite/test_areas/misc.yaml
 create mode 100644 .buildkite/test_areas/model_executor.yaml
 create mode 100644 .buildkite/test_areas/models_basic.yaml
 create mode 100644 .buildkite/test_areas/models_distributed.yaml
 create mode 100644 .buildkite/test_areas/models_language.yaml
 create mode 100644 .buildkite/test_areas/models_multimodal.yaml
 create mode 100644 .buildkite/test_areas/plugins.yaml
 create mode 100644 .buildkite/test_areas/pytorch.yaml
 create mode 100644 .buildkite/test_areas/quantization.yaml
 create mode 100644 .buildkite/test_areas/samplers.yaml
 create mode 100644 .buildkite/test_areas/tool_use.yaml
 create mode 100644 .buildkite/test_areas/weight_loading.yaml

diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
new file mode 100644
index 000000000000..af57cc6681b4
--- /dev/null
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,19 @@
+group: Attention
+steps:
+- label: V1 attention (H100)
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
new file mode 100644
index 000000000000..27c4d96aeb8c
--- /dev/null
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,14 @@
+group: Basic Correctness
+steps:
+- label: Basic Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
new file mode 100644
index 000000000000..c48c72fb405b
--- /dev/null
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -0,0 +1,17 @@
+group: Benchmarks
+steps:
+- label: Benchmarks
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/.buildkite"
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
new file mode 100644
index 000000000000..4b05bd8976e4
--- /dev/null
+++ b/.buildkite/test_areas/compile.yaml
@@ -0,0 +1,55 @@
+group: Compile
+steps:
+- label: Fusion and Compile Tests (B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
new file mode 100644
index 000000000000..6c8ff70ba45a
--- /dev/null
+++ b/.buildkite/test_areas/cuda.yaml
@@ -0,0 +1,20 @@
+group: CUDA
+steps:
+- label: Platform Tests (CUDA)
+  timeout_in_minutes: 15
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
new file mode 100644
index 000000000000..56dee5c31389
--- /dev/null
+++ b/.buildkite/test_areas/distributed.yaml
@@ -0,0 +1,205 @@
+group: Distributed
+steps:
+- label: Distributed Comm Ops
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+  - pytest -v -s distributed/test_shm_buffer.py
+  - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+  timeout_in_minutes: 90
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_wrapper.py
+  - tests/distributed/
+  - tests/entrypoints/llm/test_collective_rpc.py
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and external_dp=2
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=4 and dp=1
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2, pp=2 and dp=1
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=1 and dp=4 with ep
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2 and dp=2 with ep
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with internal dp
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - pushd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - popd
+
+- label: Distributed Tests (8 GPUs)(H100)
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands:
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+  gpu: h200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - tests/examples/offline_inference/data_parallel.py
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
new file mode 100644
index 000000000000..dca7c1fcdf31
--- /dev/null
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -0,0 +1,19 @@
+group: E2E Integration
+steps:
+- label: DeepSeek V2-Lite Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
new file mode 100644
index 000000000000..be099758eb88
--- /dev/null
+++ b/.buildkite/test_areas/engine.yaml
@@ -0,0 +1,27 @@
+group: Engine
+steps:
+- label: Engine
+  timeout_in_minutes: 40
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/tokenization
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
+  - tests/test_vllm_port
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+  # OOM in the CI unless we run this separately
+  - pytest -v -s tokenization
+
+- label: V1 e2e + engine
+  timeout_in_minutes: 45
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
new file mode 100644
index 000000000000..adbd6e96291e
--- /dev/null
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -0,0 +1,66 @@
+group: Entrypoints
+steps:
+- label: Entrypoints Unit Tests  
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - tests/entrypoints/
+  commands:
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/offline_mode
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/test_chat_utils.py
+
+
+- label: Entrypoints Integration (Pooling)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+
+
+- label: Entrypoints V1
+  timeout_in_minutes: 50
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval+Transcription WER check
+  - pytest -s entrypoints/openai/correctness/
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
new file mode 100644
index 000000000000..a1316f289d59
--- /dev/null
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -0,0 +1,21 @@
+group: Expert Parallelism
+steps:
+- label: EPLB Algorithm
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_algo.py
+  commands:
+  - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_execute.py
+  commands:
+  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
new file mode 100644
index 000000000000..91c682ca9546
--- /dev/null
+++ b/.buildkite/test_areas/kernels.yaml
@@ -0,0 +1,115 @@
+group: Kernels
+steps:
+- label: Kernels Core Operation Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
+  commands:
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test %N
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
+  commands:
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Mamba Test
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
+  commands:
+    - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  # optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - csrc/attention/mla/
+  - csrc/quantization/cutlass_w8a8/moe/
+  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
+  commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    # Attention
+    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    # Quantization
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
new file mode 100644
index 000000000000..c6498c032440
--- /dev/null
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -0,0 +1,44 @@
+group: LM Eval
+steps:
+- label: LM Eval Small Models
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
new file mode 100644
index 000000000000..3f41b5ff0f8f
--- /dev/null
+++ b/.buildkite/test_areas/lora.yaml
@@ -0,0 +1,38 @@
+group: LoRA
+steps:
+- label: LoRA %N
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    - pytest -v -s lora \
+      --shard-id=$$BUILDKITE_PARALLEL_JOB \
+      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+      --ignore=lora/test_chatglm3_tp.py \
+      --ignore=lora/test_llama_tp.py \
+      --ignore=lora/test_llm_with_multi_loras.py \
+      --ignore=lora/test_olmoe_tp.py \
+      --ignore=lora/test_deepseekv2_tp.py \
+      --ignore=lora/test_gptoss_tp.py \
+      --ignore=lora/test_qwen3moe_tp.py
+  parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+  timeout_in_minutes: 30
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # There is some Tensor Parallelism related processing logic in LoRA that
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
new file mode 100644
index 000000000000..ddca24efed1f
--- /dev/null
+++ b/.buildkite/test_areas/misc.yaml
@@ -0,0 +1,150 @@
+group: Miscellaneous
+steps:
+- label: V1 Others
+  timeout_in_minutes: 60
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    # split the test to avoid interference
+    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: Examples
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/chat.py
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_pooling.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+  timeout_in_minutes: 20
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/tracing
+  commands:
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0' \
+      'opentelemetry-api>=1.26.0' \
+      'opentelemetry-exporter-otlp>=1.26.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1'"
+  - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+  timeout_in_minutes: 50
+  source_file_dependencies:
+  - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+  timeout_in_minutes: 10
+  source_file_dependencies:
+  - vllm/
+  - tests/test_inputs.py
+  - tests/test_outputs.py
+  - tests/multimodal
+  - tests/standalone_tests/lazy_imports.py
+  - tests/transformers_utils
+  - tests/config
+  no_gpu: true
+  commands:
+  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
+  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s transformers_utils
+  - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
\ No newline at end of file
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
new file mode 100644
index 000000000000..c2d52654f0d2
--- /dev/null
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -0,0 +1,15 @@
+group: Model Executor
+steps:
+- label: Model Executor
+  timeout_in_minutes: 35
+  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
+  - vllm/model_executor
+  - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
new file mode 100644
index 000000000000..9506a613790c
--- /dev/null
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -0,0 +1,65 @@
+group: Models - Basic
+steps:
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_initialization.py
+  commands:
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py \
+             -k 'not test_can_initialize_small_subset' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+  timeout_in_minutes: 10
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  no_gpu: true
+  commands:
+    - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+  working_dir: "/vllm-workspace/"
+  optional: true
+  commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_transformers.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
new file mode 100644
index 000000000000..ea38fdb12d2e
--- /dev/null
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -0,0 +1,20 @@
+group: Models - Distributed
+steps:
+- label: Distributed Model Tests (2 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/model_executor/model_loader/sharded_state_loader.py
+  - vllm/model_executor/models/
+  - tests/basic_correctness/
+  - tests/model_executor/model_loader/test_sharded_state_loader.py
+  - tests/models/
+  commands:
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
new file mode 100644
index 000000000000..65303f049613
--- /dev/null
+++ b/.buildkite/test_areas/models_language.yaml
@@ -0,0 +1,94 @@
+group: Models - Language
+steps:
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language
+  commands:
+    # Test standard language models, excluding a subset of slow tests
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation \
+                   -m hybrid_model \
+                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+                   --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
new file mode 100644
index 000000000000..5d31192d169a
--- /dev/null
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -0,0 +1,67 @@
+group: Models - Multimodal
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+  timeout_in_minutes: 80
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor # 44min
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
new file mode 100644
index 000000000000..f922d5c919f8
--- /dev/null
+++ b/.buildkite/test_areas/plugins.yaml
@@ -0,0 +1,32 @@
+group: Plugins
+steps:
+- label: Plugin Tests (2 GPUs)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y
+  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
new file mode 100644
index 000000000000..34c0c87fb2c6
--- /dev/null
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -0,0 +1,48 @@
+group: PyTorch
+steps:
+- label: PyTorch Compilation Unit Tests
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+
+- label: PyTorch Fullgraph Smoke Test
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+
+- label: PyTorch Fullgraph
+  timeout_in_minutes: 40
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
\ No newline at end of file
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
new file mode 100644
index 000000000000..554d6447d791
--- /dev/null
+++ b/.buildkite/test_areas/quantization.yaml
@@ -0,0 +1,35 @@
+group: Quantization
+steps:
+- label: Quantization
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
new file mode 100644
index 000000000000..0d26ffbd00ac
--- /dev/null
+++ b/.buildkite/test_areas/samplers.yaml
@@ -0,0 +1,12 @@
+group: Samplers
+steps:
+- label: Samplers Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
new file mode 100644
index 000000000000..328158d0a948
--- /dev/null
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -0,0 +1,20 @@
+group: Tool use
+steps:
+- label: OpenAI-Compatible Tool Use
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental]
+  fast_check: false
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU)
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
new file mode 100644
index 000000000000..98ac8ef2ec2a
--- /dev/null
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -0,0 +1,25 @@
+group: Weight Loading
+steps:
+- label: Weight Loading Multiple GPU  # 33min
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt

From 27a893bb054b0988fcb733476d606a9c27db6dc5 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Tue, 25 Nov 2025 01:58:45 -0800
Subject: [PATCH 02/24] move primerl

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/e2e_integration.yaml | 13 ++++++++++++-
 .buildkite/test_areas/misc.yaml            | 13 +------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index dca7c1fcdf31..b7255737f889 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -16,4 +16,15 @@ steps:
   num_gpus: 4
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index ddca24efed1f..ef57557b568f 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -47,17 +47,6 @@ steps:
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: Prime-RL Integration (2 GPUs)
-  timeout_in_minutes: 30
-  optional: true
-  num_gpus: 2
-  working_dir: "/vllm-workspace"
-  source_file_dependencies:
-  - vllm/
-  - .buildkite/scripts/run-prime-rl-test.sh
-  commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
-
 - label: Examples
   timeout_in_minutes: 45
   working_dir: "/vllm-workspace/examples"
@@ -147,4 +136,4 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   commands:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
\ No newline at end of file
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58

From 77011542b42a43b643a47ed42b84333bf191e822 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Wed, 26 Nov 2025 02:05:22 -0800
Subject: [PATCH 03/24] test

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/ci_config.yaml              |   10 +
 .buildkite/test_areas/distributed.yaml |    4 +-
 buildkite_steps.yaml                   | 2212 ++++++++++++++++++++++++
 3 files changed, 2224 insertions(+), 2 deletions(-)
 create mode 100644 .buildkite/ci_config.yaml
 create mode 100644 buildkite_steps.yaml

diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
new file mode 100644
index 000000000000..40e923a24b71
--- /dev/null
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,10 @@
+name: ci
+job_dirs:
+  - ".buildkite/test_areas"
+run_all_patterns:
+  - ".*"
+run_all_exclude_patterns:
+  - ".*"
+registries:
+  main: "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo"
+  premerge: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo"
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 56dee5c31389..67d7527e36c1 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -168,13 +168,13 @@ steps:
   - tests/distributed/
   - tests/examples/offline_inference/data_parallel.py
   commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
     - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
     - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
diff --git a/buildkite_steps.yaml b/buildkite_steps.yaml
new file mode 100644
index 000000000000..7b489a91ef01
--- /dev/null
+++ b/buildkite_steps.yaml
@@ -0,0 +1,2212 @@
+steps:
+- group: Attention
+  steps:
+  - label: V1 attention (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: V1 attention (H100)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s v1/attention
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Basic Correctness
+  steps:
+  - label: Basic Correctness
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s basic_correctness/test_cumem.py
+    - pytest -v -s basic_correctness/test_basic_correctness.py
+    - pytest -v -s basic_correctness/test_cpu_offload.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Benchmarks
+  steps:
+  - label: Benchmarks
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - bash scripts/run-benchmarks.sh
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Benchmarks CLI Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s benchmarks/
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: CUDA
+  steps:
+  - label: Cudagraph
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Platform Tests (CUDA)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s cuda/test_cuda_context.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Compile
+  steps:
+  - label: Fusion E2E (2 GPUs)(B200)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Fusion and Compile Tests (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+      -k 'True and not +quant_fp8 and not +rms_norm'
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Distributed
+  steps:
+  - label: 2 Node Test (4 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
+      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node
+      test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
+      distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
+      --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345
+      --enforce-eager --trust-remote-code
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
+      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node
+      test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
+      distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
+      --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345
+      --enforce-eager --trust-remote-code
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export NCCL_CUMEM_HOST_ENABLE=0
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+    - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+    - pytest -v -s entrypoints/llm/test_collective_rpc.py
+    - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+    - pytest -v -s ./compile/test_wrapper.py
+    - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
+      | grep 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4
+      distributed/test_same_node.py | grep 'Same node test passed'
+    - pytest -v -s distributed/test_sequence_parallel.py
+    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+    - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Comm Ops
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s distributed/test_comm_ops.py
+    - pytest -v -s distributed/test_shm_broadcast.py
+    - pytest -v -s distributed/test_shm_buffer.py
+    - pytest -v -s distributed/test_shm_storage.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed NixlConnector PD accuracy (4 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Tests (2 GPUs)(B200)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Tests (2 GPUs)(H200)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1
+      VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py
+      --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Tests (4 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export NCCL_CUMEM_HOST_ENABLE=0
+    - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+    - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+    - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+    - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+    - pytest -v -s distributed/test_utils.py
+    - pytest -v -s compile/fullgraph/test_basic_correctness.py
+    - pytest -v -s distributed/test_pynccl.py
+    - pytest -v -s distributed/test_events.py
+    - pytest -v -s distributed/test_symm_mem_allreduce.py
+    - pushd ../examples/offline_inference
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+    - popd
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Tests (4 GPUs)(A100)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s distributed/test_custom_all_reduce.py
+    - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+    - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+    - pytest -v -s -x lora/test_mixtral.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Tests (8 GPUs)(H100)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export NCCL_CUMEM_HOST_ENABLE=0
+    - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py
+      --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Pipeline + Context Parallelism (4 GPUs))
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s distributed/test_pp_cudagraph.py
+    - pytest -v -s distributed/test_pipeline_parallel.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: E2E Integration
+  steps:
+  - label: DeepSeek V2-Lite Accuracy
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+      0.25 200 8010
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Prime-RL Integration (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Qwen3-30B-A3B-FP8-block Accuracy
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+      0.8 200 8020
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Engine
+  steps:
+  - label: Engine
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+    - pytest -v -s tokenization
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: V1 e2e + engine
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Entrypoints
+  steps:
+  - label: Entrypoints Integration (API Server)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
+    - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py
+      --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py
+      --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+      --ignore=entrypoints/openai/tool_parsers/
+    - pytest -v -s entrypoints/test_chat_utils.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Entrypoints Integration (LLM)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+    - pytest -v -s entrypoints/llm/test_generate.py
+    - pytest -v -s entrypoints/offline_mode
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Entrypoints Integration (Pooling)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s entrypoints/pooling
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Entrypoints Unit Tests
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s entrypoints/openai/tool_parsers
+    - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai
+      --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Entrypoints V1
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s v1/entrypoints
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: OpenAI API Correctness
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s entrypoints/openai/correctness/
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Expert Parallelism
+  steps:
+  - label: EPLB Algorithm
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s distributed/test_eplb_algo.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: EPLB Execution
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s distributed/test_eplb_execute.py
+    - pytest -v -s distributed/test_eplb_spec_decode.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Kernels
+  steps:
+  - label: Kernels (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels Attention Test %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels Core Operation Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels DeepGEMM Test (H100)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels Mamba Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/mamba
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels MoE Test %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels Quantization Test %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: LM Eval
+  steps:
+  - label: LM Eval Large Models (4 GPUs)(A100)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
+      --tp-size=4
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: LM Eval Large Models (4 GPUs)(H100)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export VLLM_USE_DEEP_GEMM=0
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+      --tp-size=4
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: LM Eval Small Models
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
+      --tp-size=1
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: LM Eval Small Models (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
+      --tp-size=1
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: LoRA
+  steps:
+  - label: LoRA %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+      \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py
+      \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py
+      \ --ignore=lora/test_qwen3moe_tp.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: LoRA TP (Distributed)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Miscellaneous
+  steps:
+  - label: Async Engine, Inputs, Utils, Worker
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s -m 'not cpu_test' multimodal
+    - pytest -v -s utils_
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - python3 standalone_tests/lazy_imports.py
+    - pytest -v -s test_inputs.py
+    - pytest -v -s test_outputs.py
+    - pytest -v -s -m 'cpu_test' multimodal
+    - pytest -v -s transformers_utils
+    - pytest -v -s config
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Examples
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install tensorizer
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf
+      --cpu-offload-gb 10
+    - python3 offline_inference/basic/chat.py
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_pooling.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory
+      /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m
+      deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper
+      --seed 0
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens
+      3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp
+      0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens
+      3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp
+      0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: GPT-OSS Eval (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b
+      --metric 0.58
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Metrics, Tracing (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pip install 'opentelemetry-sdk>=1.26.0' 'opentelemetry-api>=1.26.0' 'opentelemetry-exporter-otlp>=1.26.0'
+      'opentelemetry-semantic-conventions-ai>=0.4.1'
+    - pytest -v -s v1/tracing
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Python-only Installation
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - bash standalone_tests/python_only_compile.sh
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Regression
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install modelscope
+    - pytest -v -s test_regression.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: V1 Others
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: V1 Others (CPU)
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Model Executor
+  steps:
+  - label: Model Executor
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Models - Basic
+  steps:
+  - label: Basic Models Test (Other CPU)
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - pytest -v -s models/test_utils.py models/test_vision.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Basic Models Tests (Extra Initialization) %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/test_initialization.py \ -k 'not test_can_initialize_small_subset'
+      \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Basic Models Tests (Initialization)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Basic Models Tests (Other)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Transformers Nightly Models
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal
+      or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR
+      or KimiVL)'
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py
+      --model-type whisper
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Models - Distributed
+  steps:
+  - label: Distributed Model Tests (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+    - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+    - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+    - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py
+      -v -s -m 'distributed(num_gpus=2)'
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Models - Language
+  steps:
+  - label: Language Models Test (Extended Generation)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Test (Extended Pooling)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Test (MTEB)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/language/pooling_mteb_test
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Test (PPL)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/language/generation_ppl_test
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Tests (Extra Standard) %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+      \ --shard-id=$$BUILDKITE_PARALLEL_JOB
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Tests (Hybrid) %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+      \ --shard-id=$$BUILDKITE_PARALLEL_JOB
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Tests (Standard)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Models - Multimodal
+  steps:
+  - label: Custom Models
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - echo 'Testing custom models...'
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Accuracy Eval (Small Models)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
+      --tp-size=1
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Models (Extended) 1
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py
+      --ignore models/multimodal/processing
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Models (Extended) 2
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0)
+      and not core_model'
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Models (Extended) 3
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1)
+      and not core_model'
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Models (Standard)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py
+      --ignore models/multimodal/processing
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py
+      -m core_model
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Processor
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Plugins
+  steps:
+  - label: Plugin Tests (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pip install -e ./plugins/vllm_add_dummy_platform
+    - pytest -v -s plugins_tests/test_platform_plugins.py
+    - pip uninstall vllm_add_dummy_platform -y
+    - pip install -e ./plugins/prithvi_io_processor_plugin
+    - pytest -v -s plugins_tests/test_io_processor_plugins.py
+    - pip uninstall prithvi_io_processor_plugin -y
+    - pip install -e ./plugins/vllm_add_dummy_stat_logger
+    - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+    - pip uninstall dummy_stat_logger -y
+    - pytest -v -s plugins_tests/test_scheduler_plugins.py
+    - pip install -e ./plugins/vllm_add_dummy_model
+    - pytest -v -s distributed/test_distributed_oot.py
+    - pytest -v -s entrypoints/openai/test_oot_registration.py
+    - pytest -v -s models/test_oot_registration.py
+    - pytest -v -s plugins/lora_resolvers
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: PyTorch
+  steps:
+  - label: PyTorch Compilation Unit Tests
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: PyTorch Fullgraph
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    - pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8
+      and not Llama-4'
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: PyTorch Fullgraph Smoke Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec
+      pytest -s -v {} \\;
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Pytorch Nightly Dependency Override Check
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - bash standalone_tests/pytorch_nightly_dependency.sh
+    soft_fail: true
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Quantization
+  steps:
+  - label: Quantization
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+    - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Quantized MoE Test (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Samplers
+  steps:
+  - label: Samplers Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Tool use
+  steps:
+  - label: OpenAI-Compatible Tool Use
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s -m 'not cpu_test' tool_use
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: OpenAI-Compatible Tool Use (CPU)
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - pytest -v -s -m 'cpu_test' tool_use
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Weight Loading
+  steps:
+  - label: Weight Loading Multiple GPU
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Weight Loading Multiple GPU - Large Models
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true

From 0a7642e28828600763418730efee543427fcdddc Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 01:41:13 -0800
Subject: [PATCH 04/24] update pipeline yaml

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/ci_config.yaml                     |    8 +-
 .buildkite/pipeline.yaml                      | 2700 +++++++++++++++++
 .buildkite/test_areas/attention.yaml          |    2 +
 .buildkite/test_areas/basic_correctness.yaml  |    2 +
 .buildkite/test_areas/benchmarks.yaml         |    2 +
 .buildkite/test_areas/compile.yaml            |    2 +
 .buildkite/test_areas/cuda.yaml               |    2 +
 .buildkite/test_areas/distributed.yaml        |    2 +
 .buildkite/test_areas/e2e_integration.yaml    |    2 +
 .buildkite/test_areas/engine.yaml             |    2 +
 .buildkite/test_areas/entrypoints.yaml        |    2 +
 .buildkite/test_areas/expert_parallelism.yaml |    2 +
 .buildkite/test_areas/kernels.yaml            |    2 +
 .buildkite/test_areas/lm_eval.yaml            |    2 +
 .buildkite/test_areas/lora.yaml               |    2 +
 .buildkite/test_areas/misc.yaml               |    5 +
 .buildkite/test_areas/model_executor.yaml     |    2 +
 .buildkite/test_areas/models_basic.yaml       |    5 +-
 .buildkite/test_areas/models_distributed.yaml |    2 +
 .buildkite/test_areas/models_language.yaml    |    2 +
 .buildkite/test_areas/models_multimodal.yaml  |    2 +
 .buildkite/test_areas/plugins.yaml            |    2 +
 .buildkite/test_areas/pytorch.yaml            |    2 +
 .buildkite/test_areas/quantization.yaml       |    2 +
 .buildkite/test_areas/samplers.yaml           |    2 +
 .buildkite/test_areas/tool_use.yaml           |    3 +
 .buildkite/test_areas/weight_loading.yaml     |    4 +-
 27 files changed, 2759 insertions(+), 8 deletions(-)
 create mode 100644 .buildkite/pipeline.yaml

diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
index 40e923a24b71..2b0908bd3bd7 100644
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,10 +1,12 @@
 name: ci
 job_dirs:
   - ".buildkite/test_areas"
+  - ".buildkite/build"
 run_all_patterns:
   - ".*"
 run_all_exclude_patterns:
   - ".*"
-registries:
-  main: "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo"
-  premerge: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+  main: "vllm-ci-postmerge-repo"
+  premerge: "vllm-ci-test-repo"
diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
new file mode 100644
index 000000000000..818400f36234
--- /dev/null
+++ b/.buildkite/pipeline.yaml
@@ -0,0 +1,2700 @@
+steps:
+- group: Abuild
+  steps:
+  - label: ':docker: Build CPU arm64 image'
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - .buildkite/build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
+      123
+    soft_fail: false
+  - label: ':docker: Build CPU image'
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - .buildkite/build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
+      123
+    soft_fail: false
+    retry:
+      automatic:
+      - exit_status: -1
+        limit: 2
+      - exit_status: -10
+        limit: 2
+    env:
+      DOCKER_BUILDKIT: '1'
+  - label: ':docker: Build CUDA 11.8 image'
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - .buildkite/build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
+      123
+    soft_fail: false
+    retry:
+      automatic:
+      - exit_status: -1
+        limit: 2
+      - exit_status: -10
+        limit: 2
+    env:
+      DOCKER_BUILDKIT: '1'
+  - label: ':docker: Build HPU image'
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - .buildkite/build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
+      123
+    soft_fail: true
+    retry:
+      automatic:
+      - exit_status: -1
+        limit: 2
+      - exit_status: -10
+        limit: 2
+    env:
+      DOCKER_BUILDKIT: '1'
+  - label: ':docker: Build image'
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - .buildkite/build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123
+    soft_fail: false
+    retry:
+      automatic:
+      - exit_status: -1
+        limit: 2
+      - exit_status: -10
+        limit: 2
+    env:
+      DOCKER_BUILDKIT: '1'
+- group: Attention
+  steps:
+  - label: V1 attention (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: V1 attention (H100)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s v1/attention
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - kubernetes:
+        kubernetes:
+          podSpec:
+            containers:
+            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+              command:
+              - bash
+              - -c
+              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+                --tp-size=4
+              resources:
+                limits:
+                  nvidia.com/gpu: 4
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: /root/.cache/huggingface
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: '0'
+              - name: HF_HOME
+                value: /root/.cache/huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+            - name: devshm
+              emptyDir:
+                medium: Memory
+            - name: hf-cache
+              hostPath:
+                path: /mnt/hf-cache
+                type: DirectoryOrCreate
+- group: Basic Correctness
+  steps:
+  - label: Basic Correctness
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s basic_correctness/test_cumem.py
+    - pytest -v -s basic_correctness/test_basic_correctness.py
+    - pytest -v -s basic_correctness/test_cpu_offload.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Benchmarks
+  steps:
+  - label: Benchmarks
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - bash scripts/run-benchmarks.sh
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Benchmarks CLI Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s benchmarks/
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: CUDA
+  steps:
+  - label: Cudagraph
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Platform Tests (CUDA)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s cuda/test_cuda_context.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Compile
+  steps:
+  - label: Fusion E2E (2 GPUs)(B200)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Fusion and Compile Tests (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+      -k 'True and not +quant_fp8 and not +rms_norm'
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Distributed
+  steps:
+  - label: 2 Node Test (4 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
+      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node
+      test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
+      distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
+      --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345
+      --enforce-eager --trust-remote-code
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
+      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node
+      test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
+      distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
+      --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345
+      --enforce-eager --trust-remote-code
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export NCCL_CUMEM_HOST_ENABLE=0
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+    - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+    - pytest -v -s entrypoints/llm/test_collective_rpc.py
+    - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+    - pytest -v -s ./compile/test_wrapper.py
+    - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
+      | grep 'Same node test passed'
+    - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4
+      distributed/test_same_node.py | grep 'Same node test passed'
+    - pytest -v -s distributed/test_sequence_parallel.py
+    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+    - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Comm Ops
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s distributed/test_comm_ops.py
+    - pytest -v -s distributed/test_shm_broadcast.py
+    - pytest -v -s distributed/test_shm_buffer.py
+    - pytest -v -s distributed/test_shm_storage.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed NixlConnector PD accuracy (4 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Tests (2 GPUs)(B200)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Tests (2 GPUs)(H200)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1
+      VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py
+      --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Tests (4 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export NCCL_CUMEM_HOST_ENABLE=0
+    - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+    - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+    - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+    - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+    - pytest -v -s distributed/test_utils.py
+    - pytest -v -s compile/fullgraph/test_basic_correctness.py
+    - pytest -v -s distributed/test_pynccl.py
+    - pytest -v -s distributed/test_events.py
+    - pytest -v -s distributed/test_symm_mem_allreduce.py
+    - pushd ../examples/offline_inference
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+    - popd
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Distributed Tests (4 GPUs)(A100)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s distributed/test_custom_all_reduce.py
+    - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+    - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+    - pytest -v -s -x lora/test_mixtral.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - kubernetes:
+        kubernetes:
+          podSpec:
+            priorityClassName: ci
+            containers:
+            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+              command:
+              - bash
+              - -c
+              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
+                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
+                --tp-size=4
+              resources:
+                limits:
+                  nvidia.com/gpu: 4
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: /root/.cache/huggingface
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: '0'
+              - name: HF_HOME
+                value: /root/.cache/huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+            nodeSelector:
+              nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+            volumes:
+            - name: devshm
+              emptyDir:
+                medium: Memory
+            - name: hf-cache
+              hostPath:
+                path: /mnt/hf-cache
+                type: DirectoryOrCreate
+  - label: Distributed Tests (8 GPUs)(H100)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export NCCL_CUMEM_HOST_ENABLE=0
+    - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py
+      --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - kubernetes:
+        kubernetes:
+          podSpec:
+            containers:
+            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+              command:
+              - bash
+              - -c
+              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+                --tp-size=4
+              resources:
+                limits:
+                  nvidia.com/gpu: 4
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: /root/.cache/huggingface
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: '0'
+              - name: HF_HOME
+                value: /root/.cache/huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+            - name: devshm
+              emptyDir:
+                medium: Memory
+            - name: hf-cache
+              hostPath:
+                path: /mnt/hf-cache
+                type: DirectoryOrCreate
+  - label: Pipeline + Context Parallelism (4 GPUs))
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s distributed/test_pp_cudagraph.py
+    - pytest -v -s distributed/test_pipeline_parallel.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: E2E Integration
+  steps:
+  - label: DeepSeek V2-Lite Accuracy
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+      0.25 200 8010
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - kubernetes:
+        kubernetes:
+          podSpec:
+            containers:
+            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+              command:
+              - bash
+              - -c
+              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+                --tp-size=4
+              resources:
+                limits:
+                  nvidia.com/gpu: 4
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: /root/.cache/huggingface
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: '0'
+              - name: HF_HOME
+                value: /root/.cache/huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+            - name: devshm
+              emptyDir:
+                medium: Memory
+            - name: hf-cache
+              hostPath:
+                path: /mnt/hf-cache
+                type: DirectoryOrCreate
+  - label: Prime-RL Integration (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Qwen3-30B-A3B-FP8-block Accuracy
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+      0.8 200 8020
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - kubernetes:
+        kubernetes:
+          podSpec:
+            containers:
+            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+              command:
+              - bash
+              - -c
+              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+                --tp-size=4
+              resources:
+                limits:
+                  nvidia.com/gpu: 4
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: /root/.cache/huggingface
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: '0'
+              - name: HF_HOME
+                value: /root/.cache/huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+            - name: devshm
+              emptyDir:
+                medium: Memory
+            - name: hf-cache
+              hostPath:
+                path: /mnt/hf-cache
+                type: DirectoryOrCreate
+- group: Engine
+  steps:
+  - label: Engine
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+    - pytest -v -s tokenization
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: V1 e2e + engine
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Entrypoints
+  steps:
+  - label: Entrypoints Integration (API Server)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
+    - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py
+      --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py
+      --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+      --ignore=entrypoints/openai/tool_parsers/
+    - pytest -v -s entrypoints/test_chat_utils.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Entrypoints Integration (LLM)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+    - pytest -v -s entrypoints/llm/test_generate.py
+    - pytest -v -s entrypoints/offline_mode
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Entrypoints Integration (Pooling)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s entrypoints/pooling
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Entrypoints Unit Tests
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s entrypoints/openai/tool_parsers
+    - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai
+      --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Entrypoints V1
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s v1/entrypoints
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: OpenAI API Correctness
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s entrypoints/openai/correctness/
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Expert Parallelism
+  steps:
+  - label: EPLB Algorithm
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s distributed/test_eplb_algo.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: EPLB Execution
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pytest -v -s distributed/test_eplb_execute.py
+    - pytest -v -s distributed/test_eplb_spec_decode.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Kernels
+  steps:
+  - label: Kernels (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels Attention Test %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels Core Operation Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels DeepGEMM Test (H100)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - kubernetes:
+        kubernetes:
+          podSpec:
+            containers:
+            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+              command:
+              - bash
+              - -c
+              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+                --tp-size=4
+              resources:
+                limits:
+                  nvidia.com/gpu: 4
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: /root/.cache/huggingface
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: '0'
+              - name: HF_HOME
+                value: /root/.cache/huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+            - name: devshm
+              emptyDir:
+                medium: Memory
+            - name: hf-cache
+              hostPath:
+                path: /mnt/hf-cache
+                type: DirectoryOrCreate
+  - label: Kernels Mamba Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/mamba
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels MoE Test %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Kernels Quantization Test %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: LM Eval
+  steps:
+  - label: LM Eval Large Models (4 GPUs)(A100)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
+      --tp-size=4
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - kubernetes:
+        kubernetes:
+          podSpec:
+            priorityClassName: ci
+            containers:
+            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+              command:
+              - bash
+              - -c
+              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
+                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
+                --tp-size=4
+              resources:
+                limits:
+                  nvidia.com/gpu: 4
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: /root/.cache/huggingface
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: '0'
+              - name: HF_HOME
+                value: /root/.cache/huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+            nodeSelector:
+              nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+            volumes:
+            - name: devshm
+              emptyDir:
+                medium: Memory
+            - name: hf-cache
+              hostPath:
+                path: /mnt/hf-cache
+                type: DirectoryOrCreate
+  - label: LM Eval Large Models (4 GPUs)(H100)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export VLLM_USE_DEEP_GEMM=0
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+      --tp-size=4
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - kubernetes:
+        kubernetes:
+          podSpec:
+            containers:
+            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+              command:
+              - bash
+              - -c
+              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+                --tp-size=4
+              resources:
+                limits:
+                  nvidia.com/gpu: 4
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: /root/.cache/huggingface
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: '0'
+              - name: HF_HOME
+                value: /root/.cache/huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+            - name: devshm
+              emptyDir:
+                medium: Memory
+            - name: hf-cache
+              hostPath:
+                path: /mnt/hf-cache
+                type: DirectoryOrCreate
+  - label: LM Eval Small Models
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
+      --tp-size=1
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: LM Eval Small Models (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
+      --tp-size=1
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: LoRA
+  steps:
+  - label: LoRA %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+      \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py
+      \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py
+      \ --ignore=lora/test_qwen3moe_tp.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: LoRA TP (Distributed)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Miscellaneous
+  steps:
+  - label: Async Engine, Inputs, Utils, Worker
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s -m 'not cpu_test' multimodal
+    - pytest -v -s utils_
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - python3 standalone_tests/lazy_imports.py
+    - pytest -v -s test_inputs.py
+    - pytest -v -s test_outputs.py
+    - pytest -v -s -m 'cpu_test' multimodal
+    - pytest -v -s transformers_utils
+    - pytest -v -s config
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Examples
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install tensorizer
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf
+      --cpu-offload-gb 10
+    - python3 offline_inference/basic/chat.py
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_pooling.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory
+      /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m
+      deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper
+      --seed 0
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens
+      3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp
+      0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens
+      3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp
+      0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: GPT-OSS Eval (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b
+      --metric 0.58
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Metrics, Tracing (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pip install 'opentelemetry-sdk>=1.26.0' 'opentelemetry-api>=1.26.0' 'opentelemetry-exporter-otlp>=1.26.0'
+      'opentelemetry-semantic-conventions-ai>=0.4.1'
+    - pytest -v -s v1/tracing
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Python-only Installation
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - bash standalone_tests/python_only_compile.sh
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Regression
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install modelscope
+    - pytest -v -s test_regression.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: V1 Others
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: V1 Others (CPU)
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Model Executor
+  steps:
+  - label: Model Executor
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Models - Basic
+  steps:
+  - label: Basic Models Test (Other CPU)
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - pytest -v -s models/test_utils.py models/test_vision.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Basic Models Tests (Extra Initialization) %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/test_initialization.py \ -k 'not test_can_initialize_small_subset'
+      \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Basic Models Tests (Initialization)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Basic Models Tests (Other)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Transformers Nightly Models
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal
+      or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR
+      or KimiVL)'
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py
+      --model-type whisper
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Models - Distributed
+  steps:
+  - label: Distributed Model Tests (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+    - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+    - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+    - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py
+      -v -s -m 'distributed(num_gpus=2)'
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Models - Language
+  steps:
+  - label: Language Models Test (Extended Generation)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Test (Extended Pooling)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Test (MTEB)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/language/pooling_mteb_test
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Test (PPL)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s models/language/generation_ppl_test
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Tests (Extra Standard) %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+      \ --shard-id=$$BUILDKITE_PARALLEL_JOB
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Tests (Hybrid) %N
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+      \ --shard-id=$$BUILDKITE_PARALLEL_JOB
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Language Models Tests (Standard)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Models - Multimodal
+  steps:
+  - label: Custom Models
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - echo 'Testing custom models...'
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Accuracy Eval (Small Models)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
+      --tp-size=1
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Models (Extended) 1
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py
+      --ignore models/multimodal/processing
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Models (Extended) 2
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0)
+      and not core_model'
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Models (Extended) 3
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1)
+      and not core_model'
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Models (Standard)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py
+      --ignore models/multimodal/processing
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py
+      -m core_model
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Processor
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Plugins
+  steps:
+  - label: Plugin Tests (2 GPUs)
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - pip install -e ./plugins/vllm_add_dummy_platform
+    - pytest -v -s plugins_tests/test_platform_plugins.py
+    - pip uninstall vllm_add_dummy_platform -y
+    - pip install -e ./plugins/prithvi_io_processor_plugin
+    - pytest -v -s plugins_tests/test_io_processor_plugins.py
+    - pip uninstall prithvi_io_processor_plugin -y
+    - pip install -e ./plugins/vllm_add_dummy_stat_logger
+    - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+    - pip uninstall dummy_stat_logger -y
+    - pytest -v -s plugins_tests/test_scheduler_plugins.py
+    - pip install -e ./plugins/vllm_add_dummy_model
+    - pytest -v -s distributed/test_distributed_oot.py
+    - pytest -v -s entrypoints/openai/test_oot_registration.py
+    - pytest -v -s models/test_oot_registration.py
+    - pytest -v -s plugins/lora_resolvers
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: PyTorch
+  steps:
+  - label: PyTorch Compilation Unit Tests
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: PyTorch Fullgraph
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    - pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8
+      and not Llama-4'
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: PyTorch Fullgraph Smoke Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec
+      pytest -s -v {} \\;
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Pytorch Nightly Dependency Override Check
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - bash standalone_tests/pytorch_nightly_dependency.sh
+    depends_on:
+    - image-build
+    soft_fail: true
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Quantization
+  steps:
+  - label: Quantization
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+    - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Quantized MoE Test (B200)
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Samplers
+  steps:
+  - label: Samplers Test
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Tool use
+  steps:
+  - label: OpenAI-Compatible Tool Use
+    agents:
+      queue: gpu_1_queue
+    commands:
+    - pytest -v -s -m 'not cpu_test' tool_use
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: OpenAI-Compatible Tool Use (CPU)
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - pytest -v -s -m 'cpu_test' tool_use
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+- group: Weight Loading
+  steps:
+  - label: Weight Loading Multiple GPU
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+        always_pull: true
+        propagate_environment: true
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Weight Loading Multiple GPU - Large Models
+    agents:
+      queue: gpu_4_queue
+    commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - kubernetes:
+        kubernetes:
+          podSpec:
+            priorityClassName: ci
+            containers:
+            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+              command:
+              - bash
+              - -c
+              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
+                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
+                --tp-size=4
+              resources:
+                limits:
+                  nvidia.com/gpu: 4
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: /root/.cache/huggingface
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: '0'
+              - name: HF_HOME
+                value: /root/.cache/huggingface
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+            nodeSelector:
+              nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+            volumes:
+            - name: devshm
+              emptyDir:
+                medium: Memory
+            - name: hf-cache
+              hostPath:
+                path: /mnt/hf-cache
+                type: DirectoryOrCreate
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
index af57cc6681b4..6e444eae14c7 100644
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -1,4 +1,6 @@
 group: Attention
+depends_on: 
+  - image-build
 steps:
 - label: V1 attention (H100)
   timeout_in_minutes: 30
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
index 27c4d96aeb8c..759d2b535871 100644
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -1,4 +1,6 @@
 group: Basic Correctness
+depends_on: 
+  - image-build
 steps:
 - label: Basic Correctness
   timeout_in_minutes: 30
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
index c48c72fb405b..574b642d407b 100644
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -1,4 +1,6 @@
 group: Benchmarks
+depends_on: 
+  - image-build
 steps:
 - label: Benchmarks
   timeout_in_minutes: 20
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
index 4b05bd8976e4..0ba00925a483 100644
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -1,4 +1,6 @@
 group: Compile
+depends_on: 
+  - image-build
 steps:
 - label: Fusion and Compile Tests (B200)
   timeout_in_minutes: 40
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
index 6c8ff70ba45a..50c0c338c243 100644
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -1,4 +1,6 @@
 group: CUDA
+depends_on: 
+  - image-build
 steps:
 - label: Platform Tests (CUDA)
   timeout_in_minutes: 15
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 67d7527e36c1..e6ae13b8156d 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -1,4 +1,6 @@
 group: Distributed
+depends_on: 
+  - image-build
 steps:
 - label: Distributed Comm Ops
   timeout_in_minutes: 20
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index b7255737f889..817b995574bc 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -1,4 +1,6 @@
 group: E2E Integration
+depends_on: 
+  - image-build
 steps:
 - label: DeepSeek V2-Lite Accuracy
   timeout_in_minutes: 60
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
index be099758eb88..e4d12f3453f1 100644
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -1,4 +1,6 @@
 group: Engine
+depends_on: 
+  - image-build
 steps:
 - label: Engine
   timeout_in_minutes: 40
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index adbd6e96291e..0a789be943f3 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -1,4 +1,6 @@
 group: Entrypoints
+depends_on: 
+  - image-build
 steps:
 - label: Entrypoints Unit Tests  
   timeout_in_minutes: 10
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
index a1316f289d59..feb8252148c7 100644
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -1,4 +1,6 @@
 group: Expert Parallelism
+depends_on: 
+  - image-build
 steps:
 - label: EPLB Algorithm
   timeout_in_minutes: 15
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
index 91c682ca9546..7ca099516d64 100644
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -1,4 +1,6 @@
 group: Kernels
+depends_on: 
+  - image-build
 steps:
 - label: Kernels Core Operation Test
   timeout_in_minutes: 75
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
index c6498c032440..9af43e0c375a 100644
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -1,4 +1,6 @@
 group: LM Eval
+depends_on: 
+  - image-build
 steps:
 - label: LM Eval Small Models
   timeout_in_minutes: 75
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
index 3f41b5ff0f8f..45e3af03591d 100644
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -1,4 +1,6 @@
 group: LoRA
+depends_on: 
+  - image-build
 steps:
 - label: LoRA %N
   timeout_in_minutes: 30
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index ef57557b568f..ec719825b377 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -1,4 +1,6 @@
 group: Miscellaneous
+depends_on: 
+  - image-build
 steps:
 - label: V1 Others
   timeout_in_minutes: 60
@@ -25,6 +27,7 @@ steps:
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
 - label: V1 Others (CPU)
+  depends_on: ~
   source_file_dependencies:
     - vllm/
     - tests/v1
@@ -88,6 +91,7 @@ steps:
   - pytest -v -s v1/tracing
 
 - label: Python-only Installation
+  depends_on: ~
   timeout_in_minutes: 20
   source_file_dependencies:
   - tests/standalone_tests/python_only_compile.sh
@@ -106,6 +110,7 @@ steps:
   - pytest -v -s utils_
 
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+  depends_on: ~
   timeout_in_minutes: 10
   source_file_dependencies:
   - vllm/
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
index c2d52654f0d2..996c8bb8b780 100644
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -1,4 +1,6 @@
 group: Model Executor
+depends_on: 
+  - image-build
 steps:
 - label: Model Executor
   timeout_in_minutes: 35
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index 9506a613790c..ceddf841f87a 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -1,4 +1,6 @@
 group: Models - Basic
+depends_on: 
+  - image-build
 steps:
 - label: Basic Models Tests (Initialization)
   timeout_in_minutes: 45
@@ -30,8 +32,6 @@ steps:
 
 - label: Basic Models Tests (Other)
   timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
-  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models/test_transformers.py
@@ -41,7 +41,6 @@ steps:
 
 - label: Basic Models Test (Other CPU) # 5min
   timeout_in_minutes: 10
-  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models/test_utils.py
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
index ea38fdb12d2e..b6bfbf2ddab4 100644
--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -1,4 +1,6 @@
 group: Models - Distributed
+depends_on: 
+  - image-build
 steps:
 - label: Distributed Model Tests (2 GPUs)
   timeout_in_minutes: 50
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
index 65303f049613..fdf78dc48746 100644
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -1,4 +1,6 @@
 group: Models - Language
+depends_on: 
+  - image-build
 steps:
 - label: Language Models Tests (Standard)
   timeout_in_minutes: 25
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
index 5d31192d169a..68e5e485c316 100644
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -1,4 +1,6 @@
 group: Models - Multimodal
+depends_on: 
+  - image-build
 steps:
 - label: Multi-Modal Models (Standard) # 60min
   timeout_in_minutes: 80
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
index f922d5c919f8..60c179aa098e 100644
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -1,4 +1,6 @@
 group: Plugins
+depends_on: 
+  - image-build
 steps:
 - label: Plugin Tests (2 GPUs)
   timeout_in_minutes: 60
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
index 34c0c87fb2c6..dab6e674990b 100644
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -1,4 +1,6 @@
 group: PyTorch
+depends_on: 
+  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
   timeout_in_minutes: 30
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
index 554d6447d791..cff4a7189806 100644
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -1,4 +1,6 @@
 group: Quantization
+depends_on: 
+  - image-build
 steps:
 - label: Quantization
   timeout_in_minutes: 90
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
index 0d26ffbd00ac..ad377148fd07 100644
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -1,4 +1,6 @@
 group: Samplers
+depends_on: 
+  - image-build
 steps:
 - label: Samplers Test
   timeout_in_minutes: 75
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
index 328158d0a948..7040cd1d253b 100644
--- a/.buildkite/test_areas/tool_use.yaml
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -1,4 +1,6 @@
 group: Tool use
+depends_on: 
+  - image-build
 steps:
 - label: OpenAI-Compatible Tool Use
   timeout_in_minutes: 35
@@ -11,6 +13,7 @@ steps:
     - pytest -v -s -m 'not cpu_test' tool_use
 
 - label: OpenAI-Compatible Tool Use (CPU)
+  depends_on: ~
   timeout_in_minutes: 10
   source_file_dependencies:
     - vllm/
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
index 98ac8ef2ec2a..cfc5bb20fe7a 100644
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -1,8 +1,9 @@
 group: Weight Loading
+depends_on: 
+  - image-build
 steps:
 - label: Weight Loading Multiple GPU  # 33min
   timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   optional: true
@@ -13,7 +14,6 @@ steps:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 
 - label: Weight Loading Multiple GPU - Large Models # optional
-  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   gpu: a100

From 265bf9f6e24be205a87474f5e654e0877cc8c636 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 01:44:01 -0800
Subject: [PATCH 05/24] key

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/pipeline.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
index 818400f36234..91a9208ebedb 100644
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@@ -2,6 +2,7 @@ steps:
 - group: Abuild
   steps:
   - label: ':docker: Build CPU arm64 image'
+    key: image-build-cpu-arm64
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
@@ -9,6 +10,7 @@ steps:
       123
     soft_fail: false
   - label: ':docker: Build CPU image'
+    key: image-build-cpu
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
@@ -24,6 +26,7 @@ steps:
     env:
       DOCKER_BUILDKIT: '1'
   - label: ':docker: Build CUDA 11.8 image'
+    key: image-build-cu118
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
@@ -39,6 +42,7 @@ steps:
     env:
       DOCKER_BUILDKIT: '1'
   - label: ':docker: Build HPU image'
+    key: image-build-hpu
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
@@ -54,6 +58,7 @@ steps:
     env:
       DOCKER_BUILDKIT: '1'
   - label: ':docker: Build image'
+    key: image-build
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:

From 0784707b3fc7abbc55a3fcc3fb8b0eac5cc11b5c Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 01:49:06 -0800
Subject: [PATCH 06/24] build files

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/ci_config.yaml                     |  2 +-
 .buildkite/image_build/image_build.sh         | 38 ++++++++++++
 .buildkite/image_build/image_build.yaml       | 61 +++++++++++++++++++
 .buildkite/image_build/image_build_cpu.sh     | 36 +++++++++++
 .../image_build/image_build_cpu_arm64.sh      | 33 ++++++++++
 .buildkite/image_build/image_build_cu118.sh   | 36 +++++++++++
 .buildkite/image_build/image_build_hpu.sh     | 34 +++++++++++
 .buildkite/pipeline.yaml                      | 11 ++--
 8 files changed, 245 insertions(+), 6 deletions(-)
 create mode 100644 .buildkite/image_build/image_build.sh
 create mode 100644 .buildkite/image_build/image_build.yaml
 create mode 100644 .buildkite/image_build/image_build_cpu.sh
 create mode 100644 .buildkite/image_build/image_build_cpu_arm64.sh
 create mode 100644 .buildkite/image_build/image_build_cu118.sh
 create mode 100644 .buildkite/image_build/image_build_hpu.sh

diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
index 2b0908bd3bd7..5b00e1cab6c7 100644
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,7 +1,7 @@
 name: ci
 job_dirs:
   - ".buildkite/test_areas"
-  - ".buildkite/build"
+  - ".buildkite/image_build"
 run_all_patterns:
   - ".*"
 run_all_exclude_patterns:
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
new file mode 100644
index 000000000000..87e35acd5e84
--- /dev/null
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT \
+  --target test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT
+docker tag $REGISTRY/$REPO:$BUILDKITE_COMMIT $REGISTRY/$REPO:latest
+docker push $REGISTRY/$REPO:latest
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
new file mode 100644
index 000000000000..4b2c1da458af
--- /dev/null
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,61 @@
+group: Abuild
+steps:
+  - label: ":docker: Build image"
+    key: image-build
+    commands:
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build CPU image"
+    key: image-build-cpu
+    commands:
+    - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+  
+  - label: ":docker: Build CUDA 11.8 image"
+    key: image-build-cu118
+    commands:
+    - .buildkite/image_build/image_build_cu118.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+
+  - label: ":docker: Build HPU image"
+    soft_fail: true
+    key: image-build-hpu
+    commands:
+    - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
+  
+  - label: ":docker: Build CPU arm64 image"
+    key: image-build-cpu-arm64
+    optional: true
+    commands:
+    - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    env:
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
new file mode 100644
index 000000000000..a69732f43098
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_AVX512VNNI=true \
+  --build-arg VLLM_CPU_AMXBF16=true \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
new file mode 100644
index 000000000000..615298b6555b
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --target vllm-test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_cu118.sh b/.buildkite/image_build/image_build_cu118.sh
new file mode 100644
index 000000000000..699cef2ad60f
--- /dev/null
+++ b/.buildkite/image_build/image_build_cu118.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build \
+  --file docker/Dockerfile \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg CUDA_VERSION=11.8.0 \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118 \
+  --target test \
+  --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
new file mode 100644
index 000000000000..192447ef4577
--- /dev/null
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit>"
+  exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+  echo "Image not found, proceeding with build..."
+else
+  echo "Image found"
+  exit 0
+fi
+
+# build
+docker build \
+  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+  --build-arg max_jobs=16 \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+  --progress plain \
+  https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
index 91a9208ebedb..b4646952fa18 100644
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@@ -6,7 +6,7 @@ steps:
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
-    - .buildkite/build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
+    - .buildkite/image_build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
     soft_fail: false
   - label: ':docker: Build CPU image'
@@ -14,7 +14,7 @@ steps:
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
-    - .buildkite/build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
+    - .buildkite/image_build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
     soft_fail: false
     retry:
@@ -30,7 +30,7 @@ steps:
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
-    - .buildkite/build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
+    - .buildkite/image_build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
     soft_fail: false
     retry:
@@ -46,7 +46,7 @@ steps:
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
-    - .buildkite/build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
+    - .buildkite/image_build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
     soft_fail: true
     retry:
@@ -62,7 +62,8 @@ steps:
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
-    - .buildkite/build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo 123
+    - .buildkite/image_build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
+      123
     soft_fail: false
     retry:
       automatic:

From c8707ff0f92086d563a2efeb36a587640ff797f8 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 01:51:31 -0800
Subject: [PATCH 07/24] permission

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/image_build/image_build.sh           | 0
 .buildkite/image_build/image_build_cpu.sh       | 0
 .buildkite/image_build/image_build_cpu_arm64.sh | 0
 .buildkite/image_build/image_build_cu118.sh     | 0
 .buildkite/image_build/image_build_hpu.sh       | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 .buildkite/image_build/image_build.sh
 mode change 100644 => 100755 .buildkite/image_build/image_build_cpu.sh
 mode change 100644 => 100755 .buildkite/image_build/image_build_cpu_arm64.sh
 mode change 100644 => 100755 .buildkite/image_build/image_build_cu118.sh
 mode change 100644 => 100755 .buildkite/image_build/image_build_hpu.sh

diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
old mode 100644
new mode 100755
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
old mode 100644
new mode 100755
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
old mode 100644
new mode 100755
diff --git a/.buildkite/image_build/image_build_cu118.sh b/.buildkite/image_build/image_build_cu118.sh
old mode 100644
new mode 100755
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
old mode 100644
new mode 100755

From b9a6433cd91d1f94f1753690074d3fd9f16bdccc Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 03:09:13 -0800
Subject: [PATCH 08/24] key change

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/image_build/image_build.yaml |   10 +-
 .buildkite/pipeline.yaml                |  835 +++++----
 buildkite_steps.yaml                    | 2212 -----------------------
 3 files changed, 446 insertions(+), 2611 deletions(-)
 delete mode 100644 buildkite_steps.yaml

diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
index 4b2c1da458af..26ec10bc8f8d 100644
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -2,6 +2,7 @@ group: Abuild
 steps:
   - label: ":docker: Build image"
     key: image-build
+    depends_on: ~
     commands:
     - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT
     env:
@@ -54,8 +55,15 @@ steps:
           limit: 2
   
   - label: ":docker: Build CPU arm64 image"
-    key: image-build-cpu-arm64
+    key: cpu-arm64-image-build
     optional: true
     commands:
     - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
     env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+        - exit_status: -10  # Agent was lost
+          limit: 2
diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
index b4646952fa18..471f24d74b75 100644
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@@ -1,6 +1,9 @@
 steps:
 - group: Abuild
   steps:
+  - block: 'Run :docker: Build CPU arm64 image'
+    depends_on: image-build
+    key: block--docker--build-cpu-arm64-image
   - label: ':docker: Build CPU arm64 image'
     key: image-build-cpu-arm64
     agents:
@@ -8,6 +11,7 @@ steps:
     commands:
     - .buildkite/image_build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
+    depends_on: block--docker--build-cpu-arm64-image
     soft_fail: false
   - label: ':docker: Build CPU image'
     key: image-build-cpu
@@ -109,47 +113,46 @@ steps:
     soft_fail: false
     plugins:
     - kubernetes:
-        kubernetes:
-          podSpec:
-            containers:
-            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-              command:
-              - bash
-              - -c
-              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-                --tp-size=4
-              resources:
-                limits:
-                  nvidia.com/gpu: 4
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: /root/.cache/huggingface
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: '0'
-              - name: HF_HOME
-                value: /root/.cache/huggingface
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
+        podSpec:
+          containers:
+          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+            command:
+            - bash
+            - -c
+            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+              --tp-size=4
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
             - name: devshm
-              emptyDir:
-                medium: Memory
+              mountPath: /dev/shm
             - name: hf-cache
-              hostPath:
-                path: /mnt/hf-cache
-                type: DirectoryOrCreate
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: NCCL_CUMEM_HOST_ENABLE
+              value: '0'
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            node.kubernetes.io/instance-type: gpu-h100-sxm
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: DirectoryOrCreate
 - group: Basic Correctness
   steps:
   - label: Basic Correctness
@@ -282,14 +285,16 @@ steps:
         mount_buildkite_agent: true
 - group: Compile
   steps:
+  - block: Run Fusion E2E (2 GPUs)(B200)
+    depends_on: image-build
+    key: block-fusion-e2e-2-gpusb200
   - label: Fusion E2E (2 GPUs)(B200)
     agents:
       queue: gpu_4_queue
     commands:
     - nvidia-smi
     - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-    depends_on:
-    - image-build
+    depends_on: block-fusion-e2e-2-gpusb200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -469,6 +474,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Distributed Tests (2 GPUs)(B200)
+    depends_on: image-build
+    key: block-distributed-tests-2-gpusb200
   - label: Distributed Tests (2 GPUs)(B200)
     agents:
       queue: gpu_4_queue
@@ -476,8 +484,7 @@ steps:
     - pytest -v -s tests/distributed/test_context_parallel.py
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
     - pytest -v -s tests/v1/distributed/test_dbo.py
-    depends_on:
-    - image-build
+    depends_on: block-distributed-tests-2-gpusb200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -495,6 +502,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Distributed Tests (2 GPUs)(H200)
+    depends_on: image-build
+    key: block-distributed-tests-2-gpush200
   - label: Distributed Tests (2 GPUs)(H200)
     agents:
       queue: gpu_4_queue
@@ -509,8 +519,7 @@ steps:
       VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py
       --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
-    depends_on:
-    - image-build
+    depends_on: block-distributed-tests-2-gpush200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -573,6 +582,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Distributed Tests (4 GPUs)(A100)
+    depends_on: image-build
+    key: block-distributed-tests-4-gpusa100
   - label: Distributed Tests (4 GPUs)(A100)
     agents:
       queue: gpu_4_queue
@@ -581,53 +593,51 @@ steps:
     - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
     - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
     - pytest -v -s -x lora/test_mixtral.py
-    depends_on:
-    - image-build
+    depends_on: block-distributed-tests-4-gpusa100
     soft_fail: false
     plugins:
     - kubernetes:
-        kubernetes:
-          podSpec:
-            priorityClassName: ci
-            containers:
-            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-              command:
-              - bash
-              - -c
-              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
-                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
-                --tp-size=4
-              resources:
-                limits:
-                  nvidia.com/gpu: 4
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: /root/.cache/huggingface
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: '0'
-              - name: HF_HOME
-                value: /root/.cache/huggingface
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-            volumes:
+        podSpec:
+          priorityClassName: ci
+          containers:
+          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+            command:
+            - bash
+            - -c
+            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
+              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
+              --tp-size=4
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
             - name: devshm
-              emptyDir:
-                medium: Memory
+              mountPath: /dev/shm
             - name: hf-cache
-              hostPath:
-                path: /mnt/hf-cache
-                type: DirectoryOrCreate
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: NCCL_CUMEM_HOST_ENABLE
+              value: '0'
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: DirectoryOrCreate
   - label: Distributed Tests (8 GPUs)(H100)
     agents:
       queue: gpu_1_queue
@@ -640,47 +650,46 @@ steps:
     soft_fail: false
     plugins:
     - kubernetes:
-        kubernetes:
-          podSpec:
-            containers:
-            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-              command:
-              - bash
-              - -c
-              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-                --tp-size=4
-              resources:
-                limits:
-                  nvidia.com/gpu: 4
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: /root/.cache/huggingface
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: '0'
-              - name: HF_HOME
-                value: /root/.cache/huggingface
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
+        podSpec:
+          containers:
+          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+            command:
+            - bash
+            - -c
+            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+              --tp-size=4
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
             - name: devshm
-              emptyDir:
-                medium: Memory
+              mountPath: /dev/shm
             - name: hf-cache
-              hostPath:
-                path: /mnt/hf-cache
-                type: DirectoryOrCreate
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: NCCL_CUMEM_HOST_ENABLE
+              value: '0'
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            node.kubernetes.io/instance-type: gpu-h100-sxm
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: DirectoryOrCreate
   - label: Pipeline + Context Parallelism (4 GPUs))
     agents:
       queue: gpu_4_queue
@@ -708,65 +717,68 @@ steps:
         mount_buildkite_agent: true
 - group: E2E Integration
   steps:
+  - block: Run DeepSeek V2-Lite Accuracy
+    depends_on: image-build
+    key: block-deepseek-v2-lite-accuracy
   - label: DeepSeek V2-Lite Accuracy
     agents:
       queue: gpu_4_queue
     commands:
     - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
       0.25 200 8010
-    depends_on:
-    - image-build
+    depends_on: block-deepseek-v2-lite-accuracy
     soft_fail: false
     plugins:
     - kubernetes:
-        kubernetes:
-          podSpec:
-            containers:
-            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-              command:
-              - bash
-              - -c
-              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-                --tp-size=4
-              resources:
-                limits:
-                  nvidia.com/gpu: 4
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: /root/.cache/huggingface
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: '0'
-              - name: HF_HOME
-                value: /root/.cache/huggingface
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
+        podSpec:
+          containers:
+          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+            command:
+            - bash
+            - -c
+            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+              --tp-size=4
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
             - name: devshm
-              emptyDir:
-                medium: Memory
+              mountPath: /dev/shm
             - name: hf-cache
-              hostPath:
-                path: /mnt/hf-cache
-                type: DirectoryOrCreate
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: NCCL_CUMEM_HOST_ENABLE
+              value: '0'
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            node.kubernetes.io/instance-type: gpu-h100-sxm
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: DirectoryOrCreate
+  - block: Run Prime-RL Integration (2 GPUs)
+    depends_on: image-build
+    key: block-prime-rl-integration-2-gpus
   - label: Prime-RL Integration (2 GPUs)
     agents:
       queue: gpu_4_queue
     commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
-    depends_on:
-    - image-build
+    depends_on: block-prime-rl-integration-2-gpus
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -784,58 +796,59 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Qwen3-30B-A3B-FP8-block Accuracy
+    depends_on: image-build
+    key: block-qwen3-30b-a3b-fp8-block-accuracy
   - label: Qwen3-30B-A3B-FP8-block Accuracy
     agents:
       queue: gpu_4_queue
     commands:
     - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
       0.8 200 8020
-    depends_on:
-    - image-build
+    depends_on: block-qwen3-30b-a3b-fp8-block-accuracy
     soft_fail: false
     plugins:
     - kubernetes:
-        kubernetes:
-          podSpec:
-            containers:
-            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-              command:
-              - bash
-              - -c
-              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-                --tp-size=4
-              resources:
-                limits:
-                  nvidia.com/gpu: 4
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: /root/.cache/huggingface
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: '0'
-              - name: HF_HOME
-                value: /root/.cache/huggingface
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
+        podSpec:
+          containers:
+          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+            command:
+            - bash
+            - -c
+            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+              --tp-size=4
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
             - name: devshm
-              emptyDir:
-                medium: Memory
+              mountPath: /dev/shm
             - name: hf-cache
-              hostPath:
-                path: /mnt/hf-cache
-                type: DirectoryOrCreate
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: NCCL_CUMEM_HOST_ENABLE
+              value: '0'
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            node.kubernetes.io/instance-type: gpu-h100-sxm
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: DirectoryOrCreate
 - group: Engine
   steps:
   - label: Engine
@@ -1202,47 +1215,46 @@ steps:
     soft_fail: false
     plugins:
     - kubernetes:
-        kubernetes:
-          podSpec:
-            containers:
-            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-              command:
-              - bash
-              - -c
-              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-                --tp-size=4
-              resources:
-                limits:
-                  nvidia.com/gpu: 4
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: /root/.cache/huggingface
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: '0'
-              - name: HF_HOME
-                value: /root/.cache/huggingface
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
+        podSpec:
+          containers:
+          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+            command:
+            - bash
+            - -c
+            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+              --tp-size=4
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
             - name: devshm
-              emptyDir:
-                medium: Memory
+              mountPath: /dev/shm
             - name: hf-cache
-              hostPath:
-                path: /mnt/hf-cache
-                type: DirectoryOrCreate
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: NCCL_CUMEM_HOST_ENABLE
+              value: '0'
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            node.kubernetes.io/instance-type: gpu-h100-sxm
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: DirectoryOrCreate
   - label: Kernels Mamba Test
     agents:
       queue: gpu_1_queue
@@ -1317,6 +1329,9 @@ steps:
         mount_buildkite_agent: true
 - group: LM Eval
   steps:
+  - block: Run LM Eval Large Models (4 GPUs)(A100)
+    depends_on: image-build
+    key: block-lm-eval-large-models-4-gpusa100
   - label: LM Eval Large Models (4 GPUs)(A100)
     agents:
       queue: gpu_4_queue
@@ -1324,53 +1339,54 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
       --tp-size=4
-    depends_on:
-    - image-build
+    depends_on: block-lm-eval-large-models-4-gpusa100
     soft_fail: false
     plugins:
     - kubernetes:
-        kubernetes:
-          podSpec:
-            priorityClassName: ci
-            containers:
-            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-              command:
-              - bash
-              - -c
-              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
-                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
-                --tp-size=4
-              resources:
-                limits:
-                  nvidia.com/gpu: 4
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: /root/.cache/huggingface
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: '0'
-              - name: HF_HOME
-                value: /root/.cache/huggingface
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-            volumes:
+        podSpec:
+          priorityClassName: ci
+          containers:
+          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+            command:
+            - bash
+            - -c
+            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
+              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
+              --tp-size=4
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
             - name: devshm
-              emptyDir:
-                medium: Memory
+              mountPath: /dev/shm
             - name: hf-cache
-              hostPath:
-                path: /mnt/hf-cache
-                type: DirectoryOrCreate
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: NCCL_CUMEM_HOST_ENABLE
+              value: '0'
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: DirectoryOrCreate
+  - block: Run LM Eval Large Models (4 GPUs)(H100)
+    depends_on: image-build
+    key: block-lm-eval-large-models-4-gpush100
   - label: LM Eval Large Models (4 GPUs)(H100)
     agents:
       queue: gpu_4_queue
@@ -1378,52 +1394,50 @@ steps:
     - export VLLM_USE_DEEP_GEMM=0
     - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
       --tp-size=4
-    depends_on:
-    - image-build
+    depends_on: block-lm-eval-large-models-4-gpush100
     soft_fail: false
     plugins:
     - kubernetes:
-        kubernetes:
-          podSpec:
-            containers:
-            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-              command:
-              - bash
-              - -c
-              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-                --tp-size=4
-              resources:
-                limits:
-                  nvidia.com/gpu: 4
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: /root/.cache/huggingface
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: '0'
-              - name: HF_HOME
-                value: /root/.cache/huggingface
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
+        podSpec:
+          containers:
+          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+            command:
+            - bash
+            - -c
+            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
+              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
+              --tp-size=4
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
             - name: devshm
-              emptyDir:
-                medium: Memory
+              mountPath: /dev/shm
             - name: hf-cache
-              hostPath:
-                path: /mnt/hf-cache
-                type: DirectoryOrCreate
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: NCCL_CUMEM_HOST_ENABLE
+              value: '0'
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            node.kubernetes.io/instance-type: gpu-h100-sxm
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: DirectoryOrCreate
   - label: LM Eval Small Models
     agents:
       queue: gpu_1_queue
@@ -1449,14 +1463,16 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run LM Eval Small Models (B200)
+    depends_on: image-build
+    key: block-lm-eval-small-models-b200
   - label: LM Eval Small Models (B200)
     agents:
       queue: gpu_1_queue
     commands:
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
       --tp-size=1
-    depends_on:
-    - image-build
+    depends_on: block-lm-eval-small-models-b200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1636,6 +1652,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run GPT-OSS Eval (B200)
+    depends_on: image-build
+    key: block-gpt-oss-eval-b200
   - label: GPT-OSS Eval (B200)
     agents:
       queue: gpu_1_queue
@@ -1643,8 +1662,7 @@ steps:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
     - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b
       --metric 0.58
-    depends_on:
-    - image-build
+    depends_on: block-gpt-oss-eval-b200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1931,6 +1949,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Transformers Nightly Models
+    depends_on: image-build
+    key: block-transformers-nightly-models
   - label: Transformers Nightly Models
     agents:
       queue: gpu_1_queue
@@ -1945,8 +1966,7 @@ steps:
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py
       --model-type whisper
-    depends_on:
-    - image-build
+    depends_on: block-transformers-nightly-models
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1998,6 +2018,9 @@ steps:
         mount_buildkite_agent: true
 - group: Models - Language
   steps:
+  - block: Run Language Models Test (Extended Generation)
+    depends_on: image-build
+    key: block-language-models-test-extended-generation
   - label: Language Models Test (Extended Generation)
     agents:
       queue: gpu_1_queue
@@ -2005,8 +2028,7 @@ steps:
     - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-    depends_on:
-    - image-build
+    depends_on: block-language-models-test-extended-generation
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2024,13 +2046,15 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Language Models Test (Extended Pooling)
+    depends_on: image-build
+    key: block-language-models-test-extended-pooling
   - label: Language Models Test (Extended Pooling)
     agents:
       queue: gpu_1_queue
     commands:
     - pytest -v -s models/language/pooling -m 'not core_model'
-    depends_on:
-    - image-build
+    depends_on: block-language-models-test-extended-pooling
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2048,13 +2072,15 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Language Models Test (MTEB)
+    depends_on: image-build
+    key: block-language-models-test-mteb
   - label: Language Models Test (MTEB)
     agents:
       queue: gpu_1_queue
     commands:
     - pytest -v -s models/language/pooling_mteb_test
-    depends_on:
-    - image-build
+    depends_on: block-language-models-test-mteb
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2072,13 +2098,15 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Language Models Test (PPL)
+    depends_on: image-build
+    key: block-language-models-test-ppl
   - label: Language Models Test (PPL)
     agents:
       queue: gpu_1_queue
     commands:
     - pytest -v -s models/language/generation_ppl_test
-    depends_on:
-    - image-build
+    depends_on: block-language-models-test-ppl
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2176,13 +2204,15 @@ steps:
         mount_buildkite_agent: true
 - group: Models - Multimodal
   steps:
+  - block: Run Custom Models
+    depends_on: image-build
+    key: block-custom-models
   - label: Custom Models
     agents:
       queue: gpu_1_queue
     commands:
     - echo 'Testing custom models...'
-    depends_on:
-    - image-build
+    depends_on: block-custom-models
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2225,6 +2255,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Multi-Modal Models (Extended) 1
+    depends_on: image-build
+    key: block-multi-modal-models-extended-1
   - label: Multi-Modal Models (Extended) 1
     agents:
       queue: gpu_1_queue
@@ -2232,8 +2265,7 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py
       --ignore models/multimodal/processing
-    depends_on:
-    - image-build
+    depends_on: block-multi-modal-models-extended-1
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2251,6 +2283,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Multi-Modal Models (Extended) 2
+    depends_on: image-build
+    key: block-multi-modal-models-extended-2
   - label: Multi-Modal Models (Extended) 2
     agents:
       queue: gpu_1_queue
@@ -2258,8 +2293,7 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0)
       and not core_model'
-    depends_on:
-    - image-build
+    depends_on: block-multi-modal-models-extended-2
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2277,6 +2311,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Multi-Modal Models (Extended) 3
+    depends_on: image-build
+    key: block-multi-modal-models-extended-3
   - label: Multi-Modal Models (Extended) 3
     agents:
       queue: gpu_1_queue
@@ -2284,8 +2321,7 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1)
       and not core_model'
-    depends_on:
-    - image-build
+    depends_on: block-multi-modal-models-extended-3
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2628,13 +2664,15 @@ steps:
         mount_buildkite_agent: true
 - group: Weight Loading
   steps:
+  - block: Run Weight Loading Multiple GPU
+    depends_on: image-build
+    key: block-weight-loading-multiple-gpu
   - label: Weight Loading Multiple GPU
     agents:
       queue: gpu_4_queue
     commands:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
-    depends_on:
-    - image-build
+    depends_on: block-weight-loading-multiple-gpu
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2652,55 +2690,56 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Weight Loading Multiple GPU - Large Models
+    depends_on: image-build
+    key: block-weight-loading-multiple-gpu---large-models
   - label: Weight Loading Multiple GPU - Large Models
     agents:
       queue: gpu_4_queue
     commands:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-    depends_on:
-    - image-build
+    depends_on: block-weight-loading-multiple-gpu---large-models
     soft_fail: false
     plugins:
     - kubernetes:
-        kubernetes:
-          podSpec:
-            priorityClassName: ci
-            containers:
-            - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-              command:
-              - bash
-              - -c
-              - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-                && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
-                && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
-                --tp-size=4
-              resources:
-                limits:
-                  nvidia.com/gpu: 4
-              volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: /root/.cache/huggingface
-              env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: '0'
-              - name: HF_HOME
-                value: /root/.cache/huggingface
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-            nodeSelector:
-              nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-            volumes:
+        podSpec:
+          priorityClassName: ci
+          containers:
+          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+            command:
+            - bash
+            - -c
+            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
+              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
+              --tp-size=4
+            resources:
+              limits:
+                nvidia.com/gpu: 4
+            volumeMounts:
             - name: devshm
-              emptyDir:
-                medium: Memory
+              mountPath: /dev/shm
             - name: hf-cache
-              hostPath:
-                path: /mnt/hf-cache
-                type: DirectoryOrCreate
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: NCCL_CUMEM_HOST_ENABLE
+              value: '0'
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: DirectoryOrCreate
diff --git a/buildkite_steps.yaml b/buildkite_steps.yaml
deleted file mode 100644
index 7b489a91ef01..000000000000
--- a/buildkite_steps.yaml
+++ /dev/null
@@ -1,2212 +0,0 @@
-steps:
-- group: Attention
-  steps:
-  - label: V1 attention (B200)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: V1 attention (H100)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s v1/attention
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Basic Correctness
-  steps:
-  - label: Basic Correctness
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s basic_correctness/test_cumem.py
-    - pytest -v -s basic_correctness/test_basic_correctness.py
-    - pytest -v -s basic_correctness/test_cpu_offload.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Benchmarks
-  steps:
-  - label: Benchmarks
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - bash scripts/run-benchmarks.sh
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Benchmarks CLI Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s benchmarks/
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: CUDA
-  steps:
-  - label: Cudagraph
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
-    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Platform Tests (CUDA)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s cuda/test_cuda_context.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Compile
-  steps:
-  - label: Fusion E2E (2 GPUs)(B200)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Fusion and Compile Tests (B200)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-      -k 'True and not +quant_fp8 and not +rms_norm'
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Distributed
-  steps:
-  - label: 2 Node Test (4 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
-      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node
-      test passed'
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
-      distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
-      --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345
-      --enforce-eager --trust-remote-code
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
-      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node
-      test passed'
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
-      distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
-      --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345
-      --enforce-eager --trust-remote-code
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Distributed (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - export NCCL_CUMEM_HOST_ENABLE=0
-    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-    - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
-    - pytest -v -s entrypoints/llm/test_collective_rpc.py
-    - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
-    - pytest -v -s ./compile/test_wrapper.py
-    - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-      | grep 'Same node test passed'
-    - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4
-      distributed/test_same_node.py | grep 'Same node test passed'
-    - pytest -v -s distributed/test_sequence_parallel.py
-    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-    - pytest -v -s v1/worker/test_worker_memory_snapshot.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Distributed Comm Ops
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - pytest -v -s distributed/test_comm_ops.py
-    - pytest -v -s distributed/test_shm_broadcast.py
-    - pytest -v -s distributed/test_shm_buffer.py
-    - pytest -v -s distributed/test_shm_storage.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Distributed NixlConnector PD accuracy (4 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Distributed Tests (2 GPUs)(B200)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-    - pytest -v -s tests/v1/distributed/test_dbo.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Distributed Tests (2 GPUs)(H200)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1
-      VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py
-      --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
-    - pytest -v -s tests/v1/distributed/test_dbo.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Distributed Tests (4 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - export NCCL_CUMEM_HOST_ENABLE=0
-    - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-    - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-    - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-    - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-    - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-    - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-    - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
-    - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-    - pytest -v -s distributed/test_utils.py
-    - pytest -v -s compile/fullgraph/test_basic_correctness.py
-    - pytest -v -s distributed/test_pynccl.py
-    - pytest -v -s distributed/test_events.py
-    - pytest -v -s distributed/test_symm_mem_allreduce.py
-    - pushd ../examples/offline_inference
-    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-    - popd
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Distributed Tests (4 GPUs)(A100)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - pytest -v -s distributed/test_custom_all_reduce.py
-    - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-    - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-    - pytest -v -s -x lora/test_mixtral.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Distributed Tests (8 GPUs)(H100)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - export NCCL_CUMEM_HOST_ENABLE=0
-    - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py
-      --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Pipeline + Context Parallelism (4 GPUs))
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - pytest -v -s distributed/test_pp_cudagraph.py
-    - pytest -v -s distributed/test_pipeline_parallel.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: E2E Integration
-  steps:
-  - label: DeepSeek V2-Lite Accuracy
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
-      0.25 200 8010
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Prime-RL Integration (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - bash .buildkite/scripts/run-prime-rl-test.sh
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Qwen3-30B-A3B-FP8-block Accuracy
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
-      0.8 200 8020
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Engine
-  steps:
-  - label: Engine
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-    - pytest -v -s tokenization
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: V1 e2e + engine
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Entrypoints
-  steps:
-  - label: Entrypoints Integration (API Server)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
-    - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py
-      --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py
-      --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
-      --ignore=entrypoints/openai/tool_parsers/
-    - pytest -v -s entrypoints/test_chat_utils.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Entrypoints Integration (LLM)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
-    - pytest -v -s entrypoints/llm/test_generate.py
-    - pytest -v -s entrypoints/offline_mode
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Entrypoints Integration (Pooling)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s entrypoints/pooling
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Entrypoints Unit Tests
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s entrypoints/openai/tool_parsers
-    - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai
-      --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Entrypoints V1
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s v1/entrypoints
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: OpenAI API Correctness
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -s entrypoints/openai/correctness/
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Expert Parallelism
-  steps:
-  - label: EPLB Algorithm
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s distributed/test_eplb_algo.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: EPLB Execution
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - pytest -v -s distributed/test_eplb_execute.py
-    - pytest -v -s distributed/test_eplb_spec_decode.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Kernels
-  steps:
-  - label: Kernels (B200)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
-    - pytest -v -s tests/kernels/attention/test_attention_selector.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
-    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
-    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
-    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
-    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Kernels Attention Test %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Kernels Core Operation Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Kernels DeepGEMM Test (H100)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
-    - pytest -v -s kernels/moe/test_deepgemm.py
-    - pytest -v -s kernels/moe/test_batched_deepgemm.py
-    - pytest -v -s kernels/attention/test_deepgemm_attention.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Kernels Mamba Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s kernels/mamba
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Kernels MoE Test %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Kernels Quantization Test %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: LM Eval
-  steps:
-  - label: LM Eval Large Models (4 GPUs)(A100)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
-      --tp-size=4
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: LM Eval Large Models (4 GPUs)(H100)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - export VLLM_USE_DEEP_GEMM=0
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-      --tp-size=4
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: LM Eval Small Models
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-      --tp-size=1
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: LM Eval Small Models (B200)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
-      --tp-size=1
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: LoRA
-  steps:
-  - label: LoRA %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py
-      \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py
-      \ --ignore=lora/test_qwen3moe_tp.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: LoRA TP (Distributed)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s -x lora/test_chatglm3_tp.py
-    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_llm_with_multi_loras.py
-    - pytest -v -s -x lora/test_olmoe_tp.py
-    - pytest -v -s -x lora/test_gptoss_tp.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Miscellaneous
-  steps:
-  - label: Async Engine, Inputs, Utils, Worker
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s -m 'not cpu_test' multimodal
-    - pytest -v -s utils_
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - python3 standalone_tests/lazy_imports.py
-    - pytest -v -s test_inputs.py
-    - pytest -v -s test_outputs.py
-    - pytest -v -s -m 'cpu_test' multimodal
-    - pytest -v -s transformers_utils
-    - pytest -v -s config
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Examples
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip install tensorizer
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf
-      --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory
-      /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m
-      deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper
-      --seed 0
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens
-      3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp
-      0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens
-      3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp
-      0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: GPT-OSS Eval (B200)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b
-      --metric 0.58
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Metrics, Tracing (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - pip install 'opentelemetry-sdk>=1.26.0' 'opentelemetry-api>=1.26.0' 'opentelemetry-exporter-otlp>=1.26.0'
-      'opentelemetry-semantic-conventions-ai>=0.4.1'
-    - pytest -v -s v1/tracing
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Python-only Installation
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - bash standalone_tests/python_only_compile.sh
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Regression
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip install modelscope
-    - pytest -v -s test_regression.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: V1 Others
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - pytest -v -s -m 'not cpu_test' v1/core
-    - pytest -v -s v1/executor
-    - pytest -v -s v1/kv_offload
-    - pytest -v -s v1/sample
-    - pytest -v -s v1/logits_processors
-    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'not cpu_test' v1/metrics
-    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_request.py
-    - pytest -v -s v1/test_outputs.py
-    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
-    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: V1 Others (CPU)
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - pytest -v -s -m 'cpu_test' v1/core
-    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'cpu_test' v1/metrics
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Model Executor
-  steps:
-  - label: Model Executor
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - apt-get update && apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Models - Basic
-  steps:
-  - label: Basic Models Test (Other CPU)
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - pytest -v -s models/test_utils.py models/test_vision.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Basic Models Tests (Extra Initialization) %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s models/test_initialization.py \ -k 'not test_can_initialize_small_subset'
-      \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Basic Models Tests (Initialization)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Basic Models Tests (Other)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Transformers Nightly Models
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal
-      or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR
-      or KimiVL)'
-    - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
-    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py
-      --model-type whisper
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Models - Distributed
-  steps:
-  - label: Distributed Model Tests (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
-    - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-    - pytest models/language -v -s -m 'distributed(num_gpus=2)'
-    - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
-    - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py
-      -v -s -m 'distributed(num_gpus=2)'
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Models - Language
-  steps:
-  - label: Language Models Test (Extended Generation)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Language Models Test (Extended Pooling)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s models/language/pooling -m 'not core_model'
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Language Models Test (MTEB)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s models/language/pooling_mteb_test
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Language Models Test (PPL)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s models/language/generation_ppl_test
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Language Models Tests (Extra Standard) %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      \ --shard-id=$$BUILDKITE_PARALLEL_JOB
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Language Models Tests (Hybrid) %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      \ --shard-id=$$BUILDKITE_PARALLEL_JOB
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Language Models Tests (Standard)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and (not slow_test)'
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Models - Multimodal
-  steps:
-  - label: Custom Models
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - echo 'Testing custom models...'
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Multi-Modal Accuracy Eval (Small Models)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
-      --tp-size=1
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Multi-Modal Models (Extended) 1
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py
-      --ignore models/multimodal/processing
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Multi-Modal Models (Extended) 2
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0)
-      and not core_model'
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Multi-Modal Models (Extended) 3
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1)
-      and not core_model'
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Multi-Modal Models (Standard)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py
-      --ignore models/multimodal/processing
-    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py
-      -m core_model
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Multi-Modal Processor
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Plugins
-  steps:
-  - label: Plugin Tests (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - pip install -e ./plugins/vllm_add_dummy_platform
-    - pytest -v -s plugins_tests/test_platform_plugins.py
-    - pip uninstall vllm_add_dummy_platform -y
-    - pip install -e ./plugins/prithvi_io_processor_plugin
-    - pytest -v -s plugins_tests/test_io_processor_plugins.py
-    - pip uninstall prithvi_io_processor_plugin -y
-    - pip install -e ./plugins/vllm_add_dummy_stat_logger
-    - pytest -v -s plugins_tests/test_stats_logger_plugins.py
-    - pip uninstall dummy_stat_logger -y
-    - pytest -v -s plugins_tests/test_scheduler_plugins.py
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s distributed/test_distributed_oot.py
-    - pytest -v -s entrypoints/openai/test_oot_registration.py
-    - pytest -v -s models/test_oot_registration.py
-    - pytest -v -s plugins/lora_resolvers
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: PyTorch
-  steps:
-  - label: PyTorch Compilation Unit Tests
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: PyTorch Fullgraph
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    - pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8
-      and not Llama-4'
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: PyTorch Fullgraph Smoke Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec
-      pytest -s -v {} \\;
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Pytorch Nightly Dependency Override Check
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - bash standalone_tests/pytorch_nightly_dependency.sh
-    soft_fail: true
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Quantization
-  steps:
-  - label: Quantization
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
-    - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Quantized MoE Test (B200)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -s -v tests/quantization/test_blackwell_moe.py
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Samplers
-  steps:
-  - label: Samplers Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s samplers
-    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Tool use
-  steps:
-  - label: OpenAI-Compatible Tool Use
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: OpenAI-Compatible Tool Use (CPU)
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - pytest -v -s -m 'cpu_test' tool_use
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Weight Loading
-  steps:
-  - label: Weight Loading Multiple GPU
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - label: Weight Loading Multiple GPU - Large Models
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:None
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true

From ac8206580de8ec1310eb82f46a23708ff61f1771 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 03:21:49 -0800
Subject: [PATCH 09/24] depends_on for build job

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/image_build/image_build.yaml |  6 +++++-
 .buildkite/pipeline.yaml                | 19 +++++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
index 26ec10bc8f8d..af23621a598c 100644
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -2,7 +2,7 @@ group: Abuild
 steps:
   - label: ":docker: Build image"
     key: image-build
-    depends_on: ~
+    depends_on: []
     commands:
     - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT
     env:
@@ -16,6 +16,7 @@ steps:
 
   - label: ":docker: Build CPU image"
     key: image-build-cpu
+    depends_on: []
     commands:
     - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
     env:
@@ -29,6 +30,7 @@ steps:
   
   - label: ":docker: Build CUDA 11.8 image"
     key: image-build-cu118
+    optional: true
     commands:
     - .buildkite/image_build/image_build_cu118.sh $REGISTRY $REPO $BUILDKITE_COMMIT
     env:
@@ -42,6 +44,7 @@ steps:
 
   - label: ":docker: Build HPU image"
     soft_fail: true
+    depends_on: []
     key: image-build-hpu
     commands:
     - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
@@ -56,6 +59,7 @@ steps:
   
   - label: ":docker: Build CPU arm64 image"
     key: cpu-arm64-image-build
+    depends_on: []
     optional: true
     commands:
     - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
index 471f24d74b75..97580e597409 100644
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@@ -2,10 +2,10 @@ steps:
 - group: Abuild
   steps:
   - block: 'Run :docker: Build CPU arm64 image'
-    depends_on: image-build
+    depends_on: []
     key: block--docker--build-cpu-arm64-image
   - label: ':docker: Build CPU arm64 image'
-    key: image-build-cpu-arm64
+    key: cpu-arm64-image-build
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
@@ -13,6 +13,14 @@ steps:
       123
     depends_on: block--docker--build-cpu-arm64-image
     soft_fail: false
+    retry:
+      automatic:
+      - exit_status: -1
+        limit: 2
+      - exit_status: -10
+        limit: 2
+    env:
+      DOCKER_BUILDKIT: '1'
   - label: ':docker: Build CPU image'
     key: image-build-cpu
     agents:
@@ -20,6 +28,7 @@ steps:
     commands:
     - .buildkite/image_build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
+    depends_on: []
     soft_fail: false
     retry:
       automatic:
@@ -29,6 +38,9 @@ steps:
         limit: 2
     env:
       DOCKER_BUILDKIT: '1'
+  - block: 'Run :docker: Build CUDA 11.8 image'
+    depends_on: []
+    key: block--docker--build-cuda-11.8-image
   - label: ':docker: Build CUDA 11.8 image'
     key: image-build-cu118
     agents:
@@ -36,6 +48,7 @@ steps:
     commands:
     - .buildkite/image_build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
+    depends_on: block--docker--build-cuda-11.8-image
     soft_fail: false
     retry:
       automatic:
@@ -52,6 +65,7 @@ steps:
     commands:
     - .buildkite/image_build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
+    depends_on: []
     soft_fail: true
     retry:
       automatic:
@@ -68,6 +82,7 @@ steps:
     commands:
     - .buildkite/image_build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
+    depends_on: []
     soft_fail: false
     retry:
       automatic:

From 8b886aa54d00956688c1146c66999dadd17d598c Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 13:37:33 -0800
Subject: [PATCH 10/24] Revert "[CI] fix url-encoding behavior in nightly
 metadata generation (#29787)"

This reverts commit 37593deb02423826e9206ff28e77f57a0ff8a0b0.
---
 .buildkite/scripts/generate-nightly-index.py | 11 +++----
 setup.py                                     | 33 ++++++++------------
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 90286ad4c6e1..a61f08107647 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -112,12 +112,11 @@ def generate_package_index_and_metadata(
         relative_path = (
             wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
         )
-        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
-        # NOTE: this is AWS S3 specific behavior!
-        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
-        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
+        href_tags.append(
+            f'    <a href="{quote(relative_path.as_posix())}">{file.filename}</a><br/>'
+        )
         file_meta = asdict(file)
-        file_meta["path"] = file_path_quoted
+        file_meta["path"] = relative_path.as_posix()
         metadata.append(file_meta)
     index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
     metadata_str = json.dumps(metadata, indent=2)
@@ -186,7 +185,7 @@ def generate_index_and_metadata(
                 "platform_tag": "manylinux2014_aarch64",
                 "variant": "cu129",
                 "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
-                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+                "path": "../vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL
             },
             ...
         ]
diff --git a/setup.py b/setup.py
index 67fbebb1d37b..67226b4447c7 100644
--- a/setup.py
+++ b/setup.py
@@ -319,17 +319,14 @@ class precompiled_wheel_utils:
     """Extracts libraries and other files from an existing wheel."""
 
     @staticmethod
-    def extract_precompiled_and_patch_package(
-        wheel_url_or_path: str, download_filename: str | None
-    ) -> dict:
+    def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
         import tempfile
         import zipfile
 
         temp_dir = None
         try:
             if not os.path.isfile(wheel_url_or_path):
-                # use provided filename first, then derive from URL
-                wheel_filename = download_filename or wheel_url_or_path.split("/")[-1]
+                wheel_filename = wheel_url_or_path.split("/")[-1]
                 temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
                 wheel_path = os.path.join(temp_dir, wheel_filename)
                 print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
@@ -676,8 +673,7 @@ def _fetch_metadata_for_variant(
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location
-        download_filename = None
-        logger.info("Using user-specified precompiled wheel location: %s", wheel_url)
+        logger.info("Using user-specified precompiled wheel location: {}", wheel_url)
     else:
         import platform
 
@@ -690,17 +686,17 @@ def _fetch_metadata_for_variant(
             precompiled_wheel_utils.get_base_commit_in_main_branch(),
         )
         logger.info(
-            "Using precompiled wheel commit %s with variant %s", commit, variant
+            "Using precompiled wheel commit {} with variant {}", commit, variant
         )
         try_default = False
-        wheels, repo_url, download_filename = None, None, None
+        wheels, repo_url = None, None
         try:
             wheels, repo_url = _fetch_metadata_for_variant(commit, variant)
-        except Exception:
+        except Exception as e:
             logger.warning(
-                "Failed to fetch precompiled wheel metadata for variant %s",
+                "Failed to fetch precompiled wheel metadata for variant {}",
                 variant,
-                exc_info=True,
+                exc_info=e,
             )
             try_default = True  # try outside handler to keep the stacktrace simple
         if try_default:
@@ -721,29 +717,26 @@ def _fetch_metadata_for_variant(
 "platform_tag": "manylinux1_x86_64",
 "variant": null,
 "filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
-"path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
+"path": "../vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
 },
 ...]"""
         for wheel in wheels:
-            # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
             if wheel.get("package_name") == "vllm" and arch in wheel.get(
                 "platform_tag", ""
             ):
-                logger.info("Found precompiled wheel metadata: %s", wheel)
+                logger.info("Found precompiled wheel metadata: {}", wheel)
                 if "path" not in wheel:
                     raise ValueError(f"Wheel metadata missing path: {wheel}")
+                # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
                 wheel_url = repo_url + wheel["path"]
-                download_filename = wheel.get("filename")
-                logger.info("Using precompiled wheel URL: %s", wheel_url)
+                logger.info("Using precompiled wheel URL: {}", wheel_url)
                 break
         else:
             raise ValueError(
                 f"No precompiled vllm wheel found for architecture {arch} "
                 f"from repo {repo_url}. All available wheels: {wheels}"
             )
-    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
-        wheel_url, download_filename
-    )
+    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
 

From 95b4cdf3f1675300d4574c00e8b1451640b4bc93 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 13:37:43 -0800
Subject: [PATCH 11/24] Revert "[CI] Renovation of nightly wheel build &
 generation (#29690)"

This reverts commit 36db0a35e45f32f7c37f6f1967dc8d6ff301d882.
---
 .buildkite/generate_index.py                  |  46 +++
 .buildkite/release-pipeline.yaml              |  16 +-
 .buildkite/scripts/generate-nightly-index.py  | 368 ------------------
 .buildkite/scripts/upload-wheels.sh           | 121 +++---
 docs/getting_started/installation/cpu.md      |  15 +-
 .../installation/gpu.cuda.inc.md              |  73 ++--
 docs/getting_started/installation/gpu.md      |   2 +-
 setup.py                                      | 101 ++---
 vllm/envs.py                                  |   7 +-
 9 files changed, 181 insertions(+), 568 deletions(-)
 create mode 100644 .buildkite/generate_index.py
 delete mode 100644 .buildkite/scripts/generate-nightly-index.py

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
new file mode 100644
index 000000000000..bbed80ebe847
--- /dev/null
+++ b/.buildkite/generate_index.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
+        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
+    if "x86_64" in filename:
+        x86_wheel = filename
+        arm_wheel = filename.replace("x86_64", "aarch64").replace(
+            "manylinux1", "manylinux2014"
+        )
+    elif "aarch64" in filename:
+        x86_wheel = filename.replace("aarch64", "x86_64").replace(
+            "manylinux2014", "manylinux1"
+        )
+        arm_wheel = filename
+    else:
+        raise ValueError(f"Unsupported wheel: {filename}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(
+            x86_wheel=x86_wheel,
+            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
+            arm_wheel=arm_wheel,
+            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
+        )
+    )
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index fbfc923998f8..38c400ba1faf 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -30,6 +30,19 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   # x86 + CUDA builds
+  - label: "Build wheel - CUDA 12.8"
+    depends_on: ~
+    id: build-wheel-cuda-12-8
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
   - label: "Build wheel - CUDA 12.9"
     depends_on: ~
     id: build-wheel-cuda-12-9
@@ -96,6 +109,7 @@ steps:
   - label: "Annotate release workflow"
     depends_on:
       - create-multi-arch-manifest
+      - build-wheel-cuda-12-8
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
deleted file mode 100644
index a61f08107647..000000000000
--- a/.buildkite/scripts/generate-nightly-index.py
+++ /dev/null
@@ -1,368 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# do not complain about line length (for docstring)
-# ruff: noqa: E501
-
-import argparse
-import json
-import re
-import sys
-from dataclasses import asdict, dataclass
-from pathlib import Path
-from typing import Any
-from urllib.parse import quote
-
-if not sys.version_info >= (3, 10):
-    raise RuntimeError("This script requires Python 3.10 or higher.")
-
-INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
-<html>
-  <meta name="pypi:repository-version" content="1.0">
-  <body>
-{items}
-  </body>
-</html>
-"""
-
-
-@dataclass
-class WheelFileInfo:
-    package_name: str
-    version: str
-    build_tag: str | None
-    python_tag: str
-    abi_tag: str
-    platform_tag: str
-    variant: str | None
-    filename: str
-
-
-def parse_from_filename(file: str) -> WheelFileInfo:
-    """
-    Parse wheel file name to extract metadata.
-
-    The format of wheel names:
-        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
-    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
-    Example:
-        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
-        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
-        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
-        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
-    """
-    wheel_file_re = re.compile(
-        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
-    )
-    match = wheel_file_re.match(file)
-    if not match:
-        raise ValueError(f"Invalid wheel file name: {file}")
-
-    package_name = match.group("package_name")
-    version = match.group("version")
-    build_tag = match.group("build_tag")
-    python_tag = match.group("python_tag")
-    abi_tag = match.group("abi_tag")
-    platform_tag = match.group("platform_tag")
-
-    # extract variant from version
-    variant = None
-    if "dev" in version:
-        ver_after_dev = version.split("dev")[-1]
-        if "." in ver_after_dev:
-            variant = ver_after_dev.split(".")[-1]
-            version = version.removesuffix("." + variant)
-    else:
-        if "+" in version:
-            version, variant = version.split("+")
-
-    return WheelFileInfo(
-        package_name=package_name,
-        version=version,
-        build_tag=build_tag,
-        python_tag=python_tag,
-        abi_tag=abi_tag,
-        platform_tag=platform_tag,
-        variant=variant,
-        filename=file,
-    )
-
-
-def generate_project_list(subdir_names: list[str]) -> str:
-    """
-    Generate project list HTML content linking to each project & variant sub-directory.
-    """
-    href_tags = []
-    for name in sorted(subdir_names):
-        name = name.strip("/").strip(".")
-        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
-    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
-
-
-def generate_package_index_and_metadata(
-    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
-) -> tuple[str, str]:
-    """
-    Generate package index HTML content for a specific package, linking to actual wheel files.
-    """
-    href_tags = []
-    metadata = []
-    for file in sorted(wheel_files, key=lambda x: x.filename):
-        relative_path = (
-            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
-        )
-        href_tags.append(
-            f'    <a href="{quote(relative_path.as_posix())}">{file.filename}</a><br/>'
-        )
-        file_meta = asdict(file)
-        file_meta["path"] = relative_path.as_posix()
-        metadata.append(file_meta)
-    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
-    metadata_str = json.dumps(metadata, indent=2)
-    return index_str, metadata_str
-
-
-def generate_index_and_metadata(
-    whl_files: list[str],
-    wheel_base_dir: Path,
-    index_base_dir: Path,
-    default_variant: str | None = None,
-    alias_to_default: str | None = None,
-):
-    """
-    Generate index for all wheel files.
-
-    Args:
-        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
-        wheel_base_dir (Path): Base directory for wheel files.
-        index_base_dir (Path): Base directory to store index files.
-        default_variant (str | None): The default variant name, if any.
-        alias_to_default (str | None): Alias variant name for the default variant, if any.
-
-    First, parse all wheel files to extract metadata.
-    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
-    The index for the default variant (if any) is generated in the root index directory.
-
-    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
-    is purely a copy of the corresponding variant index, with only the links adjusted.
-    Otherwise, all wheels without variant suffixes are treated as the default variant.
-
-    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
-    as the default variant index, but the links are adjusted accordingly.
-
-    Index directory structure:
-        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
-            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
-            vllm/
-                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
-                metadata.json # machine-readable metadata for all wheels in this package
-            cpu/ # cpu variant sub-directory
-                index.html
-                vllm/
-                    index.html
-                    metadata.json
-            cu129/ # cu129 is actually the alias to default variant
-                index.html
-                vllm/
-                    index.html
-                    metadata.json
-            cu130/ # cu130 variant sub-directory
-                index.html
-                vllm/
-                    index.html
-                    metadata.json
-            ...
-
-    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
-        [
-            {
-                "package_name": "vllm",
-                "version": "0.10.2rc2",
-                "build_tag": null,
-                "python_tag": "cp38",
-                "abi_tag": "abi3",
-                "platform_tag": "manylinux2014_aarch64",
-                "variant": "cu129",
-                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
-                "path": "../vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL
-            },
-            ...
-        ]
-    """
-
-    parsed_files = [parse_from_filename(f) for f in whl_files]
-
-    if not parsed_files:
-        print("No wheel files found, skipping index generation.")
-        return
-
-    # Group by variant
-    variant_to_files: dict[str, list[WheelFileInfo]] = {}
-    for file in parsed_files:
-        variant = file.variant or "default"
-        if variant not in variant_to_files:
-            variant_to_files[variant] = []
-        variant_to_files[variant].append(file)
-
-    print(f"Found variants: {list(variant_to_files.keys())}")
-
-    # sanity check for default variant
-    if default_variant:
-        if "default" in variant_to_files:
-            raise ValueError(
-                "All wheel files must have variant suffixes when `default_variant` is specified."
-            )
-        if default_variant not in variant_to_files:
-            raise ValueError(
-                f"Default variant '{default_variant}' not found among wheel files."
-            )
-
-    if alias_to_default:
-        if "default" not in variant_to_files:
-            # e.g. only some wheels are uploaded to S3 currently
-            print(
-                "[WARN] Alias to default variant specified, but no default variant found."
-            )
-        elif alias_to_default in variant_to_files:
-            raise ValueError(
-                f"Alias variant name '{alias_to_default}' already exists among wheel files."
-            )
-        else:
-            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
-            print(f"Alias variant '{alias_to_default}' created for default variant.")
-
-    # Generate index for each variant
-    subdir_names = set()
-    for variant, files in variant_to_files.items():
-        if variant == "default":
-            variant_dir = index_base_dir
-        else:
-            variant_dir = index_base_dir / variant
-            subdir_names.add(variant)
-
-        variant_dir.mkdir(parents=True, exist_ok=True)
-
-        # gather all package names in this variant
-        packages = set(f.package_name for f in files)
-        if variant == "default":
-            # these packages should also appear in the "project list"
-            # generate after all variants are processed
-            subdir_names = subdir_names.union(packages)
-        else:
-            # generate project list for this variant directly
-            project_list_str = generate_project_list(sorted(packages))
-            with open(variant_dir / "index.html", "w") as f:
-                f.write(project_list_str)
-
-        for package in packages:
-            # filter files belonging to this package only
-            package_files = [f for f in files if f.package_name == package]
-            package_dir = variant_dir / package
-            package_dir.mkdir(parents=True, exist_ok=True)
-            index_str, metadata_str = generate_package_index_and_metadata(
-                package_files, wheel_base_dir, package_dir
-            )
-            with open(package_dir / "index.html", "w") as f:
-                f.write(index_str)
-            with open(package_dir / "metadata.json", "w") as f:
-                f.write(metadata_str)
-
-    # Generate top-level project list index
-    project_list_str = generate_project_list(sorted(subdir_names))
-    with open(index_base_dir / "index.html", "w") as f:
-        f.write(project_list_str)
-
-
-if __name__ == "__main__":
-    """
-    Arguments:
-        --version <version> : version string for the current build (e.g., commit hash)
-        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
-        --output-dir <output_directory> : directory to store generated index files
-        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
-    """
-
-    parser = argparse.ArgumentParser(
-        description="Process nightly build wheel files to generate indices."
-    )
-    parser.add_argument(
-        "--version",
-        type=str,
-        required=True,
-        help="Version string for the current build (e.g., commit hash)",
-    )
-    parser.add_argument(
-        "--current-objects",
-        type=str,
-        required=True,
-        help="Path to JSON file containing current S3 objects listing in this version directory",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        required=True,
-        help="Directory to store generated index files",
-    )
-    parser.add_argument(
-        "--alias-to-default",
-        type=str,
-        default=None,
-        help="Alias variant name for the default variant",
-    )
-
-    args = parser.parse_args()
-
-    version = args.version
-    if "/" in version or "\\" in version:
-        raise ValueError("Version string must not contain slashes.")
-    current_objects_path = Path(args.current_objects)
-    output_dir = Path(args.output_dir)
-    if not output_dir.exists():
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-    # Read current objects JSON
-    with open(current_objects_path) as f:
-        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
-
-    # current_objects looks like from list_objects_v2 S3 API:
-    """
-    "Contents": [
-        {
-            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
-            "LastModified": "2025-11-28T14:00:32+00:00",
-            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
-            "ChecksumAlgorithm": [
-                "CRC64NVME"
-            ],
-            "ChecksumType": "FULL_OBJECT",
-            "Size": 435649349,
-            "StorageClass": "STANDARD"
-        },
-        ...
-    ]
-    """
-
-    # Extract wheel file keys
-    wheel_files = []
-    for item in current_objects.get("Contents", []):
-        key: str = item["Key"]
-        if key.endswith(".whl"):
-            wheel_files.append(key.split("/")[-1])  # only the filename is used
-
-    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
-
-    # Generate index and metadata, assuming wheels and indices are stored as:
-    # s3://vllm-wheels/{version}/<wheel files>
-    # s3://vllm-wheels/<anything>/<index files>
-    wheel_base_dir = Path(output_dir).parent / version
-    index_base_dir = Path(output_dir)
-
-    generate_index_and_metadata(
-        whl_files=wheel_files,
-        wheel_base_dir=wheel_base_dir,
-        index_base_dir=index_base_dir,
-        default_variant=None,
-        alias_to_default=args.alias_to_default,
-    )
-    print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 05accb9cf16d..945c5e48c009 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,28 +2,6 @@
 
 set -ex
 
-# ======== part 0: setup ========
-
-BUCKET="vllm-wheels"
-INDICES_OUTPUT_DIR="indices"
-DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
-PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
-SUBPATH=$BUILDKITE_COMMIT
-S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
-
-# detect if python3.10+ is available
-has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,10) else 0)")
-if [[ "$has_new_python" -eq 0 ]]; then
-    # use new python from docker
-    docker pull python:3-slim
-    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
-fi
-
-echo "Using python interpreter: $PYTHON"
-echo "Python version: $($PYTHON --version)"
-
-# ========= part 1: collect, rename & upload the wheel ==========
-
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 
@@ -32,69 +10,74 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
   exit 1
 fi
+
+# Get the single wheel file
 wheel="${wheel_files[0]}"
 
-# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
-# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
-manylinux_version="manylinux_2_31"
+# Detect architecture and rename 'linux' to appropriate manylinux version
+arch=$(uname -m)
+if [[ $arch == "x86_64" ]]; then
+    manylinux_version="manylinux1"
+elif [[ $arch == "aarch64" ]]; then
+    manylinux_version="manylinux2014"
+else
+    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
+    manylinux_version="manylinux1"
+fi
 
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
-if [[ "$wheel" != *"linux"* ]]; then
-  echo "Error: Wheel filename does not contain 'linux': $wheel"
-  exit 1
-fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
-echo "Renamed wheel to: $wheel"
 
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version in wheel: $version"
-pure_version="${version%%+*}"
-echo "Pure version (without variant): $pure_version"
-
-# copy wheel to its own bucket
-aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+echo "Version: $version"
+
+normal_wheel="$wheel" # Save the original wheel filename
+
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
+if [[ $version == *dev* ]]; then
+    suffix="${version##*.}"
+    if [[ $suffix == cu* ]]; then
+        new_version="1.0.0.dev+${suffix}"
+    else
+        new_version="1.0.0.dev"
+    fi
+    new_wheel="${wheel/$version/$new_version}"
+    # use cp to keep both files in the artifacts directory
+    cp -- "$wheel" "$new_wheel"
+    wheel="$new_wheel"
+    version="$new_version"
+fi
 
-# ========= part 2: generate and upload indices ==========
-# generate indices for all existing wheels in the commit directory
-# this script might be run multiple times if there are multiple variants being built
-# so we need to guarantee there is little chance for "TOCTOU" issues
-# i.e., one process is generating indices while another is uploading a new wheel
-# so we need to ensure no time-consuming operations happen below
+# Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 
-# list all wheels in the commit directory
-echo "Existing wheels on S3:"
-aws s3 ls "$S3_COMMIT_PREFIX"
-obj_json="objects.json"
-aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
-mkdir -p "$INDICES_OUTPUT_DIR"
+# generate index for this commit
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 
-# call script to generate indicies for all existing wheels
-# this indices have relative paths that could work as long as it is next to the wheel directory in s3
-# i.e., the wheels are always in s3://vllm-wheels/<commit>/
-# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
-    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+if [[ $normal_wheel == *"cu129"* ]]; then
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
+    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 else
-    alias_arg=""
+    echo "Skipping index files for non-cu129 wheels"
 fi
 
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
-
-# copy indices to /<commit>/ unconditionally
-echo "Uploading indices to $S3_COMMIT_PREFIX"
-aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+# generate index for nightly
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 
-# copy to /nightly/ only if it is on the main branch and not a PR 
-if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
-    echo "Uploading indices to overwrite /nightly/"
-    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
+if [[ $normal_wheel == *"cu129"* ]]; then
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+else
+    echo "Skipping index files for non-cu129 wheels"
 fi
 
-# copy to /<pure_version>/ only if it does not have "dev" in the version
-if [[ "$version" != *"dev"* ]]; then
-    echo "Uploading indices to overwrite /$pure_version/"
-    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
-fi
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 18dc6d19434b..d1beab7855b1 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,23 +46,10 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Pre-built wheels
 
-Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
-
-When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
-For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
+Currently, there are no pre-built CPU wheels.
 
 ### Build wheel from source
 
-#### Set up using Python-only build (without compilation) {#python-only-build}
-
-Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
-
-```bash
-VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
-```
-
-#### Full build (with compilation) {#full-build}
-
 === "Intel/AMD x86"
 
     --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index ad26672f8092..601d3659af88 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -26,50 +26,43 @@ uv pip install vllm --torch-backend=auto
 
 ??? console "pip"
     ```bash
-    # Install vLLM with CUDA 12.9.
-    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
+    # Install vLLM with CUDA 12.8.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
     ```
 
-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
 
 !!! note
     NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
 
-As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
 
 ```bash
-# Install vLLM with a specific CUDA version (e.g., 13.0).
+# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=130 # or other
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+export CUDA_VERSION=118 # or 126
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
 #### Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
-
-* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
-* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
-
-To install from nightly index, run:
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
 
 ```bash
 uv pip install -U vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
+    --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
-!!! warning "`pip` caveat"
-
-    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
-
-    If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
-
+??? console "pip"
     ```bash
-    pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
-    pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
+    pip install -U vllm \
+        --pre \
+        --extra-index-url https://wheels.vllm.ai/nightly
     ```
 
+    `--pre` is required for `pip` to consider pre-released versions.
+
 ##### Install specific revisions
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
@@ -78,13 +71,33 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
 ```
 
+The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+??? note "pip"
+    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
+    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
+    wheel file by embedding the commit hash in the URL:
+
+    ```bash
+    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    ```
+
+    Note that the wheels are built with Python 3.8 ABI (see [PEP
+    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
+    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
+    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
+    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
+    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
+    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-#### Set up using Python-only build (without compilation) {#python-only-build}
+#### Set up using Python-only build (without compilation)
 
 If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
 
@@ -108,24 +121,18 @@ This command will do the following:
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 
 ```bash
-export VLLM_PRECOMPILED_WHEEL_COMIMT=$(git rev-parse HEAD~1) # or earlier commit on main
-export VLLM_USE_PRECOMPILED=1
+export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 uv pip install --editable .
 ```
 
-There are more environment variables to control the behavior of Python-only build:
-
-* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
-* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
-* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
-
 You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 
 !!! note
     There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
     It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
 
-#### Full build (with compilation) {#full-build}
+#### Full build (with compilation)
 
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index fb750f449985..bc7508b29475 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 
-### Pre-built wheels {#pre-built-wheels}
+### Pre-built wheels
 
 === "NVIDIA CUDA"
 
diff --git a/setup.py b/setup.py
index 67226b4447c7..0022e7fe0bf3 100644
--- a/setup.py
+++ b/setup.py
@@ -310,6 +310,9 @@ def run(self):
 class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
+    def run(self) -> None:
+        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
         return
@@ -645,97 +648,37 @@ def _read_requirements(filename: str) -> list[str]:
     ]
 }
 
-
-def _fetch_metadata_for_variant(
-    commit: str, variant: str | None
-) -> tuple[list[dict], str]:
-    variant_dir = f"{variant}/" if variant is not None else ""
-    repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
-    meta_url = repo_url + "metadata.json"
-    logger.info("Trying to fetch metadata from {}", meta_url)
-    from urllib.request import urlopen
-
-    with urlopen(meta_url) as resp:
-        # urlopen raises HTTPError on unexpected status code
-        wheels = json.loads(resp.read().decode("utf-8"))
-    return wheels, repo_url
-
-
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    # Attempts:
-    # 1. user-specified wheel location (can be either local or remote, via
-    #    VLLM_PRECOMPILED_WHEEL_LOCATION)
-    # 2. user-specified variant from nightly repo (current main commit via
-    #    VLLM_PRECOMPILED_WHEEL_VARIANT)
-    # 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
-    # 4. the default variant from nightly repo (current main commit)
+    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location
-        logger.info("Using user-specified precompiled wheel location: {}", wheel_url)
     else:
         import platform
 
         arch = platform.machine()
-        # try to fetch the wheel metadata from the nightly wheel repo
-        main_variant = envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
-        variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
-        commit = os.getenv(
-            "VLLM_PRECOMPILED_WHEEL_COMMIT",
-            precompiled_wheel_utils.get_base_commit_in_main_branch(),
-        )
-        logger.info(
-            "Using precompiled wheel commit {} with variant {}", commit, variant
+        if arch == "x86_64":
+            wheel_tag = "manylinux1_x86_64"
+        elif arch == "aarch64":
+            wheel_tag = "manylinux2014_aarch64"
+        else:
+            raise ValueError(f"Unsupported architecture: {arch}")
+        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
+        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        nightly_wheel_url = (
+            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
         )
-        try_default = False
-        wheels, repo_url = None, None
+        from urllib.request import urlopen
+
         try:
-            wheels, repo_url = _fetch_metadata_for_variant(commit, variant)
+            with urlopen(wheel_url) as resp:
+                if resp.status != 200:
+                    wheel_url = nightly_wheel_url
         except Exception as e:
-            logger.warning(
-                "Failed to fetch precompiled wheel metadata for variant {}",
-                variant,
-                exc_info=e,
-            )
-            try_default = True  # try outside handler to keep the stacktrace simple
-        if try_default:
-            logger.info("Trying the default variant")
-            wheels, repo_url = _fetch_metadata_for_variant(commit, None)
-            # if this also fails, then we have nothing more to try / cache
-        assert wheels is not None and repo_url is not None, (
-            "Failed to fetch precompiled wheel metadata"
-        )
-        # The metadata.json has the following format:
-        # see .buildkite/scripts/generate-nightly-index.py for details
-        """[{
-"package_name": "vllm",
-"version": "0.11.2.dev278+gdbc3d9991",
-"build_tag": null,
-"python_tag": "cp38",
-"abi_tag": "abi3",
-"platform_tag": "manylinux1_x86_64",
-"variant": null,
-"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
-"path": "../vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
-},
-...]"""
-        for wheel in wheels:
-            if wheel.get("package_name") == "vllm" and arch in wheel.get(
-                "platform_tag", ""
-            ):
-                logger.info("Found precompiled wheel metadata: {}", wheel)
-                if "path" not in wheel:
-                    raise ValueError(f"Wheel metadata missing path: {wheel}")
-                # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
-                wheel_url = repo_url + wheel["path"]
-                logger.info("Using precompiled wheel URL: {}", wheel_url)
-                break
-        else:
-            raise ValueError(
-                f"No precompiled vllm wheel found for architecture {arch} "
-                f"from repo {repo_url}. All available wheels: {wheels}"
-            )
+            print(f"[warn] Falling back to nightly wheel: {e}")
+            wheel_url = nightly_wheel_url
+
     patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
diff --git a/vllm/envs.py b/vllm/envs.py
index d0912863e644..46f1aa3222be 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,7 +74,7 @@
     VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
-    VLLM_MAIN_CUDA_VERSION: str = "12.9"
+    VLLM_MAIN_CUDA_VERSION: str = "12.8"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -445,9 +445,10 @@ def get_vllm_port() -> int | None:
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, cpu]
     "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-    # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
+    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
+    # 12.8 is the default. This follows PyTorch but can be overridden.
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
-    or "12.9",
+    or "12.8",
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),

From d4d268cb707b0ecfacaf52dd5a937f810fe6ec98 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 15:24:46 -0800
Subject: [PATCH 12/24] sync

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/distributed.yaml       |  9 ++++++---
 .buildkite/test_areas/engine.yaml            |  5 +----
 .buildkite/test_areas/misc.yaml              |  4 +++-
 .buildkite/test_areas/models_basic.yaml      |  5 +++--
 .buildkite/test_areas/models_multimodal.yaml | 12 +++++++++++-
 .buildkite/test_areas/quantization.yaml      |  1 +
 6 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index e6ae13b8156d..1328ecec1b16 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -39,6 +39,7 @@ steps:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
@@ -84,6 +85,7 @@ steps:
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -101,6 +103,7 @@ steps:
   - popd
 
 - label: Distributed Tests (8 GPUs)(H100)
+  optional: true
   timeout_in_minutes: 10
   gpu: h100
   num_gpus: 8
@@ -138,11 +141,11 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
index e4d12f3453f1..a028e0e4af4c 100644
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -3,19 +3,16 @@ depends_on:
   - image-build
 steps:
 - label: Engine
-  timeout_in_minutes: 40
+  timeout_in_minutes: 15
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenization
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
   - tests/test_vllm_port
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
 
 - label: V1 e2e + engine
   timeout_in_minutes: 45
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index ec719825b377..e4182005bb45 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -111,13 +111,14 @@ steps:
 
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
   depends_on: ~
-  timeout_in_minutes: 10
+  timeout_in_minutes: 20
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -126,6 +127,7 @@ steps:
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index ceddf841f87a..9b7f574a95c3 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -52,11 +52,12 @@ steps:
 - label: Transformers Nightly Models
   working_dir: "/vllm-workspace/"
   optional: true
+  soft_fail: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
     - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/processing/
     - pytest -v -s tests/models/multimodal/test_mapping.py
     - python3 examples/offline_inference/basic/chat.py
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
index 68e5e485c316..fc24068c20a4 100644
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -13,6 +13,16 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
 - label: Multi-Modal Processor # 44min
   timeout_in_minutes: 60
   source_file_dependencies:
@@ -20,7 +30,7 @@ steps:
   - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
 
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
   timeout_in_minutes: 70
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
index cff4a7189806..02a836b90bdf 100644
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -17,6 +17,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: Quantized MoE Test (B200)

From 1ad5b4dff3c10ca70ea5a5394b3faed20c51e5cc Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 15:32:05 -0800
Subject: [PATCH 13/24] sync

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/distributed.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 1328ecec1b16..30a1002b701b 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -39,7 +39,7 @@ steps:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py

From c1629aa6c593dbdfa743324864192939c4cf7045 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 16:54:03 -0800
Subject: [PATCH 14/24] fix long command

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/pipeline.yaml                   | 1157 ++++++++++++--------
 .buildkite/test_areas/lora.yaml            |   11 +-
 .buildkite/test_areas/models_basic.yaml    |    5 +-
 .buildkite/test_areas/models_language.yaml |    9 +-
 4 files changed, 691 insertions(+), 491 deletions(-)

diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
index 97580e597409..cb6abecd38c9 100644
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@@ -40,7 +40,7 @@ steps:
       DOCKER_BUILDKIT: '1'
   - block: 'Run :docker: Build CUDA 11.8 image'
     depends_on: []
-    key: block--docker--build-cuda-11.8-image
+    key: block--docker--build-cuda-11-8-image
   - label: ':docker: Build CUDA 11.8 image'
     key: image-build-cu118
     agents:
@@ -48,7 +48,7 @@ steps:
     commands:
     - .buildkite/image_build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
-    depends_on: block--docker--build-cuda-11.8-image
+    depends_on: block--docker--build-cuda-11-8-image
     soft_fail: false
     retry:
       automatic:
@@ -96,32 +96,37 @@ steps:
   steps:
   - label: V1 attention (B200)
     agents:
-      queue: gpu_1_queue
+      queue: B200
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
+        - HF_HOME=/benchmark-hf-cache
         - HF_TOKEN
         - CODECOV_TOKEN
         volumes:
         - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
   - label: V1 attention (H100)
     agents:
-      queue: gpu_1_queue
+      queue: mithril-h100-pool
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s v1/attention
     depends_on:
     - image-build
@@ -130,14 +135,15 @@ steps:
     - kubernetes:
         podSpec:
           containers:
-          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
             command:
             - bash
             - -c
             - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-              --tp-size=4
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
+              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
+              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
+              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -174,6 +180,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s basic_correctness/test_cumem.py
     - pytest -v -s basic_correctness/test_basic_correctness.py
@@ -183,10 +192,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -203,16 +211,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/.buildkite
     - bash scripts/run-benchmarks.sh
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -227,16 +237,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s benchmarks/
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -253,6 +265,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
     - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
     depends_on:
@@ -260,10 +275,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -278,16 +292,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s cuda/test_cuda_context.py
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -305,116 +321,111 @@ steps:
     key: block-fusion-e2e-2-gpusb200
   - label: Fusion E2E (2 GPUs)(B200)
     agents:
-      queue: gpu_4_queue
+      queue: B200
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/
     - nvidia-smi
     - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
     depends_on: block-fusion-e2e-2-gpusb200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
+        - HF_HOME=/benchmark-hf-cache
         - HF_TOKEN
         - CODECOV_TOKEN
         volumes:
         - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
   - label: Fusion and Compile Tests (B200)
     agents:
-      queue: gpu_1_queue
+      queue: B200
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/
     - nvidia-smi
     - pytest -v -s tests/compile/test_fusion_attn.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-      -k 'True and not +quant_fp8 and not +rms_norm'
+      -k "True and not +quant_fp8 and not +rms_norm"
     - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
+        - HF_HOME=/benchmark-hf-cache
         - HF_TOKEN
         - CODECOV_TOKEN
         volumes:
         - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
 - group: Distributed
   steps:
   - label: 2 Node Test (4 GPUs)
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
-      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node
-      test passed'
+      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep "Same node
+      test passed"
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
-      distributed/test_node_count.py | grep 'Node count test passed'
+      distributed/test_node_count.py | grep "Node count test passed"
     - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
       --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345
       --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
-      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node
-      test passed'
+      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep "Same node
+      test passed"
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
-      distributed/test_node_count.py | grep 'Node count test passed'
+      distributed/test_node_count.py | grep "Node count test passed"
     - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
       --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345
       --enforce-eager --trust-remote-code
     depends_on:
     - image-build
     soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
   - label: Distributed (2 GPUs)
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - export NCCL_CUMEM_HOST_ENABLE=0
     - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
     - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
     - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
     - pytest -v -s entrypoints/llm/test_collective_rpc.py
     - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
     - pytest -v -s ./compile/test_wrapper.py
     - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-      | grep 'Same node test passed'
+      | grep "Same node test passed"
     - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4
-      distributed/test_same_node.py | grep 'Same node test passed'
+      distributed/test_same_node.py | grep "Same node test passed"
     - pytest -v -s distributed/test_sequence_parallel.py
     - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
     - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -423,10 +434,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -441,6 +451,9 @@ steps:
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s distributed/test_comm_ops.py
     - pytest -v -s distributed/test_shm_broadcast.py
     - pytest -v -s distributed/test_shm_buffer.py
@@ -450,10 +463,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -468,6 +480,9 @@ steps:
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
     depends_on:
@@ -475,10 +490,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -494,8 +508,11 @@ steps:
     key: block-distributed-tests-2-gpusb200
   - label: Distributed Tests (2 GPUs)(B200)
     agents:
-      queue: gpu_4_queue
+      queue: B200
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/
     - pytest -v -s tests/distributed/test_context_parallel.py
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
     - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -503,32 +520,35 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
+        - HF_HOME=/benchmark-hf-cache
         - HF_TOKEN
         - CODECOV_TOKEN
         volumes:
         - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
   - block: Run Distributed Tests (2 GPUs)(H200)
     depends_on: image-build
     key: block-distributed-tests-2-gpush200
   - label: Distributed Tests (2 GPUs)(H200)
     agents:
-      queue: gpu_4_queue
+      queue: skylab-h200
     commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+      -k "not Llama-4"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1
       VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py
@@ -538,24 +558,27 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         gpus: all
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
+        - HF_HOME=/benchmark-hf-cache
         - HF_TOKEN
         - CODECOV_TOKEN
         volumes:
         - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
   - label: Distributed Tests (4 GPUs)
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - export NCCL_CUMEM_HOST_ENABLE=0
     - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
     - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@@ -565,6 +588,7 @@ steps:
     - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
     - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
     - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
     - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
     - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
     - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -583,10 +607,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -602,11 +625,14 @@ steps:
     key: block-distributed-tests-4-gpusa100
   - label: Distributed Tests (4 GPUs)(A100)
     agents:
-      queue: gpu_4_queue
+      queue: a100_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s distributed/test_custom_all_reduce.py
     - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-    - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+    - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m "distributed(num_gpus=2)"
     - pytest -v -s -x lora/test_mixtral.py
     depends_on: block-distributed-tests-4-gpusa100
     soft_fail: false
@@ -615,14 +641,15 @@ steps:
         podSpec:
           priorityClassName: ci
           containers:
-          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
             command:
             - bash
             - -c
             - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
-              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
-              --tp-size=4
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
+              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
+              && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py
+              --config-list-file=configs/models-large.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -653,28 +680,34 @@ steps:
             hostPath:
               path: /mnt/hf-cache
               type: DirectoryOrCreate
+  - block: Run Distributed Tests (8 GPUs)(H100)
+    depends_on: image-build
+    key: block-distributed-tests-8-gpush100
   - label: Distributed Tests (8 GPUs)(H100)
     agents:
-      queue: gpu_1_queue
+      queue: mithril-h100-pool
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - export NCCL_CUMEM_HOST_ENABLE=0
     - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py
       --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-    depends_on:
-    - image-build
+    depends_on: block-distributed-tests-8-gpush100
     soft_fail: false
     plugins:
     - kubernetes:
         podSpec:
           containers:
-          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
             command:
             - bash
             - -c
             - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-              --tp-size=4
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
+              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
+              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
+              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -709,6 +742,9 @@ steps:
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s distributed/test_pp_cudagraph.py
     - pytest -v -s distributed/test_pipeline_parallel.py
     depends_on:
@@ -716,10 +752,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -737,8 +772,11 @@ steps:
     key: block-deepseek-v2-lite-accuracy
   - label: DeepSeek V2-Lite Accuracy
     agents:
-      queue: gpu_4_queue
+      queue: mithril-h100-pool
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace
     - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
       0.25 200 8010
     depends_on: block-deepseek-v2-lite-accuracy
@@ -747,14 +785,15 @@ steps:
     - kubernetes:
         podSpec:
           containers:
-          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
             command:
             - bash
             - -c
             - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-              --tp-size=4
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
+              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
+              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
+              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -792,15 +831,17 @@ steps:
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace
     - bash .buildkite/scripts/run-prime-rl-test.sh
     depends_on: block-prime-rl-integration-2-gpus
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -816,8 +857,11 @@ steps:
     key: block-qwen3-30b-a3b-fp8-block-accuracy
   - label: Qwen3-30B-A3B-FP8-block Accuracy
     agents:
-      queue: gpu_4_queue
+      queue: mithril-h100-pool
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace
     - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
       0.8 200 8020
     depends_on: block-qwen3-30b-a3b-fp8-block-accuracy
@@ -826,14 +870,15 @@ steps:
     - kubernetes:
         podSpec:
           containers:
-          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
             command:
             - bash
             - -c
             - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-              --tp-size=4
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
+              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
+              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
+              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -870,17 +915,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-    - pytest -v -s tokenization
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -895,6 +941,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s v1/e2e
     - pytest -v -s v1/engine
     depends_on:
@@ -902,10 +951,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -922,6 +970,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
     - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -934,10 +985,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -952,6 +1002,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
     - pytest -v -s entrypoints/llm/test_generate.py
@@ -961,10 +1014,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -979,6 +1031,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s entrypoints/pooling
     depends_on:
@@ -986,10 +1041,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1004,6 +1058,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s entrypoints/openai/tool_parsers
     - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai
       --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
@@ -1012,10 +1069,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1030,16 +1086,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s v1/entrypoints
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1054,16 +1112,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -s entrypoints/openai/correctness/
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1080,16 +1140,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s distributed/test_eplb_algo.py
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1104,6 +1166,9 @@ steps:
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s distributed/test_eplb_execute.py
     - pytest -v -s distributed/test_eplb_spec_decode.py
     depends_on:
@@ -1111,10 +1176,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1129,16 +1193,19 @@ steps:
   steps:
   - label: Kernels (B200)
     agents:
-      queue: gpu_1_queue
+      queue: B200
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
     - pytest -v -s tests/kernels/attention/test_attention_selector.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k "not num_heads2"
     - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
     - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
     - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
-    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k "fp8"
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
     - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -1155,34 +1222,35 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
+        - HF_HOME=/benchmark-hf-cache
         - HF_TOKEN
         - CODECOV_TOKEN
         volumes:
         - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
   - label: Kernels Attention Test %N
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1193,20 +1261,23 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+    parallelism: 2
   - label: Kernels Core Operation Test
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s kernels/core kernels/test_top_k_per_row.py
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1219,8 +1290,11 @@ steps:
         mount_buildkite_agent: true
   - label: Kernels DeepGEMM Test (H100)
     agents:
-      queue: gpu_1_queue
+      queue: mithril-h100-pool
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
     - pytest -v -s kernels/moe/test_deepgemm.py
     - pytest -v -s kernels/moe/test_batched_deepgemm.py
@@ -1232,14 +1306,15 @@ steps:
     - kubernetes:
         podSpec:
           containers:
-          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
             command:
             - bash
             - -c
             - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-              --tp-size=4
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
+              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
+              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
+              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -1274,16 +1349,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s kernels/mamba
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1298,16 +1375,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1318,20 +1397,23 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+    parallelism: 2
   - label: Kernels Quantization Test %N
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1342,6 +1424,7 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+    parallelism: 2
 - group: LM Eval
   steps:
   - block: Run LM Eval Large Models (4 GPUs)(A100)
@@ -1349,8 +1432,11 @@ steps:
     key: block-lm-eval-large-models-4-gpusa100
   - label: LM Eval Large Models (4 GPUs)(A100)
     agents:
-      queue: gpu_4_queue
+      queue: a100_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/.buildkite/lm-eval-harness
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
       --tp-size=4
@@ -1361,14 +1447,15 @@ steps:
         podSpec:
           priorityClassName: ci
           containers:
-          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
             command:
             - bash
             - -c
             - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
-              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
-              --tp-size=4
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
+              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
+              && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py
+              --config-list-file=configs/models-large.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -1404,8 +1491,11 @@ steps:
     key: block-lm-eval-large-models-4-gpush100
   - label: LM Eval Large Models (4 GPUs)(H100)
     agents:
-      queue: gpu_4_queue
+      queue: mithril-h100-pool
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/.buildkite/lm-eval-harness
     - export VLLM_USE_DEEP_GEMM=0
     - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
       --tp-size=4
@@ -1415,14 +1505,15 @@ steps:
     - kubernetes:
         podSpec:
           containers:
-          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
             command:
             - bash
             - -c
             - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_USE_DEEP_GEMM=0
-              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-              --tp-size=4
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
+              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
+              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
+              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -1457,6 +1548,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
       --tp-size=1
     depends_on:
@@ -1464,10 +1558,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1483,34 +1576,39 @@ steps:
     key: block-lm-eval-small-models-b200
   - label: LM Eval Small Models (B200)
     agents:
-      queue: gpu_1_queue
+      queue: B200
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
       --tp-size=1
     depends_on: block-lm-eval-small-models-b200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
+        - HF_HOME=/benchmark-hf-cache
         - HF_TOKEN
         - CODECOV_TOKEN
         volumes:
         - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
 - group: LoRA
   steps:
   - label: LoRA %N
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
       \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py
       \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py
@@ -1520,10 +1618,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1534,10 +1631,14 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+    parallelism: 4
   - label: LoRA TP (Distributed)
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
@@ -1549,10 +1650,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1569,17 +1669,19 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - pytest -v -s -m 'not cpu_test' multimodal
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pytest -v -s -m "not cpu_test" multimodal
     - pytest -v -s utils_
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1594,10 +1696,14 @@ steps:
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - python3 standalone_tests/lazy_imports.py
     - pytest -v -s test_inputs.py
     - pytest -v -s test_outputs.py
-    - pytest -v -s -m 'cpu_test' multimodal
+    - pytest -v -s -m "cpu_test" multimodal
+    - pytest -v -s tokenizers_
     - pytest -v -s transformers_utils
     - pytest -v -s config
     depends_on:
@@ -1605,10 +1711,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1623,6 +1728,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/examples
     - pip install tensorizer
     - python3 offline_inference/basic/generate.py --model facebook/opt-125m
     - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf
@@ -1653,10 +1761,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1672,45 +1779,49 @@ steps:
     key: block-gpt-oss-eval-b200
   - label: GPT-OSS Eval (B200)
     agents:
-      queue: gpu_1_queue
+      queue: B200
     commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/
+    - uv pip install --system "gpt-oss[eval]==0.0.5"
     - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b
       --metric 0.58
     depends_on: block-gpt-oss-eval-b200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
+        - HF_HOME=/benchmark-hf-cache
         - HF_TOKEN
         - CODECOV_TOKEN
         volumes:
         - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
   - label: Metrics, Tracing (2 GPUs)
     agents:
       queue: gpu_4_queue
     commands:
-    - pip install 'opentelemetry-sdk>=1.26.0' 'opentelemetry-api>=1.26.0' 'opentelemetry-exporter-otlp>=1.26.0'
-      'opentelemetry-semantic-conventions-ai>=0.4.1'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pip install "opentelemetry-sdk>=1.26.0" "opentelemetry-api>=1.26.0" "opentelemetry-exporter-otlp>=1.26.0"
+      "opentelemetry-semantic-conventions-ai>=0.4.1"
     - pytest -v -s v1/tracing
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1725,16 +1836,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - bash standalone_tests/python_only_compile.sh
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1749,6 +1862,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pip install modelscope
     - pytest -v -s test_regression.py
     depends_on:
@@ -1756,10 +1872,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1774,16 +1889,19 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s -m "not cpu_test" v1/core
     - pytest -v -s v1/executor
     - pytest -v -s v1/kv_offload
     - pytest -v -s v1/sample
     - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
     - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s -m "not cpu_test" v1/kv_connector/unit
+    - pytest -v -s -m "not cpu_test" v1/metrics
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
     - pytest -v -s v1/test_outputs.py
@@ -1794,10 +1912,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1812,20 +1929,22 @@ steps:
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
-    - pytest -v -s -m 'cpu_test' v1/core
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pytest -v -s -m "cpu_test" v1/core
     - pytest -v -s v1/structured_output
     - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'cpu_test' v1/metrics
+    - pytest -v -s -m "cpu_test" v1/kv_connector/unit
+    - pytest -v -s -m "cpu_test" v1/metrics
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1842,6 +1961,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s model_executor
@@ -1851,10 +1973,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1871,16 +1992,18 @@ steps:
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s models/test_utils.py models/test_vision.py
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1895,17 +2018,19 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - pytest -v -s models/test_initialization.py \ -k 'not test_can_initialize_small_subset'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pytest -v -s models/test_initialization.py \ -k "not test_can_initialize_small_subset"
       \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1916,20 +2041,23 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+    parallelism: 2
   - label: Basic Models Tests (Initialization)
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1944,16 +2072,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s models/test_transformers.py models/test_registry.py
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -1971,24 +2101,25 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal
-      or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR
-      or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
     - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
     - pytest -v -s tests/models/multimodal/test_mapping.py
     - python3 examples/offline_inference/basic/chat.py
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py
       --model-type whisper
     depends_on: block-transformers-nightly-models
-    soft_fail: false
+    soft_fail: true
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2005,22 +2136,24 @@ steps:
     agents:
       queue: gpu_4_queue
     commands:
-    - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m "distributed(num_gpus=2)"
     - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
-    - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-    - pytest models/language -v -s -m 'distributed(num_gpus=2)'
-    - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+    - pytest models/test_transformers.py -v -s -m "distributed(num_gpus=2)"
+    - pytest models/language -v -s -m "distributed(num_gpus=2)"
+    - pytest models/multimodal -v -s -m "distributed(num_gpus=2)" --ignore models/multimodal/generation/test_whisper.py
     - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py
-      -v -s -m 'distributed(num_gpus=2)'
+      -v -s -m "distributed(num_gpus=2)"
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2040,17 +2173,19 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
+    - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2"
+    - pytest -v -s models/language/generation -m "(not core_model) and (not hybrid_model)"
     depends_on: block-language-models-test-extended-generation
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2068,15 +2203,17 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - pytest -v -s models/language/pooling -m 'not core_model'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pytest -v -s models/language/pooling -m "not core_model"
     depends_on: block-language-models-test-extended-pooling
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2094,15 +2231,17 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s models/language/pooling_mteb_test
     depends_on: block-language-models-test-mteb
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2120,15 +2259,17 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s models/language/generation_ppl_test
     depends_on: block-language-models-test-ppl
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2143,18 +2284,20 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pip freeze | grep -E "torch"
+    - pytest -v -s models/language -m "core_model and slow_test" \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
       \ --shard-id=$$BUILDKITE_PARALLEL_JOB
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2165,12 +2308,16 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+    parallelism: 2
   - label: Language Models Tests (Hybrid) %N
     agents:
       queue: gpu_1_queue
     commands:
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
+    - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2"
     - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
       \ --shard-id=$$BUILDKITE_PARALLEL_JOB
     depends_on:
@@ -2178,10 +2325,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2192,21 +2338,24 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+    parallelism: 2
   - label: Language Models Tests (Standard)
     agents:
       queue: gpu_1_queue
     commands:
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pip freeze | grep -E "torch"
+    - pytest -v -s models/language -m "core_model and (not slow_test)"
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2226,15 +2375,17 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - echo 'Testing custom models...'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - echo "Testing custom models..."
     depends_on: block-custom-models
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2249,6 +2400,9 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/.buildkite/lm-eval-harness
     - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
       --tp-size=1
     depends_on:
@@ -2256,10 +2410,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2277,17 +2430,19 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py
+    - pytest -v -s models/multimodal -m "not core_model" --ignore models/multimodal/generation/test_common.py
       --ignore models/multimodal/processing
     depends_on: block-multi-modal-models-extended-1
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2305,17 +2460,19 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0)
-      and not core_model'
+    - pytest -v -s models/multimodal/generation/test_common.py -m "split(group=0)
+      and not core_model"
     depends_on: block-multi-modal-models-extended-2
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2333,17 +2490,19 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1)
-      and not core_model'
+    - pytest -v -s models/multimodal/generation/test_common.py -m "split(group=1)
+      and not core_model"
     depends_on: block-multi-modal-models-extended-3
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2358,8 +2517,11 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
+    - pip freeze | grep -E "torch"
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py
       --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py
@@ -2369,10 +2531,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2387,17 +2548,46 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
+        environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/fsx/hf_cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        volumes:
+        - /dev/shm:/dev/shm
+        - /fsx/hf_cache:/fsx/hf_cache
+        mount_buildkite_agent: true
+  - label: Multi-Modal Processor Test (CPU)
+    agents:
+      queue: cpu_queue_premerge_us_east_1
+    commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+    depends_on:
+    - image-build
+    soft_fail: false
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2414,6 +2604,9 @@ steps:
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pip install -e ./plugins/vllm_add_dummy_platform
     - pytest -v -s plugins_tests/test_platform_plugins.py
     - pip uninstall vllm_add_dummy_platform -y
@@ -2434,10 +2627,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2454,16 +2646,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\;
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2478,18 +2672,20 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    - pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8
-      and not Llama-4'
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pytest -v -s compile/fullgraph/test_full_graph.py -k "not test_fp8_kv_scale_compile"
+    - pytest -v -s compile/distributed/test_fusions_e2e.py -k "TRITON and not +quant_fp8
+      and not Llama-4"
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2504,17 +2700,19 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - find compile/fullgraph/ -name "test_*.py" -not -name "test_full_graph.py" -exec
       pytest -s -v {} \\;
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2529,16 +2727,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - bash standalone_tests/pytorch_nightly_dependency.sh
     depends_on:
     - image-build
     soft_fail: true
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2555,17 +2755,20 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+    - uv pip install --system conch-triton-kernels
     - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2578,34 +2781,39 @@ steps:
         mount_buildkite_agent: true
   - label: Quantized MoE Test (B200)
     agents:
-      queue: gpu_1_queue
+      queue: B200
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/
     - pytest -s -v tests/quantization/test_blackwell_moe.py
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
+        - HF_HOME=/benchmark-hf-cache
         - HF_TOKEN
         - CODECOV_TOKEN
         volumes:
         - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
 - group: Samplers
   steps:
   - label: Samplers Test
     agents:
       queue: gpu_1_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
     depends_on:
@@ -2613,10 +2821,9 @@ steps:
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2633,16 +2840,18 @@ steps:
     agents:
       queue: gpu_1_queue
     commands:
-    - pytest -v -s -m 'not cpu_test' tool_use
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pytest -v -s -m "not cpu_test" tool_use
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2657,16 +2866,18 @@ steps:
     agents:
       queue: cpu_queue_premerge_us_east_1
     commands:
-    - pytest -v -s -m 'cpu_test' tool_use
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
+    - pytest -v -s -m "cpu_test" tool_use
     depends_on:
     - image-build
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2686,15 +2897,17 @@ steps:
     agents:
       queue: gpu_4_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
     depends_on: block-weight-loading-multiple-gpu
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
-        always_pull: true
-        propagate_environment: true
-        gpus: all
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        always-pull: true
+        propagate-environment: true
         environment:
         - VLLM_USAGE_SOURCE=ci-test
         - NCCL_CUMEM_HOST_ENABLE=0
@@ -2710,8 +2923,11 @@ steps:
     key: block-weight-loading-multiple-gpu---large-models
   - label: Weight Loading Multiple GPU - Large Models
     agents:
-      queue: gpu_4_queue
+      queue: a100_queue
     commands:
+    - (command nvidia-smi || true)
+    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
+    - cd /vllm-workspace/tests
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
     depends_on: block-weight-loading-multiple-gpu---large-models
     soft_fail: false
@@ -2720,14 +2936,15 @@ steps:
         podSpec:
           priorityClassName: ci
           containers:
-          - image: public.ecr.aws/q9t5s3a7:vllm-ci-test-repo:123
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
             command:
             - bash
             - -c
             - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && export VLLM_WORKER_MULTIPROC_METHOD=spawn
-              && pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
-              --tp-size=4
+              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
+              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
+              && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py
+              --config-list-file=configs/models-large.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
index 45e3af03591d..809b4138f44b 100644
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -8,16 +8,7 @@ steps:
   - vllm/lora
   - tests/lora
   commands:
-    - pytest -v -s lora \
-      --shard-id=$$BUILDKITE_PARALLEL_JOB \
-      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-      --ignore=lora/test_chatglm3_tp.py \
-      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py \
-      --ignore=lora/test_olmoe_tp.py \
-      --ignore=lora/test_deepseekv2_tp.py \
-      --ignore=lora/test_gptoss_tp.py \
-      --ignore=lora/test_qwen3moe_tp.py
+    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
   parallelism: 4
 
 
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index 9b7f574a95c3..39a5d51c4883 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -24,10 +24,7 @@ steps:
     # Only when vLLM model source is modified - test initialization of a large
     # subset of supported models (the complement of the small subset in the above
     # test.) Also run if model initialization test file is modified
-    - pytest -v -s models/test_initialization.py \
-             -k 'not test_can_initialize_small_subset' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
+    - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
   parallelism: 2
 
 - label: Basic Models Tests (Other)
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
index fdf78dc48746..f70192c4ebc0 100644
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -27,9 +27,7 @@ steps:
     # Shard slow subset of standard language models tests. Only run when model
     # source is modified, or when specified test files are modified
     - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and slow_test' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
+    - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
   parallelism: 2
 
 - label: Language Models Tests (Hybrid) %N
@@ -45,10 +43,7 @@ steps:
     - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
     # Shard hybrid language model tests
-    - pytest -v -s models/language/generation \
-                   -m hybrid_model \
-                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-                   --shard-id=$$BUILDKITE_PARALLEL_JOB
+    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
   parallelism: 2
 
 - label: Language Models Test (Extended Generation) # 80min

From 950643d974785b0a25b874f2adb1d1242e253979 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 1 Dec 2025 18:48:18 -0800
Subject: [PATCH 15/24] debug

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/pipeline.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
index cb6abecd38c9..c387bff0a317 100644
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@@ -2649,13 +2649,13 @@ steps:
     - (command nvidia-smi || true)
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
-    - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\;
+    - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\\\;
     depends_on:
-    - image-build
+    - []
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:52ef28ad15b328f5b6b3edb3f8b8904528a183f6
         always-pull: true
         propagate-environment: true
         environment:
@@ -2704,13 +2704,13 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - find compile/fullgraph/ -name "test_*.py" -not -name "test_full_graph.py" -exec
-      pytest -s -v {} \\;
+      pytest -s -v {} \\\\;
     depends_on:
-    - image-build
+    - []
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:52ef28ad15b328f5b6b3edb3f8b8904528a183f6
         always-pull: true
         propagate-environment: true
         environment:

From 98a38d1a167874b9c9368fbd66c0462f57387532 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Tue, 2 Dec 2025 01:33:58 -0800
Subject: [PATCH 16/24] slashes

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/pipeline.yaml           | 547 ++++++++++++++++-------------
 .buildkite/test_areas/pytorch.yaml |   4 +-
 2 files changed, 309 insertions(+), 242 deletions(-)

diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
index c387bff0a317..ae55531ecf13 100644
--- a/.buildkite/pipeline.yaml
+++ b/.buildkite/pipeline.yaml
@@ -21,6 +21,9 @@ steps:
         limit: 2
     env:
       DOCKER_BUILDKIT: '1'
+  - block: 'Run :docker: Build CPU image'
+    depends_on: []
+    key: block--docker--build-cpu-image
   - label: ':docker: Build CPU image'
     key: image-build-cpu
     agents:
@@ -28,7 +31,7 @@ steps:
     commands:
     - .buildkite/image_build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
-    depends_on: []
+    depends_on: block--docker--build-cpu-image
     soft_fail: false
     retry:
       automatic:
@@ -58,6 +61,9 @@ steps:
         limit: 2
     env:
       DOCKER_BUILDKIT: '1'
+  - block: 'Run :docker: Build HPU image'
+    depends_on: []
+    key: block--docker--build-hpu-image
   - label: ':docker: Build HPU image'
     key: image-build-hpu
     agents:
@@ -65,7 +71,7 @@ steps:
     commands:
     - .buildkite/image_build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
-    depends_on: []
+    depends_on: block--docker--build-hpu-image
     soft_fail: true
     retry:
       automatic:
@@ -75,6 +81,9 @@ steps:
         limit: 2
     env:
       DOCKER_BUILDKIT: '1'
+  - block: 'Run :docker: Build image'
+    depends_on: []
+    key: block--docker--build-image
   - label: ':docker: Build image'
     key: image-build
     agents:
@@ -82,7 +91,7 @@ steps:
     commands:
     - .buildkite/image_build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
       123
-    depends_on: []
+    depends_on: block--docker--build-image
     soft_fail: false
     retry:
       automatic:
@@ -94,6 +103,9 @@ steps:
       DOCKER_BUILDKIT: '1'
 - group: Attention
   steps:
+  - block: Run V1 attention (B200)
+    depends_on: []
+    key: block-v1-attention-b200
   - label: V1 attention (B200)
     agents:
       queue: B200
@@ -102,8 +114,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention
-    depends_on:
-    - image-build
+    depends_on: block-v1-attention-b200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -120,6 +131,9 @@ steps:
         - /dev/shm:/dev/shm
         - /data/benchmark-hf-cache:/benchmark-hf-cache
         - /data/benchmark-vllm-cache:/root/.cache/vllm
+  - block: Run V1 attention (H100)
+    depends_on: []
+    key: block-v1-attention-h100
   - label: V1 attention (H100)
     agents:
       queue: mithril-h100-pool
@@ -128,22 +142,13 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s v1/attention
-    depends_on:
-    - image-build
+    depends_on: block-v1-attention-h100
     soft_fail: false
     plugins:
     - kubernetes:
         podSpec:
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            command:
-            - bash
-            - -c
-            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
-              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
-              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
-              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -176,6 +181,9 @@ steps:
               type: DirectoryOrCreate
 - group: Basic Correctness
   steps:
+  - block: Run Basic Correctness
+    depends_on: []
+    key: block-basic-correctness
   - label: Basic Correctness
     agents:
       queue: gpu_1_queue
@@ -187,8 +195,7 @@ steps:
     - pytest -v -s basic_correctness/test_cumem.py
     - pytest -v -s basic_correctness/test_basic_correctness.py
     - pytest -v -s basic_correctness/test_cpu_offload.py
-    depends_on:
-    - image-build
+    depends_on: block-basic-correctness
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -207,6 +214,9 @@ steps:
         mount_buildkite_agent: true
 - group: Benchmarks
   steps:
+  - block: Run Benchmarks
+    depends_on: []
+    key: block-benchmarks
   - label: Benchmarks
     agents:
       queue: gpu_1_queue
@@ -215,8 +225,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/.buildkite
     - bash scripts/run-benchmarks.sh
-    depends_on:
-    - image-build
+    depends_on: block-benchmarks
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -233,6 +242,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Benchmarks CLI Test
+    depends_on: []
+    key: block-benchmarks-cli-test
   - label: Benchmarks CLI Test
     agents:
       queue: gpu_1_queue
@@ -241,8 +253,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s benchmarks/
-    depends_on:
-    - image-build
+    depends_on: block-benchmarks-cli-test
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -261,6 +272,9 @@ steps:
         mount_buildkite_agent: true
 - group: CUDA
   steps:
+  - block: Run Cudagraph
+    depends_on: []
+    key: block-cudagraph
   - label: Cudagraph
     agents:
       queue: gpu_1_queue
@@ -270,8 +284,7 @@ steps:
     - cd /vllm-workspace/tests
     - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
     - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
-    depends_on:
-    - image-build
+    depends_on: block-cudagraph
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -288,6 +301,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Platform Tests (CUDA)
+    depends_on: []
+    key: block-platform-tests-cuda
   - label: Platform Tests (CUDA)
     agents:
       queue: gpu_1_queue
@@ -296,8 +312,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s cuda/test_cuda_context.py
-    depends_on:
-    - image-build
+    depends_on: block-platform-tests-cuda
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -317,7 +332,7 @@ steps:
 - group: Compile
   steps:
   - block: Run Fusion E2E (2 GPUs)(B200)
-    depends_on: image-build
+    depends_on: []
     key: block-fusion-e2e-2-gpusb200
   - label: Fusion E2E (2 GPUs)(B200)
     agents:
@@ -345,6 +360,9 @@ steps:
         - /dev/shm:/dev/shm
         - /data/benchmark-hf-cache:/benchmark-hf-cache
         - /data/benchmark-vllm-cache:/root/.cache/vllm
+  - block: Run Fusion and Compile Tests (B200)
+    depends_on: []
+    key: block-fusion-and-compile-tests-b200
   - label: Fusion and Compile Tests (B200)
     agents:
       queue: B200
@@ -359,8 +377,7 @@ steps:
     - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
       -k "True and not +quant_fp8 and not +rms_norm"
     - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-    depends_on:
-    - image-build
+    depends_on: block-fusion-and-compile-tests-b200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -379,6 +396,9 @@ steps:
         - /data/benchmark-vllm-cache:/root/.cache/vllm
 - group: Distributed
   steps:
+  - block: Run 2 Node Test (4 GPUs)
+    depends_on: []
+    key: block-2-node-test-4-gpus
   - label: 2 Node Test (4 GPUs)
     agents:
       queue: gpu_4_queue
@@ -404,9 +424,11 @@ steps:
     - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
       --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345
       --enforce-eager --trust-remote-code
-    depends_on:
-    - image-build
+    depends_on: block-2-node-test-4-gpus
     soft_fail: false
+  - block: Run Distributed (2 GPUs)
+    depends_on: []
+    key: block-distributed-2-gpus
   - label: Distributed (2 GPUs)
     agents:
       queue: gpu_4_queue
@@ -429,8 +451,7 @@ steps:
     - pytest -v -s distributed/test_sequence_parallel.py
     - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
     - pytest -v -s v1/worker/test_worker_memory_snapshot.py
-    depends_on:
-    - image-build
+    depends_on: block-distributed-2-gpus
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -447,6 +468,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Distributed Comm Ops
+    depends_on: []
+    key: block-distributed-comm-ops
   - label: Distributed Comm Ops
     agents:
       queue: gpu_4_queue
@@ -458,8 +482,7 @@ steps:
     - pytest -v -s distributed/test_shm_broadcast.py
     - pytest -v -s distributed/test_shm_buffer.py
     - pytest -v -s distributed/test_shm_storage.py
-    depends_on:
-    - image-build
+    depends_on: block-distributed-comm-ops
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -476,6 +499,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Distributed NixlConnector PD accuracy (4 GPUs)
+    depends_on: []
+    key: block-distributed-nixlconnector-pd-accuracy-4-gpus
   - label: Distributed NixlConnector PD accuracy (4 GPUs)
     agents:
       queue: gpu_4_queue
@@ -485,8 +511,7 @@ steps:
     - cd /vllm-workspace/tests
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
-    depends_on:
-    - image-build
+    depends_on: block-distributed-nixlconnector-pd-accuracy-4-gpus
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -504,7 +529,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Distributed Tests (2 GPUs)(B200)
-    depends_on: image-build
+    depends_on: []
     key: block-distributed-tests-2-gpusb200
   - label: Distributed Tests (2 GPUs)(B200)
     agents:
@@ -534,7 +559,7 @@ steps:
         - /data/benchmark-hf-cache:/benchmark-hf-cache
         - /data/benchmark-vllm-cache:/root/.cache/vllm
   - block: Run Distributed Tests (2 GPUs)(H200)
-    depends_on: image-build
+    depends_on: []
     key: block-distributed-tests-2-gpush200
   - label: Distributed Tests (2 GPUs)(H200)
     agents:
@@ -572,6 +597,9 @@ steps:
         - /dev/shm:/dev/shm
         - /data/benchmark-hf-cache:/benchmark-hf-cache
         - /data/benchmark-vllm-cache:/root/.cache/vllm
+  - block: Run Distributed Tests (4 GPUs)
+    depends_on: []
+    key: block-distributed-tests-4-gpus
   - label: Distributed Tests (4 GPUs)
     agents:
       queue: gpu_4_queue
@@ -602,8 +630,7 @@ steps:
     - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
     - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
     - popd
-    depends_on:
-    - image-build
+    depends_on: block-distributed-tests-4-gpus
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -621,7 +648,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Distributed Tests (4 GPUs)(A100)
-    depends_on: image-build
+    depends_on: []
     key: block-distributed-tests-4-gpusa100
   - label: Distributed Tests (4 GPUs)(A100)
     agents:
@@ -642,14 +669,6 @@ steps:
           priorityClassName: ci
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            command:
-            - bash
-            - -c
-            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
-              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
-              && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py
-              --config-list-file=configs/models-large.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -681,7 +700,7 @@ steps:
               path: /mnt/hf-cache
               type: DirectoryOrCreate
   - block: Run Distributed Tests (8 GPUs)(H100)
-    depends_on: image-build
+    depends_on: []
     key: block-distributed-tests-8-gpush100
   - label: Distributed Tests (8 GPUs)(H100)
     agents:
@@ -700,14 +719,6 @@ steps:
         podSpec:
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            command:
-            - bash
-            - -c
-            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
-              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
-              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
-              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -738,6 +749,9 @@ steps:
             hostPath:
               path: /mnt/hf-cache
               type: DirectoryOrCreate
+  - block: Run Pipeline + Context Parallelism (4 GPUs))
+    depends_on: []
+    key: block-pipeline---context-parallelism-4-gpus
   - label: Pipeline + Context Parallelism (4 GPUs))
     agents:
       queue: gpu_4_queue
@@ -747,8 +761,7 @@ steps:
     - cd /vllm-workspace/tests
     - pytest -v -s distributed/test_pp_cudagraph.py
     - pytest -v -s distributed/test_pipeline_parallel.py
-    depends_on:
-    - image-build
+    depends_on: block-pipeline---context-parallelism-4-gpus
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -768,7 +781,7 @@ steps:
 - group: E2E Integration
   steps:
   - block: Run DeepSeek V2-Lite Accuracy
-    depends_on: image-build
+    depends_on: []
     key: block-deepseek-v2-lite-accuracy
   - label: DeepSeek V2-Lite Accuracy
     agents:
@@ -786,14 +799,6 @@ steps:
         podSpec:
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            command:
-            - bash
-            - -c
-            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
-              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
-              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
-              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -825,7 +830,7 @@ steps:
               path: /mnt/hf-cache
               type: DirectoryOrCreate
   - block: Run Prime-RL Integration (2 GPUs)
-    depends_on: image-build
+    depends_on: []
     key: block-prime-rl-integration-2-gpus
   - label: Prime-RL Integration (2 GPUs)
     agents:
@@ -853,7 +858,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Qwen3-30B-A3B-FP8-block Accuracy
-    depends_on: image-build
+    depends_on: []
     key: block-qwen3-30b-a3b-fp8-block-accuracy
   - label: Qwen3-30B-A3B-FP8-block Accuracy
     agents:
@@ -871,14 +876,6 @@ steps:
         podSpec:
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            command:
-            - bash
-            - -c
-            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
-              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
-              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
-              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -911,6 +908,9 @@ steps:
               type: DirectoryOrCreate
 - group: Engine
   steps:
+  - block: Run Engine
+    depends_on: []
+    key: block-engine
   - label: Engine
     agents:
       queue: gpu_1_queue
@@ -919,8 +919,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-    depends_on:
-    - image-build
+    depends_on: block-engine
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -937,6 +936,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run V1 e2e + engine
+    depends_on: []
+    key: block-v1-e2e---engine
   - label: V1 e2e + engine
     agents:
       queue: gpu_1_queue
@@ -946,8 +948,7 @@ steps:
     - cd /vllm-workspace/tests
     - pytest -v -s v1/e2e
     - pytest -v -s v1/engine
-    depends_on:
-    - image-build
+    depends_on: block-v1-e2e---engine
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -966,6 +967,9 @@ steps:
         mount_buildkite_agent: true
 - group: Entrypoints
   steps:
+  - block: Run Entrypoints Integration (API Server)
+    depends_on: []
+    key: block-entrypoints-integration-api-server
   - label: Entrypoints Integration (API Server)
     agents:
       queue: gpu_1_queue
@@ -980,8 +984,7 @@ steps:
       --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
       --ignore=entrypoints/openai/tool_parsers/
     - pytest -v -s entrypoints/test_chat_utils.py
-    depends_on:
-    - image-build
+    depends_on: block-entrypoints-integration-api-server
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -998,6 +1001,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Entrypoints Integration (LLM)
+    depends_on: []
+    key: block-entrypoints-integration-llm
   - label: Entrypoints Integration (LLM)
     agents:
       queue: gpu_1_queue
@@ -1009,8 +1015,7 @@ steps:
     - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
     - pytest -v -s entrypoints/llm/test_generate.py
     - pytest -v -s entrypoints/offline_mode
-    depends_on:
-    - image-build
+    depends_on: block-entrypoints-integration-llm
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1027,6 +1032,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Entrypoints Integration (Pooling)
+    depends_on: []
+    key: block-entrypoints-integration-pooling
   - label: Entrypoints Integration (Pooling)
     agents:
       queue: gpu_1_queue
@@ -1036,8 +1044,7 @@ steps:
     - cd /vllm-workspace/tests
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s entrypoints/pooling
-    depends_on:
-    - image-build
+    depends_on: block-entrypoints-integration-pooling
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1054,6 +1061,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Entrypoints Unit Tests
+    depends_on: []
+    key: block-entrypoints-unit-tests
   - label: Entrypoints Unit Tests
     agents:
       queue: gpu_1_queue
@@ -1064,8 +1074,7 @@ steps:
     - pytest -v -s entrypoints/openai/tool_parsers
     - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai
       --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
-    depends_on:
-    - image-build
+    depends_on: block-entrypoints-unit-tests
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1082,6 +1091,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Entrypoints V1
+    depends_on: []
+    key: block-entrypoints-v1
   - label: Entrypoints V1
     agents:
       queue: gpu_1_queue
@@ -1090,8 +1102,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s v1/entrypoints
-    depends_on:
-    - image-build
+    depends_on: block-entrypoints-v1
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1108,6 +1119,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run OpenAI API Correctness
+    depends_on: []
+    key: block-openai-api-correctness
   - label: OpenAI API Correctness
     agents:
       queue: gpu_1_queue
@@ -1116,8 +1130,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -s entrypoints/openai/correctness/
-    depends_on:
-    - image-build
+    depends_on: block-openai-api-correctness
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1136,6 +1149,9 @@ steps:
         mount_buildkite_agent: true
 - group: Expert Parallelism
   steps:
+  - block: Run EPLB Algorithm
+    depends_on: []
+    key: block-eplb-algorithm
   - label: EPLB Algorithm
     agents:
       queue: gpu_1_queue
@@ -1144,8 +1160,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s distributed/test_eplb_algo.py
-    depends_on:
-    - image-build
+    depends_on: block-eplb-algorithm
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1162,6 +1177,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run EPLB Execution
+    depends_on: []
+    key: block-eplb-execution
   - label: EPLB Execution
     agents:
       queue: gpu_4_queue
@@ -1171,8 +1189,7 @@ steps:
     - cd /vllm-workspace/tests
     - pytest -v -s distributed/test_eplb_execute.py
     - pytest -v -s distributed/test_eplb_spec_decode.py
-    depends_on:
-    - image-build
+    depends_on: block-eplb-execution
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1191,6 +1208,9 @@ steps:
         mount_buildkite_agent: true
 - group: Kernels
   steps:
+  - block: Run Kernels (B200)
+    depends_on: []
+    key: block-kernels-b200
   - label: Kernels (B200)
     agents:
       queue: B200
@@ -1217,8 +1237,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
     - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
-    depends_on:
-    - image-build
+    depends_on: block-kernels-b200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1235,6 +1254,9 @@ steps:
         - /dev/shm:/dev/shm
         - /data/benchmark-hf-cache:/benchmark-hf-cache
         - /data/benchmark-vllm-cache:/root/.cache/vllm
+  - block: Run Kernels Attention Test %N
+    depends_on: []
+    key: block-kernels-attention-test-n
   - label: Kernels Attention Test %N
     agents:
       queue: gpu_1_queue
@@ -1243,8 +1265,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    depends_on:
-    - image-build
+    depends_on: block-kernels-attention-test-n
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1262,6 +1283,9 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
     parallelism: 2
+  - block: Run Kernels Core Operation Test
+    depends_on: []
+    key: block-kernels-core-operation-test
   - label: Kernels Core Operation Test
     agents:
       queue: gpu_1_queue
@@ -1270,8 +1294,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s kernels/core kernels/test_top_k_per_row.py
-    depends_on:
-    - image-build
+    depends_on: block-kernels-core-operation-test
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1288,6 +1311,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Kernels DeepGEMM Test (H100)
+    depends_on: []
+    key: block-kernels-deepgemm-test-h100
   - label: Kernels DeepGEMM Test (H100)
     agents:
       queue: mithril-h100-pool
@@ -1299,22 +1325,13 @@ steps:
     - pytest -v -s kernels/moe/test_deepgemm.py
     - pytest -v -s kernels/moe/test_batched_deepgemm.py
     - pytest -v -s kernels/attention/test_deepgemm_attention.py
-    depends_on:
-    - image-build
+    depends_on: block-kernels-deepgemm-test-h100
     soft_fail: false
     plugins:
     - kubernetes:
         podSpec:
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            command:
-            - bash
-            - -c
-            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
-              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
-              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
-              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -1345,6 +1362,9 @@ steps:
             hostPath:
               path: /mnt/hf-cache
               type: DirectoryOrCreate
+  - block: Run Kernels Mamba Test
+    depends_on: []
+    key: block-kernels-mamba-test
   - label: Kernels Mamba Test
     agents:
       queue: gpu_1_queue
@@ -1353,8 +1373,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s kernels/mamba
-    depends_on:
-    - image-build
+    depends_on: block-kernels-mamba-test
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1371,6 +1390,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Kernels MoE Test %N
+    depends_on: []
+    key: block-kernels-moe-test-n
   - label: Kernels MoE Test %N
     agents:
       queue: gpu_1_queue
@@ -1379,8 +1401,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    depends_on:
-    - image-build
+    depends_on: block-kernels-moe-test-n
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1398,6 +1419,9 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
     parallelism: 2
+  - block: Run Kernels Quantization Test %N
+    depends_on: []
+    key: block-kernels-quantization-test-n
   - label: Kernels Quantization Test %N
     agents:
       queue: gpu_1_queue
@@ -1406,8 +1430,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    depends_on:
-    - image-build
+    depends_on: block-kernels-quantization-test-n
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1428,7 +1451,7 @@ steps:
 - group: LM Eval
   steps:
   - block: Run LM Eval Large Models (4 GPUs)(A100)
-    depends_on: image-build
+    depends_on: []
     key: block-lm-eval-large-models-4-gpusa100
   - label: LM Eval Large Models (4 GPUs)(A100)
     agents:
@@ -1448,14 +1471,6 @@ steps:
           priorityClassName: ci
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            command:
-            - bash
-            - -c
-            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
-              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
-              && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py
-              --config-list-file=configs/models-large.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -1487,7 +1502,7 @@ steps:
               path: /mnt/hf-cache
               type: DirectoryOrCreate
   - block: Run LM Eval Large Models (4 GPUs)(H100)
-    depends_on: image-build
+    depends_on: []
     key: block-lm-eval-large-models-4-gpush100
   - label: LM Eval Large Models (4 GPUs)(H100)
     agents:
@@ -1506,14 +1521,6 @@ steps:
         podSpec:
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            command:
-            - bash
-            - -c
-            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
-              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
-              && export VLLM_USE_DEEP_GEMM=0 && pytest -s -v test_lm_eval_correctness.py
-              --config-list-file=configs/models-large-hopper.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
@@ -1544,6 +1551,9 @@ steps:
             hostPath:
               path: /mnt/hf-cache
               type: DirectoryOrCreate
+  - block: Run LM Eval Small Models
+    depends_on: []
+    key: block-lm-eval-small-models
   - label: LM Eval Small Models
     agents:
       queue: gpu_1_queue
@@ -1553,8 +1563,7 @@ steps:
     - cd /vllm-workspace/tests
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
       --tp-size=1
-    depends_on:
-    - image-build
+    depends_on: block-lm-eval-small-models
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1572,7 +1581,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run LM Eval Small Models (B200)
-    depends_on: image-build
+    depends_on: []
     key: block-lm-eval-small-models-b200
   - label: LM Eval Small Models (B200)
     agents:
@@ -1602,6 +1611,9 @@ steps:
         - /data/benchmark-vllm-cache:/root/.cache/vllm
 - group: LoRA
   steps:
+  - block: Run LoRA %N
+    depends_on: []
+    key: block-lora-n
   - label: LoRA %N
     agents:
       queue: gpu_1_queue
@@ -1609,12 +1621,11 @@ steps:
     - (command nvidia-smi || true)
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
-    - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py
-      \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss_tp.py
-      \ --ignore=lora/test_qwen3moe_tp.py
-    depends_on:
-    - image-build
+    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+      --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
+      --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py
+      --ignore=lora/test_qwen3moe_tp.py
+    depends_on: block-lora-n
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1632,6 +1643,9 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
     parallelism: 4
+  - block: Run LoRA TP (Distributed)
+    depends_on: []
+    key: block-lora-tp-distributed
   - label: LoRA TP (Distributed)
     agents:
       queue: gpu_4_queue
@@ -1645,8 +1659,7 @@ steps:
     - pytest -v -s -x lora/test_llm_with_multi_loras.py
     - pytest -v -s -x lora/test_olmoe_tp.py
     - pytest -v -s -x lora/test_gptoss_tp.py
-    depends_on:
-    - image-build
+    depends_on: block-lora-tp-distributed
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1665,6 +1678,9 @@ steps:
         mount_buildkite_agent: true
 - group: Miscellaneous
   steps:
+  - block: Run Async Engine, Inputs, Utils, Worker
+    depends_on: []
+    key: block-async-engine--inputs--utils--worker
   - label: Async Engine, Inputs, Utils, Worker
     agents:
       queue: gpu_1_queue
@@ -1674,8 +1690,7 @@ steps:
     - cd /vllm-workspace/tests
     - pytest -v -s -m "not cpu_test" multimodal
     - pytest -v -s utils_
-    depends_on:
-    - image-build
+    depends_on: block-async-engine--inputs--utils--worker
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1692,6 +1707,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Async Engine, Inputs, Utils, Worker, Config (CPU)
+    depends_on: []
+    key: block-async-engine--inputs--utils--worker--config-cpu
   - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
     agents:
       queue: cpu_queue_premerge_us_east_1
@@ -1706,8 +1724,7 @@ steps:
     - pytest -v -s tokenizers_
     - pytest -v -s transformers_utils
     - pytest -v -s config
-    depends_on:
-    - image-build
+    depends_on: block-async-engine--inputs--utils--worker--config-cpu
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1724,6 +1741,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Examples
+    depends_on: []
+    key: block-examples
   - label: Examples
     agents:
       queue: gpu_1_queue
@@ -1756,8 +1776,7 @@ steps:
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens
       3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp
       0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-    depends_on:
-    - image-build
+    depends_on: block-examples
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1775,7 +1794,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run GPT-OSS Eval (B200)
-    depends_on: image-build
+    depends_on: []
     key: block-gpt-oss-eval-b200
   - label: GPT-OSS Eval (B200)
     agents:
@@ -1804,6 +1823,9 @@ steps:
         - /dev/shm:/dev/shm
         - /data/benchmark-hf-cache:/benchmark-hf-cache
         - /data/benchmark-vllm-cache:/root/.cache/vllm
+  - block: Run Metrics, Tracing (2 GPUs)
+    depends_on: []
+    key: block-metrics--tracing-2-gpus
   - label: Metrics, Tracing (2 GPUs)
     agents:
       queue: gpu_4_queue
@@ -1814,8 +1836,7 @@ steps:
     - pip install "opentelemetry-sdk>=1.26.0" "opentelemetry-api>=1.26.0" "opentelemetry-exporter-otlp>=1.26.0"
       "opentelemetry-semantic-conventions-ai>=0.4.1"
     - pytest -v -s v1/tracing
-    depends_on:
-    - image-build
+    depends_on: block-metrics--tracing-2-gpus
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1832,6 +1853,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Python-only Installation
+    depends_on: []
+    key: block-python-only-installation
   - label: Python-only Installation
     agents:
       queue: gpu_1_queue
@@ -1840,8 +1864,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - bash standalone_tests/python_only_compile.sh
-    depends_on:
-    - image-build
+    depends_on: block-python-only-installation
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1858,6 +1881,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Regression
+    depends_on: []
+    key: block-regression
   - label: Regression
     agents:
       queue: gpu_1_queue
@@ -1867,8 +1893,7 @@ steps:
     - cd /vllm-workspace/tests
     - pip install modelscope
     - pytest -v -s test_regression.py
-    depends_on:
-    - image-build
+    depends_on: block-regression
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1885,6 +1910,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run V1 Others
+    depends_on: []
+    key: block-v1-others
   - label: V1 Others
     agents:
       queue: gpu_1_queue
@@ -1907,8 +1935,7 @@ steps:
     - pytest -v -s v1/test_outputs.py
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-    depends_on:
-    - image-build
+    depends_on: block-v1-others
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1925,6 +1952,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run V1 Others (CPU)
+    depends_on: []
+    key: block-v1-others-cpu
   - label: V1 Others (CPU)
     agents:
       queue: cpu_queue_premerge_us_east_1
@@ -1937,8 +1967,7 @@ steps:
     - pytest -v -s v1/test_serial_utils.py
     - pytest -v -s -m "cpu_test" v1/kv_connector/unit
     - pytest -v -s -m "cpu_test" v1/metrics
-    depends_on:
-    - image-build
+    depends_on: block-v1-others-cpu
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1957,6 +1986,9 @@ steps:
         mount_buildkite_agent: true
 - group: Model Executor
   steps:
+  - block: Run Model Executor
+    depends_on: []
+    key: block-model-executor
   - label: Model Executor
     agents:
       queue: gpu_1_queue
@@ -1968,8 +2000,7 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s model_executor
     - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-    depends_on:
-    - image-build
+    depends_on: block-model-executor
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -1988,6 +2019,9 @@ steps:
         mount_buildkite_agent: true
 - group: Models - Basic
   steps:
+  - block: Run Basic Models Test (Other CPU)
+    depends_on: []
+    key: block-basic-models-test-other-cpu
   - label: Basic Models Test (Other CPU)
     agents:
       queue: cpu_queue_premerge_us_east_1
@@ -1996,8 +2030,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s models/test_utils.py models/test_vision.py
-    depends_on:
-    - image-build
+    depends_on: block-basic-models-test-other-cpu
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2014,6 +2047,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Basic Models Tests (Extra Initialization) %N
+    depends_on: []
+    key: block-basic-models-tests-extra-initialization-n
   - label: Basic Models Tests (Extra Initialization) %N
     agents:
       queue: gpu_1_queue
@@ -2021,10 +2057,9 @@ steps:
     - (command nvidia-smi || true)
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
-    - pytest -v -s models/test_initialization.py \ -k "not test_can_initialize_small_subset"
-      \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB
-    depends_on:
-    - image-build
+    - pytest -v -s models/test_initialization.py -k "not test_can_initialize_small_subset"
+      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+    depends_on: block-basic-models-tests-extra-initialization-n
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2042,6 +2077,9 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
     parallelism: 2
+  - block: Run Basic Models Tests (Initialization)
+    depends_on: []
+    key: block-basic-models-tests-initialization
   - label: Basic Models Tests (Initialization)
     agents:
       queue: gpu_1_queue
@@ -2050,8 +2088,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
-    depends_on:
-    - image-build
+    depends_on: block-basic-models-tests-initialization
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2068,6 +2105,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Basic Models Tests (Other)
+    depends_on: []
+    key: block-basic-models-tests-other
   - label: Basic Models Tests (Other)
     agents:
       queue: gpu_1_queue
@@ -2076,8 +2116,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s models/test_transformers.py models/test_registry.py
-    depends_on:
-    - image-build
+    depends_on: block-basic-models-tests-other
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2095,7 +2134,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Transformers Nightly Models
-    depends_on: image-build
+    depends_on: []
     key: block-transformers-nightly-models
   - label: Transformers Nightly Models
     agents:
@@ -2132,6 +2171,9 @@ steps:
         mount_buildkite_agent: true
 - group: Models - Distributed
   steps:
+  - block: Run Distributed Model Tests (2 GPUs)
+    depends_on: []
+    key: block-distributed-model-tests-2-gpus
   - label: Distributed Model Tests (2 GPUs)
     agents:
       queue: gpu_4_queue
@@ -2146,8 +2188,7 @@ steps:
     - pytest models/multimodal -v -s -m "distributed(num_gpus=2)" --ignore models/multimodal/generation/test_whisper.py
     - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py
       -v -s -m "distributed(num_gpus=2)"
-    depends_on:
-    - image-build
+    depends_on: block-distributed-model-tests-2-gpus
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2167,7 +2208,7 @@ steps:
 - group: Models - Language
   steps:
   - block: Run Language Models Test (Extended Generation)
-    depends_on: image-build
+    depends_on: []
     key: block-language-models-test-extended-generation
   - label: Language Models Test (Extended Generation)
     agents:
@@ -2197,7 +2238,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Language Models Test (Extended Pooling)
-    depends_on: image-build
+    depends_on: []
     key: block-language-models-test-extended-pooling
   - label: Language Models Test (Extended Pooling)
     agents:
@@ -2225,7 +2266,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Language Models Test (MTEB)
-    depends_on: image-build
+    depends_on: []
     key: block-language-models-test-mteb
   - label: Language Models Test (MTEB)
     agents:
@@ -2253,7 +2294,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Language Models Test (PPL)
-    depends_on: image-build
+    depends_on: []
     key: block-language-models-test-ppl
   - label: Language Models Test (PPL)
     agents:
@@ -2280,6 +2321,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Language Models Tests (Extra Standard) %N
+    depends_on: []
+    key: block-language-models-tests-extra-standard-n
   - label: Language Models Tests (Extra Standard) %N
     agents:
       queue: gpu_1_queue
@@ -2288,10 +2332,9 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pip freeze | grep -E "torch"
-    - pytest -v -s models/language -m "core_model and slow_test" \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      \ --shard-id=$$BUILDKITE_PARALLEL_JOB
-    depends_on:
-    - image-build
+    - pytest -v -s models/language -m "core_model and slow_test" --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+      --shard-id=$$BUILDKITE_PARALLEL_JOB
+    depends_on: block-language-models-tests-extra-standard-n
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2309,6 +2352,9 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
     parallelism: 2
+  - block: Run Language Models Tests (Hybrid) %N
+    depends_on: []
+    key: block-language-models-tests-hybrid-n
   - label: Language Models Tests (Hybrid) %N
     agents:
       queue: gpu_1_queue
@@ -2318,10 +2364,9 @@ steps:
     - cd /vllm-workspace/tests
     - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
     - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2"
-    - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      \ --shard-id=$$BUILDKITE_PARALLEL_JOB
-    depends_on:
-    - image-build
+    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+      --shard-id=$$BUILDKITE_PARALLEL_JOB
+    depends_on: block-language-models-tests-hybrid-n
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2339,6 +2384,9 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
     parallelism: 2
+  - block: Run Language Models Tests (Standard)
+    depends_on: []
+    key: block-language-models-tests-standard
   - label: Language Models Tests (Standard)
     agents:
       queue: gpu_1_queue
@@ -2348,8 +2396,7 @@ steps:
     - cd /vllm-workspace/tests
     - pip freeze | grep -E "torch"
     - pytest -v -s models/language -m "core_model and (not slow_test)"
-    depends_on:
-    - image-build
+    depends_on: block-language-models-tests-standard
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2369,7 +2416,7 @@ steps:
 - group: Models - Multimodal
   steps:
   - block: Run Custom Models
-    depends_on: image-build
+    depends_on: []
     key: block-custom-models
   - label: Custom Models
     agents:
@@ -2396,6 +2443,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Multi-Modal Accuracy Eval (Small Models)
+    depends_on: []
+    key: block-multi-modal-accuracy-eval-small-models
   - label: Multi-Modal Accuracy Eval (Small Models)
     agents:
       queue: gpu_1_queue
@@ -2405,8 +2455,7 @@ steps:
     - cd /vllm-workspace/.buildkite/lm-eval-harness
     - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
       --tp-size=1
-    depends_on:
-    - image-build
+    depends_on: block-multi-modal-accuracy-eval-small-models
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2424,7 +2473,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Multi-Modal Models (Extended) 1
-    depends_on: image-build
+    depends_on: []
     key: block-multi-modal-models-extended-1
   - label: Multi-Modal Models (Extended) 1
     agents:
@@ -2454,7 +2503,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Multi-Modal Models (Extended) 2
-    depends_on: image-build
+    depends_on: []
     key: block-multi-modal-models-extended-2
   - label: Multi-Modal Models (Extended) 2
     agents:
@@ -2484,7 +2533,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Multi-Modal Models (Extended) 3
-    depends_on: image-build
+    depends_on: []
     key: block-multi-modal-models-extended-3
   - label: Multi-Modal Models (Extended) 3
     agents:
@@ -2513,6 +2562,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Multi-Modal Models (Standard)
+    depends_on: []
+    key: block-multi-modal-models-standard
   - label: Multi-Modal Models (Standard)
     agents:
       queue: gpu_1_queue
@@ -2526,8 +2578,7 @@ steps:
       --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py
       -m core_model
-    depends_on:
-    - image-build
+    depends_on: block-multi-modal-models-standard
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2544,6 +2595,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Multi-Modal Processor
+    depends_on: []
+    key: block-multi-modal-processor
   - label: Multi-Modal Processor
     agents:
       queue: gpu_1_queue
@@ -2553,8 +2607,7 @@ steps:
     - cd /vllm-workspace/tests
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/processing/test_tensor_schema.py
-    depends_on:
-    - image-build
+    depends_on: block-multi-modal-processor
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2571,6 +2624,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Multi-Modal Processor Test (CPU)
+    depends_on: []
+    key: block-multi-modal-processor-test-cpu
   - label: Multi-Modal Processor Test (CPU)
     agents:
       queue: cpu_queue_premerge_us_east_1
@@ -2580,8 +2636,7 @@ steps:
     - cd /vllm-workspace/tests
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-    depends_on:
-    - image-build
+    depends_on: block-multi-modal-processor-test-cpu
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2600,6 +2655,9 @@ steps:
         mount_buildkite_agent: true
 - group: Plugins
   steps:
+  - block: Run Plugin Tests (2 GPUs)
+    depends_on: []
+    key: block-plugin-tests-2-gpus
   - label: Plugin Tests (2 GPUs)
     agents:
       queue: gpu_4_queue
@@ -2622,8 +2680,7 @@ steps:
     - pytest -v -s entrypoints/openai/test_oot_registration.py
     - pytest -v -s models/test_oot_registration.py
     - pytest -v -s plugins/lora_resolvers
-    depends_on:
-    - image-build
+    depends_on: block-plugin-tests-2-gpus
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2642,6 +2699,9 @@ steps:
         mount_buildkite_agent: true
 - group: PyTorch
   steps:
+  - block: Run PyTorch Compilation Unit Tests
+    depends_on: []
+    key: block-pytorch-compilation-unit-tests
   - label: PyTorch Compilation Unit Tests
     agents:
       queue: gpu_1_queue
@@ -2650,12 +2710,11 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\\\;
-    depends_on:
-    - []
+    depends_on: block-pytorch-compilation-unit-tests
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:52ef28ad15b328f5b6b3edb3f8b8904528a183f6
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
         always-pull: true
         propagate-environment: true
         environment:
@@ -2668,6 +2727,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run PyTorch Fullgraph
+    depends_on: []
+    key: block-pytorch-fullgraph
   - label: PyTorch Fullgraph
     agents:
       queue: gpu_1_queue
@@ -2678,8 +2740,7 @@ steps:
     - pytest -v -s compile/fullgraph/test_full_graph.py -k "not test_fp8_kv_scale_compile"
     - pytest -v -s compile/distributed/test_fusions_e2e.py -k "TRITON and not +quant_fp8
       and not Llama-4"
-    depends_on:
-    - image-build
+    depends_on: block-pytorch-fullgraph
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2696,6 +2757,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run PyTorch Fullgraph Smoke Test
+    depends_on: []
+    key: block-pytorch-fullgraph-smoke-test
   - label: PyTorch Fullgraph Smoke Test
     agents:
       queue: gpu_1_queue
@@ -2705,12 +2769,11 @@ steps:
     - cd /vllm-workspace/tests
     - find compile/fullgraph/ -name "test_*.py" -not -name "test_full_graph.py" -exec
       pytest -s -v {} \\\\;
-    depends_on:
-    - []
+    depends_on: block-pytorch-fullgraph-smoke-test
     soft_fail: false
     plugins:
     - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:52ef28ad15b328f5b6b3edb3f8b8904528a183f6
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
         always-pull: true
         propagate-environment: true
         environment:
@@ -2723,6 +2786,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Pytorch Nightly Dependency Override Check
+    depends_on: []
+    key: block-pytorch-nightly-dependency-override-check
   - label: Pytorch Nightly Dependency Override Check
     agents:
       queue: gpu_1_queue
@@ -2731,8 +2797,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - bash standalone_tests/pytorch_nightly_dependency.sh
-    depends_on:
-    - image-build
+    depends_on: block-pytorch-nightly-dependency-override-check
     soft_fail: true
     plugins:
     - docker#v5.2.0:
@@ -2751,6 +2816,9 @@ steps:
         mount_buildkite_agent: true
 - group: Quantization
   steps:
+  - block: Run Quantization
+    depends_on: []
+    key: block-quantization
   - label: Quantization
     agents:
       queue: gpu_1_queue
@@ -2761,8 +2829,7 @@ steps:
     - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
     - uv pip install --system conch-triton-kernels
     - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-    depends_on:
-    - image-build
+    depends_on: block-quantization
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2779,6 +2846,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run Quantized MoE Test (B200)
+    depends_on: []
+    key: block-quantized-moe-test-b200
   - label: Quantized MoE Test (B200)
     agents:
       queue: B200
@@ -2787,8 +2857,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/
     - pytest -s -v tests/quantization/test_blackwell_moe.py
-    depends_on:
-    - image-build
+    depends_on: block-quantized-moe-test-b200
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2807,6 +2876,9 @@ steps:
         - /data/benchmark-vllm-cache:/root/.cache/vllm
 - group: Samplers
   steps:
+  - block: Run Samplers Test
+    depends_on: []
+    key: block-samplers-test
   - label: Samplers Test
     agents:
       queue: gpu_1_queue
@@ -2816,8 +2888,7 @@ steps:
     - cd /vllm-workspace/tests
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-    depends_on:
-    - image-build
+    depends_on: block-samplers-test
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2836,6 +2907,9 @@ steps:
         mount_buildkite_agent: true
 - group: Tool use
   steps:
+  - block: Run OpenAI-Compatible Tool Use
+    depends_on: []
+    key: block-openai-compatible-tool-use
   - label: OpenAI-Compatible Tool Use
     agents:
       queue: gpu_1_queue
@@ -2844,8 +2918,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s -m "not cpu_test" tool_use
-    depends_on:
-    - image-build
+    depends_on: block-openai-compatible-tool-use
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2862,6 +2935,9 @@ steps:
         - /dev/shm:/dev/shm
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
+  - block: Run OpenAI-Compatible Tool Use (CPU)
+    depends_on: []
+    key: block-openai-compatible-tool-use-cpu
   - label: OpenAI-Compatible Tool Use (CPU)
     agents:
       queue: cpu_queue_premerge_us_east_1
@@ -2870,8 +2946,7 @@ steps:
     - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
     - cd /vllm-workspace/tests
     - pytest -v -s -m "cpu_test" tool_use
-    depends_on:
-    - image-build
+    depends_on: block-openai-compatible-tool-use-cpu
     soft_fail: false
     plugins:
     - docker#v5.2.0:
@@ -2891,7 +2966,7 @@ steps:
 - group: Weight Loading
   steps:
   - block: Run Weight Loading Multiple GPU
-    depends_on: image-build
+    depends_on: []
     key: block-weight-loading-multiple-gpu
   - label: Weight Loading Multiple GPU
     agents:
@@ -2919,7 +2994,7 @@ steps:
         - /fsx/hf_cache:/fsx/hf_cache
         mount_buildkite_agent: true
   - block: Run Weight Loading Multiple GPU - Large Models
-    depends_on: image-build
+    depends_on: []
     key: block-weight-loading-multiple-gpu---large-models
   - label: Weight Loading Multiple GPU - Large Models
     agents:
@@ -2937,14 +3012,6 @@ steps:
           priorityClassName: ci
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            command:
-            - bash
-            - -c
-            - (command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-              && cd /vllm-workspace/.buildkite/lm-eval-harness && (command nvidia-smi
-              || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd /vllm-workspace/.buildkite/lm-eval-harness
-              && export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -s -v test_lm_eval_correctness.py
-              --config-list-file=configs/models-large.txt --tp-size=4
             resources:
               limits:
                 nvidia.com/gpu: 4
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
index dab6e674990b..c158b4ecbfcf 100644
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -13,7 +13,7 @@ steps:
   # tests covered elsewhere.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\\\\\;"
 
 - label: PyTorch Fullgraph Smoke Test
   timeout_in_minutes: 30
@@ -25,7 +25,7 @@ steps:
   # as it is a heavy test that is covered in other steps.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\\\\\;"
 
 - label: PyTorch Fullgraph
   timeout_in_minutes: 40

From a020a18c40fa776bd67fb4c673d5f49e6fbf7079 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Tue, 2 Dec 2025 01:58:10 -0800
Subject: [PATCH 17/24] slashes

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/pytorch.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
index c158b4ecbfcf..703c82eb1a91 100644
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -13,7 +13,7 @@ steps:
   # tests covered elsewhere.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\\\\\;"
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
 
 - label: PyTorch Fullgraph Smoke Test
   timeout_in_minutes: 30
@@ -25,7 +25,7 @@ steps:
   # as it is a heavy test that is covered in other steps.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\\\\\;"
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
 
 - label: PyTorch Fullgraph
   timeout_in_minutes: 40

From f2e32c9a3964d41cf2f02110b378403f25314acf Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Wed, 3 Dec 2025 00:57:44 -0800
Subject: [PATCH 18/24] 2node test

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/distributed.yaml | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 30a1002b701b..e57414ba0c28 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -173,16 +173,7 @@ steps:
   - tests/distributed/
   - tests/examples/offline_inference/data_parallel.py
   commands:
-    # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-    # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
 
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
   timeout_in_minutes: 30

From e35d711b1936e763df9ba8b6da52286f7b201885 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Wed, 3 Dec 2025 14:57:04 -0800
Subject: [PATCH 19/24] switch pushd to cd

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/distributed.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index e57414ba0c28..57756aae4808 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -97,10 +97,9 @@ steps:
   - pytest -v -s distributed/test_symm_mem_allreduce.py
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
-  - pushd ../examples/offline_inference
+  - cd ../examples/offline_inference
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  - popd
 
 - label: Distributed Tests (8 GPUs)(H100)
   optional: true

From 54cb6029de1329661f41602777b1cf4c241dbebf Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Thu, 4 Dec 2025 02:52:15 -0800
Subject: [PATCH 20/24] remove old file

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/pipeline.yaml | 3044 --------------------------------------
 1 file changed, 3044 deletions(-)
 delete mode 100644 .buildkite/pipeline.yaml

diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
deleted file mode 100644
index ae55531ecf13..000000000000
--- a/.buildkite/pipeline.yaml
+++ /dev/null
@@ -1,3044 +0,0 @@
-steps:
-- group: Abuild
-  steps:
-  - block: 'Run :docker: Build CPU arm64 image'
-    depends_on: []
-    key: block--docker--build-cpu-arm64-image
-  - label: ':docker: Build CPU arm64 image'
-    key: cpu-arm64-image-build
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - .buildkite/image_build/image_build_cpu_arm64.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
-      123
-    depends_on: block--docker--build-cpu-arm64-image
-    soft_fail: false
-    retry:
-      automatic:
-      - exit_status: -1
-        limit: 2
-      - exit_status: -10
-        limit: 2
-    env:
-      DOCKER_BUILDKIT: '1'
-  - block: 'Run :docker: Build CPU image'
-    depends_on: []
-    key: block--docker--build-cpu-image
-  - label: ':docker: Build CPU image'
-    key: image-build-cpu
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - .buildkite/image_build/image_build_cpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
-      123
-    depends_on: block--docker--build-cpu-image
-    soft_fail: false
-    retry:
-      automatic:
-      - exit_status: -1
-        limit: 2
-      - exit_status: -10
-        limit: 2
-    env:
-      DOCKER_BUILDKIT: '1'
-  - block: 'Run :docker: Build CUDA 11.8 image'
-    depends_on: []
-    key: block--docker--build-cuda-11-8-image
-  - label: ':docker: Build CUDA 11.8 image'
-    key: image-build-cu118
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - .buildkite/image_build/image_build_cu118.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
-      123
-    depends_on: block--docker--build-cuda-11-8-image
-    soft_fail: false
-    retry:
-      automatic:
-      - exit_status: -1
-        limit: 2
-      - exit_status: -10
-        limit: 2
-    env:
-      DOCKER_BUILDKIT: '1'
-  - block: 'Run :docker: Build HPU image'
-    depends_on: []
-    key: block--docker--build-hpu-image
-  - label: ':docker: Build HPU image'
-    key: image-build-hpu
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - .buildkite/image_build/image_build_hpu.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
-      123
-    depends_on: block--docker--build-hpu-image
-    soft_fail: true
-    retry:
-      automatic:
-      - exit_status: -1
-        limit: 2
-      - exit_status: -10
-        limit: 2
-    env:
-      DOCKER_BUILDKIT: '1'
-  - block: 'Run :docker: Build image'
-    depends_on: []
-    key: block--docker--build-image
-  - label: ':docker: Build image'
-    key: image-build
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - .buildkite/image_build/image_build.sh public.ecr.aws/q9t5s3a7 vllm-ci-test-repo
-      123
-    depends_on: block--docker--build-image
-    soft_fail: false
-    retry:
-      automatic:
-      - exit_status: -1
-        limit: 2
-      - exit_status: -10
-        limit: 2
-    env:
-      DOCKER_BUILDKIT: '1'
-- group: Attention
-  steps:
-  - block: Run V1 attention (B200)
-    depends_on: []
-    key: block-v1-attention-b200
-  - label: V1 attention (B200)
-    agents:
-      queue: B200
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention
-    depends_on: block-v1-attention-b200
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-  - block: Run V1 attention (H100)
-    depends_on: []
-    key: block-v1-attention-h100
-  - label: V1 attention (H100)
-    agents:
-      queue: mithril-h100-pool
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s v1/attention
-    depends_on: block-v1-attention-h100
-    soft_fail: false
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            resources:
-              limits:
-                nvidia.com/gpu: 4
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: '0'
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            node.kubernetes.io/instance-type: gpu-h100-sxm
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: DirectoryOrCreate
-- group: Basic Correctness
-  steps:
-  - block: Run Basic Correctness
-    depends_on: []
-    key: block-basic-correctness
-  - label: Basic Correctness
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s basic_correctness/test_cumem.py
-    - pytest -v -s basic_correctness/test_basic_correctness.py
-    - pytest -v -s basic_correctness/test_cpu_offload.py
-    depends_on: block-basic-correctness
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Benchmarks
-  steps:
-  - block: Run Benchmarks
-    depends_on: []
-    key: block-benchmarks
-  - label: Benchmarks
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/.buildkite
-    - bash scripts/run-benchmarks.sh
-    depends_on: block-benchmarks
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Benchmarks CLI Test
-    depends_on: []
-    key: block-benchmarks-cli-test
-  - label: Benchmarks CLI Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s benchmarks/
-    depends_on: block-benchmarks-cli-test
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: CUDA
-  steps:
-  - block: Run Cudagraph
-    depends_on: []
-    key: block-cudagraph
-  - label: Cudagraph
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
-    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
-    depends_on: block-cudagraph
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Platform Tests (CUDA)
-    depends_on: []
-    key: block-platform-tests-cuda
-  - label: Platform Tests (CUDA)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s cuda/test_cuda_context.py
-    depends_on: block-platform-tests-cuda
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Compile
-  steps:
-  - block: Run Fusion E2E (2 GPUs)(B200)
-    depends_on: []
-    key: block-fusion-e2e-2-gpusb200
-  - label: Fusion E2E (2 GPUs)(B200)
-    agents:
-      queue: B200
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/
-    - nvidia-smi
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-    depends_on: block-fusion-e2e-2-gpusb200
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-  - block: Run Fusion and Compile Tests (B200)
-    depends_on: []
-    key: block-fusion-and-compile-tests-b200
-  - label: Fusion and Compile Tests (B200)
-    agents:
-      queue: B200
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/
-    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-      -k "True and not +quant_fp8 and not +rms_norm"
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-    depends_on: block-fusion-and-compile-tests-b200
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-- group: Distributed
-  steps:
-  - block: Run 2 Node Test (4 GPUs)
-    depends_on: []
-    key: block-2-node-test-4-gpus
-  - label: 2 Node Test (4 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
-      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep "Same node
-      test passed"
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
-      distributed/test_node_count.py | grep "Node count test passed"
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
-      --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345
-      --enforce-eager --trust-remote-code
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d
-      --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep "Same node
-      test passed"
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10
-      distributed/test_node_count.py | grep "Node count test passed"
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1
-      --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345
-      --enforce-eager --trust-remote-code
-    depends_on: block-2-node-test-4-gpus
-    soft_fail: false
-  - block: Run Distributed (2 GPUs)
-    depends_on: []
-    key: block-distributed-2-gpus
-  - label: Distributed (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - export NCCL_CUMEM_HOST_ENABLE=0
-    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-    - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
-    - pytest -v -s entrypoints/llm/test_collective_rpc.py
-    - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
-    - pytest -v -s ./compile/test_wrapper.py
-    - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-      | grep "Same node test passed"
-    - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4
-      distributed/test_same_node.py | grep "Same node test passed"
-    - pytest -v -s distributed/test_sequence_parallel.py
-    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-    - pytest -v -s v1/worker/test_worker_memory_snapshot.py
-    depends_on: block-distributed-2-gpus
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Distributed Comm Ops
-    depends_on: []
-    key: block-distributed-comm-ops
-  - label: Distributed Comm Ops
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s distributed/test_comm_ops.py
-    - pytest -v -s distributed/test_shm_broadcast.py
-    - pytest -v -s distributed/test_shm_buffer.py
-    - pytest -v -s distributed/test_shm_storage.py
-    depends_on: block-distributed-comm-ops
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Distributed NixlConnector PD accuracy (4 GPUs)
-    depends_on: []
-    key: block-distributed-nixlconnector-pd-accuracy-4-gpus
-  - label: Distributed NixlConnector PD accuracy (4 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
-    depends_on: block-distributed-nixlconnector-pd-accuracy-4-gpus
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Distributed Tests (2 GPUs)(B200)
-    depends_on: []
-    key: block-distributed-tests-2-gpusb200
-  - label: Distributed Tests (2 GPUs)(B200)
-    agents:
-      queue: B200
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-    - pytest -v -s tests/v1/distributed/test_dbo.py
-    depends_on: block-distributed-tests-2-gpusb200
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-  - block: Run Distributed Tests (2 GPUs)(H200)
-    depends_on: []
-    key: block-distributed-tests-2-gpush200
-  - label: Distributed Tests (2 GPUs)(H200)
-    agents:
-      queue: skylab-h200
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-      -k "not Llama-4"
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1pytest -v -s tests/distributed/test_sequence_parallel.py
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1
-      VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py
-      --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
-    - pytest -v -s tests/v1/distributed/test_dbo.py
-    depends_on: block-distributed-tests-2-gpush200
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-  - block: Run Distributed Tests (4 GPUs)
-    depends_on: []
-    key: block-distributed-tests-4-gpus
-  - label: Distributed Tests (4 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - export NCCL_CUMEM_HOST_ENABLE=0
-    - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-    - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-    - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-    - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-    - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-    - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-    - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
-    - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-    - pytest -v -s distributed/test_utils.py
-    - pytest -v -s compile/fullgraph/test_basic_correctness.py
-    - pytest -v -s distributed/test_pynccl.py
-    - pytest -v -s distributed/test_events.py
-    - pytest -v -s distributed/test_symm_mem_allreduce.py
-    - pushd ../examples/offline_inference
-    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-    - popd
-    depends_on: block-distributed-tests-4-gpus
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Distributed Tests (4 GPUs)(A100)
-    depends_on: []
-    key: block-distributed-tests-4-gpusa100
-  - label: Distributed Tests (4 GPUs)(A100)
-    agents:
-      queue: a100_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s distributed/test_custom_all_reduce.py
-    - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-    - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m "distributed(num_gpus=2)"
-    - pytest -v -s -x lora/test_mixtral.py
-    depends_on: block-distributed-tests-4-gpusa100
-    soft_fail: false
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: ci
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            resources:
-              limits:
-                nvidia.com/gpu: 4
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: '0'
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: DirectoryOrCreate
-  - block: Run Distributed Tests (8 GPUs)(H100)
-    depends_on: []
-    key: block-distributed-tests-8-gpush100
-  - label: Distributed Tests (8 GPUs)(H100)
-    agents:
-      queue: mithril-h100-pool
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - export NCCL_CUMEM_HOST_ENABLE=0
-    - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py
-      --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-    depends_on: block-distributed-tests-8-gpush100
-    soft_fail: false
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            resources:
-              limits:
-                nvidia.com/gpu: 4
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: '0'
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            node.kubernetes.io/instance-type: gpu-h100-sxm
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: DirectoryOrCreate
-  - block: Run Pipeline + Context Parallelism (4 GPUs))
-    depends_on: []
-    key: block-pipeline---context-parallelism-4-gpus
-  - label: Pipeline + Context Parallelism (4 GPUs))
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s distributed/test_pp_cudagraph.py
-    - pytest -v -s distributed/test_pipeline_parallel.py
-    depends_on: block-pipeline---context-parallelism-4-gpus
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: E2E Integration
-  steps:
-  - block: Run DeepSeek V2-Lite Accuracy
-    depends_on: []
-    key: block-deepseek-v2-lite-accuracy
-  - label: DeepSeek V2-Lite Accuracy
-    agents:
-      queue: mithril-h100-pool
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace
-    - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
-      0.25 200 8010
-    depends_on: block-deepseek-v2-lite-accuracy
-    soft_fail: false
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            resources:
-              limits:
-                nvidia.com/gpu: 4
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: '0'
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            node.kubernetes.io/instance-type: gpu-h100-sxm
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: DirectoryOrCreate
-  - block: Run Prime-RL Integration (2 GPUs)
-    depends_on: []
-    key: block-prime-rl-integration-2-gpus
-  - label: Prime-RL Integration (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace
-    - bash .buildkite/scripts/run-prime-rl-test.sh
-    depends_on: block-prime-rl-integration-2-gpus
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Qwen3-30B-A3B-FP8-block Accuracy
-    depends_on: []
-    key: block-qwen3-30b-a3b-fp8-block-accuracy
-  - label: Qwen3-30B-A3B-FP8-block Accuracy
-    agents:
-      queue: mithril-h100-pool
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace
-    - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
-      0.8 200 8020
-    depends_on: block-qwen3-30b-a3b-fp8-block-accuracy
-    soft_fail: false
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            resources:
-              limits:
-                nvidia.com/gpu: 4
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: '0'
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            node.kubernetes.io/instance-type: gpu-h100-sxm
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: DirectoryOrCreate
-- group: Engine
-  steps:
-  - block: Run Engine
-    depends_on: []
-    key: block-engine
-  - label: Engine
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-    depends_on: block-engine
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run V1 e2e + engine
-    depends_on: []
-    key: block-v1-e2e---engine
-  - label: V1 e2e + engine
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
-    depends_on: block-v1-e2e---engine
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Entrypoints
-  steps:
-  - block: Run Entrypoints Integration (API Server)
-    depends_on: []
-    key: block-entrypoints-integration-api-server
-  - label: Entrypoints Integration (API Server)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
-    - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py
-      --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py
-      --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
-      --ignore=entrypoints/openai/tool_parsers/
-    - pytest -v -s entrypoints/test_chat_utils.py
-    depends_on: block-entrypoints-integration-api-server
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Entrypoints Integration (LLM)
-    depends_on: []
-    key: block-entrypoints-integration-llm
-  - label: Entrypoints Integration (LLM)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
-    - pytest -v -s entrypoints/llm/test_generate.py
-    - pytest -v -s entrypoints/offline_mode
-    depends_on: block-entrypoints-integration-llm
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Entrypoints Integration (Pooling)
-    depends_on: []
-    key: block-entrypoints-integration-pooling
-  - label: Entrypoints Integration (Pooling)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s entrypoints/pooling
-    depends_on: block-entrypoints-integration-pooling
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Entrypoints Unit Tests
-    depends_on: []
-    key: block-entrypoints-unit-tests
-  - label: Entrypoints Unit Tests
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s entrypoints/openai/tool_parsers
-    - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai
-      --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
-    depends_on: block-entrypoints-unit-tests
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Entrypoints V1
-    depends_on: []
-    key: block-entrypoints-v1
-  - label: Entrypoints V1
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s v1/entrypoints
-    depends_on: block-entrypoints-v1
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run OpenAI API Correctness
-    depends_on: []
-    key: block-openai-api-correctness
-  - label: OpenAI API Correctness
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -s entrypoints/openai/correctness/
-    depends_on: block-openai-api-correctness
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Expert Parallelism
-  steps:
-  - block: Run EPLB Algorithm
-    depends_on: []
-    key: block-eplb-algorithm
-  - label: EPLB Algorithm
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s distributed/test_eplb_algo.py
-    depends_on: block-eplb-algorithm
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run EPLB Execution
-    depends_on: []
-    key: block-eplb-execution
-  - label: EPLB Execution
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s distributed/test_eplb_execute.py
-    - pytest -v -s distributed/test_eplb_spec_decode.py
-    depends_on: block-eplb-execution
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Kernels
-  steps:
-  - block: Run Kernels (B200)
-    depends_on: []
-    key: block-kernels-b200
-  - label: Kernels (B200)
-    agents:
-      queue: B200
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/
-    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
-    - pytest -v -s tests/kernels/attention/test_attention_selector.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k "not num_heads2"
-    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
-    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k "fp8"
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
-    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
-    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
-    depends_on: block-kernels-b200
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-  - block: Run Kernels Attention Test %N
-    depends_on: []
-    key: block-kernels-attention-test-n
-  - label: Kernels Attention Test %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    depends_on: block-kernels-attention-test-n
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-    parallelism: 2
-  - block: Run Kernels Core Operation Test
-    depends_on: []
-    key: block-kernels-core-operation-test
-  - label: Kernels Core Operation Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
-    depends_on: block-kernels-core-operation-test
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Kernels DeepGEMM Test (H100)
-    depends_on: []
-    key: block-kernels-deepgemm-test-h100
-  - label: Kernels DeepGEMM Test (H100)
-    agents:
-      queue: mithril-h100-pool
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
-    - pytest -v -s kernels/moe/test_deepgemm.py
-    - pytest -v -s kernels/moe/test_batched_deepgemm.py
-    - pytest -v -s kernels/attention/test_deepgemm_attention.py
-    depends_on: block-kernels-deepgemm-test-h100
-    soft_fail: false
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            resources:
-              limits:
-                nvidia.com/gpu: 4
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: '0'
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            node.kubernetes.io/instance-type: gpu-h100-sxm
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: DirectoryOrCreate
-  - block: Run Kernels Mamba Test
-    depends_on: []
-    key: block-kernels-mamba-test
-  - label: Kernels Mamba Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s kernels/mamba
-    depends_on: block-kernels-mamba-test
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Kernels MoE Test %N
-    depends_on: []
-    key: block-kernels-moe-test-n
-  - label: Kernels MoE Test %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    depends_on: block-kernels-moe-test-n
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-    parallelism: 2
-  - block: Run Kernels Quantization Test %N
-    depends_on: []
-    key: block-kernels-quantization-test-n
-  - label: Kernels Quantization Test %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-    depends_on: block-kernels-quantization-test-n
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-    parallelism: 2
-- group: LM Eval
-  steps:
-  - block: Run LM Eval Large Models (4 GPUs)(A100)
-    depends_on: []
-    key: block-lm-eval-large-models-4-gpusa100
-  - label: LM Eval Large Models (4 GPUs)(A100)
-    agents:
-      queue: a100_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/.buildkite/lm-eval-harness
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt
-      --tp-size=4
-    depends_on: block-lm-eval-large-models-4-gpusa100
-    soft_fail: false
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: ci
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            resources:
-              limits:
-                nvidia.com/gpu: 4
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: '0'
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: DirectoryOrCreate
-  - block: Run LM Eval Large Models (4 GPUs)(H100)
-    depends_on: []
-    key: block-lm-eval-large-models-4-gpush100
-  - label: LM Eval Large Models (4 GPUs)(H100)
-    agents:
-      queue: mithril-h100-pool
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/.buildkite/lm-eval-harness
-    - export VLLM_USE_DEEP_GEMM=0
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt
-      --tp-size=4
-    depends_on: block-lm-eval-large-models-4-gpush100
-    soft_fail: false
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            resources:
-              limits:
-                nvidia.com/gpu: 4
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: '0'
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            node.kubernetes.io/instance-type: gpu-h100-sxm
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: DirectoryOrCreate
-  - block: Run LM Eval Small Models
-    depends_on: []
-    key: block-lm-eval-small-models
-  - label: LM Eval Small Models
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-      --tp-size=1
-    depends_on: block-lm-eval-small-models
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run LM Eval Small Models (B200)
-    depends_on: []
-    key: block-lm-eval-small-models-b200
-  - label: LM Eval Small Models (B200)
-    agents:
-      queue: B200
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
-      --tp-size=1
-    depends_on: block-lm-eval-small-models-b200
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-- group: LoRA
-  steps:
-  - block: Run LoRA %N
-    depends_on: []
-    key: block-lora-n
-  - label: LoRA %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
-      --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py
-      --ignore=lora/test_qwen3moe_tp.py
-    depends_on: block-lora-n
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-    parallelism: 4
-  - block: Run LoRA TP (Distributed)
-    depends_on: []
-    key: block-lora-tp-distributed
-  - label: LoRA TP (Distributed)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s -x lora/test_chatglm3_tp.py
-    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_llm_with_multi_loras.py
-    - pytest -v -s -x lora/test_olmoe_tp.py
-    - pytest -v -s -x lora/test_gptoss_tp.py
-    depends_on: block-lora-tp-distributed
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Miscellaneous
-  steps:
-  - block: Run Async Engine, Inputs, Utils, Worker
-    depends_on: []
-    key: block-async-engine--inputs--utils--worker
-  - label: Async Engine, Inputs, Utils, Worker
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s -m "not cpu_test" multimodal
-    - pytest -v -s utils_
-    depends_on: block-async-engine--inputs--utils--worker
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Async Engine, Inputs, Utils, Worker, Config (CPU)
-    depends_on: []
-    key: block-async-engine--inputs--utils--worker--config-cpu
-  - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - python3 standalone_tests/lazy_imports.py
-    - pytest -v -s test_inputs.py
-    - pytest -v -s test_outputs.py
-    - pytest -v -s -m "cpu_test" multimodal
-    - pytest -v -s tokenizers_
-    - pytest -v -s transformers_utils
-    - pytest -v -s config
-    depends_on: block-async-engine--inputs--utils--worker--config-cpu
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Examples
-    depends_on: []
-    key: block-examples
-  - label: Examples
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/examples
-    - pip install tensorizer
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf
-      --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory
-      /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m
-      deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper
-      --seed 0
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens
-      3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp
-      0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens
-      3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp
-      0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-    depends_on: block-examples
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run GPT-OSS Eval (B200)
-    depends_on: []
-    key: block-gpt-oss-eval-b200
-  - label: GPT-OSS Eval (B200)
-    agents:
-      queue: B200
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/
-    - uv pip install --system "gpt-oss[eval]==0.0.5"
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b
-      --metric 0.58
-    depends_on: block-gpt-oss-eval-b200
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-  - block: Run Metrics, Tracing (2 GPUs)
-    depends_on: []
-    key: block-metrics--tracing-2-gpus
-  - label: Metrics, Tracing (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip install "opentelemetry-sdk>=1.26.0" "opentelemetry-api>=1.26.0" "opentelemetry-exporter-otlp>=1.26.0"
-      "opentelemetry-semantic-conventions-ai>=0.4.1"
-    - pytest -v -s v1/tracing
-    depends_on: block-metrics--tracing-2-gpus
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Python-only Installation
-    depends_on: []
-    key: block-python-only-installation
-  - label: Python-only Installation
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - bash standalone_tests/python_only_compile.sh
-    depends_on: block-python-only-installation
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Regression
-    depends_on: []
-    key: block-regression
-  - label: Regression
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip install modelscope
-    - pytest -v -s test_regression.py
-    depends_on: block-regression
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run V1 Others
-    depends_on: []
-    key: block-v1-others
-  - label: V1 Others
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - pytest -v -s -m "not cpu_test" v1/core
-    - pytest -v -s v1/executor
-    - pytest -v -s v1/kv_offload
-    - pytest -v -s v1/sample
-    - pytest -v -s v1/logits_processors
-    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m "not cpu_test" v1/kv_connector/unit
-    - pytest -v -s -m "not cpu_test" v1/metrics
-    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_request.py
-    - pytest -v -s v1/test_outputs.py
-    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
-    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-    depends_on: block-v1-others
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run V1 Others (CPU)
-    depends_on: []
-    key: block-v1-others-cpu
-  - label: V1 Others (CPU)
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s -m "cpu_test" v1/core
-    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s -m "cpu_test" v1/kv_connector/unit
-    - pytest -v -s -m "cpu_test" v1/metrics
-    depends_on: block-v1-others-cpu
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Model Executor
-  steps:
-  - block: Run Model Executor
-    depends_on: []
-    key: block-model-executor
-  - label: Model Executor
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - apt-get update && apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-    depends_on: block-model-executor
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Models - Basic
-  steps:
-  - block: Run Basic Models Test (Other CPU)
-    depends_on: []
-    key: block-basic-models-test-other-cpu
-  - label: Basic Models Test (Other CPU)
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s models/test_utils.py models/test_vision.py
-    depends_on: block-basic-models-test-other-cpu
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Basic Models Tests (Extra Initialization) %N
-    depends_on: []
-    key: block-basic-models-tests-extra-initialization-n
-  - label: Basic Models Tests (Extra Initialization) %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s models/test_initialization.py -k "not test_can_initialize_small_subset"
-      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
-    depends_on: block-basic-models-tests-extra-initialization-n
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-    parallelism: 2
-  - block: Run Basic Models Tests (Initialization)
-    depends_on: []
-    key: block-basic-models-tests-initialization
-  - label: Basic Models Tests (Initialization)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
-    depends_on: block-basic-models-tests-initialization
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Basic Models Tests (Other)
-    depends_on: []
-    key: block-basic-models-tests-other
-  - label: Basic Models Tests (Other)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s models/test_transformers.py models/test_registry.py
-    depends_on: block-basic-models-tests-other
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Transformers Nightly Models
-    depends_on: []
-    key: block-transformers-nightly-models
-  - label: Transformers Nightly Models
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/
-    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
-    - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
-    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py
-      --model-type whisper
-    depends_on: block-transformers-nightly-models
-    soft_fail: true
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Models - Distributed
-  steps:
-  - block: Run Distributed Model Tests (2 GPUs)
-    depends_on: []
-    key: block-distributed-model-tests-2-gpus
-  - label: Distributed Model Tests (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m "distributed(num_gpus=2)"
-    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
-    - pytest models/test_transformers.py -v -s -m "distributed(num_gpus=2)"
-    - pytest models/language -v -s -m "distributed(num_gpus=2)"
-    - pytest models/multimodal -v -s -m "distributed(num_gpus=2)" --ignore models/multimodal/generation/test_whisper.py
-    - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py
-      -v -s -m "distributed(num_gpus=2)"
-    depends_on: block-distributed-model-tests-2-gpus
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Models - Language
-  steps:
-  - block: Run Language Models Test (Extended Generation)
-    depends_on: []
-    key: block-language-models-test-extended-generation
-  - label: Language Models Test (Extended Generation)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
-    - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2"
-    - pytest -v -s models/language/generation -m "(not core_model) and (not hybrid_model)"
-    depends_on: block-language-models-test-extended-generation
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Language Models Test (Extended Pooling)
-    depends_on: []
-    key: block-language-models-test-extended-pooling
-  - label: Language Models Test (Extended Pooling)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s models/language/pooling -m "not core_model"
-    depends_on: block-language-models-test-extended-pooling
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Language Models Test (MTEB)
-    depends_on: []
-    key: block-language-models-test-mteb
-  - label: Language Models Test (MTEB)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s models/language/pooling_mteb_test
-    depends_on: block-language-models-test-mteb
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Language Models Test (PPL)
-    depends_on: []
-    key: block-language-models-test-ppl
-  - label: Language Models Test (PPL)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s models/language/generation_ppl_test
-    depends_on: block-language-models-test-ppl
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Language Models Tests (Extra Standard) %N
-    depends_on: []
-    key: block-language-models-tests-extra-standard-n
-  - label: Language Models Tests (Extra Standard) %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip freeze | grep -E "torch"
-    - pytest -v -s models/language -m "core_model and slow_test" --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      --shard-id=$$BUILDKITE_PARALLEL_JOB
-    depends_on: block-language-models-tests-extra-standard-n
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-    parallelism: 2
-  - block: Run Language Models Tests (Hybrid) %N
-    depends_on: []
-    key: block-language-models-tests-hybrid-n
-  - label: Language Models Tests (Hybrid) %N
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
-    - uv pip install --system --no-build-isolation "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2"
-    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      --shard-id=$$BUILDKITE_PARALLEL_JOB
-    depends_on: block-language-models-tests-hybrid-n
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-    parallelism: 2
-  - block: Run Language Models Tests (Standard)
-    depends_on: []
-    key: block-language-models-tests-standard
-  - label: Language Models Tests (Standard)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip freeze | grep -E "torch"
-    - pytest -v -s models/language -m "core_model and (not slow_test)"
-    depends_on: block-language-models-tests-standard
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Models - Multimodal
-  steps:
-  - block: Run Custom Models
-    depends_on: []
-    key: block-custom-models
-  - label: Custom Models
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - echo "Testing custom models..."
-    depends_on: block-custom-models
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Multi-Modal Accuracy Eval (Small Models)
-    depends_on: []
-    key: block-multi-modal-accuracy-eval-small-models
-  - label: Multi-Modal Accuracy Eval (Small Models)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/.buildkite/lm-eval-harness
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
-      --tp-size=1
-    depends_on: block-multi-modal-accuracy-eval-small-models
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Multi-Modal Models (Extended) 1
-    depends_on: []
-    key: block-multi-modal-models-extended-1
-  - label: Multi-Modal Models (Extended) 1
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal -m "not core_model" --ignore models/multimodal/generation/test_common.py
-      --ignore models/multimodal/processing
-    depends_on: block-multi-modal-models-extended-1
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Multi-Modal Models (Extended) 2
-    depends_on: []
-    key: block-multi-modal-models-extended-2
-  - label: Multi-Modal Models (Extended) 2
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m "split(group=0)
-      and not core_model"
-    depends_on: block-multi-modal-models-extended-2
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Multi-Modal Models (Extended) 3
-    depends_on: []
-    key: block-multi-modal-models-extended-3
-  - label: Multi-Modal Models (Extended) 3
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m "split(group=1)
-      and not core_model"
-    depends_on: block-multi-modal-models-extended-3
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Multi-Modal Models (Standard)
-    depends_on: []
-    key: block-multi-modal-models-standard
-  - label: Multi-Modal Models (Standard)
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E "torch"
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py
-      --ignore models/multimodal/processing
-    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py
-      -m core_model
-    depends_on: block-multi-modal-models-standard
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Multi-Modal Processor
-    depends_on: []
-    key: block-multi-modal-processor
-  - label: Multi-Modal Processor
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
-    depends_on: block-multi-modal-processor
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Multi-Modal Processor Test (CPU)
-    depends_on: []
-    key: block-multi-modal-processor-test-cpu
-  - label: Multi-Modal Processor Test (CPU)
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-    depends_on: block-multi-modal-processor-test-cpu
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Plugins
-  steps:
-  - block: Run Plugin Tests (2 GPUs)
-    depends_on: []
-    key: block-plugin-tests-2-gpus
-  - label: Plugin Tests (2 GPUs)
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pip install -e ./plugins/vllm_add_dummy_platform
-    - pytest -v -s plugins_tests/test_platform_plugins.py
-    - pip uninstall vllm_add_dummy_platform -y
-    - pip install -e ./plugins/prithvi_io_processor_plugin
-    - pytest -v -s plugins_tests/test_io_processor_plugins.py
-    - pip uninstall prithvi_io_processor_plugin -y
-    - pip install -e ./plugins/vllm_add_dummy_stat_logger
-    - pytest -v -s plugins_tests/test_stats_logger_plugins.py
-    - pip uninstall dummy_stat_logger -y
-    - pytest -v -s plugins_tests/test_scheduler_plugins.py
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s distributed/test_distributed_oot.py
-    - pytest -v -s entrypoints/openai/test_oot_registration.py
-    - pytest -v -s models/test_oot_registration.py
-    - pytest -v -s plugins/lora_resolvers
-    depends_on: block-plugin-tests-2-gpus
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: PyTorch
-  steps:
-  - block: Run PyTorch Compilation Unit Tests
-    depends_on: []
-    key: block-pytorch-compilation-unit-tests
-  - label: PyTorch Compilation Unit Tests
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - find compile/ -maxdepth 1 -name "test_*.py" -exec pytest -s -v {} \\\\;
-    depends_on: block-pytorch-compilation-unit-tests
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run PyTorch Fullgraph
-    depends_on: []
-    key: block-pytorch-fullgraph
-  - label: PyTorch Fullgraph
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s compile/fullgraph/test_full_graph.py -k "not test_fp8_kv_scale_compile"
-    - pytest -v -s compile/distributed/test_fusions_e2e.py -k "TRITON and not +quant_fp8
-      and not Llama-4"
-    depends_on: block-pytorch-fullgraph
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run PyTorch Fullgraph Smoke Test
-    depends_on: []
-    key: block-pytorch-fullgraph-smoke-test
-  - label: PyTorch Fullgraph Smoke Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - find compile/fullgraph/ -name "test_*.py" -not -name "test_full_graph.py" -exec
-      pytest -s -v {} \\\\;
-    depends_on: block-pytorch-fullgraph-smoke-test
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Pytorch Nightly Dependency Override Check
-    depends_on: []
-    key: block-pytorch-nightly-dependency-override-check
-  - label: Pytorch Nightly Dependency Override Check
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - bash standalone_tests/pytorch_nightly_dependency.sh
-    depends_on: block-pytorch-nightly-dependency-override-check
-    soft_fail: true
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Quantization
-  steps:
-  - block: Run Quantization
-    depends_on: []
-    key: block-quantization
-  - label: Quantization
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
-    - uv pip install --system conch-triton-kernels
-    - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-    depends_on: block-quantization
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Quantized MoE Test (B200)
-    depends_on: []
-    key: block-quantized-moe-test-b200
-  - label: Quantized MoE Test (B200)
-    agents:
-      queue: B200
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/
-    - pytest -s -v tests/quantization/test_blackwell_moe.py
-    depends_on: block-quantized-moe-test-b200
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-- group: Samplers
-  steps:
-  - block: Run Samplers Test
-    depends_on: []
-    key: block-samplers-test
-  - label: Samplers Test
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s samplers
-    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-    depends_on: block-samplers-test
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Tool use
-  steps:
-  - block: Run OpenAI-Compatible Tool Use
-    depends_on: []
-    key: block-openai-compatible-tool-use
-  - label: OpenAI-Compatible Tool Use
-    agents:
-      queue: gpu_1_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s -m "not cpu_test" tool_use
-    depends_on: block-openai-compatible-tool-use
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run OpenAI-Compatible Tool Use (CPU)
-    depends_on: []
-    key: block-openai-compatible-tool-use-cpu
-  - label: OpenAI-Compatible Tool Use (CPU)
-    agents:
-      queue: cpu_queue_premerge_us_east_1
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - pytest -v -s -m "cpu_test" tool_use
-    depends_on: block-openai-compatible-tool-use-cpu
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-- group: Weight Loading
-  steps:
-  - block: Run Weight Loading Multiple GPU
-    depends_on: []
-    key: block-weight-loading-multiple-gpu
-  - label: Weight Loading Multiple GPU
-    agents:
-      queue: gpu_4_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
-    depends_on: block-weight-loading-multiple-gpu
-    soft_fail: false
-    plugins:
-    - docker#v5.2.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-        always-pull: true
-        propagate-environment: true
-        environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/fsx/hf_cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        volumes:
-        - /dev/shm:/dev/shm
-        - /fsx/hf_cache:/fsx/hf_cache
-        mount_buildkite_agent: true
-  - block: Run Weight Loading Multiple GPU - Large Models
-    depends_on: []
-    key: block-weight-loading-multiple-gpu---large-models
-  - label: Weight Loading Multiple GPU - Large Models
-    agents:
-      queue: a100_queue
-    commands:
-    - (command nvidia-smi || true)
-    - export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1
-    - cd /vllm-workspace/tests
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-    depends_on: block-weight-loading-multiple-gpu---large-models
-    soft_fail: false
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: ci
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123
-            resources:
-              limits:
-                nvidia.com/gpu: 4
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: '0'
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: DirectoryOrCreate

From cde1d84b5bc1aea310c8416cb6f38e140eb9215e Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Fri, 5 Dec 2025 02:30:15 -0800
Subject: [PATCH 21/24] build

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/ci_config.yaml                   |  2 +-
 .buildkite/image_build/image_build.sh       | 36 +++++++++++++++------
 .buildkite/image_build/image_build.yaml     | 16 +--------
 .buildkite/image_build/image_build_cu118.sh | 36 ---------------------
 4 files changed, 29 insertions(+), 61 deletions(-)
 delete mode 100755 .buildkite/image_build/image_build_cu118.sh

diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
index 5b00e1cab6c7..d85a8517e0c9 100644
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,4 +1,4 @@
-name: ci
+name: vllm_ci
 job_dirs:
   - ".buildkite/test_areas"
   - ".buildkite/image_build"
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
index 87e35acd5e84..9a2384e524b6 100755
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,17 +1,28 @@
 #!/bin/bash
 set -e
 
-if [[ $# -lt 3 ]]; then
-  echo "Usage: $0 <registry> <repo> <commit>"
+if [[ $# -lt 8 ]]; then
+  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
   exit 1
 fi
 
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
 
 # authenticate with AWS ECR
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx 
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
 
 # skip build if image already exists
 if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
@@ -21,18 +32,25 @@ else
   exit 0
 fi
 
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+  merge_base_commit_build_args=""
+fi
+
 # build
-docker build --file docker/Dockerfile \
+docker buildx build --file docker/Dockerfile \
   --build-arg max_jobs=16 \
   --build-arg buildkite_commit=$BUILDKITE_COMMIT \
   --build-arg USE_SCCACHE=1 \
   --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
   --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT \
+  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+  ${merge_base_commit_build_args} \
+  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+  --push \
   --target test \
   --progress plain .
-
-# push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT
-docker tag $REGISTRY/$REPO:$BUILDKITE_COMMIT $REGISTRY/$REPO:latest
-docker push $REGISTRY/$REPO:latest
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
index af23621a598c..2632634922a5 100644
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -4,7 +4,7 @@ steps:
     key: image-build
     depends_on: []
     commands:
-    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
     env:
       DOCKER_BUILDKIT: "1"
     retry:
@@ -27,20 +27,6 @@ steps:
           limit: 2
         - exit_status: -10  # Agent was lost
           limit: 2
-  
-  - label: ":docker: Build CUDA 11.8 image"
-    key: image-build-cu118
-    optional: true
-    commands:
-    - .buildkite/image_build/image_build_cu118.sh $REGISTRY $REPO $BUILDKITE_COMMIT
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: -10  # Agent was lost
-          limit: 2
 
   - label: ":docker: Build HPU image"
     soft_fail: true
diff --git a/.buildkite/image_build/image_build_cu118.sh b/.buildkite/image_build/image_build_cu118.sh
deleted file mode 100755
index 699cef2ad60f..000000000000
--- a/.buildkite/image_build/image_build_cu118.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-set -e
-
-if [[ $# -lt 3 ]]; then
-  echo "Usage: $0 <registry> <repo> <commit>"
-  exit 1
-fi
-
-REGISTRY=$1
-REPO=$2
-BUILDKITE_COMMIT=$3
-
-# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
-
-# skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118) ]]; then
-  echo "Image not found, proceeding with build..."
-else
-  echo "Image found"
-  exit 0
-fi
-
-# build
-docker build \
-  --file docker/Dockerfile \
-  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --build-arg USE_SCCACHE=1 \
-  --build-arg CUDA_VERSION=11.8.0 \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118 \
-  --target test \
-  --progress plain .
-
-# push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cu118

From 89a0c2918a5e2d83dd3713989482e22d123449d0 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Fri, 5 Dec 2025 03:34:57 -0800
Subject: [PATCH 22/24] run all patterns

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/ci_config.yaml | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
index d85a8517e0c9..199c33159fde 100644
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -3,9 +3,21 @@ job_dirs:
   - ".buildkite/test_areas"
   - ".buildkite/image_build"
 run_all_patterns:
-  - ".*"
+  - "docker/Dockerfile"
+  - "CMakeLists.txt"
+  - "requirements/common.txt"
+  - "requirements/cuda.txt"
+  - "requirements/build.txt"
+  - "requirements/test.txt"
+  - "setup.py"
+  - "csrc/"
+  - "cmake/"
 run_all_exclude_patterns:
-  - ".*"
+  - "docker/Dockerfile."
+  - "csrc/cpu/"
+  - "csrc/rocm/"
+  - "cmake/hipify.py"
+  - "cmake/cpu_extension.cmake"
 registries: public.ecr.aws/q9t5s3a7
 repositories:
   main: "vllm-ci-postmerge-repo"

From a303afd41869094deae2cc3c6a71eefc6130b403 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 8 Dec 2025 13:20:52 -0800
Subject: [PATCH 23/24] sync

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/e2e_integration.yaml | 18 +++++++++++++
 .buildkite/test_areas/misc.yaml            | 31 +++++++++++++++++-----
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index 817b995574bc..3a33ee71e275 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -30,3 +30,21 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index e4182005bb45..3d1dbc98a1e7 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -55,23 +55,29 @@ steps:
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
   - vllm/entrypoints
+  - vllm/multimodal
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/chat.py # for basic
     - python3 offline_inference/basic/generate.py --model facebook/opt-125m
     - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/chat.py
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    # for multi-modal models
     - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_pooling.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+     # for pooling models
+    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -144,3 +150,16 @@ steps:
   commands:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
     - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - vllm/model_executor/layers
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file

From d490e8ec245a388085719de518c64a4a3ef45c96 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Mon, 8 Dec 2025 13:38:45 -0800
Subject: [PATCH 24/24] remove buildkit env

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/image_build/image_build.yaml    | 2 --
 .buildkite/test_areas/distributed.yaml     | 3 +--
 .buildkite/test_areas/e2e_integration.yaml | 9 +++++++++
 .buildkite/test_areas/misc.yaml            | 2 --
 .buildkite/test_areas/quantization.yaml    | 8 ++++++++
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
index 2632634922a5..d01c71dd9bec 100644
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -5,8 +5,6 @@ steps:
     depends_on: []
     commands:
     - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
-    env:
-      DOCKER_BUILDKIT: "1"
     retry:
       automatic:
         - exit_status: -1  # Agent was lost
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 57756aae4808..2cc90698d916 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -102,7 +102,6 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
 
 - label: Distributed Tests (8 GPUs)(H100)
-  optional: true
   timeout_in_minutes: 10
   gpu: h100
   num_gpus: 8
@@ -144,7 +143,7 @@ steps:
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1pytest -v -s tests/distributed/test_sequence_parallel.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index 3a33ee71e275..93d389815eda 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -20,6 +20,15 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
 - label: Prime-RL Integration (2 GPUs)
   timeout_in_minutes: 30
   optional: true
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index 3d1dbc98a1e7..072bccadb726 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -66,8 +66,6 @@ steps:
     - python3 offline_inference/basic/embed.py
     - python3 offline_inference/basic/score.py
     # for multi-modal models
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
index 02a836b90bdf..6e89d6af3b8d 100644
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -36,3 +36,11 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   commands:
     - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization