Skip to content

Commit 8bbcf8b

Browse files
louie-tsaigemini-code-assist[bot]bigPYJ1151
authored
[vLLM Benchmark Suite] Add default parameters section and update CPU benchmark cases (#29381)
Signed-off-by: Tsai, Louie <[email protected]> Signed-off-by: Louie Tsai <[email protected]> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Li, Jiang <[email protected]>
1 parent 70fb77b commit 8bbcf8b

File tree

6 files changed

+361
-1898
lines changed

6 files changed

+361
-1898
lines changed

.buildkite/performance-benchmarks/README.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
108108

109109
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
110110

111+
#### Default Parameters Field
112+
113+
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
114+
115+
<details>
116+
<summary> An Example of default parameters field </summary>
117+
118+
```json
119+
{
120+
"defaults": {
121+
"qps_list": [
122+
"inf"
123+
],
124+
"server_environment_variables": {
125+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
126+
},
127+
"server_parameters": {
128+
"tensor_parallel_size": 1,
129+
"dtype": "bfloat16",
130+
"block_size": 128,
131+
"disable_log_stats": "",
132+
"load_format": "dummy"
133+
},
134+
"client_parameters": {
135+
"backend": "vllm",
136+
"dataset_name": "random",
137+
"random-input-len": 128,
138+
"random-output-len": 128,
139+
"num_prompts": 200,
140+
"ignore-eos": ""
141+
}
142+
},
143+
"tests": [
144+
{
145+
"test_name": "serving_llama3B_tp2_random_128_128",
146+
"server_parameters": {
147+
"model": "meta-llama/Llama-3.2-3B-Instruct",
148+
"tensor_parallel_size": 2,
149+
},
150+
"client_parameters": {
151+
"model": "meta-llama/Llama-3.2-3B-Instruct",
152+
}
153+
},
154+
{
155+
"test_name": "serving_qwen3_tp4_random_128_128",
156+
"server_parameters": {
157+
"model": "Qwen/Qwen3-14B",
158+
"tensor_parallel_size": 4,
159+
},
160+
"client_parameters": {
161+
"model": "Qwen/Qwen3-14B",
162+
}
163+
},
164+
]
165+
}
166+
```
167+
168+
</details>
169+
111170
### Visualizing the results
112171

113172
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.

.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,8 @@ json2envs() {
110110
wait_for_server() {
111111
# wait for vllm server to start
112112
# return 1 if vllm server crashes
113-
timeout 1200 bash -c '
113+
local timeout_val="1200"
114+
timeout "$timeout_val" bash -c '
114115
until curl -X POST localhost:8000/v1/completions; do
115116
sleep 1
116117
done' && return 0 || return 1
@@ -316,12 +317,44 @@ run_throughput_tests() {
316317
run_serving_tests() {
317318
# run serving tests using `vllm bench serve` command
318319
# $1: a json file specifying serving test cases
320+
#
321+
# Supported JSON formats:
322+
# 1) Plain format: top-level array
323+
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
324+
#
325+
# 2) Default parameters field + plain format tests
326+
# {
327+
# "defaults": { ... },
328+
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
329+
# }
319330

320331
local serving_test_file
321332
serving_test_file=$1
322333

323334
# Iterate over serving tests
324-
jq -c '.[]' "$serving_test_file" | while read -r params; do
335+
jq -c '
336+
if type == "array" then
337+
# Plain format: test cases array
338+
.[]
339+
elif (type == "object" and has("tests")) then
340+
# merge the default parameters into each test cases
341+
. as $root
342+
| ($root.defaults // {}) as $d
343+
| ($root.tests // [])[]
344+
# default qps / max_concurrency from defaults if missing
345+
| .qps_list = (.qps_list // $d.qps_list)
346+
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
347+
# merge envs / params: test overrides defaults
348+
| .server_environment_variables =
349+
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
350+
| .server_parameters =
351+
(($d.server_parameters // {}) + (.server_parameters // {}))
352+
| .client_parameters =
353+
(($d.client_parameters // {}) + (.client_parameters // {}))
354+
else
355+
error("Unsupported serving test file format: must be array or object with .tests")
356+
end
357+
' "$serving_test_file" | while read -r params; do
325358
# get the test name, and append the GPU type back to it.
326359
test_name=$(echo "$params" | jq -r '.test_name')
327360
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -335,20 +368,25 @@ run_serving_tests() {
335368
continue
336369
fi
337370

338-
# get client and server arguments
371+
# get client and server arguments (after merged the default parameters)
339372
server_params=$(echo "$params" | jq -r '.server_parameters')
340373
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
341374
client_params=$(echo "$params" | jq -r '.client_parameters')
375+
342376
server_args=$(json2args "$server_params")
343377
server_envs=$(json2envs "$server_envs")
344378
client_args=$(json2args "$client_params")
379+
380+
# qps_list
345381
qps_list=$(echo "$params" | jq -r '.qps_list')
346382
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
347383
echo "Running over qps list $qps_list"
384+
385+
# max_concurrency_list (fallback to num_prompts if missing)
348386
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
349387
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
350-
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
351-
max_concurrency_list="[$num_prompts]"
388+
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
389+
max_concurrency_list="[$num_prompts]"
352390
fi
353391
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
354392
echo "Running over max concurrency list $max_concurrency_list"

0 commit comments

Comments
 (0)