Skip to content

Commit bc33944

Browse files
Merge remote-tracking branch 'happyamazonian/main' into restart
2 parents ce6bada + 6af70e1 commit bc33944

File tree

1,213 files changed

+61417
-26874
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,213 files changed

+61417
-26874
lines changed

.buildkite/generate_index.py

Lines changed: 0 additions & 46 deletions
This file was deleted.

.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ tasks:
88
value: 0.80
99
limit: 250 # will run on 250 * 14 subjects = 3500 samples
1010
num_fewshot: 5
11+
rtol: 0.05
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 55 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,40 @@
99
--tp-size=1
1010
"""
1111

12+
import os
13+
from contextlib import contextmanager
14+
1215
import lm_eval
1316
import numpy as np
1417
import yaml
1518

16-
RTOL = 0.08
19+
DEFAULT_RTOL = 0.08
20+
21+
22+
@contextmanager
23+
def scoped_env_vars(new_env: dict[str, str]):
24+
if not new_env:
25+
# Fast path: nothing to do
26+
yield
27+
return
28+
29+
old_values = {}
30+
new_keys = []
31+
32+
try:
33+
for key, value in new_env.items():
34+
if key in os.environ:
35+
old_values[key] = os.environ[key]
36+
else:
37+
new_keys.append(key)
38+
os.environ[key] = str(value)
39+
yield
40+
finally:
41+
# Restore / clean up
42+
for key, value in old_values.items():
43+
os.environ[key] = value
44+
for key in new_keys:
45+
os.environ.pop(key, None)
1746

1847

1948
def launch_lm_eval(eval_config, tp_size):
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
3261
f"trust_remote_code={trust_remote_code},"
3362
f"max_model_len={max_model_len},"
3463
)
35-
results = lm_eval.simple_evaluate(
36-
model=backend,
37-
model_args=model_args,
38-
tasks=[task["name"] for task in eval_config["tasks"]],
39-
num_fewshot=eval_config["num_fewshot"],
40-
limit=eval_config["limit"],
41-
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
42-
# text models. however, this is regressing measured strict-match for
43-
# existing text models in CI, so only apply it for mm, or explicitly set
44-
apply_chat_template=eval_config.get(
45-
"apply_chat_template", backend == "vllm-vlm"
46-
),
47-
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
48-
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
49-
gen_kwargs=eval_config.get("gen_kwargs"),
50-
batch_size=batch_size,
51-
)
64+
65+
env_vars = eval_config.get("env_vars", None)
66+
with scoped_env_vars(env_vars):
67+
results = lm_eval.simple_evaluate(
68+
model=backend,
69+
model_args=model_args,
70+
tasks=[task["name"] for task in eval_config["tasks"]],
71+
num_fewshot=eval_config["num_fewshot"],
72+
limit=eval_config["limit"],
73+
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
74+
# text models. however, this is regressing measured strict-match for
75+
# existing text models in CI, so only apply it for mm, or explicitly set
76+
apply_chat_template=eval_config.get(
77+
"apply_chat_template", backend == "vllm-vlm"
78+
),
79+
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
80+
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
81+
gen_kwargs=eval_config.get("gen_kwargs"),
82+
batch_size=batch_size,
83+
)
5284
return results
5385

5486

@@ -57,15 +89,18 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
5789

5890
results = launch_lm_eval(eval_config, tp_size)
5991

92+
rtol = eval_config.get("rtol", DEFAULT_RTOL)
93+
6094
success = True
6195
for task in eval_config["tasks"]:
6296
for metric in task["metrics"]:
6397
ground_truth = metric["value"]
6498
measured_value = results["results"][task["name"]][metric["name"]]
6599
print(
66100
f"{task['name']} | {metric['name']}: "
67-
f"ground_truth={ground_truth} | measured={measured_value}"
101+
f"ground_truth={ground_truth:.3f} | "
102+
f"measured={measured_value:.3f} | rtol={rtol}"
68103
)
69-
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
104+
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
70105

71106
assert success

.buildkite/performance-benchmarks/README.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
108108

109109
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
110110

111+
#### Default Parameters Field
112+
113+
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
114+
115+
<details>
116+
<summary> An Example of default parameters field </summary>
117+
118+
```json
119+
{
120+
"defaults": {
121+
"qps_list": [
122+
"inf"
123+
],
124+
"server_environment_variables": {
125+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
126+
},
127+
"server_parameters": {
128+
"tensor_parallel_size": 1,
129+
"dtype": "bfloat16",
130+
"block_size": 128,
131+
"disable_log_stats": "",
132+
"load_format": "dummy"
133+
},
134+
"client_parameters": {
135+
"backend": "vllm",
136+
"dataset_name": "random",
137+
"random-input-len": 128,
138+
"random-output-len": 128,
139+
"num_prompts": 200,
140+
"ignore-eos": ""
141+
}
142+
},
143+
"tests": [
144+
{
145+
"test_name": "serving_llama3B_tp2_random_128_128",
146+
"server_parameters": {
147+
"model": "meta-llama/Llama-3.2-3B-Instruct",
148+
"tensor_parallel_size": 2,
149+
},
150+
"client_parameters": {
151+
"model": "meta-llama/Llama-3.2-3B-Instruct",
152+
}
153+
},
154+
{
155+
"test_name": "serving_qwen3_tp4_random_128_128",
156+
"server_parameters": {
157+
"model": "Qwen/Qwen3-14B",
158+
"tensor_parallel_size": 4,
159+
},
160+
"client_parameters": {
161+
"model": "Qwen/Qwen3-14B",
162+
}
163+
},
164+
]
165+
}
166+
```
167+
168+
</details>
169+
111170
### Visualizing the results
112171

113172
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.

.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,8 @@ json2envs() {
110110
wait_for_server() {
111111
# wait for vllm server to start
112112
# return 1 if vllm server crashes
113-
timeout 1200 bash -c '
113+
local timeout_val="1200"
114+
timeout "$timeout_val" bash -c '
114115
until curl -X POST localhost:8000/v1/completions; do
115116
sleep 1
116117
done' && return 0 || return 1
@@ -316,12 +317,44 @@ run_throughput_tests() {
316317
run_serving_tests() {
317318
# run serving tests using `vllm bench serve` command
318319
# $1: a json file specifying serving test cases
320+
#
321+
# Supported JSON formats:
322+
# 1) Plain format: top-level array
323+
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
324+
#
325+
# 2) Default parameters field + plain format tests
326+
# {
327+
# "defaults": { ... },
328+
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
329+
# }
319330

320331
local serving_test_file
321332
serving_test_file=$1
322333

323334
# Iterate over serving tests
324-
jq -c '.[]' "$serving_test_file" | while read -r params; do
335+
jq -c '
336+
if type == "array" then
337+
# Plain format: test cases array
338+
.[]
339+
elif (type == "object" and has("tests")) then
340+
# merge the default parameters into each test cases
341+
. as $root
342+
| ($root.defaults // {}) as $d
343+
| ($root.tests // [])[]
344+
# default qps / max_concurrency from defaults if missing
345+
| .qps_list = (.qps_list // $d.qps_list)
346+
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
347+
# merge envs / params: test overrides defaults
348+
| .server_environment_variables =
349+
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
350+
| .server_parameters =
351+
(($d.server_parameters // {}) + (.server_parameters // {}))
352+
| .client_parameters =
353+
(($d.client_parameters // {}) + (.client_parameters // {}))
354+
else
355+
error("Unsupported serving test file format: must be array or object with .tests")
356+
end
357+
' "$serving_test_file" | while read -r params; do
325358
# get the test name, and append the GPU type back to it.
326359
test_name=$(echo "$params" | jq -r '.test_name')
327360
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -335,20 +368,25 @@ run_serving_tests() {
335368
continue
336369
fi
337370

338-
# get client and server arguments
371+
# get client and server arguments (after merged the default parameters)
339372
server_params=$(echo "$params" | jq -r '.server_parameters')
340373
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
341374
client_params=$(echo "$params" | jq -r '.client_parameters')
375+
342376
server_args=$(json2args "$server_params")
343377
server_envs=$(json2envs "$server_envs")
344378
client_args=$(json2args "$client_params")
379+
380+
# qps_list
345381
qps_list=$(echo "$params" | jq -r '.qps_list')
346382
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
347383
echo "Running over qps list $qps_list"
384+
385+
# max_concurrency_list (fallback to num_prompts if missing)
348386
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
349387
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
350-
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
351-
max_concurrency_list="[$num_prompts]"
388+
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
389+
max_concurrency_list="[$num_prompts]"
352390
fi
353391
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
354392
echo "Running over max concurrency list $max_concurrency_list"

0 commit comments

Comments
 (0)