Skip to content

Commit 1ab8fc8

Browse files
authored
Make PyTorch profiler gzip and CUDA time dump configurable (#29568)
Signed-off-by: Yifei Zhang <[email protected]>
1 parent f72a817 commit 1ab8fc8

File tree

5 files changed

+35
-13
lines changed

5 files changed

+35
-13
lines changed

docs/contributing/profiling.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
1111
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
1212
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
1313
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
14+
- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
15+
- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
1416

1517
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
1618

vllm/envs.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@
100100
VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
101101
VLLM_PROFILER_DELAY_ITERS: int = 0
102102
VLLM_PROFILER_MAX_ITERS: int = 0
103+
VLLM_TORCH_PROFILER_USE_GZIP: bool = True
104+
VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
103105
VLLM_USE_TRITON_AWQ: bool = False
104106
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
105107
VLLM_SKIP_P2P_CHECK: bool = False
@@ -890,6 +892,17 @@ def get_vllm_port() -> int | None:
890892
# Maximum number of iterations to profile when using the torch/torch CUDA profiler.
891893
# If set to 0, will not limit the number of iterations.
892894
"VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
895+
# Control whether torch profiler gzip-compresses profiling files.
896+
# Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default).
897+
"VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool(
898+
os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
899+
),
900+
# Control whether torch profiler dumps the self_cuda_time_total table.
901+
# Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping
902+
# (enabled by default).
903+
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool(
904+
os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0"
905+
),
893906
# If set, vLLM will use Triton implementations of AWQ.
894907
"VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
895908
# If set, allow loading or unloading lora adapters in runtime,

vllm/profiler/gpu_profiler.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,9 @@ def __init__(self, worker_name: str, local_rank: int) -> None:
162162
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
163163
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
164164
on_trace_ready=torch.profiler.tensorboard_trace_handler(
165-
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
165+
torch_profiler_trace_dir,
166+
worker_name=worker_name,
167+
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
166168
),
167169
)
168170

@@ -174,18 +176,19 @@ def _start(self) -> None:
174176
def _stop(self) -> None:
175177
self.profiler.stop()
176178

177-
rank = self.local_rank
178-
profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
179-
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
180-
sort_key = "self_cuda_time_total"
181-
table = self.profiler.key_averages().table(sort_by=sort_key)
179+
if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL:
180+
rank = self.local_rank
181+
profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
182+
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
183+
sort_key = "self_cuda_time_total"
184+
table = self.profiler.key_averages().table(sort_by=sort_key)
182185

183-
with open(profiler_out_file, "w") as f:
184-
print(table, file=f)
186+
with open(profiler_out_file, "w") as f:
187+
print(table, file=f)
185188

186-
# only print profiler results on rank 0
187-
if rank == 0:
188-
print(table)
189+
# only print profiler results on rank 0
190+
if rank == 0:
191+
print(table)
189192

190193
@override
191194
def annotate_context_manager(self, name: str):

vllm/v1/engine/async_llm.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,9 @@ def __init__(
190190
],
191191
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
192192
on_trace_ready=torch.profiler.tensorboard_trace_handler(
193-
envs.VLLM_TORCH_PROFILER_DIR, worker_name=worker_name, use_gzip=True
193+
envs.VLLM_TORCH_PROFILER_DIR,
194+
worker_name=worker_name,
195+
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
194196
),
195197
)
196198
else:

vllm/v1/worker/xpu_worker.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ def __init__(
6464
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
6565
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
6666
on_trace_ready=torch.profiler.tensorboard_trace_handler(
67-
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
67+
torch_profiler_trace_dir,
68+
worker_name=worker_name,
69+
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
6870
),
6971
)
7072
else:

0 commit comments

Comments
 (0)