Skip to content

Commit 7c30d39

Browse files
TroyGardenmeta-codesync[bot]
authored andcommitted
add base pipeline light (#3580)
Summary: Pull Request resolved: #3580 # context * create a light weighted base pipeline example * modify the `GPUMemoryStats` to include `free` and `total` CUDA memory info. * trace {F1983819268} * snapshot {F1983819284} |short name |GPU Runtime (P90)|CPU Runtime (P90)|GPU Peak Mem alloc (P90)|GPU Peak Mem reserved (P90)|GPU Mem used (P90)|Malloc retries (P50/P90/P100)|CPU Peak RSS (P90)| |--|--|--|--|--|--|--|--| |base_pipeline_light |2749.21 ms |1919.97 ms |11.84 GB |17.33 GB |18.32 GB |0.0 / 0.0 / 0.0 |4.09 GB | |sparse_data_dist_light |1870.40 ms |1809.62 ms |12.47 GB |18.24 GB |19.24 GB |0.0 / 0.0 / 0.0 |4.11 GB | |sparse_data_dist_base |8377.98 ms |8118.92 ms |35.94 GB |57.06 GB |58.11 GB |0.0 / 0.0 / 0.0 |29.68 GB | Reviewed By: spmex Differential Revision: D87951519 fbshipit-source-id: b9ac12ef7c9f7ca82286652ae41efecd547b4095
1 parent 051ceae commit 7c30d39

File tree

3 files changed

+124
-18
lines changed

3 files changed

+124
-18
lines changed

torchrec/distributed/benchmark/base.py

Lines changed: 82 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
]
7676

7777
EMBEDDING_DIM: int = 128
78-
MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT = 100_000
78+
MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT = 1_000_000
7979

8080

8181
class CompileMode(Enum):
@@ -89,22 +89,33 @@ class GPUMemoryStats:
8989
malloc_retries: int
9090
max_mem_allocated_mbs: int
9191
max_mem_reserved_mbs: int
92+
free_mbs: int
93+
total_mbs: int
9294

9395
@classmethod
9496
def for_device(cls, rank: int) -> "GPUMemoryStats":
9597
stats = torch.cuda.memory_stats(rank)
9698
alloc_retries = stats.get("num_alloc_retries", 0)
9799
max_allocated = stats.get("allocated_bytes.all.peak", 0)
98100
max_reserved = stats.get("reserved_bytes.all.peak", 0)
101+
102+
free, total = torch.cuda.mem_get_info(rank)
99103
return cls(
100104
rank,
101105
alloc_retries,
102106
max_allocated // 1024 // 1024,
103107
max_reserved // 1024 // 1024,
108+
free // 1024 // 1024,
109+
total // 1024 // 1024,
104110
)
105111

106112
def __str__(self) -> str:
107-
return f"Rank {self.rank}: retries={self.malloc_retries}, allocated={self.max_mem_allocated_mbs:7}mb, reserved={self.max_mem_reserved_mbs:7}mb"
113+
return (
114+
f"GPUMemoryStats: Rank {self.rank}: retries={self.malloc_retries}, "
115+
+ f"allocated={self.max_mem_allocated_mbs:6}mb, reserved={self.max_mem_reserved_mbs:6}mb, "
116+
+ f"free={self.free_mbs:6}mb, total={self.total_mbs:6}mb, used={self.total_mbs - self.free_mbs:6}mb"
117+
+ f"overhead={self.total_mbs - self.free_mbs - self.max_mem_reserved_mbs:6}mb"
118+
)
108119

109120

110121
@dataclass
@@ -136,21 +147,55 @@ class BenchmarkResult:
136147

137148
def __str__(self) -> str:
138149
gpu_runtime = (
139-
f"GPU Runtime (P90): {self.runtime_percentile(90, device='gpu'):.2f} ms"
150+
"GPU Runtime (P90)",
151+
f"{self.runtime_percentile(90, device='gpu'):.2f} ms",
140152
)
141153
cpu_runtime = (
142-
f"CPU Runtime (P90): {self.runtime_percentile(90, device='cpu'):.2f} ms"
154+
"CPU Runtime (P90)",
155+
f"{self.runtime_percentile(90, device='cpu'):.2f} ms",
143156
)
144-
cpu_mem = f"CPU Peak RSS (P90): {self.cpu_mem_percentile(90)/1000:.2f} GB"
157+
cpu_mem = "CPU Peak RSS (P90)", f"{self.cpu_mem_percentile(90)/1000:.2f} GB"
145158

146-
if len(self.gpu_mem_stats) == 0:
147-
return (
148-
f"{self.short_name: <{35}} | {gpu_runtime} | {cpu_runtime} | {cpu_mem}"
159+
short_name_length = 35
160+
161+
if len(self.gpu_mem_stats) > 0:
162+
mem_used = (
163+
"GPU Mem used (P90)",
164+
f"{self.device_mem_used(90)/1000:.2f} GB",
165+
)
166+
mem_alloc = (
167+
"GPU Peak Mem alloc (P90)",
168+
f"{self.max_mem_alloc_percentile(90)/1000:.2f} GB",
149169
)
150-
mem_alloc = f"GPU Peak Memory alloc (P90): {self.max_mem_alloc_percentile(90)/1000:.2f} GB"
151-
mem_reserved = f"GPU Peak Memory reserved (P90): {self.max_mem_reserved_percentile(90)/1000:.2f} GB"
152-
malloc_retries = f"Malloc retries (P50/P90/P100): {self.mem_retries(50)} / {self.mem_retries(90)} / {self.mem_retries(100)}"
153-
return f"{self.short_name: <{35}} | {malloc_retries} | {gpu_runtime} | {cpu_runtime} | {mem_alloc} | {mem_reserved} | {cpu_mem}"
170+
mem_reserved = (
171+
"GPU Peak Mem reserved (P90)",
172+
f"{self.max_mem_reserved_percentile(90)/1000:.2f} GB",
173+
)
174+
malloc_retries = (
175+
"Malloc retries (P50/P90/P100)",
176+
f"{self.mem_retries(50)} / {self.mem_retries(90)} / {self.mem_retries(100)}",
177+
)
178+
else:
179+
mem_used = mem_alloc = mem_reserved = malloc_retries = ("", "")
180+
head = "|short name" + " " * (short_name_length - len("short name")) + "|"
181+
split = "|--|"
182+
content = f"|{self.short_name: <{35}}|"
183+
for h, c in [
184+
gpu_runtime,
185+
cpu_runtime,
186+
mem_alloc,
187+
mem_reserved,
188+
mem_used,
189+
malloc_retries,
190+
cpu_mem,
191+
]:
192+
if len(h) == 0:
193+
continue
194+
length = max(len(h), len(c))
195+
head += f"{h: <{length}}|"
196+
split += "-" * 2 + "|"
197+
content += f"{c: <{length}}|"
198+
return head + "\n" + split + "\n" + content + "\n"
154199

155200
def runtime_percentile(
156201
self,
@@ -172,6 +217,13 @@ def runtime_percentile(
172217
interpolation=interpolation,
173218
)
174219

220+
def device_mem_used(
221+
self, percentile: int = 50, interpolation: str = "nearest"
222+
) -> torch.Tensor:
223+
return self._mem_percentile(
224+
lambda m: m.total_mbs - m.free_mbs, percentile, interpolation
225+
)
226+
175227
def max_mem_alloc_percentile(
176228
self, percentile: int = 50, interpolation: str = "nearest"
177229
) -> torch.Tensor:
@@ -310,7 +362,9 @@ def setUp() -> None:
310362
short_name=benchmark_res_per_rank[0].short_name,
311363
gpu_elapsed_time=benchmark_res_per_rank[0].gpu_elapsed_time,
312364
cpu_elapsed_time=benchmark_res_per_rank[0].cpu_elapsed_time,
313-
gpu_mem_stats=[GPUMemoryStats(rank, 0, 0, 0) for rank in range(world_size)],
365+
gpu_mem_stats=[
366+
GPUMemoryStats(rank, 0, 0, 0, 0, 0) for rank in range(world_size)
367+
],
314368
cpu_mem_stats=[CPUMemoryStats(rank, 0) for rank in range(world_size)],
315369
rank=0,
316370
)
@@ -688,10 +742,15 @@ def _reset_memory_stats() -> None:
688742
cpu_times_active_ns.append(cpu_end_active_ns - cpu_start_active_ns)
689743

690744
# Convert to milliseconds and drop the first iteration
745+
691746
cpu_elapsed_time = (
692747
torch.tensor([t / 1e6 for t in cpu_times_active_ns[1:]], dtype=torch.float)
693-
if num_benchmarks >= 2
694-
else torch.zeros(1, dtype=torch.float)
748+
if num_benchmarks >= 5 # count from 2nd iteration to remove outliers
749+
else (
750+
torch.tensor([t / 1e6 for t in cpu_times_active_ns], dtype=torch.float)
751+
if num_benchmarks > 0
752+
else torch.zeros(1, dtype=torch.float)
753+
)
695754
)
696755

697756
# Make sure all kernels are finished before reading timers / stats
@@ -705,8 +764,14 @@ def _reset_memory_stats() -> None:
705764
torch.tensor(
706765
[s.elapsed_time(e) for s, e in zip(start_events[1:], end_events[1:])]
707766
)
708-
if num_benchmarks >= 2
709-
else torch.zeros(1, dtype=torch.float)
767+
if num_benchmarks >= 5 # count from 2nd iteration to remove outliers
768+
else (
769+
torch.tensor(
770+
[s.elapsed_time(e) for s, e in zip(start_events, end_events)]
771+
)
772+
if num_benchmarks > 0
773+
else torch.zeros(1, dtype=torch.float)
774+
)
710775
)
711776
else:
712777
# For CPU-only benchmarks we fall back to wall-clock timing via ``timeit``.

torchrec/distributed/benchmark/benchmark_train_pipeline.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,9 @@ def run_pipeline(
247247
short_name=benchmark_res_per_rank[0].short_name,
248248
gpu_elapsed_time=benchmark_res_per_rank[0].gpu_elapsed_time,
249249
cpu_elapsed_time=benchmark_res_per_rank[0].cpu_elapsed_time,
250-
gpu_mem_stats=[GPUMemoryStats(rank, 0, 0, 0) for rank in range(world_size)],
250+
gpu_mem_stats=[
251+
GPUMemoryStats(rank, 0, 0, 0, 0, 0) for rank in range(world_size)
252+
],
251253
cpu_mem_stats=[CPUMemoryStats(rank, 0) for rank in range(world_size)],
252254
rank=0,
253255
)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# this is a very basic sparse data dist config
2+
# runs on 2 ranks, showing traces with reasonable workloads
3+
RunOptions:
4+
world_size: 2
5+
num_batches: 10
6+
num_benchmarks: 1
7+
num_profiles: 1
8+
sharding_type: table_wise
9+
profile_dir: "."
10+
name: "base_pipeline_light"
11+
# export_stacks: True # enable this to export stack traces
12+
loglevel: "info"
13+
PipelineConfig:
14+
pipeline: "base"
15+
ModelInputConfig:
16+
feature_pooling_avg: 10
17+
EmbeddingTablesConfig:
18+
num_unweighted_features: 20
19+
num_weighted_features: 20
20+
embedding_feature_dim: 256
21+
additional_tables:
22+
- - name: FP16_table
23+
embedding_dim: 512
24+
num_embeddings: 100_000
25+
feature_names: ["additional_0_0"]
26+
data_type: FP16
27+
- name: large_table
28+
embedding_dim: 2048
29+
num_embeddings: 1_000_000
30+
feature_names: ["additional_0_1"]
31+
- []
32+
- - name: skipped_table
33+
embedding_dim: 128
34+
num_embeddings: 100_000
35+
feature_names: ["additional_2_1"]
36+
PlannerConfig:
37+
additional_constraints:
38+
large_table:
39+
sharding_types: [column_wise]

0 commit comments

Comments
 (0)