7575]
7676
7777EMBEDDING_DIM : int = 128
78- MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT = 100_000
78+ MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT = 1_000_000
7979
8080
8181class CompileMode (Enum ):
@@ -89,22 +89,33 @@ class GPUMemoryStats:
8989 malloc_retries : int
9090 max_mem_allocated_mbs : int
9191 max_mem_reserved_mbs : int
92+ free_mbs : int
93+ total_mbs : int
9294
9395 @classmethod
9496 def for_device (cls , rank : int ) -> "GPUMemoryStats" :
9597 stats = torch .cuda .memory_stats (rank )
9698 alloc_retries = stats .get ("num_alloc_retries" , 0 )
9799 max_allocated = stats .get ("allocated_bytes.all.peak" , 0 )
98100 max_reserved = stats .get ("reserved_bytes.all.peak" , 0 )
101+
102+ free , total = torch .cuda .mem_get_info (rank )
99103 return cls (
100104 rank ,
101105 alloc_retries ,
102106 max_allocated // 1024 // 1024 ,
103107 max_reserved // 1024 // 1024 ,
108+ free // 1024 // 1024 ,
109+ total // 1024 // 1024 ,
104110 )
105111
106112 def __str__ (self ) -> str :
107- return f"Rank { self .rank } : retries={ self .malloc_retries } , allocated={ self .max_mem_allocated_mbs :7} mb, reserved={ self .max_mem_reserved_mbs :7} mb"
113+ return (
114+ f"GPUMemoryStats: Rank { self .rank } : retries={ self .malloc_retries } , "
115+ + f"allocated={ self .max_mem_allocated_mbs :6} mb, reserved={ self .max_mem_reserved_mbs :6} mb, "
116+ + f"free={ self .free_mbs :6} mb, total={ self .total_mbs :6} mb, used={ self .total_mbs - self .free_mbs :6} mb"
117+ + f"overhead={ self .total_mbs - self .free_mbs - self .max_mem_reserved_mbs :6} mb"
118+ )
108119
109120
110121@dataclass
@@ -136,21 +147,55 @@ class BenchmarkResult:
136147
137148 def __str__ (self ) -> str :
138149 gpu_runtime = (
139- f"GPU Runtime (P90): { self .runtime_percentile (90 , device = 'gpu' ):.2f} ms"
150+ "GPU Runtime (P90)" ,
151+ f"{ self .runtime_percentile (90 , device = 'gpu' ):.2f} ms" ,
140152 )
141153 cpu_runtime = (
142- f"CPU Runtime (P90): { self .runtime_percentile (90 , device = 'cpu' ):.2f} ms"
154+ "CPU Runtime (P90)" ,
155+ f"{ self .runtime_percentile (90 , device = 'cpu' ):.2f} ms" ,
143156 )
144- cpu_mem = f "CPU Peak RSS (P90): { self .cpu_mem_percentile (90 )/ 1000 :.2f} GB"
157+ cpu_mem = "CPU Peak RSS (P90)" , f" { self .cpu_mem_percentile (90 )/ 1000 :.2f} GB"
145158
146- if len (self .gpu_mem_stats ) == 0 :
147- return (
148- f"{ self .short_name : <{35 }} | { gpu_runtime } | { cpu_runtime } | { cpu_mem } "
159+ short_name_length = 35
160+
161+ if len (self .gpu_mem_stats ) > 0 :
162+ mem_used = (
163+ "GPU Mem used (P90)" ,
164+ f"{ self .device_mem_used (90 )/ 1000 :.2f} GB" ,
165+ )
166+ mem_alloc = (
167+ "GPU Peak Mem alloc (P90)" ,
168+ f"{ self .max_mem_alloc_percentile (90 )/ 1000 :.2f} GB" ,
149169 )
150- mem_alloc = f"GPU Peak Memory alloc (P90): { self .max_mem_alloc_percentile (90 )/ 1000 :.2f} GB"
151- mem_reserved = f"GPU Peak Memory reserved (P90): { self .max_mem_reserved_percentile (90 )/ 1000 :.2f} GB"
152- malloc_retries = f"Malloc retries (P50/P90/P100): { self .mem_retries (50 )} / { self .mem_retries (90 )} / { self .mem_retries (100 )} "
153- return f"{ self .short_name : <{35 }} | { malloc_retries } | { gpu_runtime } | { cpu_runtime } | { mem_alloc } | { mem_reserved } | { cpu_mem } "
170+ mem_reserved = (
171+ "GPU Peak Mem reserved (P90)" ,
172+ f"{ self .max_mem_reserved_percentile (90 )/ 1000 :.2f} GB" ,
173+ )
174+ malloc_retries = (
175+ "Malloc retries (P50/P90/P100)" ,
176+ f"{ self .mem_retries (50 )} / { self .mem_retries (90 )} / { self .mem_retries (100 )} " ,
177+ )
178+ else :
179+ mem_used = mem_alloc = mem_reserved = malloc_retries = ("" , "" )
180+ head = "|short name" + " " * (short_name_length - len ("short name" )) + "|"
181+ split = "|--|"
182+ content = f"|{ self .short_name : <{35 }} |"
183+ for h , c in [
184+ gpu_runtime ,
185+ cpu_runtime ,
186+ mem_alloc ,
187+ mem_reserved ,
188+ mem_used ,
189+ malloc_retries ,
190+ cpu_mem ,
191+ ]:
192+ if len (h ) == 0 :
193+ continue
194+ length = max (len (h ), len (c ))
195+ head += f"{ h : <{length }} |"
196+ split += "-" * 2 + "|"
197+ content += f"{ c : <{length }} |"
198+ return head + "\n " + split + "\n " + content + "\n "
154199
155200 def runtime_percentile (
156201 self ,
@@ -172,6 +217,13 @@ def runtime_percentile(
172217 interpolation = interpolation ,
173218 )
174219
220+ def device_mem_used (
221+ self , percentile : int = 50 , interpolation : str = "nearest"
222+ ) -> torch .Tensor :
223+ return self ._mem_percentile (
224+ lambda m : m .total_mbs - m .free_mbs , percentile , interpolation
225+ )
226+
175227 def max_mem_alloc_percentile (
176228 self , percentile : int = 50 , interpolation : str = "nearest"
177229 ) -> torch .Tensor :
@@ -310,7 +362,9 @@ def setUp() -> None:
310362 short_name = benchmark_res_per_rank [0 ].short_name ,
311363 gpu_elapsed_time = benchmark_res_per_rank [0 ].gpu_elapsed_time ,
312364 cpu_elapsed_time = benchmark_res_per_rank [0 ].cpu_elapsed_time ,
313- gpu_mem_stats = [GPUMemoryStats (rank , 0 , 0 , 0 ) for rank in range (world_size )],
365+ gpu_mem_stats = [
366+ GPUMemoryStats (rank , 0 , 0 , 0 , 0 , 0 ) for rank in range (world_size )
367+ ],
314368 cpu_mem_stats = [CPUMemoryStats (rank , 0 ) for rank in range (world_size )],
315369 rank = 0 ,
316370 )
@@ -688,10 +742,15 @@ def _reset_memory_stats() -> None:
688742 cpu_times_active_ns .append (cpu_end_active_ns - cpu_start_active_ns )
689743
690744 # Convert to milliseconds and drop the first iteration
745+
691746 cpu_elapsed_time = (
692747 torch .tensor ([t / 1e6 for t in cpu_times_active_ns [1 :]], dtype = torch .float )
693- if num_benchmarks >= 2
694- else torch .zeros (1 , dtype = torch .float )
748+ if num_benchmarks >= 5 # count from 2nd iteration to remove outliers
749+ else (
750+ torch .tensor ([t / 1e6 for t in cpu_times_active_ns ], dtype = torch .float )
751+ if num_benchmarks > 0
752+ else torch .zeros (1 , dtype = torch .float )
753+ )
695754 )
696755
697756 # Make sure all kernels are finished before reading timers / stats
@@ -705,8 +764,14 @@ def _reset_memory_stats() -> None:
705764 torch .tensor (
706765 [s .elapsed_time (e ) for s , e in zip (start_events [1 :], end_events [1 :])]
707766 )
708- if num_benchmarks >= 2
709- else torch .zeros (1 , dtype = torch .float )
767+ if num_benchmarks >= 5 # count from 2nd iteration to remove outliers
768+ else (
769+ torch .tensor (
770+ [s .elapsed_time (e ) for s , e in zip (start_events , end_events )]
771+ )
772+ if num_benchmarks > 0
773+ else torch .zeros (1 , dtype = torch .float )
774+ )
710775 )
711776 else :
712777 # For CPU-only benchmarks we fall back to wall-clock timing via ``timeit``.
0 commit comments