Undo changes directory changes made in torchtitan.distributed.utils.py

akashveramd · akashveramd · commit 045b1598966a · 2025-10-29T11:31:10.000-05:00
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -106,6 +106,14 @@ def set_determinism(
         # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 
+        # Ensure flex_attention is compiled without max-autotune. This is needed to ensure
+        # reproducibility, since the autotune results may not be deterministic.
+        from torch.nn.attention.flex_attention import flex_attention
+
+        from torchtitan.models.attention import FlexAttentionWrapper
+
+        FlexAttentionWrapper._compiled_flex_attn = torch.compile(flex_attention)
+
     if not world_mesh:
         if seed is not None:
             torch.manual_seed(seed)
@@ -185,28 +193,14 @@ def create_context_parallel_ctx(
     )
 
 
-def get_train_context(
-    enable_loss_parallel: bool, enable_compiled_autograd: bool
-) -> Generator[None, None, None]:
+def get_train_context(enable_loss_parallel: bool) -> Generator[None, None, None]:
     @contextlib.contextmanager
     def context(cp_context: Generator[None, None, None] | None = None):
         with contextlib.ExitStack() as stack:
             if enable_loss_parallel:
                 stack.enter_context(torch.distributed.tensor.parallel.loss_parallel())
 
-            if enable_compiled_autograd:
-                stack.enter_context(
-                    torch._dynamo.utils.maybe_enable_compiled_autograd(True)
-                )
-
-            if cp_context is not None:
-                from torch.nn.attention import SDPBackend
-
-                from torchtitan.models.attention import ScaledDotProductAttention
-
-                if SDPBackend.MATH in ScaledDotProductAttention.backends:
-                    ScaledDotProductAttention.backends.remove(SDPBackend.MATH)
-
+            if cp_context:
                 stack.enter_context(cp_context)
 
             yield
@@ -274,13 +268,7 @@ def _get_distributed_backend(enable_cpu_backend):
     if comm_config.trace_buf_size > 0:
         # dump on timeout by default if trace buffer is enabled
         _warn_overwrite_env(DUMP_ON_TIMEOUT, "1")
-        # ROCm runner doesn't have write permissions for current working directory.
-        # Hence, using HOME directory to save results.
-        if base_folder and os.access(base_folder, os.W_OK):
-            dump_base = base_folder
-        else:
-            dump_base = os.path.expanduser("~")
-        dump_dir = os.path.join(dump_base, comm_config.save_traces_folder)
+        dump_dir = os.path.join(base_folder, comm_config.save_traces_folder)
         prefix = comm_config.save_traces_file_prefix
         os.makedirs(dump_dir, exist_ok=True)
         _warn_overwrite_env(TRACE_FILE, f"{dump_dir}/{prefix}")
@@ -455,9 +443,3 @@ def _clip_grad_norm_with_ep(
     torch.nn.utils.clip_grads_with_norm_(non_ep_params, max_norm, total_norm, foreach)
 
     return total_norm
-
-
-def _round_up(x: int, y: int) -> int:
-    """Round up x to the nearest multiple of y."""
-    x_ceil_div_y = (x + y - 1) // y
-    return x_ceil_div_y * y