Fix debug dir bugs and minifier output directories (#87682)

Fixes pytorch/torchdynamo#1758, pytorch/torchdynamo#1752 - minifier_launcher.py now dumps checkpoints to \<cwd\>/checkpoints when run - a single debug directory is created per script invocation, asserts failing with no directory will no longer occur - torchinductor debug tracing will correctly dump to the debug directory now since no prior setup is needed, (the directory was incorrectly only initialized during dynamo tracing) cc @jansel @lezcano @fdrocha @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: #87682 Approved by: https://github.com/ezyang
pytorch · Oct 25, 2022 · 44d7ba7 · 44d7ba7
1 parent ff2569b
commit 44d7ba7
Show file tree

Hide file tree

Showing 5 changed files with 12 additions and 146 deletions.
diff --git a/test/dynamo/test_debug_dir.py b/test/dynamo/test_debug_dir.py
diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
@@ -43,10 +43,8 @@ def tearDownClass(cls):
 
     def setUp(self):
         super().setUp()
-        torch._dynamo.utils.debug_dir.setup()
 
     def tearDown(self):
-        torch._dynamo.utils.debug_dir.clear()
         super().tearDown()
 
     def test_after_dynamo(self):

diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
@@ -240,7 +240,7 @@ def save_graph_repro(fd, gm, args, compiler_name):
 def isolate_fails(fx_g, args, compiler_name: str, env=None):
     if env is None:
         env = {}
-    subdir = f"{minifier_dir()}/isolate"
+    subdir = os.path.join(os.getcwd(), "isolate")
     if not os.path.exists(subdir):
         os.makedirs(subdir, exist_ok=True)
     file_name = os.path.join(subdir, f"{str(uuid.uuid4())[:5]}.py")
@@ -600,10 +600,11 @@ def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False):
     """
     Saves the repro to a repro.py file
     """
-    subdir = os.path.join(minifier_dir())
+    curdir = os.getcwd()
+    subdir = os.path.join(os.getcwd(), "checkpoints")
     if not os.path.exists(subdir):
         os.makedirs(subdir, exist_ok=True)
-    file_name = os.path.join(subdir, f"{len(gm.graph.nodes)}.py")
+    file_name = os.path.join(subdir, f"minified_{len(gm.graph.nodes)}_nodes.py")
     log.warning(f"Writing checkpoint with {len(gm.graph.nodes)} nodes to {file_name}")
 
     model_str = NNModuleToString.convert(gm)
@@ -613,19 +614,10 @@ def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False):
                 model_str, args, compiler_name, check_accuracy
             )
         )
-    latest_repro = os.path.join(subdir, "repro.py")
+    latest_repro = os.path.join(curdir, "repro.py")
     log.warning(f"Copying {file_name} to {latest_repro} for convenience")
     shutil.copyfile(file_name, latest_repro)
 
-    local_path = os.path.join(config.base_dir, "repro.py")
-    try:
-        shutil.copyfile(file_name, local_path)
-        log.warning(
-            f"Copying minified repro from {file_name} to {local_path} for convenience"
-        )
-    except OSError:
-        log.warning("No write permissions for {local_path}")
-
 
 # TODO - Commented because we are assuming that nn.Modules can be safely repr'd
 # If that does not work, we might have to bring this code back. So, keeping it
@@ -748,8 +740,6 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 from {config.dynamo_import}.optimizations.backends import BACKENDS
 from {config.dynamo_import}.testing import rand_strided
 
-{config.dynamo_import}.config.repro_dir = \"{minifier_dir()}\"
-
 args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]}
 args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
 

diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
@@ -103,14 +103,12 @@ def __enter__(self):
                 "Please refer to https://github.com/pytorch/torchdynamo#usage-example "
                 "to use torchdynamo.optimize(...) as an annotation/decorator. "
             )
-        utils.debug_dir.setup()
         self.on_enter()
         self.prior = set_eval_frame(self.callback)
         self.backend_ctx = self.extra_ctx_ctor()
         self.backend_ctx.__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        utils.debug_dir.clear()
         set_eval_frame(self.prior)
         self.prior = unset
         self.backend_ctx.__exit__(exc_type, exc_val, exc_tb)
@@ -152,14 +150,12 @@ def __call__(self, *args, **kwargs):
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
             on_enter()
-            utils.debug_dir.setup()
             prior = set_eval_frame(callback)
             backend_ctx = backend_ctx_ctor()
             backend_ctx.__enter__()
             try:
                 return fn(*args, **kwargs)
             finally:
-                utils.debug_dir.clear()
                 set_eval_frame(prior)
                 backend_ctx.__exit__(None, None, None)
 

diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
@@ -975,35 +975,13 @@ def recompile_reasons(code):
         return rpt
 
 
-class DebugDir:
-    def __init__(self):
-        self.num_setup_calls = 0
-        self.debug_path = None
-
-    def setup(self):
-        assert self.num_setup_calls >= 0
-        if self.num_setup_calls == 0:
-            debug_root = config.debug_dir_root
-            dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
-            self.debug_path = os.path.join(debug_root, dir_name)
-
-        self.num_setup_calls += 1
-
-    def clear(self):
-        assert self.num_setup_calls >= 0
-        if self.num_setup_calls == 1:
-            self.debug_path = None
-
-        self.num_setup_calls -= 1
-        assert self.num_setup_calls >= 0
-
-    def get(self):
-        assert self.debug_path is not None
-        return self.debug_path
-
-
-debug_dir = DebugDir()
+# return same dir unless user changes config between calls
+@functools.lru_cache(None)
+def _get_debug_dir(root_dir):
+    dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
+    return os.path.join(root_dir, dir_name)
 
 
 def get_debug_dir():
-    return debug_dir.get()
+    debug_root = config.debug_dir_root
+    return _get_debug_dir(debug_root)