Skip to content

Commit

Permalink
Fix debug dir bugs and minifier output directories (#87682)
Browse files Browse the repository at this point in the history
Fixes pytorch/torchdynamo#1758, pytorch/torchdynamo#1752

- minifier_launcher.py now dumps checkpoints to \<cwd\>/checkpoints when run
- a single debug directory is created per script invocation, asserts failing with no directory will no longer occur
- torchinductor debug tracing will correctly dump to the debug directory now since no prior setup is needed, (the directory was incorrectly only initialized during dynamo tracing)

cc @jansel @lezcano @fdrocha @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: #87682
Approved by: https://github.com/ezyang
  • Loading branch information
mlazos authored and pytorchmergebot committed Oct 25, 2022
1 parent ff2569b commit 44d7ba7
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 146 deletions.
96 changes: 0 additions & 96 deletions test/dynamo/test_debug_dir.py

This file was deleted.

2 changes: 0 additions & 2 deletions test/dynamo/test_minifier.py
Expand Up @@ -43,10 +43,8 @@ def tearDownClass(cls):

def setUp(self):
super().setUp()
torch._dynamo.utils.debug_dir.setup()

def tearDown(self):
torch._dynamo.utils.debug_dir.clear()
super().tearDown()

def test_after_dynamo(self):
Expand Down
20 changes: 5 additions & 15 deletions torch/_dynamo/debug_utils.py
Expand Up @@ -240,7 +240,7 @@ def save_graph_repro(fd, gm, args, compiler_name):
def isolate_fails(fx_g, args, compiler_name: str, env=None):
if env is None:
env = {}
subdir = f"{minifier_dir()}/isolate"
subdir = os.path.join(os.getcwd(), "isolate")
if not os.path.exists(subdir):
os.makedirs(subdir, exist_ok=True)
file_name = os.path.join(subdir, f"{str(uuid.uuid4())[:5]}.py")
Expand Down Expand Up @@ -600,10 +600,11 @@ def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False):
"""
Saves the repro to a repro.py file
"""
subdir = os.path.join(minifier_dir())
curdir = os.getcwd()
subdir = os.path.join(os.getcwd(), "checkpoints")
if not os.path.exists(subdir):
os.makedirs(subdir, exist_ok=True)
file_name = os.path.join(subdir, f"{len(gm.graph.nodes)}.py")
file_name = os.path.join(subdir, f"minified_{len(gm.graph.nodes)}_nodes.py")
log.warning(f"Writing checkpoint with {len(gm.graph.nodes)} nodes to {file_name}")

model_str = NNModuleToString.convert(gm)
Expand All @@ -613,19 +614,10 @@ def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False):
model_str, args, compiler_name, check_accuracy
)
)
latest_repro = os.path.join(subdir, "repro.py")
latest_repro = os.path.join(curdir, "repro.py")
log.warning(f"Copying {file_name} to {latest_repro} for convenience")
shutil.copyfile(file_name, latest_repro)

local_path = os.path.join(config.base_dir, "repro.py")
try:
shutil.copyfile(file_name, local_path)
log.warning(
f"Copying minified repro from {file_name} to {local_path} for convenience"
)
except OSError:
log.warning("No write permissions for {local_path}")


# TODO - Commented because we are assuming that nn.Modules can be safely repr'd
# If that does not work, we might have to bring this code back. So, keeping it
Expand Down Expand Up @@ -748,8 +740,6 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
from {config.dynamo_import}.optimizations.backends import BACKENDS
from {config.dynamo_import}.testing import rand_strided
{config.dynamo_import}.config.repro_dir = \"{minifier_dir()}\"
args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]}
args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
Expand Down
4 changes: 0 additions & 4 deletions torch/_dynamo/eval_frame.py
Expand Up @@ -103,14 +103,12 @@ def __enter__(self):
"Please refer to https://github.com/pytorch/torchdynamo#usage-example "
"to use torchdynamo.optimize(...) as an annotation/decorator. "
)
utils.debug_dir.setup()
self.on_enter()
self.prior = set_eval_frame(self.callback)
self.backend_ctx = self.extra_ctx_ctor()
self.backend_ctx.__enter__()

def __exit__(self, exc_type, exc_val, exc_tb):
utils.debug_dir.clear()
set_eval_frame(self.prior)
self.prior = unset
self.backend_ctx.__exit__(exc_type, exc_val, exc_tb)
Expand Down Expand Up @@ -152,14 +150,12 @@ def __call__(self, *args, **kwargs):
@functools.wraps(fn)
def _fn(*args, **kwargs):
on_enter()
utils.debug_dir.setup()
prior = set_eval_frame(callback)
backend_ctx = backend_ctx_ctor()
backend_ctx.__enter__()
try:
return fn(*args, **kwargs)
finally:
utils.debug_dir.clear()
set_eval_frame(prior)
backend_ctx.__exit__(None, None, None)

Expand Down
36 changes: 7 additions & 29 deletions torch/_dynamo/utils.py
Expand Up @@ -975,35 +975,13 @@ def recompile_reasons(code):
return rpt


class DebugDir:
def __init__(self):
self.num_setup_calls = 0
self.debug_path = None

def setup(self):
assert self.num_setup_calls >= 0
if self.num_setup_calls == 0:
debug_root = config.debug_dir_root
dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
self.debug_path = os.path.join(debug_root, dir_name)

self.num_setup_calls += 1

def clear(self):
assert self.num_setup_calls >= 0
if self.num_setup_calls == 1:
self.debug_path = None

self.num_setup_calls -= 1
assert self.num_setup_calls >= 0

def get(self):
assert self.debug_path is not None
return self.debug_path


debug_dir = DebugDir()
# return same dir unless user changes config between calls
@functools.lru_cache(None)
def _get_debug_dir(root_dir):
dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
return os.path.join(root_dir, dir_name)


def get_debug_dir():
return debug_dir.get()
debug_root = config.debug_dir_root
return _get_debug_dir(debug_root)

0 comments on commit 44d7ba7

Please sign in to comment.