Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix finalization of ProcessSetTable and some test flakiness with PyTorch 1.10.1 #3351

Merged
merged 3 commits into from Jan 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions horovod/common/process_set.cc
Expand Up @@ -265,6 +265,11 @@ void ProcessSetTable::Finalize_(const Context& context, const Status& status) {
LOG(TRACE, Get(0).controller->GetRank())
<< "Finalizing ProcessSetTable, global process set id 0";
id_to_process_set_[0].Finalize(status);

next_id_ = 1;
while (!free_ids_.empty()) free_ids_.pop(); // Clear queue to be sure.
assert(ids_.size() == 1);
assert(id_to_process_set_.size() == 1);
}

#if HAVE_MPI
Expand Down
5 changes: 3 additions & 2 deletions test/parallel/test_torch.py
Expand Up @@ -71,6 +71,7 @@ def setup(self):
def tearDown(self):
gloo_rank = int(os.getenv('HOROVOD_RANK', -1))
if hvd.is_initialized() and not _is_mac and gloo_rank != -1:
hvd.barrier()
hvd.shutdown()

def convert_cpu_fp16_to_fp32(self, *values):
Expand Down Expand Up @@ -2158,14 +2159,14 @@ def get_optimizer_param_values(optimizer):

model_param_values = get_model_param_values(model)
for name, model_param_value in model_param_values:
hvd.broadcast_(model_param_value, root_rank=0)
hvd.broadcast_(model_param_value, root_rank=0, name=name)

opt_param_values_updated = []
opt_param_values = get_optimizer_param_values(optimizer)
for name, opt_param_value in opt_param_values:
is_tensor = torch.is_tensor(opt_param_value)
if is_tensor:
hvd.broadcast_(opt_param_value, root_rank=0)
hvd.broadcast_(opt_param_value, root_rank=0, name=f"{name}_tensor")
else:
opt_param_value = hvd.broadcast_object(opt_param_value, name=name)
opt_param_values_updated.append((name, opt_param_value))
Expand Down