Skip to content

Commit

Permalink
test new code
Browse files Browse the repository at this point in the history
moved _setup call to init()

Signed-off-by: TJ <tix@uber.com>
  • Loading branch information
TJ committed Nov 4, 2021
1 parent 9e730d8 commit 533d06a
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 23 deletions.
6 changes: 5 additions & 1 deletion horovod/mxnet/mpi_ops.py
Expand Up @@ -39,7 +39,6 @@
check_installed_version('mxnet', mx.__version__)

# import basic methods
init = _basics.init
shutdown = _basics.shutdown
is_initialized = _basics.is_initialized
start_timeline = _basics.start_timeline
Expand All @@ -61,6 +60,11 @@
cuda_built = _basics.cuda_built
rocm_built = _basics.rocm_built

def init(*args, **kwargs):
_basics.init(*args, **kwargs)
# Call set up again to make sure the basics is in sync
_setup_process_sets(_basics)

dll_path = os.path.join(os.path.dirname(__file__),
'mpi_lib' + get_ext_suffix())
MPI_MXNET_LIB_CTYPES = ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL)
Expand Down
10 changes: 4 additions & 6 deletions horovod/spark/keras/remote.py
Expand Up @@ -109,6 +109,7 @@ def train(serialized_model, train_rows, val_rows, avg_row_size):

hvd = get_horovod()
hvd.init()

pin_gpu(hvd, tf, k)

if not user_shuffle_buffer_size:
Expand All @@ -129,6 +130,9 @@ def train(serialized_model, train_rows, val_rows, avg_row_size):
# Verbose mode 1 will print a progress bar
verbose = user_verbose if hvd.rank() == 0 else 0

if verbose:
print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}")

transform_spec = None
if transformation:
transform_spec = TransformSpec(transformation)
Expand Down Expand Up @@ -227,12 +231,6 @@ def train(serialized_model, train_rows, val_rows, avg_row_size):
reader_factory = make_batch_reader
is_batch_reader = True

# Call _setup again in process set module to point shared lib to tensorflow's module
# since the lib path might be overwritten in remote trainer.
_horovod.common.process_sets._setup(_horovod.tensorflow.mpi_ops._basics)
if verbose:
print(f"Set shared lib path to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}")

with reader_factory(remote_store.train_data_path,
num_epochs=1,
cur_shard=hvd.rank(),
Expand Down
12 changes: 5 additions & 7 deletions horovod/spark/lightning/remote.py
Expand Up @@ -97,10 +97,14 @@ def RemoteTrainer(estimator, metadata, ckpt_bytes, run_id, dataset_idx, train_ro

def train(serialized_model):
import horovod.torch as hvd
import horovod as _horovod

# Horovod: initialize library.
hvd.init()

if verbose:
import horovod as _horovod
print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}")

_checkpoint_callback = None
require_checkpoint = False

Expand Down Expand Up @@ -218,12 +222,6 @@ def on_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -

print(f"pytorch_lightning version={pl.__version__}")

# Call _setup again in process set module to point shared lib to torch's module
# since the lib path might be overwritten in remote trainer.
_horovod.common.process_sets._setup(_horovod.torch.mpi_ops._basics)
if verbose:
print(f"Set shared lib path to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}")

dataset = data_module(train_dir=remote_store.train_data_path,
val_dir=remote_store.val_data_path,
num_train_epochs=epochs,
Expand Down
11 changes: 4 additions & 7 deletions horovod/spark/torch/remote.py
Expand Up @@ -103,7 +103,6 @@ def train(serialized_model, optimizer_cls, model_opt_state_serialized,
from petastorm.pytorch import BatchedDataLoader, InMemBatchedDataLoader
import torch
import horovod.torch as hvd
import horovod as _horovod

# Deserializing objects
model_opt_state = torch.load(model_opt_state_serialized)
Expand All @@ -118,6 +117,10 @@ def train(serialized_model, optimizer_cls, model_opt_state_serialized,
# Horovod: initialize library.
hvd.init()

if user_verbose:
import horovod as _horovod
print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}")

if not user_shuffle_buffer_size:
shuffle_buffer_size = \
calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size())
Expand Down Expand Up @@ -228,12 +231,6 @@ def save_checkpoint():
else:
reader_factory = make_batch_reader

# Call _setup again in process set module to point shared lib to torch's module
# since the lib path might be overwritten in remote trainer.
_horovod.common.process_sets._setup(_horovod.torch.mpi_ops._basics)
if user_verbose:
print(f"Set shared lib path to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}")

# Petastorm: read data from the store with the correct shard for this rank
# setting num_epochs=None will cause an infinite iterator
# and enables ranks to perform training and validation with
Expand Down
6 changes: 5 additions & 1 deletion horovod/tensorflow/mpi_ops.py
Expand Up @@ -57,7 +57,6 @@ def _load_library(name):
_basics = _HorovodBasics(__file__, 'mpi_lib')

# import basic methods
init = _basics.init
shutdown = _basics.shutdown
is_initialized = _basics.is_initialized
start_timeline = _basics.start_timeline
Expand All @@ -84,6 +83,11 @@ def _load_library(name):
Sum = _basics.Sum
Adasum = _basics.Adasum

def init(*args, **kwargs):
_basics.init(*args, **kwargs)
# Call set up again to make sure the basics is in sync
_setup_process_sets(_basics)

is_homogeneous = _basics.is_homogeneous

handle_average_backwards_compatibility = get_average_backwards_compatibility_fun(_basics)
Expand Down
4 changes: 3 additions & 1 deletion horovod/torch/mpi_ops.py
Expand Up @@ -69,7 +69,9 @@ def shutdown(*args, **kwargs):
def init(*args, **kwargs):
global _handle_map
_handle_map = {}
return _basics.init(*args, **kwargs)
_basics.init(*args, **kwargs)
# Call set up again to make sure the basics is in sync
_setup_process_sets(_basics)

# import reduction op values
Average = _basics.Average
Expand Down

0 comments on commit 533d06a

Please sign in to comment.