Skip to content

Commit

Permalink
Fixing GPU and CPU TF head CI failures (#3431)
Browse files Browse the repository at this point in the history
Signed-off-by: TJ <tix@uber.com>
  • Loading branch information
TJ Xu committed Feb 28, 2022
1 parent 79ded4b commit 71e10b4
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 11 deletions.
7 changes: 4 additions & 3 deletions horovod/tensorflow/mpi_ops.cc
Expand Up @@ -1137,13 +1137,14 @@ class HorovodJoinOp : public AsyncOpKernel {
}
};

#if HOROVOD_GPU_ALLREDUCE
REGISTER_KERNEL_BUILDER(Name("HorovodJoin")
.Device(DEVICE_CPU)
.Device(DEVICE_GPU)
.HostMemory("output"),
HorovodJoinOp);
#if HOROVOD_GPU_ALLREDUCE
#else
REGISTER_KERNEL_BUILDER(Name("HorovodJoin")
.Device(DEVICE_GPU)
.Device(DEVICE_CPU)
.HostMemory("output"),
HorovodJoinOp);
#endif
Expand Down
12 changes: 4 additions & 8 deletions test/parallel/test_tensorflow.py
Expand Up @@ -4051,8 +4051,6 @@ def test_horovod_join_allreduce(self):
self.assertSequenceEqual(ret_values, [ret] * size,
msg="hvd.join() did not return the same value on each rank")

@pytest.mark.skipif(LooseVersion(tf.__version__) >=
LooseVersion('2.9.0'), reason='https://github.com/horovod/horovod/issues/3422')
def test_horovod_syncbn_gpu(self):
"""Test that the SyncBatchNormalization implementation is correct on GPU."""
# Only do this test if there are GPUs available.
Expand Down Expand Up @@ -4085,8 +4083,8 @@ def test_horovod_syncbn_gpu(self):
for x in x_list:
bn = tf.keras.layers.BatchNormalization(axis=1, fused=False)
sync_bn = hvd.SyncBatchNormalization(axis=1)
bn_func = bn.apply(x, training=True)
sync_bn_func = sync_bn.apply(tf.expand_dims(x[hvd.rank()], 0), training=True)
bn_func = bn(x, training=True)
sync_bn_func = sync_bn(tf.expand_dims(x[hvd.rank()], 0), training=True)

try:
init = tf.global_variables_initializer()
Expand All @@ -4100,8 +4098,6 @@ def test_horovod_syncbn_gpu(self):
self.assertAllClose(self.evaluate(sync_bn.moving_mean), self.evaluate(bn.moving_mean))
self.assertAllClose(self.evaluate(sync_bn.moving_variance), self.evaluate(bn.moving_variance))

@pytest.mark.skipif(LooseVersion(tf.__version__) >=
LooseVersion('2.9.0'), reason='https://github.com/horovod/horovod/issues/3422')
def test_horovod_syncbn_cpu(self):
"""Test that the SyncBatchNormalization implementation is correct on CPU."""

Expand Down Expand Up @@ -4131,8 +4127,8 @@ def test_horovod_syncbn_cpu(self):
for x in x_list:
bn = tf.keras.layers.BatchNormalization(axis=1, fused=False)
sync_bn = hvd.SyncBatchNormalization(axis=1)
bn_func = bn.apply(x, training=True)
sync_bn_func = sync_bn.apply(tf.expand_dims(x[hvd.rank()], 0), training=True)
bn_func = bn(x, training=True)
sync_bn_func = sync_bn(tf.expand_dims(x[hvd.rank()], 0), training=True)

try:
init = tf.global_variables_initializer()
Expand Down

0 comments on commit 71e10b4

Please sign in to comment.