horovod · Tixxx · Feb 28, 2022 · Feb 28, 2022
diff --git a/horovod/tensorflow/mpi_ops.cc b/horovod/tensorflow/mpi_ops.cc
@@ -1137,13 +1137,14 @@ class HorovodJoinOp : public AsyncOpKernel {
   }
 };
 
+#if HOROVOD_GPU_ALLREDUCE
 REGISTER_KERNEL_BUILDER(Name("HorovodJoin")
-                            .Device(DEVICE_CPU)
+                            .Device(DEVICE_GPU)
                             .HostMemory("output"),
                         HorovodJoinOp);
-#if HOROVOD_GPU_ALLREDUCE
+#else
 REGISTER_KERNEL_BUILDER(Name("HorovodJoin")
-                            .Device(DEVICE_GPU)
+                            .Device(DEVICE_CPU)
                             .HostMemory("output"),
                         HorovodJoinOp);
 #endif

diff --git a/test/parallel/test_tensorflow.py b/test/parallel/test_tensorflow.py
@@ -4051,8 +4051,6 @@ def test_horovod_join_allreduce(self):
                 self.assertSequenceEqual(ret_values, [ret] * size,
                                          msg="hvd.join() did not return the same value on each rank")
 
-    @pytest.mark.skipif(LooseVersion(tf.__version__) >=
-                        LooseVersion('2.9.0'), reason='https://github.com/horovod/horovod/issues/3422')
     def test_horovod_syncbn_gpu(self):
         """Test that the SyncBatchNormalization implementation is correct on GPU."""
         # Only do this test if there are GPUs available.
@@ -4085,8 +4083,8 @@ def test_horovod_syncbn_gpu(self):
             for x in x_list:
                 bn = tf.keras.layers.BatchNormalization(axis=1, fused=False)
                 sync_bn = hvd.SyncBatchNormalization(axis=1)
-                bn_func = bn.apply(x, training=True)
-                sync_bn_func = sync_bn.apply(tf.expand_dims(x[hvd.rank()], 0), training=True)
+                bn_func = bn(x, training=True)
+                sync_bn_func = sync_bn(tf.expand_dims(x[hvd.rank()], 0), training=True)
 
                 try:
                   init = tf.global_variables_initializer()
@@ -4100,8 +4098,6 @@ def test_horovod_syncbn_gpu(self):
                 self.assertAllClose(self.evaluate(sync_bn.moving_mean), self.evaluate(bn.moving_mean))
                 self.assertAllClose(self.evaluate(sync_bn.moving_variance), self.evaluate(bn.moving_variance))
 
-    @pytest.mark.skipif(LooseVersion(tf.__version__) >=
-                        LooseVersion('2.9.0'), reason='https://github.com/horovod/horovod/issues/3422')
     def test_horovod_syncbn_cpu(self):
         """Test that the SyncBatchNormalization implementation is correct on CPU."""
 
@@ -4131,8 +4127,8 @@ def test_horovod_syncbn_cpu(self):
             for x in x_list:
                 bn = tf.keras.layers.BatchNormalization(axis=1, fused=False)
                 sync_bn = hvd.SyncBatchNormalization(axis=1)
-                bn_func = bn.apply(x, training=True)
-                sync_bn_func = sync_bn.apply(tf.expand_dims(x[hvd.rank()], 0), training=True)
+                bn_func = bn(x, training=True)
+                sync_bn_func = sync_bn(tf.expand_dims(x[hvd.rank()], 0), training=True)
 
                 try:
                   init = tf.global_variables_initializer()