Limit build and test to TF 2.6+

Signed-off-by: Max H. Gerlach <git@maxgerlach.de>
horovod · Aug 27, 2021 · 7970416 · 7970416
1 parent 16c6a5a
commit 7970416
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 24 deletions.
diff --git a/horovod/tensorflow/CMakeLists.txt b/horovod/tensorflow/CMakeLists.txt
@@ -61,23 +61,26 @@ set(Tensorflow_CXX11 ${Tensorflow_CXX11} PARENT_SCOPE)
 list(APPEND TF_SOURCES "${PROJECT_SOURCE_DIR}/horovod/tensorflow/mpi_ops.cc")
 list(APPEND TF_SOURCES "${PROJECT_SOURCE_DIR}/horovod/tensorflow/xla_mpi_ops.cc")
 
-list(APPEND TF_SOURCES "${PROJECT_SOURCE_DIR}/horovod/tensorflow/dense_update_functor.cc"
-                       "${PROJECT_SOURCE_DIR}/horovod/tensorflow/training_op_helpers.cc")
+if(VERSION_DEC GREATER 2006000000 OR VERSION_DEC EQUAL 2006000000)
+    # dependencies for inplace broadcast with resource variables
+    list(APPEND TF_SOURCES "${PROJECT_SOURCE_DIR}/horovod/tensorflow/dense_update_functor.cc"
+                           "${PROJECT_SOURCE_DIR}/horovod/tensorflow/training_op_helpers.cc")
 
-if(HAVE_CUDA OR HAVE_SUB_PROJECT_CUDA)
-    list(APPEND TF_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/horovod/tensorflow/dense_update_functor_gpu.cc.cu")
-    # dense_update_functor_gpu.cu.cc from TensorFlow source had to be renamed because it was not compiled via nvcc otherwise
+    if(HAVE_CUDA OR HAVE_SUB_PROJECT_CUDA)
+        list(APPEND TF_CUDA_SOURCES "${PROJECT_SOURCE_DIR}/horovod/tensorflow/dense_update_functor_gpu.cc.cu")
+        # dense_update_functor_gpu.cu.cc from TensorFlow source had to be renamed because it was not compiled via nvcc otherwise
 
-    set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+        set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
 
-    set(ENV{PYTHONPATH} "${PROJECT_SOURCE_DIR}/cmake:$ENV{PYTHONPATH}")
-    execute_process(COMMAND ${PY_EXE} -c "import build_utils; print(' '.join(build_utils.get_nvcc_flags()))"
-                    OUTPUT_VARIABLE HVD_NVCC_COMPILE_FLAGS OUTPUT_STRIP_TRAILING_WHITESPACE)
+        set(ENV{PYTHONPATH} "${PROJECT_SOURCE_DIR}/cmake:$ENV{PYTHONPATH}")
+        execute_process(COMMAND ${PY_EXE} -c "import build_utils; print(' '.join(build_utils.get_nvcc_flags()))"
+                        OUTPUT_VARIABLE HVD_NVCC_COMPILE_FLAGS OUTPUT_STRIP_TRAILING_WHITESPACE)
 
-    list(APPEND CUDA_NVCC_FLAGS "${HVD_NVCC_COMPILE_FLAGS}")
+        list(APPEND CUDA_NVCC_FLAGS "${HVD_NVCC_COMPILE_FLAGS}")
 
-    cuda_add_library(tensorflow_cuda_kernels ${TF_CUDA_SOURCES} STATIC OPTIONS -DGOOGLE_CUDA=1)
-    list(APPEND TF_LINKER_LIBS tensorflow_cuda_kernels)
+        cuda_add_library(tensorflow_cuda_kernels ${TF_CUDA_SOURCES} STATIC OPTIONS -DGOOGLE_CUDA=1)
+        list(APPEND TF_LINKER_LIBS tensorflow_cuda_kernels)
+    endif()
 endif()
 
 # Create library

diff --git a/horovod/tensorflow/functions.py b/horovod/tensorflow/functions.py
@@ -68,10 +68,10 @@ def broadcast_variables(variables, root_rank, process_set=global_process_set, in
     Broadcasts variables from root rank to all other processes
     in a process set (defaults to all Horovod processes).
 
-    Optionally, the broadcast may be performed in-place. Some restrictions apply then:
-    Reference variables (default in TensorFlow 1, legacy in TensorFlow 2) must all be
-    of the same data type. Resource variables (default in TensorFlow 2) are not supported
-    with TensorFlow
+    Optionally, the broadcast may be performed in-place. This is only supported with
+    TensorFlow 2.6 or later. Reference variables (legacy support in TF 2) must all
+    be of the same data type. There is no such restriction for resource variables
+    (default in TF 2).
 
     Arguments:
         variables: variables for broadcast

diff --git a/horovod/tensorflow/mpi_ops.cc b/horovod/tensorflow/mpi_ops.cc
@@ -31,9 +31,12 @@
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+
+#if TENSORFLOW_VERSION >= 2006000000
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
+#endif // TENSORFLOW_VERSION >= 2006000000
 
 #include "../common/common.h"
 
@@ -839,6 +842,7 @@ Output
                `tensor` on root rank.
 )doc");
 
+#if TENSORFLOW_VERSION >= 2006000000
 namespace {
 std::string NormalizeNameForTensorFlow(const std::string& name) {
   static const std::regex normalize_re(R"regex([^a-zA-Z0-9_])regex");
@@ -1087,6 +1091,7 @@ Input
     resources:    Variables to broadcast. They will be updated in-place
                   to the values from the root rank.
 )doc");
+#endif // TENSORFLOW_VERSION >= 2006000000
 
 class HorovodJoinOp : public AsyncOpKernel {
 public:

diff --git a/horovod/tensorflow/mpi_ops.py b/horovod/tensorflow/mpi_ops.py
@@ -305,12 +305,15 @@ def broadcast_(variables, root_rank, name=None, process_set=global_process_set):
     are ready to send and receive all variables. In each process all variables need
     to be located on the same device (CPU or GPU).
 
-    Note: Due to a bug in TensorFlow this will not work with resource variables in
-    TF 2.3, 2.4, or 2.5. Versions 1.15 and 2.6 should be fine, however.
+    Note: This is only supported with TensorFlow 2.6 or later.
 
     Returns:
       The tensor values of the updated `variables` as broadcasted from root rank.
     """
+    from distutils.version import LooseVersion
+    if LooseVersion(tf.__version__) < LooseVersion('2.6.0'):
+        raise NotImplementedError("In-place broadcasts are only supported with TensorFlow 2.6 or later")
+
     from tensorflow.python.ops import resource_variable_ops
     if all(resource_variable_ops.is_resource_variable(var) for var in variables):
         with tf.control_dependencies(

diff --git a/test/parallel/test_tensorflow.py b/test/parallel/test_tensorflow.py
@@ -2227,8 +2227,8 @@ def test_horovod_broadcast_gpu(self):
 
     def test_horovod_broadcast_inplace_cpu(self):
         """Test that the inplace broadcast correctly broadcasts 1D, 2D, 3D variables on CPU."""
-        if LooseVersion('2.3.0') <= LooseVersion(tf.__version__) < LooseVersion('2.6.0'):
-            self.skipTest("Custom Ops using resource variables are known to be broken with TensorFlow 2.3, 2.4, and 2.5")
+        if LooseVersion(tf.__version__) < LooseVersion('2.6.0'):
+            self.skipTest("Custom Ops using resource variables only work with TF 2.6+")
 
         hvd.init()
         rank = hvd.rank()
@@ -2271,8 +2271,8 @@ def test_horovod_broadcast_inplace_cpu(self):
 
     def test_horovod_broadcast_inplace_gpu(self):
         """Test that the inplace broadcast correctly broadcasts 1D, 2D, 3D variables on GPU."""
-        if LooseVersion('2.3.0') <= LooseVersion(tf.__version__) < LooseVersion('2.6.0'):
-            self.skipTest("Custom Ops using resource variables are known to be broken with TensorFlow 2.3, 2.4, and 2.5")
+        if LooseVersion(tf.__version__) < LooseVersion('2.6.0'):
+            self.skipTest("Custom Ops using resource variables only work with TF 2.6+")
 
         if not tf.test.is_gpu_available(cuda_only=True):
             self.skipTest("No GPUs available")
@@ -2321,8 +2321,8 @@ def test_horovod_broadcast_inplace_gpu(self):
 
     def test_horovod_broadcast_inplace_multiple(self):
         """Test that the inplace broadcast correctly broadcasts multiple variables on CPU."""
-        if LooseVersion('2.3.0') <= LooseVersion(tf.__version__) < LooseVersion('2.6.0'):
-            self.skipTest("Custom Ops using resource variables are known to be broken with TensorFlow 2.3, 2.4, and 2.5")
+        if LooseVersion(tf.__version__) < LooseVersion('2.6.0'):
+            self.skipTest("Custom Ops using resource variables only work with TF 2.6+")
 
         hvd.init()
         rank = hvd.rank()