diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 767efa845cb1..58d6a0053f9b 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -1616,6 +1616,18 @@ def comm_fn(tensor, group=None): self._test_work_wait(tensor, comm_fn=comm_fn) +class ReduceOpTest(TestCase): + + def test_op_isinstance_of_reduceop(self): + for reduce_op in ( + c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX, + c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR, + ): + self.assertTrue(isinstance(reduce_op, c10d.ReduceOp)) + for scale in ([torch.tensor(1.0)], 2.0): + self.assertTrue(isinstance(dist._make_nccl_premul_sum(scale), c10d.ReduceOp)) + + if __name__ == "__main__": assert ( not torch.cuda._initialized diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index aad37d6a8c5a..bdf0166b8daa 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -63,7 +63,6 @@ class DebugLevel(Enum): class ReduceOp: - # note(crcrpar): These values are populated from Kind SUM = ... PRODUCT = ... MIN = ... @@ -74,7 +73,7 @@ class ReduceOp: PREMUL_SUM = ... UNUSED = ... - class Kind(Enum): ... + class RedOpType(Enum): ... class BroadcastOptions: rootRank: int diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp index 4d928976d87e..64fbc45c6588 100644 --- a/torch/csrc/distributed/c10d/Types.hpp +++ b/torch/csrc/distributed/c10d/Types.hpp @@ -29,6 +29,7 @@ struct NCCLPreMulSumSupplement : _SupplementBase { // Other ReduceOps that need different supplementary data can also // derive from _SupplementBase. struct TORCH_API ReduceOp : torch::CustomClassHolder { + // note(crcrpar): RedOpType could be defined outside of `ReduceOp` enum RedOpType : uint8_t { SUM = 0, AVG = 1, @@ -46,7 +47,9 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder { ReduceOp(RedOpType op) : op_(op) { TORCH_INTERNAL_ASSERT( - op_ != PREMUL_SUM, "PREMUL_SUM requires a scale factor tensor or scalar argument"); + op_ != PREMUL_SUM, + "Use `torch.distributed._make_nccl_premul_sum` to create an instance of ReduceOp with PREMUL_SUM" + ); } ReduceOp(RedOpType op, c10::intrusive_ptr<_SupplementBase> optional_supplement) { diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 99d1affa611c..8e70c5334cf7 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -515,10 +515,14 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO R"(Sets the debug level of the torch.distributed package from the ``TORCH_DISTRIBUTED_DEBUG`` environment variable.)"); + // TODO(crcrpar): Hardening `ReduceOp`. + // While keeping most op types as enum value, + // making `PREMUL_SUM` callable, i.e., allowing for + // `ReduceOp.PREMUL_SUM(scale)` might be better as per @wanchaol. // https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types py::class_<::c10d::ReduceOp> reduce_op(module, "ReduceOp", R"( An enum-like class for available reduction operations: ``SUM``, ``PRODUCT``, -``MIN``, ``MAX``, ``BAND``, ``BOR``, and ``BXOR``. +``MIN``, ``MAX``, ``BAND``, ``BOR``, ``BXOR``, and ``PREMUL_SUM``. ``BAND``, ``BOR``, and ``BXOR`` reductions are not available when using the ``NCCL`` backend. @@ -529,13 +533,16 @@ and only for NCCL versions 2.10 or later. ``PREMUL_SUM`` multiplies inputs by a given scalar locally before reduction. ``PREMUL_SUM`` is only available with the ``NCCL`` backend, -and only available for NCCL versions 2.11 or later. +and only available for NCCL versions 2.11 or later. Users are supposed to +use ``torch.distributed._make_nccl_premul_sum``. Additionally, ``MAX``, ``MIN`` and ``PRODUCT`` are not supported for complex tensors. The values of this class can be accessed as attributes, e.g., ``ReduceOp.SUM``. They are used in specifying strategies for reduction collectives, e.g., -:func:`reduce`, :func:`all_reduce_multigpu`, etc.)"); +:func:`reduce`, :func:`all_reduce_multigpu`, etc. + +This class does not support ``__members__`` property.)"); reduce_op.def(py::init<::c10d::ReduceOp::RedOpType>()) .def_readwrite("op", &::c10d::ReduceOp::op_); @@ -555,8 +562,14 @@ They are used in specifying strategies for reduction collectives, e.g., [](const ::c10d::ReduceOp& self, const ::c10d::ReduceOp& other) { return self == other.op_; }) - .def("__hash__", [](const ::c10d::ReduceOp& self) { return self.op_; }); - + .def("__hash__", [](const ::c10d::ReduceOp& self) { + return static_cast(self.op_); + }); + + // note(crcrpar): Deliberately skip + // [`export_values`](https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types) + // here and manually set values in Python side. See note "ReduceOp static + // class attributes to support `isinstance`" py::enum_<::c10d::ReduceOp::RedOpType>(reduce_op, "RedOpType") .value("SUM", ::c10d::ReduceOp::RedOpType::SUM) .value("AVG", ::c10d::ReduceOp::RedOpType::AVG) @@ -566,10 +579,10 @@ They are used in specifying strategies for reduction collectives, e.g., .value("BAND", ::c10d::ReduceOp::RedOpType::BAND) .value("BOR", ::c10d::ReduceOp::RedOpType::BOR) .value("BXOR", ::c10d::ReduceOp::RedOpType::BXOR) - .value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM) - .export_values(); + .value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM); - // Ref: [Implicit + // note(crcrpar): This could be removed because users will not pass + // `RedOpType` to reduce collective ops Ref: [Implicit // conversions](https://pybind11.readthedocs.io/en/stable/advanced/classes.html#implicit-conversions) // Let us skip the explicit construction of `c10d::ReduceOp` from // `c10d::ReduceOp::RedOpType` in Python. diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 906302f7acad..27bc3b615d48 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -233,6 +233,17 @@ def register_backend(cls, name, func, extended_api=False): dist_backend = Backend +# NOTE(crcrpar): [ReduceOp static class attributes to support `isinstance`] +# A ReduceOp instance of `PREMUL_SUM` is supposed to be created via `_make_nccl_premul_sum` +# while the other `op`s (meaning RedOpType members) can be directly passed to c10d reduce collectives. +# I changed `ReduceOp` to struct from enum class and introduced RedOpType enum class for PREMUL_SUM, +# which broke an implicit contract of ReduceOp being enum-like with which users apply isinstance to +# `op`, for example, `isinstance(ReduceOp.SUM, ReduceOp)`: https://github.com/pytorch/pytorch/issues/87191 +DENY_LIST = ("PREMUL_SUM", ) +for _red_op_name, _red_op_value in ReduceOp.RedOpType.__members__.items(): + setattr(ReduceOp, _red_op_name, _red_op_value if _red_op_name in DENY_LIST else ReduceOp(_red_op_value)) + + class _reduce_op(object): r""" Deprecated enum-like class for reduction operations: ``SUM``, ``PRODUCT``,