Skip to content

Commit

Permalink
Reenable isinstance with torch.distributed.ReduceOp (#87303) (#87463
Browse files Browse the repository at this point in the history
)

tentatively marking as draft as I haven't gotten a comprehensive list of side effects...

Ref: https://stackoverflow.com/questions/40244413/python-static-class-attribute-of-the-class-itself
Rel: #87191

cc @kwen2501
Pull Request resolved: #87303
Approved by: https://github.com/wanchaol

Co-authored-by: Masaki Kozuki <mkozuki@nvidia.com>
  • Loading branch information
atalman and crcrpar committed Oct 21, 2022
1 parent 51fa4fa commit f6c42ae
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 11 deletions.
12 changes: 12 additions & 0 deletions test/distributed/test_c10d_common.py
Expand Up @@ -1616,6 +1616,18 @@ def comm_fn(tensor, group=None):
self._test_work_wait(tensor, comm_fn=comm_fn)


class ReduceOpTest(TestCase):

def test_op_isinstance_of_reduceop(self):
for reduce_op in (
c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
):
self.assertTrue(isinstance(reduce_op, c10d.ReduceOp))
for scale in ([torch.tensor(1.0)], 2.0):
self.assertTrue(isinstance(dist._make_nccl_premul_sum(scale), c10d.ReduceOp))


if __name__ == "__main__":
assert (
not torch.cuda._initialized
Expand Down
3 changes: 1 addition & 2 deletions torch/_C/_distributed_c10d.pyi
Expand Up @@ -63,7 +63,6 @@ class DebugLevel(Enum):

class ReduceOp:

# note(crcrpar): These values are populated from Kind
SUM = ...
PRODUCT = ...
MIN = ...
Expand All @@ -74,7 +73,7 @@ class ReduceOp:
PREMUL_SUM = ...
UNUSED = ...

class Kind(Enum): ...
class RedOpType(Enum): ...

class BroadcastOptions:
rootRank: int
Expand Down
5 changes: 4 additions & 1 deletion torch/csrc/distributed/c10d/Types.hpp
Expand Up @@ -29,6 +29,7 @@ struct NCCLPreMulSumSupplement : _SupplementBase {
// Other ReduceOps that need different supplementary data can also
// derive from _SupplementBase.
struct TORCH_API ReduceOp : torch::CustomClassHolder {
// note(crcrpar): RedOpType could be defined outside of `ReduceOp`
enum RedOpType : uint8_t {
SUM = 0,
AVG = 1,
Expand All @@ -46,7 +47,9 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder {

ReduceOp(RedOpType op) : op_(op) {
TORCH_INTERNAL_ASSERT(
op_ != PREMUL_SUM, "PREMUL_SUM requires a scale factor tensor or scalar argument");
op_ != PREMUL_SUM,
"Use `torch.distributed._make_nccl_premul_sum` to create an instance of ReduceOp with PREMUL_SUM"
);
}

ReduceOp(RedOpType op, c10::intrusive_ptr<_SupplementBase> optional_supplement) {
Expand Down
29 changes: 21 additions & 8 deletions torch/csrc/distributed/c10d/init.cpp
Expand Up @@ -515,10 +515,14 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
R"(Sets the debug level of the torch.distributed package from the
``TORCH_DISTRIBUTED_DEBUG`` environment variable.)");

// TODO(crcrpar): Hardening `ReduceOp`.
// While keeping most op types as enum value,
// making `PREMUL_SUM` callable, i.e., allowing for
// `ReduceOp.PREMUL_SUM(scale)` might be better as per @wanchaol.
// https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types
py::class_<::c10d::ReduceOp> reduce_op(module, "ReduceOp", R"(
An enum-like class for available reduction operations: ``SUM``, ``PRODUCT``,
``MIN``, ``MAX``, ``BAND``, ``BOR``, and ``BXOR``.
``MIN``, ``MAX``, ``BAND``, ``BOR``, ``BXOR``, and ``PREMUL_SUM``.
``BAND``, ``BOR``, and ``BXOR`` reductions are not available when
using the ``NCCL`` backend.
Expand All @@ -529,13 +533,16 @@ and only for NCCL versions 2.10 or later.
``PREMUL_SUM`` multiplies inputs by a given scalar locally before reduction.
``PREMUL_SUM`` is only available with the ``NCCL`` backend,
and only available for NCCL versions 2.11 or later.
and only available for NCCL versions 2.11 or later. Users are supposed to
use ``torch.distributed._make_nccl_premul_sum``.
Additionally, ``MAX``, ``MIN`` and ``PRODUCT`` are not supported for complex tensors.
The values of this class can be accessed as attributes, e.g., ``ReduceOp.SUM``.
They are used in specifying strategies for reduction collectives, e.g.,
:func:`reduce`, :func:`all_reduce_multigpu`, etc.)");
:func:`reduce`, :func:`all_reduce_multigpu`, etc.
This class does not support ``__members__`` property.)");

reduce_op.def(py::init<::c10d::ReduceOp::RedOpType>())
.def_readwrite("op", &::c10d::ReduceOp::op_);
Expand All @@ -555,8 +562,14 @@ They are used in specifying strategies for reduction collectives, e.g.,
[](const ::c10d::ReduceOp& self, const ::c10d::ReduceOp& other) {
return self == other.op_;
})
.def("__hash__", [](const ::c10d::ReduceOp& self) { return self.op_; });

.def("__hash__", [](const ::c10d::ReduceOp& self) {
return static_cast<uint8_t>(self.op_);
});

// note(crcrpar): Deliberately skip
// [`export_values`](https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types)
// here and manually set values in Python side. See note "ReduceOp static
// class attributes to support `isinstance`"
py::enum_<::c10d::ReduceOp::RedOpType>(reduce_op, "RedOpType")
.value("SUM", ::c10d::ReduceOp::RedOpType::SUM)
.value("AVG", ::c10d::ReduceOp::RedOpType::AVG)
Expand All @@ -566,10 +579,10 @@ They are used in specifying strategies for reduction collectives, e.g.,
.value("BAND", ::c10d::ReduceOp::RedOpType::BAND)
.value("BOR", ::c10d::ReduceOp::RedOpType::BOR)
.value("BXOR", ::c10d::ReduceOp::RedOpType::BXOR)
.value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM)
.export_values();
.value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM);

// Ref: [Implicit
// note(crcrpar): This could be removed because users will not pass
// `RedOpType` to reduce collective ops Ref: [Implicit
// conversions](https://pybind11.readthedocs.io/en/stable/advanced/classes.html#implicit-conversions)
// Let us skip the explicit construction of `c10d::ReduceOp` from
// `c10d::ReduceOp::RedOpType` in Python.
Expand Down
11 changes: 11 additions & 0 deletions torch/distributed/distributed_c10d.py
Expand Up @@ -233,6 +233,17 @@ def register_backend(cls, name, func, extended_api=False):
dist_backend = Backend


# NOTE(crcrpar): [ReduceOp static class attributes to support `isinstance`]
# A ReduceOp instance of `PREMUL_SUM` is supposed to be created via `_make_nccl_premul_sum`
# while the other `op`s (meaning RedOpType members) can be directly passed to c10d reduce collectives.
# I changed `ReduceOp` to struct from enum class and introduced RedOpType enum class for PREMUL_SUM,
# which broke an implicit contract of ReduceOp being enum-like with which users apply isinstance to
# `op`, for example, `isinstance(ReduceOp.SUM, ReduceOp)`: https://github.com/pytorch/pytorch/issues/87191
DENY_LIST = ("PREMUL_SUM", )
for _red_op_name, _red_op_value in ReduceOp.RedOpType.__members__.items():
setattr(ReduceOp, _red_op_name, _red_op_value if _red_op_name in DENY_LIST else ReduceOp(_red_op_value))


class _reduce_op(object):
r"""
Deprecated enum-like class for reduction operations: ``SUM``, ``PRODUCT``,
Expand Down

0 comments on commit f6c42ae

Please sign in to comment.