From 4f2d869095034301b903cd2ef807b416547c0d9c Mon Sep 17 00:00:00 2001 From: atalman Date: Mon, 24 Oct 2022 19:38:07 +0000 Subject: [PATCH] Fix distributed issue by including distributed files (#87615) This fixes regression in distributed headers installation. Caused by following PR: https://github.com/pytorch/pytorch/pull/85953 which removed the inclusions Fixes #87173 Test plan from wheel build by this CI: https://github.com/pytorch/pytorch/actions/runs/3314742519 ``` [ec2-user@ip-10-0-9-132 c10d]$ pwd /home/ec2-user/actions-runner/_work/_temp/artifacts/torch/include/torch/csrc/distributed/c10d [ec2-user@ip-10-0-9-132 c10d]$ ls -las total 300 4 drwxr-xr-x 2 ec2-user ec2-user 4096 Oct 24 19:12 . 0 drwxr-xr-x 4 ec2-user ec2-user 29 Oct 24 19:12 .. 12 -rw-r--r-- 1 ec2-user ec2-user 9051 Oct 24 17:28 Backend.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 216 Oct 24 17:28 c10d.h 4 -rw-r--r-- 1 ec2-user ec2-user 3880 Oct 24 17:28 comm.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 604 Oct 24 17:28 debug.h 4 -rw-r--r-- 1 ec2-user ec2-user 1717 Oct 24 17:28 default_comm_hooks.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1316 Oct 24 17:28 error.h 4 -rw-r--r-- 1 ec2-user ec2-user 962 Oct 24 17:28 exception.h 4 -rw-r--r-- 1 ec2-user ec2-user 1461 Oct 24 17:28 FileStore.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 771 Oct 24 17:28 GlooDeviceFactory.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1154 Oct 24 17:28 HashStore.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 4058 Oct 24 17:28 logger.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2059 Oct 24 17:28 logging.h 8 -rw-r--r-- 1 ec2-user ec2-user 7979 Oct 24 17:28 NCCLUtils.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2756 Oct 24 17:28 Ops.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1814 Oct 24 17:28 ParamCommsUtils.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1478 Oct 24 17:28 PrefixStore.hpp 16 -rw-r--r-- 1 ec2-user ec2-user 13235 Oct 24 17:28 ProcessGroupGloo.hpp 12 -rw-r--r-- 1 ec2-user ec2-user 11298 Oct 24 17:28 ProcessGroup.hpp 12 -rw-r--r-- 1 ec2-user ec2-user 8645 Oct 24 17:28 ProcessGroupMPI.hpp 28 -rw-r--r-- 1 ec2-user ec2-user 26526 Oct 24 17:28 ProcessGroupNCCL.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 3805 Oct 24 17:28 ProcessGroupRoundRobin.hpp 12 -rw-r--r-- 1 ec2-user ec2-user 10361 Oct 24 17:28 ProcessGroupUCC.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 5062 Oct 24 17:28 ProcessGroupWrapper.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 4201 Oct 24 17:28 PyProcessGroup.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1072 Oct 24 17:28 python_comm_hook.h 24 -rw-r--r-- 1 ec2-user ec2-user 23859 Oct 24 17:28 reducer.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2330 Oct 24 17:28 reducer_timer.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1683 Oct 24 17:28 sequence_num.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2108 Oct 24 17:28 socket.h 4 -rw-r--r-- 1 ec2-user ec2-user 2589 Oct 24 17:28 Store.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 3264 Oct 24 17:28 TCPStore.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 6944 Oct 24 17:28 TraceUtils.h 8 -rw-r--r-- 1 ec2-user ec2-user 4539 Oct 24 17:28 Types.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 580 Oct 24 17:28 UCCForNCCL.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2301 Oct 24 17:28 UCCTracing.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 4933 Oct 24 17:28 UCCUtils.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 584 Oct 24 17:28 UnixSockUtils.hpp 24 -rw-r--r-- 1 ec2-user ec2-user 20796 Oct 24 17:28 Utils.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 575 Oct 24 17:28 WinSockUtils.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 4259 Oct 24 17:28 Work.hpp ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87615 Approved by: https://github.com/malfet --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f844c690b74f..e3eb3ced6005 100644 --- a/setup.py +++ b/setup.py @@ -1097,7 +1097,8 @@ def main(): 'include/torch/csrc/autograd/generated/*.h', 'include/torch/csrc/autograd/utils/*.h', 'include/torch/csrc/cuda/*.h', - 'include/torch/csrc/distributed/c10d/exception.h', + 'include/torch/csrc/distributed/c10d/*.h', + 'include/torch/csrc/distributed/c10d/*.hpp', 'include/torch/csrc/distributed/rpc/*.h', 'include/torch/csrc/jit/*.h', 'include/torch/csrc/jit/backends/*.h',