From d523d78623403f214a981b0095a72f1839d9bead Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 14 Oct 2022 17:35:18 +0000 Subject: [PATCH 1/2] [MPS] Copy from CPU always add storageOffset (#86958) Because why wouldn't it? Fixes https://github.com/pytorch/pytorch/issues/86052 Pull Request resolved: https://github.com/pytorch/pytorch/pull/86958 Approved by: https://github.com/kulinseth (cherry picked from commit 13cff2ee8ea1d7aea2ad201cbd77ebe2b9a29d25) --- aten/src/ATen/native/mps/operations/Copy.mm | 3 +-- test/test_mps.py | 9 +++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm index 91b486374603..2bfee3f9a393 100644 --- a/aten/src/ATen/native/mps/operations/Copy.mm +++ b/aten/src/ATen/native/mps/operations/Copy.mm @@ -210,8 +210,7 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src, options:options deallocator:nil]; sourceOffset = uintptr_t(host_src) - uintptr_t(alignedPtr); - if (src_.is_view() || !src_.is_contiguous()) - sourceOffset += src_.storage_offset() * src_.itemsize(); + sourceOffset += src_.storage_offset() * src_.itemsize(); stream->copy_and_sync(sourceBuffer, destBuffer, size_to_copy, sourceOffset, dst_byte_offset, non_blocking); [sourceBuffer release]; diff --git a/test/test_mps.py b/test/test_mps.py index 61ca9e14c543..d433e6ba890a 100644 --- a/test/test_mps.py +++ b/test/test_mps.py @@ -5686,6 +5686,15 @@ def test_view_copy_out(self, device="mps"): self.assertEqual(expected1, out1) self.assertEqual(expected2, out2) + def test_detached_view_copy(self, device="mps"): + # https://github.com/pytorch/pytorch/issues/86052 + x = torch.arange(2) + # .detach() makes y not a view, but contig tensor + # with non-zero offset + y = x[1].detach() + z = y.to(device) + self.assertEqual(y, z.cpu()) + def test_empty_reshape(self, device="mps"): x = torch.randn(0, 6, device=device) self.assertEqual((1, 0, 6, 1, 1), x.reshape(1, 0, 6, 1, 1).shape) From 1ad30ba28531660a55706eb0661da1fa795e3fce Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 21 Oct 2022 14:10:05 +0000 Subject: [PATCH 2/2] [MPS] Revamp copy_to_mps_ implementation (#86956) Tensor's view in linear storage is represented by the following parameters: `.shape`, `.stride()` and `.storage_offset()`. Only tensors that are representable as 1d-views can be copied from host to device (and vice versa) using single [`copy(from:sourceOffset:to:destinationOffset:size:)`](https://developer.apple.com/documentation/metal/mtlblitcommandencoder/1400767-copyfrombuffer?language=objc) call. Modify `copy_to_mps_` function to do the following steps: - Cast `src` tensor to dst data type if needed - Expand `src` tensor to `dst` tensor shape - Clone `src` tensor if it is not stride contiguous (i.e. can not be represented by `src.view(src.numel())`) - Create an empty tensor if `dst` is not stride-contiguous or if its strides are different then potentially cloned `src` strides - Do 1d copy for `src` to (potentiall temp) `dst` - Finally do re-striding/copy on MPS if needed Add test to cover cases where stide-contiguous permuted tensor is copied to MPS, non-stride-contiguous tensor is copied to MPS and if permuted CPU tensor is copied to differently permuted MPS tensor Fixes https://github.com/pytorch/pytorch/issues/86954 Pull Request resolved: https://github.com/pytorch/pytorch/pull/86956 Approved by: https://github.com/kulinseth (cherry picked from commit ae62cf7c02c009ab1123a3bfa08ca9a5e4255e4a) --- aten/src/ATen/native/mps/operations/Copy.mm | 84 +++++++++++++-------- test/test_mps.py | 25 +++++- 2 files changed, 77 insertions(+), 32 deletions(-) diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm index 2bfee3f9a393..99183d21030c 100644 --- a/aten/src/ATen/native/mps/operations/Copy.mm +++ b/aten/src/ATen/native/mps/operations/Copy.mm @@ -33,6 +33,25 @@ return (void*)alignedAddress; } +/** + * Computes number of elements one needs to transfer to preserve all the elements + */ +size_t compute_strided_size(const at::Tensor& t) { + size_t rc = 1; + if (t.numel() == 0) { + return 0; + } + for(const auto i: c10::irange(t.dim())) { + assert(t.size(i) > 0); + rc += (t.size(i) - 1) * t.stride(i); + } + return rc; +} + +bool is_strided_contiguous(const at::Tensor& t) { + return compute_strided_size(t) == t.numel(); +} + // Copy sourceBuffer into destBuffer, casting sourceBuffer to src.scalar_type(). // The shapes and dtypes are taken from dst and src, but their storage pointers are not used. void copy_cast_mps(at::Tensor& dst, const at::Tensor& src, @@ -168,55 +187,60 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src, return dst_; } -static at::Tensor& copy_to_mps_(at::Tensor& dst_, const at::Tensor& src_, bool non_blocking) +// Copies tensor from cpu to mps backed by identical strided-contiguous data +static void copy_to_mps_stride_contig(at::Tensor& dst, const at::Tensor& src, bool non_blocking) { MPSStream* stream = getCurrentMPSStream(); - Tensor src; - id device = MPSDevice::getInstance()->device(); - auto dst_byte_offset = dst_.storage_offset() * dst_.itemsize(); - id destBuffer = getMTLBufferStorage(dst_); - uint64_t src_total_size = 0; - - // This is weird, but sometimes this function can be called - // with contiguous destination and non-contiguous source - if (src_.is_view() || dst_.is_contiguous() != src_.is_contiguous()) { - src = src_.to(dst_.dtype()).expand_as(dst_).contiguous(); - // Get the actual size of a View (takes into account the storage offset) - // For View tensors, the storage offset can be bigger than what's being reported by nbytes - src_total_size = at::detail::computeStorageNbytesContiguous(src.sizes(), src.element_size(), src.storage_offset()); - } else { - TORCH_INTERNAL_ASSERT(src_.strides() == dst_.strides()); - src = src_; - if (src.dtype() != dst_.dtype()) { - // In case of dtype change, perform conversion on source device - src = src.to(dst_.dtype()); - } - src_total_size = src.nbytes(); - } - + auto dst_byte_offset = dst.storage_offset() * dst.itemsize(); + auto src_byte_offset = src.storage_offset() * src.itemsize(); + id destBuffer = getMTLBufferStorage(dst); const size_t size_to_copy = src.nbytes(); - const void* host_src = src.storage().data(); - TORCH_INTERNAL_ASSERT(src_total_size >= (src.storage_offset() * src.element_size())); + const void* host_src = static_cast(src.storage().data()) + src_byte_offset; + + TORCH_INTERNAL_ASSERT(src.dtype() == dst.dtype() && src.strides() == dst.strides() && is_strided_contiguous(src)); - NSUInteger sourceOffset = 0; @autoreleasepool { MTLResourceOptions options = MTLResourceOptionCPUCacheModeDefault | MTLResourceStorageModeShared; NSUInteger alignedLength = 0; + NSUInteger sourceOffset = 0; - void* alignedPtr = pageAlignedBlockPtr(host_src, (NSUInteger)src_total_size, &alignedLength); + void* alignedPtr = pageAlignedBlockPtr(host_src, (NSUInteger)size_to_copy, &alignedLength); id sourceBuffer = [device newBufferWithBytesNoCopy:alignedPtr length:alignedLength options:options deallocator:nil]; sourceOffset = uintptr_t(host_src) - uintptr_t(alignedPtr); - sourceOffset += src_.storage_offset() * src_.itemsize(); stream->copy_and_sync(sourceBuffer, destBuffer, size_to_copy, sourceOffset, dst_byte_offset, non_blocking); [sourceBuffer release]; } +} - return dst_; +static at::Tensor& copy_to_mps_(at::Tensor& dst_, const at::Tensor& src_, bool non_blocking) +{ + // Typecast to dst_ if needed and expand, which is a no-op + Tensor src = (src_.dtype() != dst_.dtype() ? src_.to(dst_.dtype()) : src_).expand_as(dst_); + + // If src is not contiguously strided it must be cloned + // It does not mean that tensor is contiguous, but rather + // that it could be represented as 1d view + if (!is_strided_contiguous(src)) { + src = src.clone(); + TORCH_INTERNAL_ASSERT(is_strided_contiguous(src)); + } + Tensor dst = dst_; + bool needs_copy = false; + // If src and dst_ strides do not match, it means that + // either dst_ is not representable as 1d view or its stride order is different + // in that case create an empty storage like src, copy it to device and then do + // reshaping on the device + if (src.strides() != dst_.strides()) { + needs_copy = true; + dst = at::empty_like(src, at::device(at::kMPS)); + } + copy_to_mps_stride_contig(dst, src, non_blocking && !needs_copy); + return needs_copy? dst_.copy_(dst) : dst_; } void copy_blit_mps(void* dst, const void* src, size_t size) { diff --git a/test/test_mps.py b/test/test_mps.py index d433e6ba890a..76716154ba05 100644 --- a/test/test_mps.py +++ b/test/test_mps.py @@ -1596,9 +1596,9 @@ def test_storage_offset_greater_than_src_nbytes(self): tensor_list.append(t) for i in range(0, n_tensors - 1): - t = tensor_list[i].view(1, 784) + t = tensor_list[i].view(1, n_tensor_elems) t_mps = t.to("mps") - self.assertEqual(t, t_mps.cpu()) + self.assertEqual(t, t_mps.cpu(), f"i={i}") # See https://github.com/pytorch/pytorch/issues/82427 # and https://github.com/pytorch/pytorch/issues/83692 @@ -1649,6 +1649,27 @@ def test_from_numpy_non_contiguous(self): t_mps = torch.tensor(a, device="mps") self.assertEqual(t_cpu, t_mps.to("cpu")) + # See https://github.com/pytorch/pytorch/issues/86954 + def test_copy_non_contiguous(self): + x = torch.arange(27).reshape(3, 3, 3).permute(2, 0, 1) + self.assertFalse(x.is_contiguous()) + y = x.to('mps') + self.assertFalse(y.is_contiguous()) + self.assertEqual(x, y.to('cpu')) + + x = torch.arange(4**3).reshape(4, 4, 4).permute((2, 0, 1))[1:, ::2] + y = x.to('mps') + self.assertEqual(x, y.to('cpu')) + + x = torch.full((4, 4, 4, 4), 13, device="cpu") + y = torch.full((4, 4, 4, 4), 13, device="mps") + z = torch.arange(4**4).reshape(4, 4, 4, 4).permute(3, 2, 0, 1)[1::, ::2] + x.permute(3, 2, 1, 0)[1::, ::2] = z + # As y is on MPS and z on CPU, this dispatches to a copy operator + y.permute(3, 2, 1, 0)[1::, ::2] = z + self.assertEqual(x, y.to('cpu')) + + class TestLogical(TestCase): def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):