PR #10495: Enforce same input/output layout for offloading ops

Imported from GitHub PR openxla/xla#10495 This patch makes sure that the host offloader will not introduce layout mismatches after removing the offloading custom calls by constraining the layout assignment to assign the same layout for the custom call's input and output. Copybara import of the project: -- ce6aeac3ac5445ab014d36c92a181dbe7afc35e7 by Jaroslav Sevcik <jsevcik@nvidia.com>: Enforce same input/output layout for offloading ops Merging this change closes #10495 FUTURE_COPYBARA_INTEGRATE_REVIEW=openxla/xla#10495 from jaro-sevcik:host-offload-call-layout-fix ce6aeac3ac5445ab014d36c92a181dbe7afc35e7 PiperOrigin-RevId: 628707325
tensorflow · Apr 29, 2024 · aeb8034 · aeb8034
1 parent 57f8705
commit aeb8034
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 0 deletions.
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
@@ -4089,6 +4089,7 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:computation_layout",
+        "//xla/service:host_memory_offload_annotations_hdr",
         "//xla/service:layout_assignment",
         "//xla/service:logical_buffer",
         "//xla/stream_executor",

diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
@@ -568,5 +569,23 @@ bool GpuLayoutAssignment::PropagateReductionLayoutToOperand(
       {/*is_row_reduction=*/true, {1, kept_dimension_size, reduction_size}});
 }
 
+bool GpuLayoutAssignment::InstructionCanChangeLayoutInstance(
+    const HloInstruction* instruction) {
+  // The host offloading custom calls will be eventually removed
+  // by the offloader, so we need to make sure that the calls do not change
+  // the layout and thus cause layout mismatches after the removal.
+  const HloCustomCallInstruction* custom_call =
+      DynCast<HloCustomCallInstruction>(instruction);
+  if (custom_call != nullptr &&
+      (custom_call->custom_call_target() ==
+           host_memory_offload_annotations::kMoveToHostCustomCallTarget ||
+       custom_call->custom_call_target() ==
+           host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+    return false;
+  }
+
+  return LayoutAssignment::InstructionCanChangeLayoutInstance(instruction);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.h b/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
@@ -68,6 +68,9 @@ class GpuLayoutAssignment : public LayoutAssignment {
 
   bool PropagateReductionLayoutToOperand(const HloInstruction* user) override;
 
+  bool InstructionCanChangeLayoutInstance(
+      const HloInstruction* instruction) override;
+
   const se::GpuComputeCapability gpu_version_;
   const se::dnn::VersionInfo dnn_version_;
 };

diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -421,6 +421,62 @@ ENTRY entry {
   expect_layout(call_0->operand(1)->shape(), {1, 2, 0});
 }
 
+TEST_F(LayoutAssignmentTest, MoveToHostCustomCallConstrained) {
+  const char* module_str = R"(
+HloModule TestModule
+
+ENTRY entry {
+  Arg_0 = f32[2,5,5]{2,1,0} parameter(0)
+  custom-call.0 = f32[2,5,5] custom-call(Arg_0), custom_call_target="MoveToHost"
+  ROOT custom-call.1 = f32[2,5,5]{2, 1, 0} custom-call(custom-call.0), custom_call_target="fixed_call", operand_layout_constraints={f32[2,5,5]{1,2,0}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, GetGpuComputeCapability(), GetDnnVersion());
+
+  EXPECT_THAT(layout_assignment.Run(m.get()), IsOkAndHolds(true));
+
+  const HloInstruction* call_0 = FindInstruction(m.get(), "custom-call.0");
+  const Layout input_layout = call_0->operand(0)->shape().layout();
+  const Layout output_layout = call_0->shape().layout();
+  EXPECT_TRUE(LayoutUtil::Equal(input_layout, output_layout))
+      << "Expected the same input/output layouts.  Input: " << input_layout
+      << ". Output: " << output_layout;
+}
+
+TEST_F(LayoutAssignmentTest, MoveToDeviceCustomCallConstrained) {
+  const char* module_str = R"(
+HloModule TestModule
+
+ENTRY entry {
+  Arg_0 = f32[2,5,5]{2,1,0} parameter(0)
+  custom-call.0 = f32[2,5,5] custom-call(Arg_0), custom_call_target="MoveToDevice"
+  ROOT custom-call.1 = f32[2,5,5]{2, 1, 0} custom-call(custom-call.0), custom_call_target="fixed_call", operand_layout_constraints={f32[2,5,5]{1,2,0}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, GetGpuComputeCapability(), GetDnnVersion());
+
+  EXPECT_THAT(layout_assignment.Run(m.get()), IsOkAndHolds(true));
+
+  const HloInstruction* call_0 = FindInstruction(m.get(), "custom-call.0");
+  const Layout input_layout = call_0->operand(0)->shape().layout();
+  const Layout output_layout = call_0->shape().layout();
+  EXPECT_TRUE(LayoutUtil::Equal(input_layout, output_layout))
+      << "Expected the same input/output layouts.  Input: " << input_layout
+      << ". Output: " << output_layout;
+}
+
 TEST_F(LayoutAssignmentTest, ConvCuDNNF8) {
   if (!GetCudaComputeCapability().IsAtLeast(
           se::CudaComputeCapability::HOPPER)) {