apache · masahi · May 13, 2024 · Apr 30, 2024
diff --git a/python/tvm/dlight/base/transform.py b/python/tvm/dlight/base/transform.py
@@ -36,6 +36,14 @@ def _is_scheduled(func: tir.PrimFunc) -> bool:
     return func.attrs["tir.is_scheduled"] == 1
 
 
+def _get_target(func: tir.PrimFunc) -> Target:
+    target = func.attrs.get("target")
+    if target is None:
+        return Target.current(allow_none=False)
+    else:
+        return target
+
+
 @module_pass(opt_level=0, name="ApplyDefaultSchedule")
 class ApplyDefaultSchedule:  # pylint: disable=too-few-public-methods
     """A IRModule pass that applies a list of ScheduleRules to all PrimFuncs in the module."""
@@ -55,10 +63,11 @@ def transform_module(  # pylint: disable=missing-function-docstring
         mod: IRModule,
         _: PassContext,
     ) -> IRModule:
-        target = Target.current(allow_none=False)
         updated_functions = {}
         for g_var, func in mod.functions_items():
             if isinstance(func, tir.PrimFunc) and not _is_scheduled(func):
+                target = _get_target(func)
+
                 sch = _apply_rules(func, target, self.rules, tunable=False)
                 if sch is not None:
                     assert len(sch) == 1

diff --git a/tests/python/dlight/test_gpu_fallback.py b/tests/python/dlight/test_gpu_fallback.py
@@ -179,5 +179,83 @@ def expected(var_pages: T.handle, var_page_table_indptr: T.handle, var_page_tabl
     assert_structural_equal(mod["main"], expected)
 
 
+def test_gpu_fallback_ignores_non_gpu_functions():
+    @I.ir_module
+    class Before:
+        # This function has no "target" attribute, and is scheduled
+        # using the `Target.current`.
+        @T.prim_func
+        def gpu_func(
+            A: T.Buffer((1, 32, 1, 128), "float16"),
+            C: T.Buffer((1, 1, 4096), "float16"),
+        ):
+            B = T.alloc_buffer((1, 1, 32, 128), "float16")
+            for i, j, k, l in T.grid(1, 1, 32, 128):
+                with T.block("T_transpose"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk, vl] = A[vi, vk, vj, vl]
+            for i, j, k in T.grid(1, 1, 4096):
+                with T.block("T_reshape"):
+                    vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+                    C[vi, vj, vk] = B[0, 0, vk % 4096 // 128, vk % 128]
+
+        # This function is identical, except that it is explicitly
+        # annotated with the "target" attribute, and is scheduled
+        # based on the annotation's target.
+        @T.prim_func
+        def cpu_func(
+            A: T.Buffer((1, 32, 1, 128), "float16"),
+            C: T.Buffer((1, 1, 4096), "float16"),
+        ):
+            T.func_attr({"target": T.target("llvm")})
+            B = T.alloc_buffer((1, 1, 32, 128), "float16")
+            for i, j, k, l in T.grid(1, 1, 32, 128):
+                with T.block("T_transpose"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk, vl] = A[vi, vk, vj, vl]
+            for i, j, k in T.grid(1, 1, 4096):
+                with T.block("T_reshape"):
+                    vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+                    C[vi, vj, vk] = B[0, 0, vk % 4096 // 128, vk % 128]
+
+    @I.ir_module
+    class After:
+        @T.prim_func
+        def gpu_func(
+            A: T.Buffer((1, 32, 1, 128), "float16"),
+            C: T.Buffer((1, 1, 4096), "float16"),
+        ):
+            T.func_attr({"tir.is_scheduled": 1})
+            for ax0_fused_0 in T.thread_binding(4, thread="blockIdx.x"):
+                for ax0_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
+                    with T.block("T_reshape"):
+                        v0 = T.axis.spatial(4096, ax0_fused_0 * 1024 + ax0_fused_1)
+                        T.reads(A[0, v0 // 128, 0, v0 % 128])
+                        T.writes(C[0, 0, v0])
+                        C[0, 0, v0] = A[0, v0 // 128, 0, v0 % 128]
+
+        @T.prim_func
+        def cpu_func(
+            A: T.Buffer((1, 32, 1, 128), "float16"),
+            C: T.Buffer((1, 1, 4096), "float16"),
+        ):
+            T.func_attr({"target": T.target("llvm")})
+            B = T.alloc_buffer((1, 1, 32, 128), "float16")
+            for i, j, k, l in T.grid(1, 1, 32, 128):
+                with T.block("T_transpose"):
+                    vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+                    B[vi, vj, vk, vl] = A[vi, vk, vj, vl]
+            for i, j, k in T.grid(1, 1, 4096):
+                with T.block("T_reshape"):
+                    vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+                    C[vi, vj, vk] = B[0, 0, vk % 4096 // 128, vk % 128]
+
+    with Target("cuda"):
+        mod = dl.ApplyDefaultSchedule(  # pylint: disable=not-callable
+            dl.gpu.Fallback(),
+        )(Before)
+    assert_structural_equal(mod, After)
+
+
 if __name__ == "__main__":
     tvm.testing.main()