Update MarkShardingFunction to be AOTAutograd traceable

tengyifei · tengyifei · commit e4c19085bccc · 2025-03-11T20:20:38.000-07:00
This is so that we can use it in `scan` later. This has the side-effect of making the function no longer in-place because PyTorch custom_op blows up if I don't clone the tensor. So it "fixes" #8809 as a side-effect.
diff --git a/test/spmd/test_xla_sharding.py b/test/spmd/test_xla_sharding.py
@@ -836,6 +836,10 @@ def test_mark_sharding_ir(self):
 
     self.assertTrue(torch.allclose(expected, actual.cpu()))
 
+  def _check_sharding_annotation(self, tensor, sharding_annotation):
+    hlo = torch_xla._XLAC._get_xla_tensors_hlo([tensor])
+    self.assertIn(sharding_annotation, hlo)
+
   @unittest.skipUnless(xr.global_runtime_device_count() > 1,
                        "Multiple devices required for autograd sharding test")
   def test_mark_sharding_autograd(self):
@@ -849,9 +853,56 @@ def test_mark_sharding_autograd(self):
     t = y.sum()
     # Backward pass
     t.backward()
-    hlo = torch_xla._XLAC._get_xla_tensors_hlo([z.grad])
-    sharding_annotation = 'sharding={devices=[1,%d]' % self.n_devices
-    self.assertIn(sharding_annotation, hlo)
+    self._check_sharding_annotation(z.grad,
+                                    'sharding={devices=[1,%d]' % self.n_devices)
+
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required for autograd sharding test")
+  def test_mark_sharding_aot_compile(self):
+    mesh = self._get_mesh((self.n_devices,))
+
+    def my_fn(x):
+      z = torch.sin(x)
+      y = MarkShardingFunction.apply(z, mesh, (0,))
+      return y + 42
+
+    from functorch.compile import aot_function, make_boxed_func  # type: ignore
+
+    x = torch.randn(8)
+    x = x.to('xla').requires_grad_(True)
+
+    graphs = []
+
+    def get_graph(gm: torch.fx.GraphModule, _):
+      graphs.append(gm)
+      return make_boxed_func(gm)
+
+    y = aot_function(my_fn, get_graph)(x)
+    t = y.sum()
+    t.backward()
+    torch_xla.sync()
+
+    sharding_spec = '{devices=[%d]' % self.n_devices
+
+    # Check that the output has sharding.
+    self.assertIn(sharding_spec, torch_xla._XLAC._get_xla_sharding_spec(y))
+
+    # Check that the gradient has sharding.
+    self.assertIsNotNone(x.grad)
+    self.assertIn(sharding_spec, torch_xla._XLAC._get_xla_sharding_spec(x.grad))
+
+    # Check that the AOTAutograd captured graphs also each contains a mark_sharding.
+    fwd, bwd = graphs
+
+    inp = torch.randn(8).to('xla').requires_grad_(False)
+    out, *residuals = fwd(inp)
+    self._check_sharding_annotation(out,
+                                    'sharding={devices=[%d]' % self.n_devices)
+
+    tangents = torch.randn(8).to('xla').requires_grad_(False)
+    out, = bwd(*residuals, tangents)
+    self._check_sharding_annotation(out,
+                                    'sharding={devices=[%d]' % self.n_devices)
 
   def test_sharded_tensor_aliasing(self):
     met.clear_all()
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -1250,24 +1250,56 @@ class MarkShardingFunction(torch.autograd.Function):
   of the intermediate tensors during backward pass.
 
   Usage:
-  new_tensor = MarkShardingFunction.apply(tensor, mesh, ('axis_1', 'axis_2'))
+
+  >>> new_tensor = MarkShardingFunction.apply(tensor, mesh, ('axis_1', 'axis_2'))
 
   This is required to guide GSPMD sharding propagation better during the
   backward pass as during complicated workloads the compiler can introduce extra
   collectives that can hurt performance.
+
+  Compared to `mark_sharding`, this version will not in-place shard input tensors.
+  Instead it takes in an unsharded tensor and returns a new tensor that is sharded.
+  After GSPMD sharding propagation in the compiler, both tensors will become sharded.
+
+  This version can also be used in AOTAutograd.
   """
 
   @staticmethod
   def forward(ctx, torch_tensor: torch.Tensor, mesh: Mesh,
-              partition_spec: Tuple) -> torch.Tensor:
-    mark_sharding(torch_tensor, mesh, partition_spec)
+              partition_spec) -> torch.Tensor:
+    o = _aot_mark_sharding(torch_tensor, str(mesh), str(partition_spec))
     ctx.partition_spec = partition_spec
     ctx.mesh = mesh
-    return torch_tensor
+    return o
 
   @staticmethod
-  def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+  def backward(ctx, grad_output: torch.Tensor):  # type: ignore
     partition_spec = ctx.partition_spec
     mesh = ctx.mesh
-    mark_sharding(grad_output, mesh, partition_spec)
-    return grad_output, None, None
+    o = _aot_mark_sharding(grad_output, str(mesh), str(partition_spec))
+    return o, None, None
+
+
+@torch.library.custom_op("xla::aot_mark_sharding", mutates_args=())
+def _aot_mark_sharding(t: torch.Tensor, mesh: str,
+                       partition_spec: str) -> torch.Tensor:
+  if t is None:
+    return None
+
+  import ast
+
+  import torch_xla.distributed.spmd as xs
+
+  the_mesh = xs.Mesh.from_str(mesh)
+  assert the_mesh is not None
+  partition_spec_eval = ast.literal_eval(partition_spec)
+  return xs.mark_sharding(t.clone(), the_mesh,
+                          partition_spec_eval).global_tensor
+
+
+@_aot_mark_sharding.register_fake
+def aot_mark_sharding_fake(t: torch.Tensor, mesh: str,
+                           partition_spec: str) -> torch.Tensor:
+  if t is None:
+    return None
+  return torch.empty_like(t)