pytorch · Feb 12, 2025
diff --git a/‎run_llama_train.sh
+4 b/‎run_llama_train.sh
+4
diff --git a/‎torchtitan/checkpoint.py
+117-50 b/‎torchtitan/checkpoint.py
+117-50
diff --git a/‎torchtitan/config_manager.py
+13 b/‎torchtitan/config_manager.py
+13
diff --git a/‎torchtitan/ft.py
+58 b/‎torchtitan/ft.py
+58
diff --git a/‎torchtitan/optimizer.py
+33-6 b/‎torchtitan/optimizer.py
+33-6
diff --git a/‎torchtitan/parallelisms/parallel_dims.py
+17-4 b/‎torchtitan/parallelisms/parallel_dims.py
+17-4
diff --git a/‎torchtitan/utils.py
+18 b/‎torchtitan/utils.py
+18
diff --git a/‎train.py
+9-3 b/‎train.py
+9-3
@@ -19,7 +19,11 @@ if [ $# -ne 0 ]; then
     overrides="$*"
 fi
 
+TORCHFT_MANAGER_PORT=${TORCHFT_MANAGER_PORT:-"29512"}
+
 PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
+TORCHFT_LIGHTHOUSE=http://localhost:29510 \
+TORCHFT_MANAGER_PORT=${TORCHFT_MANAGER_PORT} \
 torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
 --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
 train.py --job.config_file ${CONFIG_FILE} $overrides
@@ -19,6 +19,7 @@
 import torch.distributed as dist
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
+from torch.distributed._state_dict_utils import _copy_state_dict, _create_cpu_state_dict
 from torch.distributed.checkpoint.state_dict import (
     get_model_state_dict,
     set_model_state_dict,
@@ -144,13 +145,18 @@ def __init__(
         lr_schedulers: LRSchedulersContainer,
         states: Dict[str, Any],
         job_config: JobConfig,
+        ft_manager: Optional[Any] = None,
     ) -> None:
         ckpt_config = job_config.checkpoint
         self.enable_checkpoint = ckpt_config.enable_checkpoint
-        self.keep_latest_k = ckpt_config.keep_latest_k
+        self.ft_manager = ft_manager
+        self.enable_staging = (
+            self.enable_checkpoint and async_mode == AsyncMode.ASYNC_WITH_PINNED_MEM
+        ) or self.ft_manager
 
-        if not self.enable_checkpoint:
+        if not self.enable_checkpoint and self.ft_manager is None:
             return
+
         """
         Note: Pipeline Parallelism and Virtual Stages
 
@@ -185,6 +191,13 @@ def __init__(
             }
         )
 
+        async_mode = ckpt_config.async_mode.lower()
+        self.staging = False
+        self.sending_to_checkpoint_mp = False
+        self.staging_id = None
+        self.cpu_offload_state_dict = None
+        self.staging_stream = torch.cuda.Stream() if self.enable_staging else None
+
         self.folder = os.path.join(job_config.job.dump_folder, ckpt_config.folder)
         self.interval_type = (
             IntervalType.SECONDS
@@ -199,6 +212,7 @@ def __init__(
         if async_mode == AsyncMode.ASYNC or self.interval_type == IntervalType.SECONDS:
             self.pg = dist.new_group(backend="gloo")
 
+        self.keep_latest_k = ckpt_config.keep_latest_k
         self.model_weights_only = ckpt_config.model_weights_only
         self.export_dtype = TORCH_DTYPE_MAP[ckpt_config.export_dtype]
         self.exclude_from_loading = ckpt_config.exclude_from_loading
@@ -223,10 +237,6 @@ def __init__(
                 daemon=True,
             )
             self.mp.start()
-            self.cpu_offload_state_dict = None
-            self.staging = False
-            self.staging_id = None
-            self.staging_stream = torch.cuda.Stream()
         else:
             raise ValueError(f"Unkown checkpoint async_mode {ckpt_config.async_mode}")
 
@@ -240,8 +250,61 @@ def __del__(self):
             self.mp.join()
 
     def reset(self) -> None:
+        # We need to stage the local state if another replicate joins during the
+        # first step.
+        if self.ft_manager:
+            self.cpu_staging(None)
         self.begin_time = time.monotonic()
 
+    def _initialize_states(
+        self,
+        states: Dict[str, Any],
+        dataloader: DataLoader,
+        model_parts: List[nn.Module],
+        optimizers: OptimizersContainer,
+        lr_schedulers: LRSchedulersContainer,
+    ) -> None:
+        """
+        Note: Pipeline Parallelism and Virtual Stages
+
+        1. Even for simple PP schedules, there is a separate optimizer each PP rank.
+        rank0's optimizer would have a param_group[0] which refers to layers.0 in the
+        original model. rank1's would _also_ have a param_group[0], since it's index based,
+        but referring to layers.1.
+        When saving, these collide and one of them is lost.  Then when reloading, only one
+        stage can restore its optimizer states, others will error.
+
+            The solution to this problem is optimizer flattening: it landed in #127071
+            and is enabled in TorchTitan by passing the 'flatten_optimizer_state_dict'
+            kwarg to DCP functions called in the OptimizerContainer.
+
+        2. With complex PP schedules, we have multiple model chunks per pp rank. This
+        compounds challenge (1) by also requiring us to reason about multiple 'optim'
+        objects locally.
+
+            We solve this in the Model and Optimizer wrapper classes by flattening the
+            state dicts from each object into one state dict before saving/loading.
+            We rely on the individual state_dicts to not collide, which is gauranteed for
+            the model by correct pipeline splitting and for the optimizer by the flattening
+            support described in (1).
+
+        3. LR schedulers also index model states like optimizers and would need to be
+        flattened properly to support resharding. Unfortunately, the implementations of
+        different lr_schedulers do not follow a clear pattern like optimizers do, so it's
+        hard to write a generic 'flattener' utility.
+
+            TODO: This is currently unsolved and needs a fix.
+        """
+        self.states = states
+        self.states.update(
+            {
+                "model": ModelWrapper(model_parts),
+                "optimizer": optimizers,
+                "dataloader": dataloader,
+                "lr_scheduler": lr_schedulers,
+            }
+        )
+
     def _create_checkpoint_id(self, step: int) -> str:
         return os.path.join(self.folder, f"step-{step}")
 
@@ -324,31 +387,8 @@ def _async_wait(self) -> None:
                 self.async_future.result()
 
     def _async_with_pinned_memory(self, checkpoint_id: str) -> None:
-        try:
-            from torch.distributed._state_dict_utils import (
-                _copy_state_dict,
-                _create_cpu_state_dict,
-            )
-        except ImportError as e:
-            raise ImportError(
-                "Please install the latest PyTorch nightly to use async checkpointing with pinned memory."
-            ) from e
-        state_dict = dcp.state_dict_saver._stateful_to_state_dict(self.states)
-        if self.cpu_offload_state_dict is None:
-            logger.debug(f"Preparing the CPU memory, {time.monotonic()=}.:.2f")
-            self.cpu_offload_state_dict = _create_cpu_state_dict(
-                state_dict, pin_memory=True, share_memory=True
-            )
-
-        logger.debug(f"Staging the state_dict, {time.monotonic()=}.:.2f")
-        with torch.cuda.stream(self.staging_stream):
-            self.cpu_offload_state_dict = _copy_state_dict(
-                state_dict,
-                self.cpu_offload_state_dict,
-                non_blocking=True,
-            )
-            self.staging = True
-            self.staging_id = checkpoint_id
+        self.cpu_staging(checkpoint_id)
+        self.sending_to_checkpoint_mp = True
 
     def save(self, curr_step: int, force: bool = False) -> None:
         """
@@ -358,6 +398,8 @@ def save(self, curr_step: int, force: bool = False) -> None:
         for initial seed checkpoint.
         """
         if not self._should_save(curr_step, force):
+            if self.ft_manager:
+                self.cpu_staging(None)
             return
 
         begin = time.monotonic()
@@ -381,26 +423,51 @@ def save(self, curr_step: int, force: bool = False) -> None:
             f"in {time.monotonic() - begin:.2f} seconds."
         )
 
+    def cpu_staging(self, checkpoint_id: Optional[str]) -> None:
+        """Offload state_dict to CPU memory"""
+        state_dict = dcp.state_dict_saver._stateful_to_state_dict(self.states)
+        if self.cpu_offload_state_dict is None:
+            logger.debug(f"Preparing the CPU memory, {time.monotonic()=}.:.2f")
+            self.cpu_offload_state_dict = _create_cpu_state_dict(
+                state_dict, pin_memory=True, share_memory=True
+            )
+
+        logger.debug(f"Staging the state_dict, {time.monotonic()=}.:.2f")
+        with torch.cuda.stream(self.staging_stream):
+            self.cpu_offload_state_dict = _copy_state_dict(
+                state_dict,
+                self.cpu_offload_state_dict,
+                non_blocking=True,
+            )
+            self.staging = True
+            self.staging_id = checkpoint_id
+
+    def wait_for_staging(self) -> None:
+        if not self.staging_stream.query():
+            self.staging_stream.synchronize()
+        self.staging = False
+
+    def staging_results(self) -> Dict[str, Any]:
+        self.maybe_wait_for_staging()
+        return self.cpu_offload_state_dict
+
     def maybe_wait_for_staging(self) -> None:
-        if (
-            self.enable_checkpoint
-            and self.async_mode == AsyncMode.ASYNC_WITH_PINNED_MEM
-            and self.staging
-        ):
-            if not self.staging_stream.query():
-                self.staging_stream.synchronize()
-
-            def sync_func():
-                self.mp_queue_send.put_nowait(
-                    (self.cpu_offload_state_dict, self.staging_id)
-                )
-
-            # This may be a faster way to do zero-overhead checkpointing staging
-            # checkpointing but we need more thorough investigation before
-            # swithing to this method.
-            # self.my_thread = threading.Thread(target=func).start()
-            sync_func()
-            self.staging = False
+        if self.enable_staging and self.staging:
+            self.wait_for_staging()
+
+            if self.sending_to_checkpoint_mp:
+                # Copy the sync staging result to another process.
+                def sync_func():
+                    self.mp_queue_send.put_nowait(
+                        (self.cpu_offload_state_dict, self.staging_id)
+                    )
+
+                # This may be a faster way to do zero-overhead checkpointing staging
+                # checkpointing but we need more thorough investigation before
+                # swithing to this method.
+                # self.my_thread = threading.Thread(target=func).start()
+                sync_func()
+                self.sending_to_checkpoint_mp = False
 
     def load(self, step: int = -1) -> bool:
         if not self.enable_checkpoint:
 
@@ -631,6 +631,19 @@ def __init__(self):
             action="store_true",
         )
 
+        self.parser.add_argument(
+            "--experimental.enable_torchft",
+            action="store_true",
+            help="Enable TorchFT integration.",
+        )
+
+        self.parser.add_argument(
+            "--experimental.ft_replica_group_id",
+            type=int,
+            default=-1,
+            help="The FT replicate group of this run.",
+        )
+
     def to_dict(self):
         return self.args_dict
 
 
@@ -0,0 +1,58 @@
+import importlib
+from typing import Any, Callable, Optional
+
+from torch.distributed._state_dict_utils import _copy_state_dict, _create_cpu_state_dict
+
+from torchtitan.config_manager import JobConfig
+
+if importlib.util.find_spec("torchft") is not None:
+    import torchft as ft
+
+    has_torchft = True
+else:
+    has_torchft = False
+
+
+def init_ft_manager(job: JobConfig) -> Optional["ft.Manager"]:
+    """
+    Initialize the FT manager for the given job.
+    """
+    if not job.experimental.enable_torchft:
+        return None
+
+    if not has_torchft:
+        raise ImportError("torchft is not installed. Please install it.")
+
+    pg = ft.ProcessGroupBabyNCCL()
+    manager = ft.Manager(
+        pg=pg,
+        min_replica_size=1,
+        load_state_dict=None,
+        state_dict=None,
+        use_async_quorum=True,
+        replica_id=f"torchtitan_ft_{job.experimental.ft_replica_group_id}",
+    )
+
+    return manager
+
+
+def set_ft_state_dict_fns(manager: Optional["ft.Manager"], ckpt_manager) -> None:
+    """
+    Set the state dict for the given manager.
+    """
+    if manager is None:
+        return
+
+    def state_dict():
+        ret = {}
+        for k, v in ckpt_manager.staging_results().items():
+            if k in {"model", "optimizer", "lr_schedulers"}:
+                ret[k] = v
+        return ret
+
+    def load_state_dict(state_dict):
+        assert state_dict is not None
+        for k, v in state_dict.items():
+            ckpt_manager.states[k].load_state_dict(v)
+
+    manager.set_state_dict_fns(load_state_dict, state_dict)
@@ -177,8 +177,32 @@ def zero_grad(self) -> None:
         pass
 
 
+class FTOptimizersContainer(Optimizer):
+    def __init__(
+        self,
+        model_parts: List[nn.Module],
+        optimizer_kwargs: Dict[str, Any],
+        name: str,
+        ft_manager: Any,
+    ) -> None:
+        import torchft as ft
+
+        super().__init__()
+
+        # Force to initialize the optimizer state so that `optim.step()`
+        # won't be called by state_dict() and load_state_dict().
+        _ = {
+            k: v
+            for sd in map(get_optimizer_state_dict, model_parts, self.optimizers)
+            for k, v in sd.items()
+        }
+        self.optimizers = [ft.Optimizer(ft_manager, optim) for optim in self.optimizers]
+
+
 def build_optimizers(
-    model_parts: List[nn.Module], job_config: JobConfig
+    model_parts: List[nn.Module],
+    job_config: JobConfig,
+    ft_manager: Optional[Any] = None,
 ) -> OptimizersContainer:
     """Create a OptimizersContainer for the given model parts and job config.
 
@@ -213,11 +237,14 @@ def build_optimizers(
         "foreach": not fused,
     }
 
-    return (
-        OptimizersContainer(model_parts, optimizer_kwargs, name)
-        if not optim_in_bwd
-        else OptimizersInBackwardContainer(model_parts, optimizer_kwargs, name)
-    )
+    if optim_in_bwd and ft_manager:
+        raise ValueError("TorchFT is not supported with optimizers in backward.")
+    elif optim_in_bwd:
+        return OptimizersInBackwardContainer(model_parts, optimizer_kwargs, name)
+    elif ft_manager:
+        return FTOptimizersContainer(model_parts, optimizer_kwargs, name, ft_manager)
+    else:
+        return OptimizersContainer(model_parts, optimizer_kwargs, name)
 
 
 class LRSchedulersContainer(Stateful):
 
@@ -6,6 +6,7 @@
 
 from dataclasses import dataclass
 from functools import cached_property
+from typing import Any, Optional
 
 from torch.distributed.device_mesh import init_device_mesh
 
@@ -24,6 +25,7 @@ class ParallelDims:
     pp: int
     world_size: int
     enable_loss_parallel: bool
+    ft_manager: Optional[Any]
 
     def __post_init__(self):
         self._validate()
@@ -56,13 +58,24 @@ def build_mesh(self, device_type):
             [self.pp, self.dp_replicate, self.dp_shard, self.cp, self.tp],
             ["pp", "dp_replicate", "dp_shard", "cp", "tp"],
         ):
-            if d > 1:
+            if d > 1 or (name == "dp_replicate" and self.ft_manager is not None):
                 dims.append(d)
                 names.append(name)
 
         logger.info(f"Building {len(dims)}-D device mesh with {names}, {dims}")
         names = tuple(names)
-        mesh = init_device_mesh(device_type, dims, mesh_dim_names=names)
+        if self.ft_manager is None:
+            mesh = init_device_mesh(device_type, dims, mesh_dim_names=names)
+        else:
+            from torchft.process_group import ft_init_device_mesh
+
+            mesh = ft_init_device_mesh(
+                device_type=device_type,
+                mesh_shape=dims,
+                mesh_dim_names=names,
+                replicate_dim=names.index("dp_replicate"),
+                manager=self.ft_manager,
+            )
 
         # Create all the submesh here to ensure all required process groups are
         # initialized:
@@ -73,7 +86,7 @@ def build_mesh(self, device_type):
         # Mesh for loss all-reduce
         dp_cp_mesh_dim_names = []
 
-        if self.dp_replicate_enabled:
+        if self.dp_replicate_enabled or ft_manager is not None:
             dp_mesh_dim_names.append("dp_replicate")
             dp_cp_mesh_dim_names.append("dp_replicate")
         if self.dp_shard_enabled:
@@ -101,7 +114,7 @@ def dp_enabled(self):
 
     @property
     def dp_replicate_enabled(self):
-        return self.dp_replicate > 1
+        return self.dp_replicate > 1 or self.ft_manager is not None
 
     @property
     def dp_shard_enabled(self):
 
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
+import copy
 import gc
 import importlib
 import math
@@ -18,6 +19,7 @@
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.distributed_c10d as c10d
+import torchft as ft
 from torch import distributed as dist
 from torch._utils import _get_available_device_type, _get_device_module
 from torch.distributed.device_mesh import DeviceMesh
@@ -38,6 +40,11 @@ def get_device_info():
 
 
 def dist_reduce(x: torch.Tensor, reduceOp: str, mesh: DeviceMesh) -> float:
+    if isinstance(mesh, ft.process_group._FlattenDeviceMesh):
+        torch.distributed.all_reduce(x, group=mesh.managed_mesh.replicate_pg)
+        # x = funcol.all_reduce(x, reduceOp=reduceOp, group=mesh.managed_mesh.replicate_pg)
+        mesh = mesh.managed_mesh.mesh
+
     if isinstance(x, DTensor):
         # functional collectives do not support DTensor inputs
         x = x.full_tensor()
@@ -402,6 +409,17 @@ def clip_grad_norm_(
     if isinstance(total_norm, DTensor):
         # Will reach here if any non-PP parallelism is used.
         # If only using PP, total_norm will be a local tensor.
+        mesh = total_norm._spec.mesh
+        if isinstance(mesh, ft.process_group.ManagedDeviceMesh):
+            # The gradients along the replicated dim has been reduced.
+            # So we don't need another reducution beforing removing the
+            # replicate dimension
+            local_tensor = total_norm.to_local()
+            placements = list(copy.copy(total_norm._spec.placements))
+            placements.pop(mesh.replicate_dim)
+            mesh = mesh.mesh
+            total_norm = DTensor.from_local(local_tensor, mesh, placements)
+
         total_norm = total_norm.full_tensor()
 
     if pp_mesh is not None:
 
@@ -16,6 +16,7 @@
 from torchtitan.checkpoint import CheckpointManager, TrainState
 from torchtitan.config_manager import JobConfig
 from torchtitan.float8 import Float8Handler
+from torchtitan.ft import init_ft_manager, set_ft_state_dict_fns
 from torchtitan.logging import init_logger, logger
 from torchtitan.metrics import build_device_memory_monitor, build_metric_logger
 from torchtitan.parallelisms import ParallelDims
@@ -42,6 +43,10 @@ def main(job_config: JobConfig):
     # take control of garbage collection to avoid stragglers
     gc_handler = utils.GarbageCollection(gc_freq=job_config.training.gc_freq)
 
+    device = torch.device(f"{device_type}:{int(os.environ['LOCAL_RANK'])}")
+    device_module.set_device(device)
+    ft_manager = init_ft_manager(job_config)
+
     # init distributed
     world_size = int(os.environ["WORLD_SIZE"])
     parallel_dims = ParallelDims(
@@ -52,9 +57,8 @@ def main(job_config: JobConfig):
         pp=job_config.experimental.pipeline_parallel_degree,
         world_size=world_size,
         enable_loss_parallel=not job_config.training.disable_loss_parallel,
+        ft_manager=ft_manager,
     )
-    device = torch.device(f"{device_type}:{int(os.environ['LOCAL_RANK'])}")
-    device_module.set_device(device)
     utils.init_distributed(job_config)
     # initialize device memory monitor and get peak flops for MFU calculation
     device_memory_monitor = build_device_memory_monitor()
@@ -186,7 +190,7 @@ def loss_fn(pred, labels):
     )
 
     # build optimizer after applying parallelisms to the model
-    optimizers = train_spec.build_optimizers_fn(model_parts, job_config)
+    optimizers = train_spec.build_optimizers_fn(model_parts, job_config, ft_manager)
     lr_schedulers = train_spec.build_lr_schedulers_fn(optimizers, job_config)
 
     train_state = TrainState()
@@ -199,7 +203,9 @@ def loss_fn(pred, labels):
         lr_schedulers=lr_schedulers,
         states={"train_state": train_state},
         job_config=job_config,
+        ft_manager=ft_manager,
     )
+    set_ft_state_dict_fns(ft_manager, checkpoint)
 
     if job_config.checkpoint.create_seed_checkpoint:
         assert (