wrap stages in FSDP root mod

H-Huang · H-Huang · commit c092cffb8417 · 2025-10-09T11:09:33.000-07:00
diff --git a/torchtitan/models/llama3/infra/pipeline.py b/torchtitan/models/llama3/infra/pipeline.py
@@ -17,7 +17,7 @@
 )
 
 from torchtitan.components.loss import LossFunction
-from torchtitan.config import JobConfig
+from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.pipeline_parallel import (
     build_pipeline_schedule,
@@ -27,7 +27,23 @@
 
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.tools.logging import logger
-
+from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
+
+root_mod = None
+
+class PipelineStagesWrapper(nn.Module):
+    """Wrapper to establish parent-child relationship for pipeline stages."""
+    def __init__(self, stages):
+        super().__init__()
+        # Store stages as actual child modules
+        for i, stage in enumerate(stages):
+            self.add_module(f"stage_{i}", stage)
+    
+    def forward(self, x):
+        # This won't be called in pipeline mode, but FSDP requires it
+        for stage in self.children():
+            x = stage(x)
+        return x
 
 def pipeline_llama(
     model: nn.Module,
@@ -136,6 +152,21 @@ def pipeline_llama(
         #       in case the model is modified e.g. by torch.compile
         stages[i].submod = m
 
+    if parallel_dims.fsdp_enabled:
+        world_mesh = parallel_dims.world_mesh
+        if parallel_dims.dp_replicate_enabled:
+            dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+
+        mp_policy = MixedPrecisionPolicy(param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param], reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce])
+        fsdp_config = {"mesh": world_mesh[tuple(dp_mesh_dim_names)], "mp_policy": mp_policy}
+
+        # Wrap the model parts into a root-level FSDP Module
+        parent_module = PipelineStagesWrapper(model_parts)
+        global root_mod
+        root_mod = fully_shard(parent_module, **fsdp_config)
+
     pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn)
 
     # This is used in the train loop to determine whether to pass in the input_ids and labels