Move fqn mapping logic to StateDictAdapter (#1557)

wesleytruong · web-flow · commit 8bd8c930efdc · 2025-08-12T16:35:31.000-07:00
This moves the logic that parses `model.safetensors.index.json` and
generates the `fqn_to_index_mapping` to `StateDictAdapter` since this
logic should be shared by all classes that inherit from
`StateDictAdapter`.
diff --git a/scripts/checkpoint_conversion/convert_to_hf.py b/scripts/checkpoint_conversion/convert_to_hf.py
@@ -65,7 +65,7 @@ def convert_to_hf(input_dir, output_dir, model_name, model_flavor, hf_assets_pat
         "--hf_assets_path",
         type=Path,
         help="Path to HF assets directory. This is used to get the model.safetensors.index.json mapping",
-        default="./assets/hf/Llama3.1-8B",
+        default="./assets/hf/Llama-3.1-8B",
     )
     parser.add_argument("--model_name", type=str, nargs="?", default="llama3")
     parser.add_argument("--model_flavor", type=str, nargs="?", default="8B")
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -37,7 +37,7 @@
 from torchtitan.components.lr_scheduler import LRSchedulersContainer
 from torchtitan.components.optimizer import OptimizersContainer
 from torchtitan.config import Checkpoint as CheckpointConfig, TORCH_DTYPE_MAP
-from torchtitan.protocols import StateDictAdapter
+from torchtitan.protocols import BaseStateDictAdapter
 from torchtitan.tools.logging import logger
 from torchtitan.tools.utils import GarbageCollection
 
@@ -177,7 +177,7 @@ class CheckpointManager:
         checkpoint_config (Checkpoint): The config used to configure the checkpointing.
         base_folder (str): The base folder to save the checkpoint. Will be concatenated
             with checkpoint_config.folder
-        sd_adapter (Optional[type[StateDictAdapter]]): The adapter used to convert model state
+        sd_adapter (Optional[type[BaseStateDictAdapter]]): The adapter used to convert model state
             dicts between native format and other formats.
         ft_manager (Optional[ft.Manager]): The FTManager from TorchFT.
 
@@ -191,7 +191,7 @@ def __init__(
         lr_schedulers: LRSchedulersContainer,
         states: dict[str, Any],
         checkpoint_config: CheckpointConfig,
-        sd_adapter: StateDictAdapter | None,
+        sd_adapter: BaseStateDictAdapter | None,
         base_folder: str = "",
         ft_manager: FTManager | None = None,
     ) -> None:
diff --git a/torchtitan/experiments/forge/train_spec.py b/torchtitan/experiments/forge/train_spec.py
@@ -8,7 +8,7 @@
 
 # Import torchtitan.models to ensure all train specs are registered
 import torchtitan.models  # noqa: F401
-from torchtitan.protocols import BaseModelArgs, ModelProtocol, StateDictAdapter
+from torchtitan.protocols import BaseModelArgs, BaseStateDictAdapter, ModelProtocol
 from torchtitan.protocols.train_spec import (
     _train_specs,
     LossFunctionBuilder,
@@ -30,7 +30,7 @@ class ForgeTrainSpec:
     build_optimizers_fn: OptimizersBuilder
     build_lr_schedulers_fn: LRSchedulersBuilder
     build_loss_fn: LossFunctionBuilder
-    state_dict_adapter: type[StateDictAdapter] | None = None
+    state_dict_adapter: type[BaseStateDictAdapter] | None = None
 
 
 # Copy and transform train specs from torchtitan.protocols.train_spec._train_specs
diff --git a/torchtitan/models/README.md b/torchtitan/models/README.md
@@ -20,7 +20,7 @@ The folder should be organized as follows
       - `init_weights()` is used to properly initialize the parameters and buffers in the model. Please define it in a recursive way so that every submodule has its own `init_weights()`.
     - Add additional files to reduce the complexity of `model.py` if it grows too large or complex, e.g. moe.py to host the `MoE`, `Router`, and `GroupedExperts` modules.
   - `state_dict_adapter.py`
-    - Inherit [`StateDictAdapter`](/torchtitan/protocols/state_dict_adapter.py) to implement state dict mappings between `torchtitan` model definition and other model definitions (e.g. from HuggingFace so that we can save / load model checkpoints in HF formats).
+    - Inherit [`BaseStateDictAdapter`](/torchtitan/protocols/state_dict_adapter.py) to implement state dict mappings between `torchtitan` model definition and other model definitions (e.g. from HuggingFace so that we can save / load model checkpoints in HF formats).
     - There are multiple ways such adapters could be used
       - Checkpoint conversion scripts in `scripts/checkpoint_conversion/` will use them to adapt state dicts containing non-sharded `torch.Tensor` on CPU.
       - During training, [`CheckpointManager`](/torchtitan/components/checkpoint.py) will use them to adapt state dicts containing (potentially sharded) `DTensor` on GPUs to save / load checkpoints in HF format.
diff --git a/torchtitan/models/llama3/model/state_dict_adapter.py b/torchtitan/models/llama3/model/state_dict_adapter.py
@@ -4,9 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import json
 import logging
-import os
 import re
 from typing import Any
 
@@ -19,6 +17,8 @@
 
 class Llama3StateDictAdapter(StateDictAdapter):
     def __init__(self, model_args: TransformerModelArgs, hf_assets_path: str | None):
+        super().__init__(model_args, hf_assets_path)
+
         self.model_args = model_args
         self.hf_assets_path = hf_assets_path
         self.from_hf_map = {
@@ -37,26 +37,6 @@ def __init__(self, model_args: TransformerModelArgs, hf_assets_path: str | None)
             "lm_head.weight": "output.weight",
         }
 
-        if hf_assets_path:
-            mapping_path = os.path.join(hf_assets_path, "model.safetensors.index.json")
-            try:
-                with open(mapping_path, "r") as f:
-                    hf_safetensors_indx = json.load(f)
-            except FileNotFoundError:
-                logger.warning(
-                    "model.safetensors.index.json not found at hf_assets_path: {mapping_path}. \
-                    Defaulting to saving a single safetensors file if checkpoint is saved in HF format.",
-                )
-                hf_safetensors_indx = None
-
-            if hf_safetensors_indx:
-                self.fqn_to_index_mapping = {}
-                for hf_key, raw_indx in hf_safetensors_indx["weight_map"].items():
-                    indx = re.search(r"\d+", raw_indx).group(0)
-                    self.fqn_to_index_mapping[hf_key] = indx
-            else:
-                self.fqn_to_index_mapping = None
-
     # HuggingFace permutation function (exact copy from their conversion script)
     def _permute(self, w, n_heads_arg, dim1=None, dim2=None):
         if dim1 is None:
diff --git a/torchtitan/protocols/__init__.py b/torchtitan/protocols/__init__.py
@@ -6,12 +6,13 @@
 
 from .model import BaseModelArgs, ModelProtocol
 from .model_converter import ModelConverter, ModelConvertersContainer
-from .state_dict_adapter import StateDictAdapter
+from .state_dict_adapter import BaseStateDictAdapter, StateDictAdapter
 
 __all__ = [
     "BaseModelArgs",
     "ModelProtocol",
     "ModelConverter",
     "ModelConvertersContainer",
     "StateDictAdapter",
+    "BaseStateDictAdapter",
 ]
diff --git a/torchtitan/protocols/state_dict_adapter.py b/torchtitan/protocols/state_dict_adapter.py
@@ -4,13 +4,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import json
+import logging
+import os
+import re
 from abc import ABC, abstractmethod
 from typing import Any
 
+logger = logging.getLogger()
+
 from .model import BaseModelArgs
 
 
-class StateDictAdapter(ABC):
+class BaseStateDictAdapter(ABC):
     """Abstract base class for state dict transformations.
 
     This class defines the interface for converting between native model
@@ -47,3 +53,28 @@ def from_hf(self, hf_state_dict: dict[str, Any]) -> dict[str, Any]:
             The converted native model state dict
         """
         pass
+
+
+class StateDictAdapter(BaseStateDictAdapter):
+    """State dict adapter base class which provides convenient default behavior to build fqn_to_index_mapping"""
+
+    def __init__(self, model_args: BaseModelArgs, hf_assets_path: str | None):
+        if hf_assets_path:
+            mapping_path = os.path.join(hf_assets_path, "model.safetensors.index.json")
+            try:
+                with open(mapping_path, "r") as f:
+                    hf_safetensors_indx = json.load(f)
+            except FileNotFoundError:
+                logger.warning(
+                    "model.safetensors.index.json not found at hf_assets_path: {mapping_path}. \
+                    Defaulting to saving a single safetensors file if checkpoint is saved in HF format.",
+                )
+                hf_safetensors_indx = None
+
+            if hf_safetensors_indx:
+                self.fqn_to_index_mapping = {}
+                for hf_key, raw_indx in hf_safetensors_indx["weight_map"].items():
+                    indx = re.search(r"\d+", raw_indx).group(0)
+                    self.fqn_to_index_mapping[hf_key] = indx
+            else:
+                self.fqn_to_index_mapping = None
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
@@ -21,7 +21,7 @@
 from torchtitan.config import LRScheduler
 
 from .model import BaseModelArgs, ModelProtocol
-from .state_dict_adapter import StateDictAdapter
+from .state_dict_adapter import BaseStateDictAdapter
 
 
 ParallelizeFunction: TypeAlias = Callable[..., nn.Module]
@@ -53,7 +53,7 @@ class TrainSpec:
     build_loss_fn: LossFunctionBuilder
     build_validator_fn: ValidatorBuilder | None = None
     build_metrics_processor_fn: MetricsProcessorBuilder | None = None
-    state_dict_adapter: type[StateDictAdapter] | None = None
+    state_dict_adapter: type[BaseStateDictAdapter] | None = None
 
 
 _train_specs = {}

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ def convert_to_hf(input_dir, output_dir, model_name, model_flavor, hf_assets_pat`
`65`	`65`	`"--hf_assets_path",`
`66`	`66`	`type=Path,`
`67`	`67`	`help="Path to HF assets directory. This is used to get the model.safetensors.index.json mapping",`
`68`		`- default="./assets/hf/Llama3.1-8B",`
	`68`	`+ default="./assets/hf/Llama-3.1-8B",`
`69`	`69`	`)`
`70`	`70`	`parser.add_argument("--model_name", type=str, nargs="?", default="llama3")`
`71`	`71`	`parser.add_argument("--model_flavor", type=str, nargs="?", default="8B")`