chore: Merge branch 'main' into tensor_parallelism

le1nux · le1nux · commit dcd37e51cdfd · 2025-07-22T17:55:27.000+02:00
diff --git a/src/modalities/config/config.py b/src/modalities/config/config.py
@@ -338,15 +338,15 @@ class FullACParams(BaseModel):
         pass
 
     class SelectiveLayerACParams(BaseModel):
-        ac_freq: int
+        ac_freq: Annotated[int, Field(strict=True, ge=1)]
 
     class SelectiveOpACParams(BaseModel):
         save_ops_keys: list[str]
 
     ac_variant: ActivationCheckpointingVariants
     layers_fqn: str
-    model: PydanticPytorchModuleType | PydanticFSDP1ModuleType
-    ac_fun_params: Optional[FullACParams | SelectiveLayerACParams | SelectiveOpACParams] = None
+    model: PydanticPytorchModuleType
+    ac_fun_params: FullACParams | SelectiveLayerACParams | SelectiveOpACParams
 
 
 class RawAppStateConfig(BaseModel):
diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py
@@ -42,7 +42,7 @@
 from modalities.running_env.fsdp.fsdp_auto_wrapper import FSDPTransformerAutoWrapPolicyFactory
 from modalities.training.activation_checkpointing.activation_checkpointing import (
     ActivationCheckpointing,
-    apply_activation_checkpointing_inplace,
+    apply_activation_checkpointing_fsdp1_inplace,
 )
 from modalities.training.activation_checkpointing.activation_checkpointing_variants import (
     ActivationCheckpointingVariants,
@@ -265,19 +265,19 @@ def get_activation_checkpointed_fsdp1_model_(model: FSDP1, activation_checkpoint
         """
         if len(activation_checkpointing_modules) > 0:
             if isinstance(model, FSDP1):
-                apply_activation_checkpointing_inplace(
+                apply_activation_checkpointing_fsdp1_inplace(
                     model=model,
                     activation_checkpointing_modules=activation_checkpointing_modules,
                 )
             else:
                 raise ValueError(
                     "Activation checkpointing can only be applied to FSDP1-wrapped models! "
-                    f"Current model type: {type(model)}"
+                    f"Current model type: {type(model)}."
                 )
         return model
 
     @staticmethod
-    def get_activation_checkpointed_model_(
+    def get_activation_checkpointed_fsdp2_model_(
         ac_variant: ActivationCheckpointingVariants,
         layers_fqn: str,
         model: nn.Module,
@@ -288,7 +288,9 @@ def get_activation_checkpointed_model_(
         ),
     ) -> nn.Module:
         """FSDP2 variant for applying activation checkpointing to the given model (in-place operation).
-        When using FSDP2, we always first apply activation checkpointing to the model and then wrap it with FSDP2.
+
+        Important: When using FSDP2, we always first apply activation checkpointing to the model
+                   and then wrap it with FSDP2.
 
         Args:
             ac_variant (ActivationCheckpointingVariants): The activation checkpointing variant to use.
diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py
@@ -166,7 +166,7 @@ class ComponentEntity:
     ComponentEntity(
         "model",
         "activation_checkpointed",
-        ModelFactory.get_activation_checkpointed_model_,
+        ModelFactory.get_activation_checkpointed_fsdp2_model_,
         ActivationCheckpointedModelConfig,
     ),
     ComponentEntity("model", "compiled", ModelFactory.get_compiled_model, CompiledModelConfig),
diff --git a/src/modalities/training/activation_checkpointing/activation_checkpointing.py b/src/modalities/training/activation_checkpointing/activation_checkpointing.py
@@ -23,12 +23,15 @@ def is_module_to_apply_activation_checkpointing(
     return isinstance(submodule, tuple(activation_checkpointing_modules))
 
 
-def apply_activation_checkpointing_inplace(model: nn.Module, activation_checkpointing_modules: list[str]):
+def apply_activation_checkpointing_fsdp1_inplace(model: FSDP1, activation_checkpointing_modules: list[str]):
     activation_checkpointing_module_types = [
         get_module_class_from_name(model, m) for m in activation_checkpointing_modules
     ]
-    if not isinstance(model, (FSDP1)):
-        raise ValueError("activation checkpointing can only be applied to FSDP1 wrapped models!")
+    if not isinstance(model, FSDP1):
+        raise ValueError(
+            "This activation checkpointing component can only be applied to FSDP1 wrapped models. "
+            "Use the respective FSDP2 component for FSDP2 models."
+        )
     non_reentrant_wrapper = partial(ptd_checkpoint_wrapper, checkpoint_impl=CheckpointImpl.NO_REENTRANT, debug=False)
 
     apply_activation_checkpointing(
@@ -76,7 +79,7 @@ class ActivationCheckpointing:
         # for low precision training, it's useful to always save
         # the result of max, since the absolute maximum is
         # used to compute the scaling factor for quantization.
-        "torch.ops.aten.max.default": ops.aten.max.default,
+        "ops.aten.max.default": ops.aten.max.default,
     }
 
     @staticmethod
@@ -147,19 +150,20 @@ def apply_activation_checkpointing_(
 
     @staticmethod
     def _apply_full_ac(module: nn.Module) -> nn.Module:
-        module_saced = ptd_checkpoint_wrapper(module, preserve_rng_state=False)
-        return module_saced
+        module_aced = ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+        return module_aced
 
     @staticmethod
     def _apply_selective_op_ac(module: nn.Module, save_ops_keys: list[str]) -> nn.Module:
-        def _get_custom_policy(meta, save_ops_set: Set):  # closure to capture meta
+        def _get_custom_policy(meta: dict[str, int], save_ops_set: Set):  # closure to capture meta
             def _custom_policy(ctx, func, *args, **kwargs):
                 mode = "recompute" if ctx.is_recompute else "forward"
                 mm_count_key = f"{mode}_mm_count"
                 if func == torch.ops.aten.mm.default:
                     meta[mm_count_key] += 1
                 # Saves output of all compute ops in save_ops_set, except every second mm
                 # NOTE: we should make this configurable and not hide it in the code
+                # To make this completely configurable, we would have to store the checkpointing frequency of every OP.
                 to_save = func in save_ops_set and not (
                     func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
                 )
diff --git a/src/modalities/util.py b/src/modalities/util.py
@@ -65,6 +65,18 @@ def get_experiment_id_from_config(config_file_path: Optional[Path], hash_length:
 def get_synced_string(
     string_to_be_synced: str, from_rank: int = 0, max_string_byte_length: Optional[int] = 1024
 ) -> str:
+    """Broadcast a string from one rank to all other ranks in the distributed setup.
+
+    Args:
+        string_to_be_synced (str): The string to be synced across ranks.
+        from_rank (int, optional): The rank that generates the string. Defaults to 0.
+        max_string_byte_length (Optional[int], optional): Maximum byte length of the string to be synced.
+            Defaults to 1024.
+    Returns:
+        str: The synced string, decoded from the byte array.
+    Raises:
+        ValueError: If the string exceeds the maximum byte length.
+    """
     rank = dist.get_rank()
     if rank == from_rank:
         # Generate a unique folder name
@@ -112,9 +124,9 @@ def get_synced_experiment_id_of_run(
     Returns:
         str: The experiment ID.
     """
-    experimenet_id = get_experiment_id_from_config(config_file_path, hash_length)
+    experiment_id = get_experiment_id_from_config(config_file_path, hash_length)
     experiment_id_synced = get_synced_string(
-        string_to_be_synced=experimenet_id,
+        string_to_be_synced=experiment_id,
         from_rank=0,
         max_string_byte_length=max_experiment_id_byte_length,
     )
diff --git a/tests/training/config_activation_checkpointing.yaml b/tests/training/config_activation_checkpointing.yaml
@@ -7,6 +7,7 @@ full_activation_checkpointed_model:
       instance_key: model_raw
       pass_type: BY_REFERENCE
     layers_fqn: transformer.h
+    ac_fun_params: {}
 
 selective_layer_activation_checkpointed_model: 
   component_key: model
@@ -31,7 +32,7 @@ selective_op_activation_checkpointed_model:
     layers_fqn: transformer.h
     ac_fun_params:
       save_ops_keys:
-        - torch.ops.aten.mm.default
+        - ops.aten.mm.default
 
 model_raw:
   component_key: model
diff --git a/tests/training/config_activation_checkpointing_fsdp1.yaml b/tests/training/config_activation_checkpointing_fsdp1.yaml
@@ -7,6 +7,7 @@ test_model:
       instance_key: wrapped_model
       pass_type: BY_REFERENCE
     layers_fqn: transformer.h
+    ac_fun_params: {}
 
 wrapped_model:
   component_key: model
diff --git a/tests/training/config_activation_checkpointing_fsdp2.yaml b/tests/training/config_activation_checkpointing_fsdp2.yaml
@@ -40,6 +40,7 @@ activation_checkpointed_model:
       instance_key: model_raw
       pass_type: BY_REFERENCE
     layers_fqn: transformer.h
+    ac_fun_params: {}
 
 model_raw:
   component_key: model
diff --git a/tests/training/test_activation_checkpointing.py b/tests/training/test_activation_checkpointing.py
@@ -2,7 +2,10 @@
 from pathlib import Path
 
 import pytest
+import torch
 import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.nn.functional as F
 from pydantic import BaseModel
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import CheckpointWrapper
 
@@ -15,6 +18,10 @@
 working_dir = Path(os.path.dirname(__file__))
 
 
+class RawModel(BaseModel):
+    model_raw: PydanticPytorchModuleType
+
+
 class ActivationCheckpointingInstantiationModel(BaseModel):
     test_model: PydanticPytorchModuleType
 
@@ -31,6 +38,10 @@ class SelectiveOpActivationCheckpointingInstantiationModel(BaseModel):
     selective_op_activation_checkpointed_model: PydanticPytorchModuleType
 
 
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="This test requires more than one GPU",
+)
 @pytest.mark.parametrize(
     "rdvz_port, world_size, relative_config_path",
     [
@@ -50,7 +61,6 @@ def test_full_activation_checkpointing_FSDP1_legacy(world_size: int, rdvz_port:
 def _test_full_activation_checkpointing_FSDP1_legacy_thread(
     process_id: int, rdvz_port: int, world_size: int, relative_config_path: str
 ):
-    working_dir = Path(os.path.dirname(__file__))
     config_file_path = working_dir / relative_config_path
 
     with MultiProcessingCudaEnv(
@@ -77,6 +87,10 @@ def _test_full_activation_checkpointing_FSDP1_legacy_thread(
         )
 
 
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="This test requires more than one GPU",
+)
 @pytest.mark.parametrize(
     "rdvz_port, world_size, relative_config_path",
     [
@@ -96,7 +110,6 @@ def test_full_activation_checkpointing_FSDPX(world_size: int, rdvz_port: int, re
 def _test_full_activation_checkpointing_FSDPX_thread(
     process_id: int, rdvz_port: int, world_size: int, relative_config_path: str
 ):
-    working_dir = Path(os.path.dirname(__file__))
     config_file_path = working_dir / relative_config_path
 
     with MultiProcessingCudaEnv(
@@ -130,8 +143,7 @@ def _test_full_activation_checkpointing_FSDPX_thread(
         ("config_activation_checkpointing.yaml"),
     ],
 )
-def test_full_activation_checkpointing(relative_config_path: str):
-    working_dir = Path(os.path.dirname(__file__))
+def test_fsdp2_full_activation_checkpointing(relative_config_path: str):
     config_file_path = working_dir / relative_config_path
 
     main = Main(config_file_path, experiment_id="-1")
@@ -152,8 +164,7 @@ def test_full_activation_checkpointing(relative_config_path: str):
         ("config_activation_checkpointing.yaml"),
     ],
 )
-def test_selective_layer_activation_checkpointing(relative_config_path: str):
-    working_dir = Path(os.path.dirname(__file__))
+def test_fsdp2_selective_layer_activation_checkpointing(relative_config_path: str):
     config_file_path = working_dir / relative_config_path
 
     main = Main(config_file_path, experiment_id="-1")
@@ -174,8 +185,7 @@ def test_selective_layer_activation_checkpointing(relative_config_path: str):
         ("config_activation_checkpointing.yaml"),
     ],
 )
-def test_selective_op_activation_checkpointing(relative_config_path: str):
-    working_dir = Path(os.path.dirname(__file__))
+def test_fsdp2_selective_op_activation_checkpointing(relative_config_path: str):
     config_file_path = working_dir / relative_config_path
 
     main = Main(config_file_path, experiment_id="-1")
@@ -189,3 +199,89 @@ def test_selective_op_activation_checkpointing(relative_config_path: str):
             assert isinstance(module, CheckpointWrapper)
         else:
             assert not isinstance(module, CheckpointWrapper)
+
+
+# end to end equivalence test in terms of loss
+
+
+@pytest.mark.parametrize(
+    "relative_config_path",
+    [
+        ("config_activation_checkpointing.yaml"),
+    ],
+)
+def test_fsdp2_activation_checkpointing_end2end(relative_config_path: str):
+    def forward_and_backward(model: nn.Module, input_ids: torch.Tensor) -> float:
+        target = input_ids[:, 1:]  # batch_size, seq_len - 1
+        input_ids = input_ids[:, :-1]  # batch_size, seq_len - 1
+        input_dict = {"input_ids": input_ids}
+        logits = model(input_dict)["logits"]  # batch_size, seq_len - 1, vocab_size
+
+        loss = F.cross_entropy(
+            logits.reshape(-1, logits.size(-1)),  # batch_size * (seq_len - 1), vocab_size
+            target.reshape(-1),  # batch_size * (seq_len - 1)
+            reduction="mean",
+        )
+        loss_val = loss.item()
+        loss.backward()
+        return loss_val
+
+    def check_grads_equal(model1, model2, label):
+        for (n1, p1), (n2, p2) in zip(model1.named_parameters(), model2.named_parameters()):
+            if p1.grad is not None and p2.grad is not None:
+                # we cannot check the FQNs as AC renames the parameters.
+                # inestead we check for weight equivalence
+                torch.testing.assert_close(p1, p2, rtol=1e-5, atol=1e-7, msg=f"Parameter mismatch in {n1} ({label})")
+                torch.testing.assert_close(
+                    p1.grad, p2.grad, rtol=1e-5, atol=1e-7, msg=f"Gradient mismatch in {n1} ({label})"
+                )
+
+    batch_size = 2
+    seq_len = 256
+    vocab_size = 50304
+
+    # build the models with different activation checkpointing variants but equivalent weights
+    config_file_path = working_dir / relative_config_path
+    main = Main(config_file_path, experiment_id="-1")
+
+    torch.manual_seed(42)
+    model_raw = main.build_components(components_model_type=RawModel).model_raw.to("cuda")
+
+    torch.manual_seed(42)
+    model_fac = main.build_components(
+        components_model_type=FullActivationCheckpointingInstantiationModel
+    ).full_activation_checkpointed_model.to("cuda")
+
+    torch.manual_seed(42)
+    model_sel_layer = main.build_components(
+        components_model_type=SelectiveLayerActivationCheckpointingInstantiationModel
+    ).selective_layer_activation_checkpointed_model.to("cuda")
+
+    torch.manual_seed(42)
+    model_sel_op = main.build_components(
+        components_model_type=SelectiveOpActivationCheckpointingInstantiationModel
+    ).selective_op_activation_checkpointed_model.to("cuda")
+
+    # Ensure all models have a different reference
+    models = [model_raw, model_fac, model_sel_layer, model_sel_op]
+    assert len(set(id(m) for m in models)) == len(models)
+
+    # Dummy LLM token input
+    # we use a sequence length of seq_len + 1 as the last token will be only used for loss calculation
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len + 1), device="cuda")
+
+    # Run forward+backward
+    loss_raw = forward_and_backward(model_raw, input_ids)
+    loss_fac = forward_and_backward(model_fac, input_ids)
+    loss_sel_layer = forward_and_backward(model_sel_layer, input_ids)
+    loss_sel_op = forward_and_backward(model_sel_op, input_ids)
+
+    # Compare losses
+    torch.testing.assert_close(torch.tensor(loss_fac), torch.tensor(loss_raw), msg="FAC loss mismatch")
+    torch.testing.assert_close(torch.tensor(loss_sel_layer), torch.tensor(loss_raw), msg="Sel layer AC loss mismatch")
+    torch.testing.assert_close(torch.tensor(loss_sel_op), torch.tensor(loss_raw), msg="Sel op AC loss mismatch")
+
+    # Compare gradients
+    check_grads_equal(model_raw, model_fac, "fac")
+    check_grads_equal(model_raw, model_sel_layer, "sel_layer")
+    check_grads_equal(model_raw, model_sel_op, "sel_op")
diff --git a/tutorials/instruction_tuning/configs/small_train_instruct_model_fsdp2_config.yaml b/tutorials/instruction_tuning/configs/small_train_instruct_model_fsdp2_config.yaml
@@ -23,8 +23,8 @@ settings:
     enforce_last_step_evaluated: false
     enforce_last_step_checkpointed: false
   step_profile:
-    gradient_accumulation_steps: 2
-    local_train_micro_batch_size: 2
+    gradient_accumulation_steps: 4
+    local_train_micro_batch_size: 1
     sequence_length: 8192 # Qwen2.5 would have 32768
   training_target:
   # had to hack here: Value error, Not enough tokens in the dataset. Actual: 57434112, Expected: >=57442304