Make token group alignment size configurable

danielvegamyhre · danielvegamyhre · commit 2122f270fdac · 2025-07-31T10:25:28.000-07:00
diff --git a/torchtitan/components/quantization/mx.py b/torchtitan/components/quantization/mx.py
@@ -59,6 +59,15 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
             and job_config.parallelism.tensor_parallel_degree > 1
         ), "TP not yet supported with torch.compile for mxfp8"
 
+        # For MoE training with mxfp8, token group sizes must be multiples of 32
+        if job_config.mx.moe_fqns_prototype:
+            from torchtitan.experiments.llama4.infra import expert_parallel
+
+            expert_parallel.TOKEN_GROUP_ALIGN_SIZE_M = 32
+            print(
+                f"Setting TOKEN_GROUP_ALIGN_SIZE_M to {expert_parallel.TOKEN_GROUP_ALIGN_SIZE_M}"
+            )
+
         # Configure MXFP8
         from torchao.prototype.mx_formats.config import (
             MXFP8Dim1CastKernelChoice,
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -544,12 +544,18 @@ class MX:
 
     filter_fqns: list[str] = field(default_factory=lambda: ["output"])
     """
-    Comma-separated list of fully qualified names of modules to skip applying mxfloat8 training to.
+    Comma-separated list of fully qualified names of modules to skip applying mxfp8 training to.
     nn.Linear modules with any dim size not divisible by 16 are also always skipped due to hardware requirements.
     By default we always skip the output layer.
     Example: --mx.filter_fqns "attention.wq,attention.wk,attention.wv,output"
     """
 
+    moe_fqns_prototype: list[str] | str = field(default_factory=list)
+    """
+    Comma-separated list of fully qualified names of MoE modules to apply mxfp8 training to.
+    This is a prototype feature that requires the torchao nightly build.
+    Example: --float8.moe_fqns_prototype="experts"
+    """
 
 @dataclass
 class Comm:
diff --git a/torchtitan/experiments/llama4/infra/expert_parallel.py b/torchtitan/experiments/llama4/infra/expert_parallel.py
@@ -24,6 +24,9 @@
 from torch.distributed.tensor.placement_types import Placement
 
 
+TOKEN_GROUP_ALIGN_SIZE_M = 16
+
+
 # implementation of Tensor Parallel for the GroupedExperts in MoE
 class TensorParallel(ParallelStyle):
     def _partition_fn(self, name, module, device_mesh):
@@ -251,6 +254,7 @@ def wrapper(
         x: torch.Tensor,
         num_tokens_per_expert: torch.Tensor | None = None,
     ) -> torch.Tensor:
+        global TOKEN_GROUP_ALIGN_SIZE_M
         if isinstance(w1, DTensor):
             w1 = w1.to_local()
             w2 = w2.to_local()
@@ -264,7 +268,6 @@ def wrapper(
             experts_per_ep_rank = w1.shape[0]
             num_ep_ranks = num_tokens_per_expert.shape[0] // experts_per_ep_rank
 
-            ALIGN_SIZE_M = 16
             with torch.no_grad():
                 (
                     permuted_indices,
@@ -274,8 +277,8 @@ def wrapper(
                     num_tokens_per_expert,
                     experts_per_ep_rank,
                     num_ep_ranks,
-                    x.shape[0] + experts_per_ep_rank * ALIGN_SIZE_M,
-                    ALIGN_SIZE_M,
+                    x.shape[0] + experts_per_ep_rank * TOKEN_GROUP_ALIGN_SIZE_M,
+                    TOKEN_GROUP_ALIGN_SIZE_M,
                 )
 
             x = torch.vstack((x, x.new_zeros((x.shape[-1]))))