address comments

danielvegamyhre · danielvegamyhre · commit b55fbc751630 · 2025-07-31T15:18:53.000-07:00
diff --git a/torchtitan/components/quantization/float8.py b/torchtitan/components/quantization/float8.py
@@ -10,6 +10,7 @@
 
 from torchtitan.config.job_config import Float8, JobConfig
 from torchtitan.distributed import ParallelDims
+from torchtitan.experiments.llama4.infra.expert_parallel import set_token_group_alignment_size_m
 from torchtitan.protocols.model_converter import (
     ModelConverter,
     register_model_converter,
@@ -66,6 +67,10 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
                 job_config.parallelism.context_parallel_degree == 1
             ), "Float8 MoE training prototype does not yet support context parallelism"
 
+            # For fp8 grouped GEMM, token group sizes must be multiples of 16
+            # (16 byte alignment / 1 byte per elem = 16 elements)
+            set_token_group_alignment_size_m(16)
+
         if float8_config.recipe_name is not None:
             assert not float8_config.enable_fsdp_float8_all_gather, (
                 "using `float8_config.enable_fsdp_float8_all_gather` together "
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -577,7 +577,7 @@ class MX:
     """
     Comma-separated list of fully qualified names of MoE modules to apply mxfp8 training to.
     This is a prototype feature that requires the torchao nightly build.
-    Example: --float8.moe_fqns_prototype="experts"
+    Example: --mx.moe_fqns_prototype="experts"
     """
 
 @dataclass
diff --git a/torchtitan/experiments/llama4/infra/expert_parallel.py b/torchtitan/experiments/llama4/infra/expert_parallel.py
@@ -24,14 +24,26 @@
 from torch.distributed.tensor.placement_types import Placement
 
 
-TOKEN_GROUP_ALIGN_SIZE_M = 16
+TOKEN_GROUP_ALIGN_SIZE_M = 8
 
 
 def set_token_group_alignment_size_m(m: int) -> None:
-    """Set the alignment size for token groups in MoE."""
+    """
+    Set the token group alignment size for token groups in MoE. This is implemented by
+    padding each token group size to the next multiple of TOKEN_GROUP_ALIGN_SIZE_M. 
+    Different values are needed for different cases:
+    
+    * For bf16, 8 is enough (16 byte alignment / 2 bytes per elem = 8 elements).
+    * For fp8, 16 byte alignment / 1 byte per elem = 16 elements.
+    * For mxfp8, we need 32 (or block_size) because scaling block size is (1 x 32), 
+      so when doing per-token-group quantization on each logically distinct subtensor,
+      we need to ensure the contracting dim is divisible by block_size. 
+      In the backward pass, grad_weight = (grad_output_t @ input).t() has gemm dims
+      of (N, M) @ (M, K) so M is the contracting dim, and group offsets are along M, 
+      so we need 32 element alignment.
+    """
     global TOKEN_GROUP_ALIGN_SIZE_M
     assert m > 0, "Alignment size must be positive"
-    assert m % 16 == 0, "Alignment size must always be a multiple of 16 due to hardware constraints"
     TOKEN_GROUP_ALIGN_SIZE_M = m