use literal

danielvegamyhre · danielvegamyhre · commit 6b953dbdaed5 · 2025-07-31T17:35:33.000-07:00
diff --git a/torchtitan/experiments/llama4/infra/expert_parallel.py b/torchtitan/experiments/llama4/infra/expert_parallel.py
@@ -6,7 +6,7 @@
 
 
 from functools import partial
-from typing import Callable
+from typing import Callable, Literal
 
 import torch
 import torch.distributed as dist
@@ -25,12 +25,15 @@
 
 
 TOKEN_GROUP_ALIGN_SIZE_M = 8
+ValidTokenGroupAlignmentSize = Literal[8, 16, 32]
 
 
-def set_token_group_alignment_size_m(m: int) -> None:
+def set_token_group_alignment_size_m(m: ValidTokenGroupAlignmentSize) -> None:
     """
     Set the token group alignment size for token groups in MoE. This is implemented by
     padding each token group size to the next multiple of TOKEN_GROUP_ALIGN_SIZE_M.
+
+    Valid values are: 8, 16, or 32.
     Different values are needed for different cases:
 
     * For bf16, 8 is enough (16 byte alignment / 2 bytes per elem = 8 elements).