[compiler toolkit] Prepare deepseek to accept graph passes (#1982)

yiming0416 · web-flow · commit 8659543376f8 · 2025-11-03T17:35:32.000-08:00
Made some updates to improve UX when running experiments in compiler
toolkit
- Always register block mask as pytree node. A model could use flex_attn
even it's flavor doesn't contain `flex_attn`
- Prepare deepseek v3 to accept graph passes like llama3
- Annotate flex attention in deepseek v3 
- Regional inductor doesn't work on deepseek with flex attn with error
P2021796847

To repro the regional inductor issue in dsv3, uncomment
`regional_inductor()` and run

```
NGPU=4 CONFIG_FILE=./torchtitan/models/deepseek_v3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.deepseek_v3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=2 --parallelism.expert_parallel_degree=2 --activation_checkpoint.mode none --model.flavor=debugmodel_flex_attn
```
diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md
@@ -30,4 +30,4 @@ We provide this `experiments/` folder to host experiments that add significant v
 | [torchcomms](./torchcomms/) | TBA | [@d4l3k](https://https://github.com/d4l3k) [@fduwjj](https://github.com/fduwjj) [@mori360 ](https://github.com/mori360) |
 | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) |
 | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
-| [compiler_toolkit](./compiler_tookit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
+| [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
diff --git a/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py b/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py
@@ -30,20 +30,29 @@
 from torchtitan.tools.logging import logger
 
 
-def fw_compiler(gm: torch.fx.GraphModule, example_inputs) -> torch.fx.GraphModule:
-    logger.info("fwd_gm:")
+def compiler(name: str, gm: torch.fx.GraphModule, example_inputs):
+    logger.info(f"{name} before compiler:")
     logger.info(gm.print_readable(print_output=False))
-    return gm
 
+    # TODO: regional_inductor should work with deepseek_v3
+    # gm = regional_inductor(gm, example_inputs)
 
-def bw_compiler(gm: torch.fx.GraphModule, example_inputs) -> torch.fx.GraphModule:
-    logger.info("bwd_gm:")
+    logger.info(f"{name} after compiler:")
     logger.info(gm.print_readable(print_output=False))
     return gm
 
 
+def fw_compiler(gm: torch.fx.GraphModule, example_inputs) -> None:
+    return compiler("fwd_gm", gm, example_inputs)
+
+
+def bw_compiler(gm: torch.fx.GraphModule, example_inputs) -> None:
+    return compiler("bwd_gm", gm, example_inputs)
+
+
 def annotate_deepseekv3() -> None:
     from torchtitan.distributed.expert_parallel import ExpertParallel
+    from torchtitan.models.attention import FlexAttentionWrapper
     from torchtitan.models.moe.moe import MoE
 
     # annotate the MoE with dispatch, compute and combine
@@ -55,6 +64,11 @@ def annotate_deepseekv3() -> None:
     )
     MoE.forward = annotate_fn({"EP": "compute"})(MoE.forward)
 
+    # annotate flex_attention with compile_with_inductor
+    FlexAttentionWrapper.forward = annotate_fn(
+        {"compile_with_inductor": "flex_attention"}
+    )(FlexAttentionWrapper.forward)
+
 
 def parallelize_deepseekv3(
     model: torch.nn.Module,
@@ -64,8 +78,7 @@ def parallelize_deepseekv3(
 
     annotate_deepseekv3()
 
-    if job_config.model.flavor.endswith("flex_attn"):
-        register_blockmask_pytree_node()
+    register_blockmask_pytree_node()
 
     # Disable torch.compile over the model in the compiler toolkit style workflow
     with disable_compile(job_config):
diff --git a/torchtitan/experiments/compiler_toolkit/llama3/parallelize.py b/torchtitan/experiments/compiler_toolkit/llama3/parallelize.py
@@ -87,8 +87,7 @@ def parallelize_llama(
 
     annotate_llama()
 
-    if job_config.model.flavor.endswith("flex_attn"):
-        register_blockmask_pytree_node()
+    register_blockmask_pytree_node()
 
     # Disable torch.compile over the model in the compiler toolkit style workflow
     with disable_compile(job_config):