wip for llama4 patch debugging

lucaslie · lucaslie · commit 244232f5a003 · 2025-08-27T18:26:19.000-07:00
diff --git a/tensorrt_llm/_torch/auto_deploy/llm.py b/tensorrt_llm/_torch/auto_deploy/llm.py
@@ -66,7 +66,11 @@ def __call__(
             # TODO: can we avoid the extra tolist() here eventually?
             token_ids = all_args.pop("input_ids")
             assert token_ids.shape[0] == 1, "messages should be unbatched at this point."
-            return token_ids[0].tolist(), {"multimodal_data": all_args} if all_args else None
+            if all_args:
+                extra_processed_inputs = {"multimodal_data": all_args}
+            else:
+                extra_processed_inputs = None
+            return token_ids[0].tolist(), extra_processed_inputs
         else:
             token_ids = self.tokenizer.encode(inputs["prompt"], **kwargs)
             return token_ids, None
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/llama4.py b/tensorrt_llm/_torch/auto_deploy/models/patches/llama4.py
@@ -159,7 +159,7 @@ def _no_vision_branch(inputs_embeds, pixel_values, input_ids):
     )
 
 
-@ExportPatchRegistry.register("hf_llama4_vision")
+# @ExportPatchRegistry.register("hf_llama4_vision")
 class Llama4VisionPatch(BaseExportPatch):
     """Patch for Llama4ForConditionalGeneration to make it compatible with torch.export.
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py
@@ -66,14 +66,15 @@ def _apply(
         cm.info.set_example_sequence(**factory.get_example_inputs())
 
         # export the model to a graph module
-        gm = torch_export_to_gm(
-            model,
-            args=cm.args,
-            dynamic_shapes=cm.dynamic_shapes,
-            clone=self.config.clone_state_dict,
-            strict=self.config.strict,
-            patch_list=self.config.patch_list,
-        )
+        if False:
+            gm = torch_export_to_gm(
+                model,
+                args=cm.args,
+                dynamic_shapes=cm.dynamic_shapes,
+                clone=self.config.clone_state_dict,
+                strict=self.config.strict,
+                patch_list=self.config.patch_list,
+            )
 
         # this is a clean graph by definition since it was just exported
         info = TransformInfo(skipped=False, num_matches=1, is_clean=True, has_valid_shapes=True)
diff --git a/tests/unittest/_torch/auto_deploy/integration/test_llama4_vlm_export.py b/tests/unittest/_torch/auto_deploy/integration/test_llama4_vlm_export.py
@@ -8,7 +8,7 @@
 from transformers.models.llama4.modeling_llama4 import Llama4CausalLMOutputWithPast
 from utils.llm_data import llm_models_root
 
-from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
+from tensorrt_llm._torch.auto_deploy.export import apply_export_patches, torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.transformations._graph import move_to_device
 
 
@@ -101,6 +101,42 @@ def _vision_branch(inputs_embeds, pixel_values, input_ids):
 
         return inputs_embeds.view(original_inputs_embeds_shape)
 
+    def _vision_branch2(inputs_embeds, pixel_values, input_ids):
+        image_features = self.get_image_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            image_sizes=image_sizes,
+        )
+
+        vision_flat = image_features.view(-1, image_features.size(-1))
+        projected_vision_flat = self.multi_modal_projector(vision_flat).to(
+            inputs_embeds.device, inputs_embeds.dtype
+        )
+        # NOTE: get_placeholder_mask is not supported by torch.export due to numel check ###########
+        # special_image_mask = self.get_placeholder_mask(
+        #     input_ids, inputs_embeds=inputs_embeds, image_features=projected_vision_flat
+        # )
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(
+                    self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device
+                )
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = (
+            special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        )
+        ### END OF get_placeholder_mask ############################################################
+
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, projected_vision_flat)
+
+        return inputs_embeds
+
     def _no_vision_branch(inputs_embeds, pixel_values, input_ids):
         return inputs_embeds
 
@@ -109,7 +145,7 @@ def _no_vision_branch(inputs_embeds, pixel_values, input_ids):
 
     inputs_embeds = torch.cond(
         has_image,
-        _vision_branch,
+        _vision_branch2,
         _no_vision_branch,
         (inputs_embeds, pixel_values, input_ids),
     )
@@ -132,7 +168,10 @@ def _no_vision_branch(inputs_embeds, pixel_values, input_ids):
 
     loss = None
     if labels is not None:
+        # Shift so that tokens < n predict n
         if attention_mask is not None:
+            # we use the input attention mask to shift the logits and labels, because it is 2D.
+            # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
             shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
             shift_logits = logits[..., :-1, :][
                 shift_attention_mask.to(logits.device) != 0
@@ -141,6 +180,7 @@ def _no_vision_branch(inputs_embeds, pixel_values, input_ids):
         else:
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
         loss_fct = nn.CrossEntropyLoss()
         loss = loss_fct(
             shift_logits.view(-1, shift_logits.size(-1)),
@@ -210,35 +250,49 @@ def test_build_run_llama4_vlm():
         .to(torch.bfloat16)
     )
 
-    with torch.inference_mode():
-        # the original model queried with text-only
-        out_text_only = model(inputs["input_ids"], None, inputs["attention_mask"])
-
+    def _run_with_and_without_image(model, use_none=False):
+        with apply_export_patches({"transformers_sdpa_mask": {}, "autocast_noop": {}}):
+            with torch.inference_mode():
+                out_no_images = model(
+                    inputs["input_ids"],
+                    None if use_none else torch.zeros_like(inputs["pixel_values"]),
+                    inputs["attention_mask"],
+                )
+                out_with_images = model(
+                    inputs["input_ids"],
+                    inputs["pixel_values"],
+                    inputs["attention_mask"],
+                )
+            return {"no_images": out_no_images.logits, "with_images": out_with_images.logits}
+
+    # Get output pre-patch
+    out_original = _run_with_and_without_image(model, use_none=True)
+
+    # set patch
     Llama4ForConditionalGeneration.forward = _forward_with_cond
 
-    with torch.inference_mode():
-        out_real = model(inputs["input_ids"], inputs["pixel_values"], inputs["attention_mask"])
-        out_dummy = model(
-            inputs["input_ids"], torch.zeros_like(inputs["pixel_values"]), inputs["attention_mask"]
-        )
-        torch.testing.assert_close(out_dummy.logits, out_text_only.logits, rtol=rtol, atol=atol)
+    # Get output post-patch
+    outputs_for_comparison = {}
+    outputs_for_comparison["model_with_patch"] = _run_with_and_without_image(model)
 
+    # Export to GM
     gm = torch_export_to_gm(
         model,
         (inputs["input_ids"], inputs["pixel_values"], inputs["attention_mask"]),
         kwargs={},
+        patch_list=["transformers_sdpa_mask", "autocast_noop"],
     )
     move_to_device(gm, model.device)
 
-    with torch.inference_mode():
-        out_real_gm = gm(inputs["input_ids"], inputs["pixel_values"], inputs["attention_mask"])
-        torch.testing.assert_close(out_real.logits, out_real_gm.logits, rtol=rtol, atol=atol)
-        out_dummy_gm = gm(
-            inputs["input_ids"], torch.zeros_like(inputs["pixel_values"]), inputs["attention_mask"]
-        )
-        torch.testing.assert_close(out_dummy.logits, out_dummy_gm.logits, rtol=rtol, atol=atol)
-        torch.testing.assert_close(out_dummy_gm.logits, out_text_only.logits, rtol=rtol, atol=atol)
-
-        assert not torch.allclose(out_real.logits, out_dummy.logits, rtol=rtol, atol=atol), (
-            "Expected outputs to differ between text only input and text+image input"
+    # Get the output post export
+    outputs_for_comparison["gm"] = _run_with_and_without_image(gm)
+
+    # Run comparisons to out_original with no patch now...
+    for comp, outs in outputs_for_comparison.items():
+        torch.testing.assert_close(
+            outs,
+            out_original,
+            rtol=rtol,
+            atol=atol,
+            msg=lambda m: f"Comparison for {comp} failed:\n{m}",
         )

Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,7 @@ def _no_vision_branch(inputs_embeds, pixel_values, input_ids):`
`159`	`159`	`)`
`160`	`160`
`161`	`161`
`162`		`-@ExportPatchRegistry.register("hf_llama4_vision")`
	`162`	`+# @ExportPatchRegistry.register("hf_llama4_vision")`
`163`	`163`	`class Llama4VisionPatch(BaseExportPatch):`
`164`	`164`	`"""Patch for Llama4ForConditionalGeneration to make it compatible with torch.export.`
`165`	`165`