wip for llama4 patch fix

lucaslie · lucaslie · commit 504afd2b510b · 2025-08-27T15:48:24.000-07:00
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/llm.py b/tensorrt_llm/_torch/auto_deploy/llm.py
@@ -66,7 +66,11 @@ def __call__(
             # TODO: can we avoid the extra tolist() here eventually?
             token_ids = all_args.pop("input_ids")
             assert token_ids.shape[0] == 1, "messages should be unbatched at this point."
-            return token_ids[0].tolist(), {"multimodal_data": all_args} if all_args else None
+            if all_args:
+                extra_processed_inputs = {"multimodal_data": all_args}
+            else:
+                extra_processed_inputs = None
+            return token_ids[0].tolist(), extra_processed_inputs
         else:
             token_ids = self.tokenizer.encode(inputs["prompt"], **kwargs)
             return token_ids, None
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py
@@ -66,14 +66,15 @@ def _apply(
         cm.info.set_example_sequence(**factory.get_example_inputs())
 
         # export the model to a graph module
-        gm = torch_export_to_gm(
-            model,
-            args=cm.args,
-            dynamic_shapes=cm.dynamic_shapes,
-            clone=self.config.clone_state_dict,
-            strict=self.config.strict,
-            patch_list=self.config.patch_list,
-        )
+        if False:
+            gm = torch_export_to_gm(
+                model,
+                args=cm.args,
+                dynamic_shapes=cm.dynamic_shapes,
+                clone=self.config.clone_state_dict,
+                strict=self.config.strict,
+                patch_list=self.config.patch_list,
+            )
 
         # this is a clean graph by definition since it was just exported
         info = TransformInfo(skipped=False, num_matches=1, is_clean=True, has_valid_shapes=True)
diff --git a/tests/unittest/_torch/auto_deploy/integration/test_llama4_vlm_export.py b/tests/unittest/_torch/auto_deploy/integration/test_llama4_vlm_export.py
@@ -101,6 +101,38 @@ def _vision_branch(inputs_embeds, pixel_values, input_ids):
 
         return inputs_embeds.view(original_inputs_embeds_shape)
 
+    def _vision_branch2(inputs_embeds, pixel_values, input_ids):
+        image_features = self.get_image_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            image_sizes=None,
+        )
+        original_inputs_embeds_shape = inputs_embeds.shape
+
+        vision_flat = image_features.view(-1, image_features.size(-1))
+        projected_vision_flat = self.multi_modal_projector(vision_flat)
+
+        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+        final_mask = special_image_mask.to(inputs_embeds.device)
+        inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1))
+
+        final_mask_1d = final_mask[..., 0].reshape(-1)
+        # num_tokens_to_fill = final_mask_1d.sum()
+
+        # This condition statement breaks torch.export:
+        # TODO: sanity check on the inputs for this
+        # if num_tokens_to_fill != projected_vision_flat.size(0):
+        #     raise ValueError(
+        #         f"Mismatch: final_mask wants {num_tokens_to_fill} embeddings, "
+        #         f"but multi_modal_projector returned {projected_vision_flat.size(0)}"
+        #     )
+
+        expanded_mask = final_mask_1d.unsqueeze(-1).expand(-1, inputs_embeds.size(-1))
+        inputs_embeds.masked_scatter_(expanded_mask, projected_vision_flat)
+
+        return inputs_embeds.view(original_inputs_embeds_shape)
+
     def _no_vision_branch(inputs_embeds, pixel_values, input_ids):
         return inputs_embeds