messing with attention mask

lucaslie · lucaslie · commit abb628528cdc · 2025-08-27T21:13:45.000-07:00
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py
@@ -71,6 +71,10 @@ def scaled_dot_product_attention(
     of the vanilla sdpa in a graph.
     """
 
+    if attn_mask is not None:
+        is_causal = True
+        attn_mask = None
+
     return F.scaled_dot_product_attention(
         query.contiguous(),
         key.contiguous(),
@@ -79,7 +83,7 @@ def scaled_dot_product_attention(
         dropout_p=dropout_p,
         is_causal=is_causal,
         scale=scale,
-        enable_gqa=False,
+        enable_gqa=enable_gqa,
     )
 
 
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/llama4.py b/tensorrt_llm/_torch/auto_deploy/models/patches/llama4.py
@@ -7,7 +7,7 @@
 from transformers import Llama4ForConditionalGeneration
 from transformers.models.llama4.modeling_llama4 import Llama4CausalLMOutputWithPast
 
-from ...export.interface import BaseExportPatch, ExportPatchRegistry
+from ...export.interface import BaseExportPatch
 
 
 # Copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama4/modeling_llama4.py#L1651
diff --git a/tests/unittest/_torch/auto_deploy/integration/test_llama4_vlm_export.py b/tests/unittest/_torch/auto_deploy/integration/test_llama4_vlm_export.py
@@ -211,7 +211,10 @@ def test_build_run_llama4_vlm():
     )
     processor = AutoProcessor.from_pretrained(model_id)
 
-    config = AutoConfig.from_pretrained(model_id)
+    config = AutoConfig.from_pretrained(
+        model_id,
+        # attn_implementation="eager",
+    )
     config.text_config.num_hidden_layers = 2
     config.text_config.intermediate_size = 64
     config.text_config.intermediate_size_mlp = 128
@@ -251,7 +254,14 @@ def test_build_run_llama4_vlm():
     )
 
     def _run_with_and_without_image(model, use_none=False):
-        with apply_export_patches({"transformers_sdpa_mask": {}, "autocast_noop": {}}):
+        with apply_export_patches(
+            {
+                "transformers_sdpa_mask": {},
+                "autocast_noop": {},
+                # "sdpa": {},
+                "sdpa_kernel_noop": {},
+            }
+        ):
             with torch.inference_mode():
                 out_no_images = model(
                     inputs["input_ids"],
@@ -280,7 +290,14 @@ def _run_with_and_without_image(model, use_none=False):
         model,
         (inputs["input_ids"], inputs["pixel_values"], inputs["attention_mask"]),
         kwargs={},
-        patch_list=["transformers_sdpa_mask", "autocast_noop"],
+        patch_list=[
+            "transformers_sdpa_mask",
+            "autocast_noop",
+            "torch_where",
+            "tensor_meta_device",
+            "sdpa_kernel_noop",
+            "sdpa",
+        ],
     )
     move_to_device(gm, model.device)
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher_hf.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher_hf.py
@@ -78,9 +78,6 @@ def _joint_transform(gm: GraphModule) -> None:
     ["eager", "sdpa"],
 )
 def test_match_llama_attention(config: Dict[str, Any], attn_implementation: str):
-    if attn_implementation == "sdpa":
-        pytest.skip("https://nvbugspro.nvidia.com/bug/5170222")
-
     def verify_matcher(gm: GraphModule):
         """Ensure that there is exactly one torch.ops.auto_deploy.torch_attention_bsnd_grouped_sdpa
         call in the graph. Also check that there is no repeat_kv pattern left.