keras-team · hertschuh · Aug 20, 2025 · Aug 18, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/keras/src/layers/attention/attention.py b/keras/src/layers/attention/attention.py
@@ -176,6 +176,8 @@ def _apply_scores(self, scores, value, scores_mask=None, training=False):
             # Bias so padding positions do not contribute to attention
             # distribution.  Note 65504. is the max float16 value.
             max_value = 65504.0 if scores.dtype == "float16" else 1.0e9
+            if len(padding_mask.shape) == 2:
+                padding_mask = ops.expand_dims(padding_mask, axis=-2)
             scores -= max_value * ops.cast(padding_mask, dtype=scores.dtype)
 
         weights = ops.softmax(scores, axis=-1)

diff --git a/keras/src/layers/attention/attention_test.py b/keras/src/layers/attention/attention_test.py
@@ -86,6 +86,23 @@ def test_attention_with_mask(self):
         self.assertAllClose(output, [[[1.0, 1.0], [0.0, 0.0]]])
         self.assertAllClose(scores, [[[1.0, 0.0], [1.0, 0.0]]])
 
+    def test_attention_2D_mask_shape_mismatch(self):
+        layer = layers.Attention()
+        batch_size, Tq, Tv, dim = 2, 3, 4, 5
+        query = np.random.random((batch_size, Tq, dim)).astype(np.float32)
+        value = np.random.random((batch_size, Tv, dim)).astype(np.float32)
+        query_mask = np.array([[True, False, True], [True, False, True]])
+        value_mask = np.array(
+            [[True, False, True, True], [True, False, True, True]]
+        )
+        output, scores = layer(
+            [query, value],
+            mask=[query_mask, value_mask],
+            return_attention_scores=True,
+        )
+        self.assertEqual(output.shape, (batch_size, Tq, dim))
+        self.assertEqual(scores.shape, (batch_size, Tq, Tv))
+
     def test_attention_errors(self):
         layer = layers.Attention()
         tensor = np.array([[[1.0, 1.0], [1.0, 1.0]]])