keras-team · divyashreepathihalli · Jul 10, 2025 · Jun 10, 2025 · Jun 10, 2025 · Jun 25, 2025
diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
@@ -454,6 +454,9 @@
 from keras_hub.src.models.qwen3.qwen3_backbone import (
     Qwen3Backbone as Qwen3Backbone,
 )
+from keras_hub.src.models.qwen3.qwen3_causal_lm import (
+    Qwen3CausalLM as Qwen3CausalLM,
+)
 from keras_hub.src.models.qwen3.qwen3_causal_lm_preprocessor import (
     Qwen3CausalLMPreprocessor as Qwen3CausalLMPreprocessor,
 )

diff --git a/keras_hub/src/models/qwen3/qwen3_attention.py b/keras_hub/src/models/qwen3/qwen3_attention.py
@@ -299,11 +299,11 @@ def _compute_attention(
             attention_scores,
             ops.cast(self._inv_norm_factor, self.compute_dtype),
         )
-        if not self.sliding_window_size:
+        if self.sliding_window_size:
             attention_mask = self._mask_sliding_window(
                 attention_mask,
                 cache_update_index=cache_update_index
-                if cache_update_index
+                if cache_update_index is not None
                 else 0,
             )
         attention_scores = self._masked_softmax(