remove disc and separate layers mechanism

Mohamed-Ashraf273 · Mohamed-Ashraf273 · commit d748dd5c0905 · 2025-06-24T16:35:27.000+03:00
diff --git a/keras_hub/src/models/causal_lm.py b/keras_hub/src/models/causal_lm.py
@@ -138,12 +138,9 @@ def make_generate_function(self):
 
         self.generate_function = self.generate_step
         if keras.config.backend() == "openvino":
-            import os
-            import shutil
-
             import numpy as np
             import openvino as ov
-            import openvino.runtime.opset14 as ov_opset
+            import openvino.runtime.opset15 as ov_opset
             from keras.src.backend.openvino.core import OPENVINO_DTYPES
             from keras.src.backend.openvino.core import OpenVINOKerasTensor
 
@@ -192,17 +189,13 @@ def get_outputs_from_model(inputs, model):
                 return outputs
 
             def get_model(inputs, fn, ov_model=None, compiled=False):
-                config = {
-                    "CACHE_DIR": "openvino_cache",
-                }
-
                 struct_params, _ = set_struct_outputs(inputs, fn)
 
                 if ov_model is not None:
                     assert compiled, (
                         "if you pass a model, you should make compiled=True"
                     )
-                    return ov.compile_model(ov_model, "CPU", config)
+                    return ov.compile_model(ov_model, "CPU")
 
                 parameters = [
                     p.output.get_node() for p in tree.flatten(struct_params)
@@ -216,15 +209,7 @@ def get_model(inputs, fn, ov_model=None, compiled=False):
                 if not compiled:
                     return ov_model
 
-                return ov.compile_model(ov_model, "CPU", config)
-
-            def compile_model_disc(inputs, fn, name):
-                model_path = f"./run_dir/{name}.xml"
-                if not os.path.exists(model_path):
-                    ov_model = get_model(inputs, fn)
-                    ov.save_model(ov_model, model_path)
-                model = ov.Core().read_model(model_path)
-                return get_model(inputs, fn, ov_model=model, compiled=True)
+                return ov.compile_model(ov_model, "CPU")
 
             def ov_infer(
                 inputs,
@@ -245,33 +230,21 @@ def ov_infer(
                     else:
                         set_struct_outputs(inputs, fn)
                     compiled_model = self._ov_mem[name]
-                elif disc:
-                    assert name is not None, (
-                        "you should provide the name of thr model"
-                    )
-                    compiled_model = compile_model_disc(inputs, fn, name)
                 else:
                     compiled_model = get_model(inputs, fn, compiled=True)
                 outputs = get_outputs_from_model(inputs, compiled_model)
                 del compiled_model
                 return outputs
 
-            def delete_ov_cache():
-                for path in ["openvino_cache", "run_dir"]:
-                    if os.path.exists(path):
-                        shutil.rmtree(path, ignore_errors=True)
-
             self.ov_infer = ov_infer
 
             def wrapped_generate_function(inputs, stop_token_ids=None):
                 final_outputs = []
-                os.makedirs("./run_dir", exist_ok=True)
                 for input in inputs:
                     outputs = self.generate_step(input, stop_token_ids)
                     for k, v in outputs.items():
                         outputs[k] = ops.convert_to_numpy(v)
                     final_outputs.append(outputs)
-                delete_ov_cache()
                 return final_outputs
 
             self.generate_function = wrapped_generate_function
diff --git a/keras_hub/src/models/gemma/gemma_causal_lm.py b/keras_hub/src/models/gemma/gemma_causal_lm.py
@@ -196,31 +196,36 @@ def call_with_cache(
             the final hidden representation of the input tokens, and `cache` is
             the decoding cache.
         """
+        caches = []
+
+        use_openvino = keras.config.backend() == "openvino"
 
         def embed_and_scale_tokens(token_ids):
             x = self.backbone.token_embedding(token_ids)
             return x * ops.cast(ops.sqrt(self.backbone.hidden_dim), x.dtype)
 
-        def make_apply_fn(layer):
-            def apply_transformer_layer(inputs):
-                x = inputs["x"]
-                current_cache = inputs["current_cache"]
-                index = inputs["cache_update_index"]
-                x, next_cache = layer(
-                    x, cache=current_cache, cache_update_index=index
+        def apply_transformer_layers(inputs):
+            x = inputs["x"]
+            current_cache = inputs["current_cache"]
+            cache_update_index = inputs["cache_update_index"]
+            for i, transformer_layer in enumerate(
+                self.backbone.transformer_layers
+            ):
+                current_cache = cache[:, i, ...]
+                x, next_cache = transformer_layer(
+                    x,
+                    cache=current_cache,
+                    cache_update_index=cache_update_index,
                 )
-                return x, next_cache
-
-            return apply_transformer_layer
+                caches.append(next_cache)
+            return x, next_cache
 
         def finalize_generation_step(inputs):
             x = self.backbone.layer_norm(inputs["x"])
             cache = ops.stack(inputs["caches"], axis=1)
             logits = self.backbone.token_embedding(x, reverse=True)
             return logits, x, cache
 
-        use_openvino = keras.config.backend() == "openvino"
-
         if use_openvino:
             token_ids = ops.convert_to_numpy(token_ids)
             cache = ops.convert_to_numpy(cache)
@@ -233,56 +238,53 @@ def finalize_generation_step(inputs):
                 )
             else:
                 ov_cache = self._ov_mem.get("cache")
-                if  ov_cache is not None and cache.shape == ov_cache.shape:
+                if ov_cache is not None and cache.shape == ov_cache.shape:
                     return None, self._ov_mem["hidden_states"], ov_cache
                 x = self.ov_infer(token_ids, embed_and_scale_tokens)
         else:
             x = embed_and_scale_tokens(token_ids)
 
-        caches = []
-        for i, transformer_layer in enumerate(self.backbone.transformer_layers):
-            current_cache = cache[:, i, ...]
-            
-            inputs = {
-                "x": x,
-                "current_cache": current_cache,
-                "cache_update_index": cache_update_index,
-            }
-
-            apply_fn = make_apply_fn(transformer_layer)
-
-            if use_openvino:
-                if token_ids.shape[1] == 1:
-                    x, next_cache = self.ov_infer(
-                        inputs,
-                        apply_fn,
-                        disc=True,
-                        name=f"layer_{i}",
-                    )
-                else:
-                    x, next_cache = self.ov_infer(inputs, apply_fn)
+        if use_openvino:
+            if token_ids.shape[1] == 1:
+                x, cache = self.ov_infer(
+                    {"x": x, "current_cache": cache, "cache_update_index": 0},
+                    apply_transformer_layers,
+                    cache=True,
+                    name="apply_transformer_layers",
+                )
             else:
-                x, next_cache = apply_fn(inputs)
-
-            caches.append(next_cache)
+                x, cache = self.ov_infer(
+                    {"x": x, "current_cache": cache, "cache_update_index": 0},
+                    apply_transformer_layers,
+                )
+                self._ov_mem["cache"] = cache
+        else:
+            x, cache = apply_transformer_layers(
+                {
+                    "x": x,
+                    "current_cache": cache,
+                    "cache_update_index": cache_update_index,
+                }
+            )
 
-        inputs = {"x": x, "caches": caches}
         if use_openvino:
             if token_ids.shape[1] == 1:
                 logits, hidden_states, cache = self.ov_infer(
-                    inputs,
+                    {"x": x, "caches": caches},
                     finalize_generation_step,
                     cache=True,
                     name="finalize_generation_step",
                 )
             else:
                 logits, hidden_states, cache = self.ov_infer(
-                    inputs, finalize_generation_step
+                    {"x": x, "caches": caches}, finalize_generation_step
                 )
                 self._ov_mem["cache"] = cache
                 self._ov_mem["hidden_states"] = hidden_states
         else:
-            logits, hidden_states, cache = finalize_generation_step(inputs)
+            logits, hidden_states, cache = finalize_generation_step(
+                {"x": x, "caches": caches}
+            )
 
         return logits, hidden_states, cache