remove disc mechanism

Mohamed-Ashraf273 · Mohamed-Ashraf273 · commit 2044408ca714 · 2025-06-24T22:40:48.000+03:00
diff --git a/keras_hub/src/models/causal_lm.py b/keras_hub/src/models/causal_lm.py
@@ -138,9 +138,6 @@ def make_generate_function(self):
 
         self.generate_function = self.generate_step
         if keras.config.backend() == "openvino":
-            import os
-            import shutil
-
             import numpy as np
             import openvino as ov
             import openvino.runtime.opset14 as ov_opset
@@ -192,17 +189,13 @@ def get_outputs_from_model(inputs, model):
                 return outputs
 
             def get_model(inputs, fn, ov_model=None, compiled=False):
-                config = {
-                    "CACHE_DIR": "openvino_cache",
-                }
-
                 struct_params, _ = set_struct_outputs(inputs, fn)
 
                 if ov_model is not None:
                     assert compiled, (
                         "if you pass a model, you should make compiled=True"
                     )
-                    return ov.compile_model(ov_model, "CPU", config)
+                    return ov.compile_model(ov_model, "CPU")
 
                 parameters = [
                     p.output.get_node() for p in tree.flatten(struct_params)
@@ -216,21 +209,12 @@ def get_model(inputs, fn, ov_model=None, compiled=False):
                 if not compiled:
                     return ov_model
 
-                return ov.compile_model(ov_model, "CPU", config)
-
-            def compile_model_disc(inputs, fn, name):
-                model_path = f"./run_dir/{name}.xml"
-                if not os.path.exists(model_path):
-                    ov_model = get_model(inputs, fn)
-                    ov.save_model(ov_model, model_path)
-                model = ov.Core().read_model(model_path)
-                return get_model(inputs, fn, ov_model=model, compiled=True)
+                return ov.compile_model(ov_model, "CPU")
 
             def ov_infer(
                 inputs,
                 fn,
                 cache=False,
-                disc=False,
                 name=None,
             ):
                 compiled_model = None
@@ -245,34 +229,19 @@ def ov_infer(
                     else:
                         set_struct_outputs(inputs, fn)
                     compiled_model = self._ov_mem[name]
-                elif disc:
-                    assert name is not None, (
-                        "you should provide the name of thr model"
-                    )
-                    compiled_model = compile_model_disc(inputs, fn, name)
                 else:
                     compiled_model = get_model(inputs, fn, compiled=True)
                 outputs = get_outputs_from_model(inputs, compiled_model)
                 del compiled_model
                 return outputs
 
-            def delete_ov_cache():
-                for path in ["openvino_cache", "run_dir"]:
-                    if os.path.exists(path):
-                        shutil.rmtree(path, ignore_errors=True)
-
             self.ov_infer = ov_infer
 
             def wrapped_generate_function(inputs, stop_token_ids=None):
-                final_outputs = []
-                os.makedirs("./run_dir", exist_ok=True)
-                for input in inputs:
-                    outputs = self.generate_step(input, stop_token_ids)
-                    for k, v in outputs.items():
-                        outputs[k] = ops.convert_to_numpy(v)
-                    final_outputs.append(outputs)
-                delete_ov_cache()
-                return final_outputs
+                outputs = self.generate_step(inputs, stop_token_ids)
+                for k, v in outputs.items():
+                    outputs[k] = ops.convert_to_numpy(v)
+                return outputs
 
             self.generate_function = wrapped_generate_function
         if keras.config.backend() == "torch":
@@ -529,10 +498,7 @@ def postprocess(x):
         if strip_prompt:
             outputs = [strip_prompt_function(generate(x), x) for x in inputs]
         else:
-            if keras.config.backend() == "openvino":
-                outputs = generate(inputs)
-            else:
-                outputs = [generate(x) for x in inputs]
+            outputs = [generate(x) for x in inputs]
 
         if self.preprocessor is not None:
             outputs = [postprocess(x) for x in outputs]
diff --git a/keras_hub/src/models/gemma/gemma_causal_lm.py b/keras_hub/src/models/gemma/gemma_causal_lm.py
@@ -197,29 +197,35 @@ def call_with_cache(
             the decoding cache.
         """
 
+        use_openvino = keras.config.backend() == "openvino"
+
         def embed_and_scale_tokens(token_ids):
             x = self.backbone.token_embedding(token_ids)
             return x * ops.cast(ops.sqrt(self.backbone.hidden_dim), x.dtype)
 
-        def make_apply_fn(layer):
-            def apply_transformer_layer(inputs):
-                x = inputs["x"]
-                current_cache = inputs["current_cache"]
-                index = inputs["cache_update_index"]
-                x, next_cache = layer(
-                    x, cache=current_cache, cache_update_index=index
+        def apply_transformer_layers(inputs):
+            x = inputs["x"]
+            cache = inputs["cache"]
+            cache_update_index = inputs["cache_update_index"]
+            caches = []
+            for i, transformer_layer in enumerate(
+                self.backbone.transformer_layers
+            ):
+                current_cache = cache[:, i, ...]
+                x, next_cache = transformer_layer(
+                    x,
+                    cache=current_cache,
+                    cache_update_index=cache_update_index,
                 )
-                return x, next_cache
+                caches.append(next_cache)
 
-            return apply_transformer_layer
+            cache = ops.stack(caches, axis=1)
+            return x, cache
 
-        def finalize_generation_step(inputs):
-            x = self.backbone.layer_norm(inputs["x"])
-            cache = ops.stack(inputs["caches"], axis=1)
+        def finalize_generation_step(x):
+            hidden_states = x = self.backbone.layer_norm(x)
             logits = self.backbone.token_embedding(x, reverse=True)
-            return logits, x, cache
-
-        use_openvino = keras.config.backend() == "openvino"
+            return logits, hidden_states
 
         if use_openvino:
             token_ids = ops.convert_to_numpy(token_ids)
@@ -233,56 +239,58 @@ def finalize_generation_step(inputs):
                 )
             else:
                 ov_cache = self._ov_mem.get("cache")
-                if  ov_cache is not None and cache.shape == ov_cache.shape:
+                if ov_cache is not None and cache.shape == ov_cache.shape:
                     return None, self._ov_mem["hidden_states"], ov_cache
                 x = self.ov_infer(token_ids, embed_and_scale_tokens)
         else:
             x = embed_and_scale_tokens(token_ids)
 
-        caches = []
-        for i, transformer_layer in enumerate(self.backbone.transformer_layers):
-            current_cache = cache[:, i, ...]
-            
-            inputs = {
-                "x": x,
-                "current_cache": current_cache,
-                "cache_update_index": cache_update_index,
-            }
-
-            apply_fn = make_apply_fn(transformer_layer)
-
-            if use_openvino:
-                if token_ids.shape[1] == 1:
-                    x, next_cache = self.ov_infer(
-                        inputs,
-                        apply_fn,
-                        disc=True,
-                        name=f"layer_{i}",
-                    )
-                else:
-                    x, next_cache = self.ov_infer(inputs, apply_fn)
+        if use_openvino:
+            if token_ids.shape[1] == 1:
+                x, cache = self.ov_infer(
+                    {
+                        "x": x,
+                        "cache": cache,
+                        "cache_update_index": cache_update_index,
+                    },
+                    apply_transformer_layers,
+                    cache=True,
+                    name="apply_transformer_layers",
+                )
             else:
-                x, next_cache = apply_fn(inputs)
-
-            caches.append(next_cache)
+                x, cache = self.ov_infer(
+                    {
+                        "x": x,
+                        "cache": cache,
+                        "cache_update_index": cache_update_index,
+                    },
+                    apply_transformer_layers,
+                )
+                self._ov_mem["cache"] = cache
+        else:
+            x, cache = apply_transformer_layers(
+                {
+                    "x": x,
+                    "cache": cache,
+                    "cache_update_index": cache_update_index,
+                }
+            )
 
-        inputs = {"x": x, "caches": caches}
         if use_openvino:
             if token_ids.shape[1] == 1:
-                logits, hidden_states, cache = self.ov_infer(
-                    inputs,
+                logits, hidden_states = self.ov_infer(
+                    x,
                     finalize_generation_step,
                     cache=True,
                     name="finalize_generation_step",
                 )
             else:
-                logits, hidden_states, cache = self.ov_infer(
-                    inputs, finalize_generation_step
+                logits, hidden_states = self.ov_infer(
+                    x, finalize_generation_step
                 )
-                self._ov_mem["cache"] = cache
                 self._ov_mem["hidden_states"] = hidden_states
         else:
-            logits, hidden_states, cache = finalize_generation_step(inputs)
+            logits, hidden_states = finalize_generation_step(x)
 
         return logits, hidden_states, cache