remove unwanted code

Mohamed-Ashraf273 · Mohamed-Ashraf273 · commit 298a3199d2e1 · 2025-07-01T16:08:36.000+03:00
diff --git a/keras_hub/src/models/causal_lm.py b/keras_hub/src/models/causal_lm.py
@@ -58,11 +58,6 @@ class CausalLM(Task):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        # only OpenVINO needs these declarations
-        if keras.config.backend() == "openvino":
-            self._ov_mem = {}
-            self.struct_outputs = None
-            self.ov_infer = None
 
     def compile(
         self,
@@ -170,78 +165,47 @@ def parameterize_inputs(inputs):
                 else:
                     raise TypeError(f"Unknown input type: {type(inputs)}")
 
-            def set_struct_outputs(inputs, fn):
+            def get_struct_outputs(inputs, stop_token_ids, fn):
                 struct_params = parameterize_inputs(inputs)
-                self.struct_outputs = fn(struct_params)
-                return struct_params, self.struct_outputs
+                struct_outputs = fn(struct_params, stop_token_ids)
+                return struct_params, struct_outputs
 
-            def get_outputs_from_model(inputs, model):
+            def get_outputs_from_model(
+                inputs, struct_outputs, compile_ov_model
+            ):
                 flatten_inputs = tree.flatten(inputs)
                 assert OpenVINOKerasTensor not in inputs, (
                     "inputs should be numpy arrays"
                 )
-                outputs = model(flatten_inputs)
+                outputs = compile_ov_model(flatten_inputs)
                 outputs = unpack_singleton(
-                    tree.pack_sequence_as(
-                        self.struct_outputs, outputs.to_tuple()
-                    )
+                    tree.pack_sequence_as(struct_outputs, outputs.to_tuple())
                 )
                 return outputs
 
-            def get_model(inputs, fn, ov_model=None, compiled=False):
-                struct_params, _ = set_struct_outputs(inputs, fn)
-
-                if ov_model is not None:
-                    assert compiled, (
-                        "if you pass a model, you should make compiled=True"
-                    )
-                    return ov.compile_model(ov_model, "CPU")
-
+            def ov_infer(inputs, struct_params, struct_outputs):
                 parameters = [
                     p.output.get_node() for p in tree.flatten(struct_params)
                 ]
                 results = [
                     ov_opset.result(r.output)
-                    for r in tree.flatten(self.struct_outputs)
+                    for r in tree.flatten(struct_outputs)
                 ]
 
                 ov_model = ov.Model(results=results, parameters=parameters)
-                if not compiled:
-                    return ov_model
-
-                return ov.compile_model(ov_model, "CPU")
-
-            def ov_infer(
-                inputs,
-                fn,
-                cache=False,
-                name=None,
-            ):
-                compiled_model = None
-                if cache:
-                    assert name is not None, (
-                        "you should provide name of the model being cached"
-                    )
-                    if self._ov_mem.get(name) is None:
-                        self._ov_mem[name] = get_model(
-                            inputs, fn, compiled=True
-                        )
-                    else:
-                        set_struct_outputs(inputs, fn)
-                    compiled_model = self._ov_mem[name]
-                else:
-                    compiled_model = get_model(inputs, fn, compiled=True)
-                outputs = get_outputs_from_model(inputs, compiled_model)
-                del compiled_model
-                return outputs
-
-            self.ov_infer = ov_infer
+                compile_ov_model = ov.compile_model(ov_model, "CPU")
+                return get_outputs_from_model(
+                    inputs, struct_outputs, compile_ov_model
+                )
 
             def wrapped_generate_function(inputs, stop_token_ids=None):
-                outputs = self.generate_step(inputs, stop_token_ids)
-                for k, v in outputs.items():
-                    outputs[k] = ops.convert_to_numpy(v)
-                return outputs
+                for k, v in inputs.items():
+                    if isinstance(v, OpenVINOKerasTensor):
+                        inputs[k] = ops.convert_to_numpy(v)
+                struct_params, struct_outputs = get_struct_outputs(
+                    inputs, stop_token_ids, self.generate_step
+                )
+                return ov_infer(inputs, struct_params, struct_outputs)
 
             self.generate_function = wrapped_generate_function
         if keras.config.backend() == "torch":
diff --git a/keras_hub/src/models/gemma/gemma_causal_lm.py b/keras_hub/src/models/gemma/gemma_causal_lm.py
@@ -196,102 +196,22 @@ def call_with_cache(
             the final hidden representation of the input tokens, and `cache` is
             the decoding cache.
         """
-
-        use_openvino = keras.config.backend() == "openvino"
-
-        def embed_and_scale_tokens(token_ids):
-            x = self.backbone.token_embedding(token_ids)
-            return x * ops.cast(ops.sqrt(self.backbone.hidden_dim), x.dtype)
-
-        def apply_transformer_layers(inputs):
-            x = inputs["x"]
-            cache = inputs["cache"]
-            cache_update_index = inputs["cache_update_index"]
-            caches = []
-            for i, transformer_layer in enumerate(
-                self.backbone.transformer_layers
-            ):
-                current_cache = cache[:, i, ...]
-                x, next_cache = transformer_layer(
-                    x,
-                    cache=current_cache,
-                    cache_update_index=cache_update_index,
-                )
-                caches.append(next_cache)
-
-            cache = ops.stack(caches, axis=1)
-            return x, cache
-
-        def finalize_generation_step(x):
-            hidden_states = x = self.backbone.layer_norm(x)
-            logits = self.backbone.token_embedding(x, reverse=True)
-            return logits, hidden_states
-
-        if use_openvino:
-            token_ids = ops.convert_to_numpy(token_ids)
-            cache = ops.convert_to_numpy(cache)
-            if token_ids.shape[1] == 1:
-                x = self.ov_infer(
-                    token_ids,
-                    embed_and_scale_tokens,
-                    cache=True,
-                    name="embed_and_scale_tokens",
-                )
-            else:
-                ov_cache = self._ov_mem.get("cache")
-                if ov_cache is not None and cache.shape == ov_cache.shape:
-                    return None, self._ov_mem["hidden_states"], ov_cache
-                x = self.ov_infer(token_ids, embed_and_scale_tokens)
-        else:
-            x = embed_and_scale_tokens(token_ids)
-
-        if use_openvino:
-            if token_ids.shape[1] == 1:
-                x, cache = self.ov_infer(
-                    {
-                        "x": x,
-                        "cache": cache,
-                        "cache_update_index": cache_update_index,
-                    },
-                    apply_transformer_layers,
-                    cache=True,
-                    name="apply_transformer_layers",
-                )
-            else:
-                x, cache = self.ov_infer(
-                    {
-                        "x": x,
-                        "cache": cache,
-                        "cache_update_index": cache_update_index,
-                    },
-                    apply_transformer_layers,
-                )
-                self._ov_mem["cache"] = cache
-        else:
-            x, cache = apply_transformer_layers(
-                {
-                    "x": x,
-                    "cache": cache,
-                    "cache_update_index": cache_update_index,
-                }
+        x = self.backbone.token_embedding(token_ids)
+        x = x * ops.cast(ops.sqrt(self.backbone.hidden_dim), x.dtype)
+        # Each decoder layer has a cache; we update them separately.
+        caches = []
+        for i, transformer_layer in enumerate(self.backbone.transformer_layers):
+            current_cache = cache[:, i, ...]
+            x, next_cache = transformer_layer(
+                x,
+                cache=current_cache,
+                cache_update_index=cache_update_index,
             )
+            caches.append(next_cache)
 
-        if use_openvino:
-            if token_ids.shape[1] == 1:
-                logits, hidden_states = self.ov_infer(
-                    x,
-                    finalize_generation_step,
-                    cache=True,
-                    name="finalize_generation_step",
-                )
-            else:
-                logits, hidden_states = self.ov_infer(
-                    x, finalize_generation_step
-                )
-                self._ov_mem["hidden_states"] = hidden_states
-        else:
-            logits, hidden_states = finalize_generation_step(x)
-
+        cache = ops.stack(caches, axis=1)
+        hidden_states = x = self.backbone.layer_norm(x)
+        logits = self.backbone.token_embedding(x, reverse=True)
         return logits, hidden_states, cache
 
     def _build_cache(self, token_ids):
@@ -338,6 +258,9 @@ def next(prompt, cache, index):
             cache_update_index = index - 1
             batch_size = ops.shape(prompt)[0]
             prompt = ops.slice(prompt, [0, cache_update_index], [batch_size, 1])
+            if keras.config.backend() == "openvino":
+                # Avoid returning dynamic shape by openvino slice
+                prompt = ops.reshape(prompt, [batch_size, 1])
             logits, hidden_states, cache = self.call_with_cache(
                 prompt,
                 cache,
diff --git a/keras_hub/src/models/gemma/gemma_causal_lm_test.py b/keras_hub/src/models/gemma/gemma_causal_lm_test.py
@@ -64,10 +64,6 @@ def test_causal_lm_basics(self):
             expected_output_shape=(2, 8, 11),
         )
 
-    @pytest.mark.skipif(
-        keras.config.backend() == "openvino",
-        reason="OpenVINO is for inference only",
-    )
     def test_cache_correctness(self):
         token_ids = self.input_data["token_ids"]
         padding_mask = ops.ones_like(self.input_data["padding_mask"])
@@ -97,9 +93,6 @@ def test_generate(self):
         causal_lm.preprocessor = None
         outputs = causal_lm.generate(prompt_ids, stop_token_ids=None)
         # Assert prompt is in output in token id space.
-        if keras.config.backend() == "openvino":
-            for k, v in prompt_ids.items():
-                prompt_ids[k] = ops.convert_to_numpy(v)
         self.assertAllEqual(
             outputs["token_ids"][:, :4],
             prompt_ids["token_ids"][:, :4],
@@ -139,9 +132,6 @@ def test_generate_with_bfloat16(self):
             causal_lm.preprocessor = None
             outputs = causal_lm.generate(prompt_ids, stop_token_ids=None)
             # Assert prompt is in output in token id space.
-            if keras.config.backend() == "openvino":
-                for k, v in prompt_ids.items():
-                    prompt_ids[k] = ops.convert_to_numpy(v)
             self.assertAllEqual(
                 outputs["token_ids"][:, :4],
                 prompt_ids["token_ids"][:, :4],
@@ -163,12 +153,6 @@ def wrapper(*args, **kwargs):
             """Modify output logits to always favor end_token_id"""
             logits, hidden_states, cache = call_with_cache(*args, **kwargs)
             index = self.preprocessor.tokenizer.end_token_id
-            if keras.config.backend() == "openvino":
-                """Set all logits to a large negative number 
-                    to avoid NaNs produced by ov.einsum"""
-                logits = ops.ones_like(logits) * ops.convert_to_tensor(
-                    -1e9, dtype=logits.dtype
-                )
             update = ops.ones_like(logits)[:, :, index] * 1.0e9
             update = ops.expand_dims(update, axis=-1)
             logits = ops.slice_update(logits, (0, 0, index), update)
@@ -188,12 +172,6 @@ def wrapper(*args, **kwargs):
             """Modify output logits to always favor end_token_id"""
             logits, hidden_states, cache = call_with_cache(*args, **kwargs)
             index = self.preprocessor.tokenizer.end_token_id
-            if keras.config.backend() == "openvino":
-                """Set all logits to a large negative number 
-                    to avoid NaNs produced by ov.einsum"""
-                logits = ops.ones_like(logits) * ops.convert_to_tensor(
-                    -1e9, dtype=logits.dtype
-                )
             update = ops.ones_like(logits)[:, :, index] * 1.0e9
             update = ops.expand_dims(update, axis=-1)
             logits = ops.slice_update(logits, (0, 0, index), update)
@@ -237,10 +215,6 @@ def test_all_presets(self):
                 input_data=self.input_data,
             )
 
-    @pytest.mark.skipif(
-        keras.config.backend() == "openvino",
-        reason="OpenVINO is for inference only",
-    )
     def test_score_logits(self):
         # Setup prompts, models, and associated expected shapes.
         prompts = ["the quick brown fox", "the quick brown fox"]
@@ -263,10 +237,6 @@ def test_score_logits(self):
 
         self.assertEqual(ops.shape(scores), expected_score_shape)
 
-    @pytest.mark.skipif(
-        keras.config.backend() == "openvino",
-        reason="OpenVINO is for inference only",
-    )
     def test_score_loss(self):
         # Setup prompts, models, and associated expected shapes.
         prompts = ["the quick brown fox", "the quick brown fox"]
diff --git a/keras_hub/src/models/mistral/mistral_causal_lm.py b/keras_hub/src/models/mistral/mistral_causal_lm.py
@@ -145,6 +145,9 @@ def next(prompt, cache, index):
             cache_update_index = index - 1
             batch_size = ops.shape(prompt)[0]
             prompt = ops.slice(prompt, [0, cache_update_index], [batch_size, 1])
+            if keras.config.backend() == "openvino":
+                # Avoid returning dynamic shape by openvino slice
+                prompt = ops.reshape(prompt, [batch_size, 1])
             logits, hidden_states, cache = self.call_with_cache(
                 prompt,
                 cache,
diff --git a/keras_hub/src/samplers/sampler.py b/keras_hub/src/samplers/sampler.py
diff --git a/keras_hub/src/tests/test_case.py b/keras_hub/src/tests/test_case.py