huggingface · echarlaix · Oct 16, 2024 · Feb 17, 2025
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -83,7 +83,6 @@
 )
 from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
-    CLIPModelPatcher,
     FalconModelPatcher,
     MgpstrModelPatcher,
     MistralModelPatcher,
@@ -1109,6 +1108,7 @@ class CLIPNormalizedConfig(NormalizedTextAndVisionConfig):
 
 class CLIPVisionModelOnnxConfig(VisionOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
@@ -1122,16 +1122,10 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
 
         return common_outputs
 
-    def patch_model_for_export(
-        self,
-        model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
-        model_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> "ModelPatcher":
-        return CLIPModelPatcher(self, model, model_kwargs=model_kwargs)
-
 
 class CLIPOnnxConfig(TextAndVisionOnnxConfig):
     NORMALIZED_CONFIG_CLASS = CLIPNormalizedConfig
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
@@ -1150,13 +1144,6 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             "image_embeds": {0: "image_batch_size"},
         }
 
-    def patch_model_for_export(
-        self,
-        model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
-        model_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> "ModelPatcher":
-        return CLIPModelPatcher(self, model, model_kwargs=model_kwargs)
-
 
 class SentenceTransformersCLIPOnnxConfig(CLIPOnnxConfig):
     @property
@@ -1202,13 +1189,6 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
 
         return common_outputs
 
-    def patch_model_for_export(
-        self,
-        model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
-        model_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> "ModelPatcher":
-        return CLIPModelPatcher(self, model, model_kwargs=model_kwargs)
-
 
 class CLIPTextOnnxConfig(CLIPTextWithProjectionOnnxConfig):
     @property

diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -1278,16 +1278,3 @@ def __init__(
                 self._update_causal_mask_original = self._model.model._update_causal_mask
             else:
                 self._update_causal_mask_original = self._model._update_causal_mask
-
-
-class CLIPModelPatcher(ModelPatcher):
-    def __enter__(self):
-        super().__enter__()
-        if is_transformers_version(">=", "4.43"):
-            self.original_sdpa_forward = CLIPSdpaAttention.forward
-            CLIPSdpaAttention.forward = CLIPAttention.forward
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        super().__exit__(exc_type, exc_value, traceback)
-        if is_transformers_version(">=", "4.43"):
-            CLIPSdpaAttention.forward = self.original_sdpa_forward