cleanup

stanford-crfm · Apr 28, 2024 · ded6d2c · ded6d2c
1 parent a909139
commit ded6d2c
Show file tree

Hide file tree

Showing 10 changed files with 84 additions and 109 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -47,14 +47,13 @@ install_requires=
     # Basic metrics
     nltk~=3.7
     pyext~=0.7
-    pycocoevalcap~=1.2
     rouge-score~=0.1.2
     scipy~=1.10
     uncertainty-calibration~=0.1.4
     scikit-learn~=1.1
 
     # Models and Metrics Extras
-    transformers~=4.40.0  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
+    transformers~=4.40  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
     # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
     torch>=1.13.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
     torchvision>=0.14.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
@@ -176,6 +175,9 @@ vlm =
     crfm-helm[images]
     crfm-helm[image2structure]
 
+    # For metrics
+    pycocoevalcap~=1.2
+
 image2structure =
     crfm-helm[images]
 

diff --git a/src/helm/benchmark/metrics/common_metric_specs.py b/src/helm/benchmark/metrics/common_metric_specs.py
@@ -164,6 +164,4 @@ def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricS
 
 
 def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
-    return get_basic_metric_specs(
-        ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"]
-    )
+    return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
diff --git a/src/helm/benchmark/metrics/evaluate_reference_metrics.py b/src/helm/benchmark/metrics/evaluate_reference_metrics.py
@@ -10,6 +10,7 @@
 from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.scenarios.code_scenario import CodeReference
 from helm.benchmark.scenarios.scenario import Reference
+from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import GeneratedOutput
 from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
 from nltk.metrics.scores import f_measure
@@ -20,7 +21,7 @@
 import string
 from . import code_metrics_helper
 import nltk
-from pycocoevalcap.cider.cider import Cider
+
 
 try:
     nltk.data.find("tokenizers/punkt")
@@ -190,6 +191,11 @@ def bleu_4(gold: str, pred: str) -> float:
 
 
 def cider(gold: str, pred: str) -> float:
+    try:
+        from pycocoevalcap.cider.cider import Cider
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["vlm"])
+
     cider_evaluator = Cider()
     candidate = {"caption": [pred]}
     reference = {"caption": [gold]}

diff --git a/src/helm/benchmark/presentation/run_entries_debug.conf b/src/helm/benchmark/presentation/run_entries_debug.conf
diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py
diff --git a/src/helm/clients/openai_client.py b/src/helm/clients/openai_client.py
@@ -55,12 +55,7 @@ def _is_chat_model_engine(self, model_engine: str) -> bool:
             return True
         return False
 
-    def _is_high_res_vision_model(self, model_engine: str) -> bool:
-        return model_engine == "gpt-4-vision-preview-high-res"
-
     def _get_model_for_request(self, request: Request) -> str:
-        if self._is_high_res_vision_model(request.model_engine):
-            return "gpt-4-vision-preview"
         return request.model_engine
 
     def _get_cache_key(self, raw_request: Dict, request: Request):
@@ -136,9 +131,6 @@ def _make_chat_request(self, request: Request) -> RequestResult:
 
                         base64_image: str = encode_base64(media_object.location)
                         image_object: Dict[str, str] = {"url": f"data:image/jpeg;base64,{base64_image}"}
-                        if self._is_high_res_vision_model(request.model_engine):
-                            image_object["detail"] = "high"
-
                         content.append({"type": "image_url", "image_url": image_object})
                     elif media_object.is_type(TEXT_TYPE):
                         if media_object.text is None:

diff --git a/src/helm/clients/vertexai_client.py b/src/helm/clients/vertexai_client.py
@@ -354,6 +354,9 @@ def do_it() -> Dict[str, Any]:
                         raise VertexAIContentBlockedError("No candidates in response due to content blocking")
 
                     # We should only have one candidate
+                    assert (
+                        len(candidates) == 1
+                    ), f"Expected 1 candidate since candidate_count is 1, got {len(candidates)}."
                     candidate: Candidate = candidates[0]
                     if (
                         candidate.finish_reason in VertexAIChatClient.CONTENT_BLOCKED_FINISH_REASONS
@@ -373,12 +376,11 @@ def do_it() -> Dict[str, Any]:
 
                 cache_key = CachingClient.make_cache_key(raw_cache_key, request)
                 response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
-            except (requests.exceptions.RequestException, VertexAIContentBlockedError) as e:
-                if "Content has no parts" in str(e):
-                    return complete_for_valid_error(self.CONTENT_HAS_NO_PARTS_ERROR)
-
+            except requests.exceptions.RequestException as e:
                 error: str = f"Gemini Vision error: {e}"
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+            except VertexAIContentBlockedError as e:
+                return complete_for_valid_error(str(e))
 
             if "error" in response:
                 return complete_for_valid_error(response["error"])

diff --git a/src/helm/clients/vision_language/huggingface_vision2seq_client.py b/src/helm/clients/vision_language/huggingface_vision2seq_client.py
@@ -18,15 +18,15 @@
 
 
 @dataclass(frozen=True)
-class LoadedVision2SeqModelProcessor:
+class Vision2SeqModelProcessor:
     """Loaded model and processor."""
 
     model: AutoModelForVision2Seq
     processor: AutoProcessor
 
 
 _models_lock: Lock = Lock()
-_models: Dict[str, Optional[LoadedVision2SeqModelProcessor]] = {
+_models: Dict[str, Optional[Vision2SeqModelProcessor]] = {
     "HuggingFaceM4/idefics2-8b": None,
 }
 
@@ -44,7 +44,7 @@ def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: Cach
         self.tokenizer_name = tokenizer_name
         self._device: str = get_torch_device_name()
 
-    def _get_model(self, checkpoint: str) -> LoadedVision2SeqModelProcessor:
+    def _get_model(self, checkpoint: str) -> Vision2SeqModelProcessor:
         global _models_lock
         global _models
 
@@ -57,7 +57,7 @@ def _get_model(self, checkpoint: str) -> LoadedVision2SeqModelProcessor:
                 model = AutoModelForVision2Seq.from_pretrained(checkpoint, torch_dtype=torch_dtype).to(self._device)
                 processor = AutoProcessor.from_pretrained(checkpoint)
 
-                _models[checkpoint] = LoadedVision2SeqModelProcessor(model, processor)
+                _models[checkpoint] = Vision2SeqModelProcessor(model, processor)
                 loaded_model_processor = _models[checkpoint]
 
         assert loaded_model_processor is not None
@@ -67,7 +67,7 @@ def make_request(self, request: Request) -> RequestResult:
         assert request.model_deployment in _models, f"Not a valid model for this client: {request.model_deployment}"
         assert request.multimodal_prompt is not None, "Multimodal prompt is required"
 
-        loaded_model_processor: LoadedVision2SeqModelProcessor = self._get_model(request.model_deployment)
+        loaded_model_processor: Vision2SeqModelProcessor = self._get_model(request.model_deployment)
         model = loaded_model_processor.model
         processor = loaded_model_processor.processor
 

diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
@@ -1357,15 +1357,6 @@ model_deployments:
     client_spec:
       class_name: "helm.clients.openai_client.OpenAIClient"
 
-  - name: openai/gpt-4-vision-preview-high-res
-    model_name: openai/gpt-4-vision-preview-high-res
-    tokenizer_name: openai/cl100k_base
-    max_sequence_length: 128000  # According to https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
-    max_request_length: 128001
-    max_sequence_and_generated_tokens_length: 132096
-    client_spec:
-      class_name: "helm.clients.openai_client.OpenAIClient"
-
   ## Codex Models
   # DEPRECATED: Codex models have been shut down on March 23 2023.
 

diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
@@ -1772,14 +1772,6 @@ models:
     release_date: 2023-11-06
     tags: [VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG]
 
-  - name: openai/gpt-4-vision-preview-high-res
-    display_name: GPT-4V high res (preview)
-    description: GPT-4V with "high res" mode enabled, which first allows the model to see the low res image and then creates detailed crops of input images as 512px squares based on the input image size.
-    creator_organization_name: OpenAI
-    access: limited
-    release_date: 2023-11-06
-    tags: [VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG]
-
   ## Codex Models
   # DEPRECATED: Codex models have been shut down on March 23 2023.