diff --git a/setup.cfg b/setup.cfg index 14421056536..053637f5f77 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,14 +47,13 @@ install_requires= # Basic metrics nltk~=3.7 pyext~=0.7 - pycocoevalcap~=1.2 rouge-score~=0.1.2 scipy~=1.10 uncertainty-calibration~=0.1.4 scikit-learn~=1.1 # Models and Metrics Extras - transformers~=4.40.0 # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics) + transformers~=4.40 # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics) # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers torch>=1.13.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) torchvision>=0.14.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) @@ -176,6 +175,9 @@ vlm = crfm-helm[images] crfm-helm[image2structure] + # For metrics + pycocoevalcap~=1.2 + image2structure = crfm-helm[images] diff --git a/src/helm/benchmark/metrics/common_metric_specs.py b/src/helm/benchmark/metrics/common_metric_specs.py index 00f2af22e0f..0e19111c44c 100644 --- a/src/helm/benchmark/metrics/common_metric_specs.py +++ b/src/helm/benchmark/metrics/common_metric_specs.py @@ -164,6 +164,4 @@ def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricS def get_open_ended_generation_metric_specs() -> List[MetricSpec]: - return get_basic_metric_specs( - ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"] - ) + return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"]) diff --git a/src/helm/benchmark/metrics/evaluate_reference_metrics.py b/src/helm/benchmark/metrics/evaluate_reference_metrics.py index f0a2b2dfd6e..754012f1feb 100644 --- a/src/helm/benchmark/metrics/evaluate_reference_metrics.py +++ b/src/helm/benchmark/metrics/evaluate_reference_metrics.py @@ -10,6 +10,7 @@ from helm.benchmark.metrics.statistic import Stat from helm.benchmark.scenarios.code_scenario import CodeReference from helm.benchmark.scenarios.scenario import Reference +from helm.common.optional_dependencies import handle_module_not_found_error from helm.common.request import GeneratedOutput from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought from nltk.metrics.scores import f_measure @@ -20,7 +21,7 @@ import string from . import code_metrics_helper import nltk -from pycocoevalcap.cider.cider import Cider + try: nltk.data.find("tokenizers/punkt") @@ -190,6 +191,11 @@ def bleu_4(gold: str, pred: str) -> float: def cider(gold: str, pred: str) -> float: + try: + from pycocoevalcap.cider.cider import Cider + except ModuleNotFoundError as e: + handle_module_not_found_error(e, ["vlm"]) + cider_evaluator = Cider() candidate = {"caption": [pred]} reference = {"caption": [gold]} diff --git a/src/helm/benchmark/presentation/run_entries_debug.conf b/src/helm/benchmark/presentation/run_entries_debug.conf deleted file mode 100644 index 7a5660ab361..00000000000 --- a/src/helm/benchmark/presentation/run_entries_debug.conf +++ /dev/null @@ -1,5 +0,0 @@ -entries: [ - - {description: "viz_wiz:model=vlm", priority: 1} - -] diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py index 130377c3e52..ca1e233f2d7 100644 --- a/src/helm/benchmark/run_specs/vlm_run_specs.py +++ b/src/helm/benchmark/run_specs/vlm_run_specs.py @@ -11,7 +11,7 @@ get_basic_reference_metric_specs, get_exact_match_metric_specs, get_generative_harms_metric_specs, - get_open_ended_generation_metric_specs, + get_basic_metric_specs, ) from helm.benchmark.metrics.metric import MetricSpec from helm.benchmark.run_spec import RunSpec, run_spec_function @@ -23,7 +23,7 @@ # Prototypical adapter specs for VLM evaluation -def get_generation_adapter_spec( +def _get_generation_adapter_spec( instructions: str = "", input_prefix: str = "", input_suffix: str = "", @@ -51,8 +51,8 @@ def get_generation_adapter_spec( ) -def get_short_answer_generation_adapter_spec(instructions: Optional[str] = None): - return get_generation_adapter_spec( +def _get_short_answer_generation_adapter_spec(instructions: Optional[str] = None) -> AdapterSpec: + return _get_generation_adapter_spec( instructions=( "Just give a short answer without answering in a complete sentence." if instructions is None @@ -62,7 +62,15 @@ def get_short_answer_generation_adapter_spec(instructions: Optional[str] = None) ) -def get_multiple_choice_joint_adapter_spec( +def _get_captioning_adapter_spec() -> AdapterSpec: + return _get_generation_adapter_spec( + instructions="Generate a caption for the following image. The caption should be short and does " + "not need to be a complete sentence.", + max_tokens=20, + ) + + +def _get_multiple_choice_joint_adapter_spec( input_noun: Optional[str], output_noun: str, max_train_instances: int = 0, @@ -90,7 +98,13 @@ def get_multiple_choice_joint_adapter_spec( # VHELM metric specs -def get_image2structure_metric_specs( +def _get_open_ended_generation_metric_specs() -> List[MetricSpec]: + return get_basic_metric_specs( + ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"] + ) + + +def _get_image2structure_metric_specs( generation_type: str, metric_names: Optional[List[str]] = None, args: Optional[Dict] = None, @@ -134,7 +148,7 @@ def get_a_okvqa_spec() -> RunSpec: args={}, ) - adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec( + adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", max_train_instances=0 ) @@ -155,7 +169,7 @@ def get_chart2csv_spec() -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.image2structure.chart2csv_scenario.Chart2CSVScenario", args={}, ) - adapter_spec: AdapterSpec = get_generation_adapter_spec( + adapter_spec: AdapterSpec = _get_generation_adapter_spec( instructions="Generate the CSV for the chart. Some of the labels may be missing due to the size of the chart. " "Please infer the missing labels based on the surrounding context. " "Just give the CSV without any explanation.", @@ -179,8 +193,8 @@ def get_crossmodal_3600_spec(location: str, language: str) -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario", args={"location": location, "language": language}, ) - adapter_spec: AdapterSpec = get_generation_adapter_spec(max_tokens=20) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs() + adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=20) + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() run_spec_name: str = "crossmodal_3600" return RunSpec( @@ -197,12 +211,8 @@ def get_flickr30k_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={} ) - adapter_spec: AdapterSpec = get_generation_adapter_spec( - instructions="Generate a caption for the following image. The caption should neither be long " - "nor a complete sentence.", - max_tokens=20, - ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs() + adapter_spec: AdapterSpec = _get_captioning_adapter_spec() + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() run_spec_name: str = "flickr30k" return RunSpec( @@ -219,10 +229,10 @@ def get_gqa_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={} ) - adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec( + adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec( instructions="Answer the question using a single word or phrase." ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs() + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() run_spec_name: str = "gqa" return RunSpec( @@ -239,7 +249,7 @@ def get_hateful_memes_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", args={} ) - adapter_spec = get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer", max_train_instances=0) + adapter_spec = _get_multiple_choice_joint_adapter_spec(input_noun=None, output_noun="Answer", max_train_instances=0) metric_specs: List[MetricSpec] = get_exact_match_metric_specs() run_spec_name: str = "hateful_memes" @@ -258,7 +268,7 @@ def get_mm_safety_bench_spec(subset: str) -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.mm_safety_bench_scenario.MMSafetyBenchScenario", args={"subset": subset}, ) - adapter_spec: AdapterSpec = get_generation_adapter_spec(max_tokens=500) + adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500) metric_specs: List[MetricSpec] = get_generative_harms_metric_specs( include_basic_metrics=True, include_generative_harms_metrics=True ) @@ -282,17 +292,13 @@ def get_mscoco_captioning_spec(long: bool = False) -> RunSpec: adapter_spec: AdapterSpec if long: - adapter_spec = get_generation_adapter_spec( + adapter_spec = _get_generation_adapter_spec( instructions="Generate a long, detailed caption for the following image.", max_tokens=150, ) else: - adapter_spec = get_generation_adapter_spec( - instructions="Generate a caption for the following image. The caption should neither be long " - "nor a complete sentence.", - max_tokens=20, - ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs() + adapter_spec = _get_captioning_adapter_spec() + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() run_spec_name: str = "mscoco_captioning" if long: @@ -314,16 +320,7 @@ def get_mscoco_categorization_spec() -> RunSpec: "MSCOCOCategorizationScenario", args={}, ) - # adapter_spec: AdapterSpec = get_generation_adapter_spec( - # instructions="Give the most prevalent category of the objects in the image. " - # "Just return the category without any explanation or in a complete sentence. " - # "Here are the possible categories: textile, plant, building, furniture-stuff, " - # "structural, raw-material, floor, ceiling, sky, ground, water, food-stuff, " - # "solid, wall, window, other", - # max_tokens=20, - # ) - # metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs() - adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec( + adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", max_train_instances=0 ) @@ -344,7 +341,7 @@ def get_originality_vlm_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.originality_scenario.OriginalityScenario", args={} ) - adapter_spec: AdapterSpec = get_generation_adapter_spec(max_tokens=500) + adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=500) metric_specs: List[MetricSpec] = get_generative_harms_metric_specs( include_basic_metrics=True, include_generative_harms_metrics=True ) @@ -364,12 +361,12 @@ def get_viz_wiz_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={} ) - adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec( + adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec( # Following https://arxiv.org/abs/2310.03744 instructions="When the provided information is insufficient, respond with 'Unanswerable'. " "Answer the question using a single word or phrase." ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs() + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() run_spec_name: str = "viz_wiz" return RunSpec( @@ -387,11 +384,11 @@ def get_vqa_spec() -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={} ) # Following https://arxiv.org/abs/2310.03744 - adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec( + adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec( instructions='Answer the question using a single word or phrase. When the question asks "How many...", ' "respond with just a number (e.g., 3) and not the word corresponding to the number." ) - metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs() + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() run_spec_name: str = "vqa" return RunSpec( @@ -409,11 +406,11 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario", args={"subset": subset, "recompile_prompt": recompile_prompt}, ) - adapter_spec: AdapterSpec = get_generation_adapter_spec( + adapter_spec: AdapterSpec = _get_generation_adapter_spec( instructions="Just give a short answer without answering in a complete sentence.", max_tokens=2000, ) - metric_specs: List[MetricSpec] = get_image2structure_metric_specs( + metric_specs: List[MetricSpec] = _get_image2structure_metric_specs( generation_type="latex", args=args, include_edit_similarity=True, @@ -442,11 +439,11 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario", args={"subset": subset, "recompile_prompt": recompile_prompt}, ) - adapter_spec: AdapterSpec = get_generation_adapter_spec( + adapter_spec: AdapterSpec = _get_generation_adapter_spec( instructions="Just give a short answer without answering in a complete sentence.", max_tokens=2000, ) - metric_specs: List[MetricSpec] = get_image2structure_metric_specs( + metric_specs: List[MetricSpec] = _get_image2structure_metric_specs( generation_type="webpage", args=args, include_edit_similarity=True, @@ -478,9 +475,9 @@ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec: adapter_spec: AdapterSpec if question_type == "free_form": - adapter_spec = get_short_answer_generation_adapter_spec() + adapter_spec = _get_short_answer_generation_adapter_spec() elif question_type == "multi_choice": - adapter_spec = get_multiple_choice_joint_adapter_spec( + adapter_spec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", max_train_instances=0 ) else: @@ -503,11 +500,11 @@ def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario", args={"subset": "music", "recompile_prompt": False}, # There os only one subset for music sheets ) - adapter_spec: AdapterSpec = get_generation_adapter_spec( + adapter_spec: AdapterSpec = _get_generation_adapter_spec( instructions="Just give a short answer without answering in a complete sentence.", max_tokens=2000, ) - metric_specs: List[MetricSpec] = get_image2structure_metric_specs( + metric_specs: List[MetricSpec] = _get_image2structure_metric_specs( generation_type="lilypond", args=args, include_edit_similarity=False, # No ground truth for music sheets @@ -539,9 +536,9 @@ def get_mmmu_spec(subject: str, question_type: str) -> RunSpec: adapter_spec: AdapterSpec if question_type == "open": - adapter_spec = get_short_answer_generation_adapter_spec() + adapter_spec = _get_short_answer_generation_adapter_spec() elif question_type == "multiple-choice": - adapter_spec = get_multiple_choice_joint_adapter_spec( + adapter_spec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", max_train_instances=0, @@ -568,7 +565,7 @@ def get_unicorn_spec(subject: str) -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.unicorn_scenario.UnicornScenario", args={"subject": subject}, ) - adapter_spec: AdapterSpec = get_generation_adapter_spec( + adapter_spec: AdapterSpec = _get_generation_adapter_spec( instructions="Only give numerical or boolean answer without an explanation." ) metric_specs: List[MetricSpec] = get_exact_match_metric_specs() @@ -588,8 +585,8 @@ def get_bingo_spec(subject: str) -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject} ) - adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec() - metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs() + adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec() + metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs() run_spec_name: str = "bingo" return RunSpec( @@ -610,9 +607,9 @@ def get_multipanelvqa_spec(subject: str, question_type: str) -> RunSpec: adapter_spec: AdapterSpec if question_type == "open": - adapter_spec = get_short_answer_generation_adapter_spec() + adapter_spec = _get_short_answer_generation_adapter_spec() elif question_type == "multiple-choice": - adapter_spec = get_multiple_choice_joint_adapter_spec( + adapter_spec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", max_train_instances=0 ) else: @@ -634,7 +631,7 @@ def get_pope_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.vision_language.pope_scenario.POPEScenario", ) - adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec( + adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", max_train_instances=0 ) metric_specs: List[MetricSpec] = get_exact_match_metric_specs() @@ -655,7 +652,7 @@ def get_seed_bench_spec(subject: str) -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.seed_bench_scenario.SEEDBenchScenario", args={"subject": subject}, ) - adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec( + adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", max_train_instances=0 ) metric_specs: List[MetricSpec] = get_exact_match_metric_specs() @@ -676,7 +673,7 @@ def get_mme_spec(subject: str) -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.mme_scenario.MMEScenario", args={"subject": subject}, ) - adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec( + adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", max_train_instances=0 ) metric_specs: List[MetricSpec] = get_exact_match_metric_specs() @@ -697,7 +694,7 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.heim_human_eval_scenario.HEIMHumanEvalScenario", args={"question_type": question_type}, ) - adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec( + adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", num_outputs=1, @@ -721,7 +718,7 @@ def get_pairs_spec(subset: str, person: str) -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.pairs_scenario.PAIRSScenario", args={"subset": subset, "person": person}, ) - adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec( + adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec( input_noun=None, output_noun="Answer", num_outputs=1, @@ -745,8 +742,8 @@ def get_mementos_spec(subject: str) -> RunSpec: class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario", args={"subject": subject}, ) - adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec() - metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs() + adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec() + metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs() run_spec_name: str = "mementos" return RunSpec( diff --git a/src/helm/clients/openai_client.py b/src/helm/clients/openai_client.py index 5f0cd2e78bb..22de364a1de 100644 --- a/src/helm/clients/openai_client.py +++ b/src/helm/clients/openai_client.py @@ -55,12 +55,7 @@ def _is_chat_model_engine(self, model_engine: str) -> bool: return True return False - def _is_high_res_vision_model(self, model_engine: str) -> bool: - return model_engine == "gpt-4-vision-preview-high-res" - def _get_model_for_request(self, request: Request) -> str: - if self._is_high_res_vision_model(request.model_engine): - return "gpt-4-vision-preview" return request.model_engine def _get_cache_key(self, raw_request: Dict, request: Request): @@ -136,9 +131,6 @@ def _make_chat_request(self, request: Request) -> RequestResult: base64_image: str = encode_base64(media_object.location) image_object: Dict[str, str] = {"url": f"data:image/jpeg;base64,{base64_image}"} - if self._is_high_res_vision_model(request.model_engine): - image_object["detail"] = "high" - content.append({"type": "image_url", "image_url": image_object}) elif media_object.is_type(TEXT_TYPE): if media_object.text is None: diff --git a/src/helm/clients/vertexai_client.py b/src/helm/clients/vertexai_client.py index 67efdca7cba..b65c0477c6c 100644 --- a/src/helm/clients/vertexai_client.py +++ b/src/helm/clients/vertexai_client.py @@ -354,6 +354,9 @@ def do_it() -> Dict[str, Any]: raise VertexAIContentBlockedError("No candidates in response due to content blocking") # We should only have one candidate + assert ( + len(candidates) == 1 + ), f"Expected 1 candidate since candidate_count is 1, got {len(candidates)}." candidate: Candidate = candidates[0] if ( candidate.finish_reason in VertexAIChatClient.CONTENT_BLOCKED_FINISH_REASONS @@ -373,12 +376,11 @@ def do_it() -> Dict[str, Any]: cache_key = CachingClient.make_cache_key(raw_cache_key, request) response, cached = self.cache.get(cache_key, wrap_request_time(do_it)) - except (requests.exceptions.RequestException, VertexAIContentBlockedError) as e: - if "Content has no parts" in str(e): - return complete_for_valid_error(self.CONTENT_HAS_NO_PARTS_ERROR) - + except requests.exceptions.RequestException as e: error: str = f"Gemini Vision error: {e}" return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[]) + except VertexAIContentBlockedError as e: + return complete_for_valid_error(str(e)) if "error" in response: return complete_for_valid_error(response["error"]) diff --git a/src/helm/clients/vision_language/huggingface_vision2seq_client.py b/src/helm/clients/vision_language/huggingface_vision2seq_client.py index 530a7dc125f..5b798ffd6f3 100644 --- a/src/helm/clients/vision_language/huggingface_vision2seq_client.py +++ b/src/helm/clients/vision_language/huggingface_vision2seq_client.py @@ -18,7 +18,7 @@ @dataclass(frozen=True) -class LoadedVision2SeqModelProcessor: +class Vision2SeqModelProcessor: """Loaded model and processor.""" model: AutoModelForVision2Seq @@ -26,7 +26,7 @@ class LoadedVision2SeqModelProcessor: _models_lock: Lock = Lock() -_models: Dict[str, Optional[LoadedVision2SeqModelProcessor]] = { +_models: Dict[str, Optional[Vision2SeqModelProcessor]] = { "HuggingFaceM4/idefics2-8b": None, } @@ -44,7 +44,7 @@ def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: Cach self.tokenizer_name = tokenizer_name self._device: str = get_torch_device_name() - def _get_model(self, checkpoint: str) -> LoadedVision2SeqModelProcessor: + def _get_model(self, checkpoint: str) -> Vision2SeqModelProcessor: global _models_lock global _models @@ -57,7 +57,7 @@ def _get_model(self, checkpoint: str) -> LoadedVision2SeqModelProcessor: model = AutoModelForVision2Seq.from_pretrained(checkpoint, torch_dtype=torch_dtype).to(self._device) processor = AutoProcessor.from_pretrained(checkpoint) - _models[checkpoint] = LoadedVision2SeqModelProcessor(model, processor) + _models[checkpoint] = Vision2SeqModelProcessor(model, processor) loaded_model_processor = _models[checkpoint] assert loaded_model_processor is not None @@ -67,7 +67,7 @@ def make_request(self, request: Request) -> RequestResult: assert request.model_deployment in _models, f"Not a valid model for this client: {request.model_deployment}" assert request.multimodal_prompt is not None, "Multimodal prompt is required" - loaded_model_processor: LoadedVision2SeqModelProcessor = self._get_model(request.model_deployment) + loaded_model_processor: Vision2SeqModelProcessor = self._get_model(request.model_deployment) model = loaded_model_processor.model processor = loaded_model_processor.processor diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index d6e6b9b0a1a..c7644f704b6 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -1357,15 +1357,6 @@ model_deployments: client_spec: class_name: "helm.clients.openai_client.OpenAIClient" - - name: openai/gpt-4-vision-preview-high-res - model_name: openai/gpt-4-vision-preview-high-res - tokenizer_name: openai/cl100k_base - max_sequence_length: 128000 # According to https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo - max_request_length: 128001 - max_sequence_and_generated_tokens_length: 132096 - client_spec: - class_name: "helm.clients.openai_client.OpenAIClient" - ## Codex Models # DEPRECATED: Codex models have been shut down on March 23 2023. diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index 7356902651a..770186fdaac 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -1772,14 +1772,6 @@ models: release_date: 2023-11-06 tags: [VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG] - - name: openai/gpt-4-vision-preview-high-res - display_name: GPT-4V high res (preview) - description: GPT-4V with "high res" mode enabled, which first allows the model to see the low res image and then creates detailed crops of input images as 512px squares based on the input image size. - creator_organization_name: OpenAI - access: limited - release_date: 2023-11-06 - tags: [VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, FULL_FUNCTIONALITY_VLM_TAG] - ## Codex Models # DEPRECATED: Codex models have been shut down on March 23 2023.