From 5c2830dd46b97b7ead7271eb80f199e1d570a7d9 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Thu, 6 Feb 2025 09:28:58 -0800
Subject: [PATCH] Add Financial Phrasebank scenario (#3302)

Co-authored-by: Ryo Kawahara <ryokawa@jp.ibm.com>
Co-authored-by: Mikio Takeuchi <mtake@jp.ibm.com>
---
 .../run_specs/enterprise_run_specs.py         | 26 +++++
 .../financial_phrasebank_scenario.py          | 94 +++++++++++++++++++
 .../benchmark/static/schema_enterprise.yaml   | 18 ++++
 3 files changed, 138 insertions(+)
 create mode 100644 src/helm/benchmark/scenarios/financial_phrasebank_scenario.py

diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
index dadc8eaefc..be559f49d1 100644
--- a/src/helm/benchmark/run_specs/enterprise_run_specs.py
+++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -49,6 +49,32 @@ def get_news_headline_spec(category: str) -> RunSpec:
     )
 
 
+@run_spec_function("financial_phrasebank")
+def get_financial_phrasebank_spec(agreement: int = 50) -> RunSpec:
+    from helm.benchmark.scenarios.financial_phrasebank_scenario import FinancialPhrasebankScenario
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.financial_phrasebank_scenario.FinancialPhrasebankScenario",
+        args={"agreement": agreement},
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions=FinancialPhrasebankScenario.INSTRUCTIONS,
+        input_noun="Sentence",
+        output_noun="Label",
+        max_tokens=30,
+    )
+
+    return RunSpec(
+        name=f"financial_phrasebank:agreement={agreement}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs()
+        + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]),
+        groups=["financial_phrasebank"],
+    )
+
+
 # Legal
 
 
diff --git a/src/helm/benchmark/scenarios/financial_phrasebank_scenario.py b/src/helm/benchmark/scenarios/financial_phrasebank_scenario.py
new file mode 100644
index 0000000000..40bc81ce17
--- /dev/null
+++ b/src/helm/benchmark/scenarios/financial_phrasebank_scenario.py
@@ -0,0 +1,94 @@
+import os
+import random
+from typing import List
+
+from helm.common.general import ensure_file_downloaded
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    Input,
+    Output,
+)
+
+
+class FinancialPhrasebankScenario(Scenario):
+    """
+    A sentiment classification benchmark based on the dataset from Good Debt or Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., 2013)](https://arxiv.org/abs/1307.5336).
+
+    Context:
+    Polar sentiment dataset of sentences from financial news. The dataset consists of 4840 sentences from English
+    language financial news categorized by sentiment. The dataset is divided by agreement rate of 5-8 annotators.
+
+    This release of the financial phrase bank covers a collection of 4840 sentences. The selected collection of
+    phrases was annotated by 16 people with adequate background knowledge on financial markets.
+
+    Given the large number of overlapping annotations (5 to 8 annotations per sentence), there are several ways
+    to define a majority vote based gold standard. To provide an objective comparison, the paper authors have formed 4 alternative
+    reference datasets based on the strength of majority agreement: 100%, 75%, 66% and 50%.
+
+    Data source:
+    https://huggingface.co/datasets/takala/financial_phrasebank
+
+    Reference:
+    P. Malo, A. Sinha, P. Korhonen, J. Wallenius, and P. Takala, “Good debt or bad debt: Detecting semantic orientations in economic texts,” Journal of the Association for Information Science and Technology, vol. 65, 2014.
+    https://arxiv.org/pdf/1307.5336
+
+    """  # noqa: E501
+
+    name = "financial_phrasebank"
+    description = "The dataset consists of 4840 sentences from English \
+                   language financial news categorized by sentiment."
+    tags = ["finance", "sentiment analysis", "classification"]
+
+    INSTRUCTIONS = """The dataset consists of sentences from English language financial news categorized by sentiment.
+Classify the sentences into one of the 3 sentiment categories.
+Possible labels:\n1. positive\n2. neutral\n3. negative"""  # noqa: E501
+    DATASET_URL = "https://huggingface.co/datasets/takala/financial_phrasebank/resolve/598b6aad98f7c8d67be161b12a4b5f2497e07edd/data/FinancialPhraseBank-v1.0.zip"  # noqa: E501
+    AGREEMENT_VALUES = [50, 66, 75, 100]
+    TRAIN_SPLIT_SIZE = 0.7
+
+    def __init__(self, agreement: int, random_seed: int = 121):
+        """The initialization of an instance.
+
+        Args:
+            subset: str: This argument is used to specify the ratio of annotators who agreed on the ground truth label.
+            The value must be one of the strings defined in
+            SUBSETS = ["sentences_allagree", "sentences_75agree", "sentences_66agree", "sentences_50agree"].
+            random_seed: int = 121: The random seed for sampling the train/test splits.
+        """
+        super().__init__()
+        if agreement not in self.AGREEMENT_VALUES:
+            raise Exception(
+                f"Unknown `agreement` value: {agreement}, allowed values are {self.AGREEMENT_VALUES}".format(agreement)
+            )
+        self.agreement = agreement
+        self.random_seed = random_seed
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_parent_path = os.path.join(output_path, "data")
+        ensure_file_downloaded(
+            self.DATASET_URL,
+            data_parent_path,
+            unpack=True,
+            unpack_type="unzip",
+        )
+        file_name = "Sentences_AllAgree.txt" if self.agreement == 100 else f"Sentences_{self.agreement}Agree.txt"
+        data_file_path = os.path.join(data_parent_path, "FinancialPhraseBank-v1.0", file_name)
+        with open(data_file_path, mode="r", encoding="iso-8859-1") as f:
+            lines = list(f.readlines())
+            random.Random(self.random_seed).shuffle(lines)
+        train_split_index = int(len(lines) * self.TRAIN_SPLIT_SIZE)
+        instances: List[Instance] = []
+        for index, line in enumerate(lines):
+            sentence, label = line.strip().rsplit("@", 1)
+            instance = Instance(
+                input=Input(text=sentence),
+                references=[Reference(Output(text=label), tags=[CORRECT_TAG])],
+                split=TRAIN_SPLIT if index < train_split_index else TEST_SPLIT,
+            )
+            instances.append(instance)
+        return instances
diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml
index 979ec58385..92f804c56e 100644
--- a/src/helm/benchmark/static/schema_enterprise.yaml
+++ b/src/helm/benchmark/static/schema_enterprise.yaml
@@ -113,6 +113,7 @@ run_groups:
     category: All scenarios
     subgroups:
       - gold_commodity_news
+      - financial_phrasebank
 
   - name: legal_scenarios
     display_name: Legal Scenarios
@@ -138,6 +139,23 @@ run_groups:
     subgroups:
       - cti_to_mitre
 
+  - name: financial_phrasebank
+    display_name: Financial Phrasebank (Sentiment Classification)
+    description: A sentiment classification benchmark based on the dataset from Good Debt or Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., 2013)](https://arxiv.org/abs/1307.5336).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: classification_weighted_f1
+      main_split: test
+    taxonomy:
+      task: sentiment analysis
+      what: phrases from financial news texts and company press releases
+      who: annotators with adequate business education background
+      when: before 2013
+      language: English
+
   - name: gold_commodity_news
     display_name: Gold Commodity News
     description: A classification benchmark based on a dataset of human-annotated gold commodity news headlines ([Sinha & Khandait, 2019](https://arxiv.org/abs/2009.04202)).