Merge pull request #102 from aitomatic/dep-upgrade

upgrade package & dependencies
aitomatic · Feb 27, 2024 · 37d6064 · 37d6064
2 parents 3a65fc5 + eca8317
commit 37d6064
Show file tree

Hide file tree

Showing 74 changed files with 1,206 additions and 298 deletions.
diff --git a/.gitignore b/.gitignore
@@ -37,3 +37,4 @@ docs/_build/
 .env
 tmp/
 examples/data/cache/dense
+examples/data/*.csv
diff --git a/.ruff.toml b/.ruff.toml
@@ -48,6 +48,7 @@ ignore = [
   "D107",  # missing docstring in `__init__`
   "D200",  # one-line docstring should fit on one line
   "D202",  # no blank lines allowed after function docstring
+  "D203",  # one blank line before class
   "D204",  # 1 blank line required after class docstring
   "D205",  # 1 blank line required between summary line and description
   "D212",  # multi-line docstring summary should start at the first line
@@ -66,10 +67,12 @@ ignore = [
   "EM102",  # exception must not use an f-string literal, assign to variable first
   "ERA001",  # found commented-out code
   "F401",  # imported but unused
+  "FA102",  # missing `from __future__ import annotations`, but uses PEP 604 union
   "FBT001",  # boolean-typed positional argument in function definition
   "FBT002",  # boolean default positional argument in function definition
   "FBT003",  # boolean positional value in function call
   "FIX002",  # line contains TODO, consider resolving the issue
+  "FURB101",  # `open` and `read` should be replaced by `Path(file_path).read_bytes()`
   "I001",  # import block is un-sorted or un-formatted
   "INP001",  # file is part of an implicit namespace package; add an `__init__.py`
   "LOG009",  # use of undocumented `logging.WARN` constant
@@ -106,11 +109,16 @@ ignore = [
   "RUF013",  # PEP 484 prohibits implicit `Optional`
   "RUF017",  # Avoid quadratic list summation"
   "RUF018",  # avoid assignment expressions in `assert` statements
+  "RUF022",  # __all__` is not sorted
+  "RUF100",  # unused `noqa` directive
   "S101",  # use of `assert` detected
   "S106",  # possible hardcoded password assigned to argument
+  "S110",  # `try`-`except`-`pass` detected, consider logging the exception
   "S605",  # starting a process with a shell, possible injection detected
+  "S607",  # starting a process with a partial executable path
   "SIM102",  # use a single `if` statement instead of nested `if` statements
   "SIM108",  # use ternary operator `temp = temp["content"] if isinstance(temp, dict) else temp.content` instead of `if`-`else`-block
+  "SIM105",  # use `contextlib.suppress(Exception)` instead of `try`-`except`-`pass`
   "SIM112",  # use capitalized environment variable
   "SIM401",  # use `item.get("role", "assistant")` instead of an `if` block
   "SLF001",  # private member accessed
@@ -123,5 +131,4 @@ ignore = [
   "UP007",  # use `X | Y` for type annotations
   "UP035",  # `typing.[X]` is deprecated, use `[x]` instead
   "UP039",  # unnecessary parentheses after class definition
-  "D203",   # one-blank-line-before-class
 ]
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -9,8 +9,11 @@
   ],
 
   "unwantedRecommendations": [
-    "ms-python.mypy-type-checker",  // MyPy Type Checker (Microsoft)
+    "ms-python.autopep8",  // AutoPEP8 (Microsoft)
+    "ms-python.black-formatter",  // Black Formatter (Microsoft)
+    "eeyore.yapf",  // Yet Another Python Formatter
 
+    "ms-python.mypy-type-checker",  // MyPy Type Checker (Microsoft)
     "matangover.mypy"  // MyPy
   ]
 }
diff --git a/Makefile b/Makefile
@@ -36,7 +36,7 @@ install:
 	poetry install --extras=contrib --with=docs --with=lint --with=test
 
 install-editable:
-	python3 -m pip install -e ".[contrib]" --upgrade --user
+	python3 -m pip install -e ".[contrib]" --upgrade
 
 
 # LINTING
@@ -56,7 +56,6 @@ lint-pylint:
 lint-ruff:
 	# docs.astral.sh/ruff/linter
 	poetry run ruff check $(LIB_DIR) $(DOCS_DIR) $(EXAMPLES_DIR) $(TESTS_DIR) \
-		--show-source \
 		--output-format text \
 		--target-version py310 \
 		--preview \

diff --git a/README.md b/README.md
@@ -147,7 +147,7 @@ Head to [Lepton](https://dashboard.lepton.ai/) to get your API key.
 * Select `API tokens`
 * Copy `<YOUR_LEPTON_API_TOKEN>`
 
-In terminal, run 
+In terminal, run
 ```bash=
 export LEPTON_API_KEY=<YOUR_LEPTON_API_TOKEN>
 ```
diff --git a/examples/data/docs/m290/m290.pdf b/examples/data/docs/m290/m290.pdf
diff --git a/examples/data/notebook_qa_comparion.csv b/examples/data/notebook_qa_comparion.csv
diff --git a/examples/data/qa_comparion.csv b/examples/data/qa_comparion.csv
diff --git a/examples/data/qa_standard_agent_comparion.csv b/examples/data/qa_standard_agent_comparion.csv
diff --git a/examples/financebench/.env.template b/examples/financebench/.env.template
@@ -0,0 +1,5 @@
+AWS_ACCESS_KEY_ID=[... AWS creds if caching through S3 ...]
+AWS_SECRET_ACCESS_KEY=[... AWS creds if caching through S3 ...]
+
+LEPTON_API_KEY=[... Lepton key (obtainable at dashboard.lepton.ai) if running SSAs on Aitomatic services ...]
+OPENAI_API_KEY=[... OpenAI creds if running SSAs directly on OpenAI services ...]
diff --git a/examples/financebench/.gitignore b/examples/financebench/.gitignore
@@ -0,0 +1,2 @@
+.FinanceBench/
+.streamlit/secrets.toml
diff --git a/examples/financebench/.streamlit/secrets.toml.template b/examples/financebench/.streamlit/secrets.toml.template
@@ -0,0 +1 @@
+LEPTON_API_KEY = '...'
diff --git a/examples/financebench/Makefile b/examples/financebench/Makefile
@@ -0,0 +1,5 @@
+run-streamlit:
+	@streamlit run streamlit-main.py --server.allowRunOnSave=true --server.runOnSave=true
+
+solve:
+	@poetry run python3 ssa_fb/prob_solve.py ${id}
diff --git a/examples/financebench/README.md b/examples/financebench/README.md
@@ -0,0 +1,32 @@
+<!-- markdownlint-disable MD043 -->
+
+# OpenSSA-FinanceBench benchmarking
+
+This example conducts the benchmarking of `OpenSSA` performance
+on the `FinanceBench` dataset.
+
+## [`FinanceBench` Dataset](https://github.com/patronus-ai/financebench/blob/main/financebench_sample_150.csv)
+
+## Running Aitomatic SSA benchmarking project
+
+Have Python 3.10-3.11 installed.
+
+Have Poetry installed: __`make get-poetry`__.
+
+__Install__ project, and update its dependencies from time to time:
+__`make install`__.
+
+Create `.env` file following the `.env.template` and fill in necessary credentials.
+
+__Solve__ the problem corresponding to a specific `financebench_id`:
+__`make solve id=...`__.
+
+- refer to `FinanceBench` dataset above for `financebench_id`s
+and corresponding information
+
+## Notes to Aitomatic Developers
+
+The OpenSSA dependency for this benchmarking project is from the `experimental`
+branch of the private [SSA](https://github.com/aitomatic/ssa) repository.
+Hence, all improvements to OpenSSA during this project must be
+committed/pushed/merged into that repository and branch.
diff --git a/examples/financebench/poetry.toml b/examples/financebench/poetry.toml
@@ -0,0 +1,3 @@
+[virtualenvs]
+create = true
+in-project = true
diff --git a/examples/financebench/requirements.txt b/examples/financebench/requirements.txt
@@ -0,0 +1 @@
+OpenSSA[contrib] @ https://GitHub.com/Aitomatic/OpenSSA/archive/main.zip
diff --git a/examples/financebench/scripts/README.md b/examples/financebench/scripts/README.md
@@ -0,0 +1,30 @@
+### Instruction to run finance bench dataset with OpenSSA OODA
+
+
+1. Download finacial reports
+
+```
+    python data.py
+```
+
+
+2. Load documents and run Q&A on set of questions.
+
+```
+    python qa.py # standard RAG
+    python ooda-qa.py # run with ooda
+```
+
+3. Auto resume if the run was incompleted or stopped in the middle.
+
+```
+    python qa.py
+    python ooda-qa.py
+```
+
+
+4. Output
+
+```
+    tmp/finance-bench/output.csv
+```
diff --git a/examples/financebench/scripts/data.py b/examples/financebench/scripts/data.py
@@ -0,0 +1,58 @@
+import os
+from pathlib import Path
+import pandas as pd
+import requests
+from requests.exceptions import HTTPError
+
+# Note JOHNSON&JOHNSON is not downloadable
+
+FINANCEBENCH_METADATA_URL: str = (
+    "https://raw.githubusercontent.com/patronus-ai/"
+    "financebench/main/financebench_sample_150.csv"
+)
+
+
+def download_dataset():
+    # Read the CSV file
+    df_finbench = pd.read_csv(FINANCEBENCH_METADATA_URL)
+    df_finbench["status"] = "ok"
+
+    base_directory = "tmp/finance-bench/docs"
+    count = 0
+    for index, row in df_finbench.iterrows():
+        doc_name = row["doc_name"]
+        doc_link = row["doc_link"]
+
+        # Create a subdirectory for each document
+        doc_directory = os.path.join(base_directory, doc_name)
+        if not os.path.exists(doc_directory):
+            os.makedirs(doc_directory)
+
+        # Path for the PDF file
+        file_path = os.path.join(doc_directory, f"{doc_name}.pdf")
+
+        # Check if the file has already been downloaded
+        if not Path(file_path).is_file():
+            try:
+                # Download the file
+                response = requests.get(doc_link, timeout=30)
+                response.raise_for_status()  # Raises if the HTTP request, returned an unsuccessful status code
+
+                # Write the content to a file
+                with open(file_path, "wb") as file:
+                    file.write(response.content)
+                print(f"Downloaded and saved: {file_path}")
+                count += 1
+            except HTTPError as e:
+                df_finbench.loc[index, "status"] = "error"  # Update the status to 'error'
+                print(f"Error downloading {file_path}: {e}")
+        else:
+            print(f"File already exists, skipping: {file_path}")
+
+    dataset_directory = "tmp/finance-bench"
+    df_finbench.to_csv(os.path.join(dataset_directory, "finance_bench_dataset.csv"), index=False)
+    print(f"All files processed. Total files downloaded: {count}")
+
+
+if __name__ == "__main__":
+    download_dataset()
diff --git a/examples/financebench/scripts/ooda_qa.py b/examples/financebench/scripts/ooda_qa.py
@@ -0,0 +1,77 @@
+import os
+from loguru import logger
+import nest_asyncio
+import pandas as pd
+from openssa.utils.utils import Utils
+from openssa.core.ooda_rag.solver import OodaSSA
+
+nest_asyncio.apply()
+
+PATH: str = "./tmp/finance-bench/docs"
+FINANCEBENCH_CSV: str = "./tmp/finance-bench/finance_bench_dataset.csv"
+OUTPUT_DIRECTORY: str = "tmp/finance-bench/output"
+OUTPUT_FILE_NAME: str = "ooda_rag_output.csv"
+
+
+@Utils.timeit
+def process_doc(doc_name: str, question: str) -> str:
+    ssa = OodaSSA(enable_generative=True)
+    resource = os.path.join(PATH, doc_name)
+    ssa.activate_resources(resource)
+    solution = ssa.solve(question)
+    return solution
+
+
+def run():
+    output_file_path = os.path.join(OUTPUT_DIRECTORY, OUTPUT_FILE_NAME)
+    answer_column_name = "ooda_answer"
+    # Check if the output file exists, and read from it if available (load cache)
+    if os.path.exists(output_file_path):
+        df_finbench = pd.read_csv(output_file_path)
+    else:
+        # If the output file does not exist, read from the original dataset
+        df_finbench = pd.read_csv(FINANCEBENCH_CSV)
+        if answer_column_name not in df_finbench.columns:
+            df_finbench[answer_column_name] = ""
+    df_finbench = df_finbench.fillna("")
+
+    if not os.path.exists(OUTPUT_DIRECTORY):
+        os.makedirs(OUTPUT_DIRECTORY)
+
+    for index, row in df_finbench.iterrows():
+        logger.info(f"Processing row {index} of {len(df_finbench)} : {row['doc_name']}")
+        if row["status"] == "ok" and not row[answer_column_name]:
+            doc_name = row["doc_name"]
+            question = row["question"]
+            answer = process_doc(doc_name, question)
+            df_finbench.loc[index, answer_column_name] = answer
+            # Save progress as cache after processing each row
+            df_finbench.to_csv(output_file_path, index=False)
+        print(f"complete index {index} of {len(df_finbench)}")
+
+    # if any answer contain "Empty Response" then update it to "file error"
+    df_finbench.loc[
+        df_finbench[answer_column_name].str.lower().str.contains("empty response"),
+        answer_column_name,
+    ] = "file reading error"
+
+    df_finbench.to_csv(output_file_path, index=False)
+    print("Processing complete. Output saved to:", output_file_path)
+
+
+def clean_up():
+    file_path = os.path.join(OUTPUT_DIRECTORY, OUTPUT_FILE_NAME)
+    df_data = pd.read_csv(file_path)
+    filtered_df = df_data[
+        ~df_data["ooda_answer"].isna()
+        & (df_data["ooda_answer"] != "file reading error")
+    ]
+    clean_output_file_path = os.path.join(
+        OUTPUT_DIRECTORY, "filtered_ooda_rag_output.csv"
+    )
+    filtered_df.to_csv(clean_output_file_path, index=False)
+    print(f"Filtered data saved to {clean_output_file_path}")
+
+
+if __name__ == "__main__":
+    run()
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,3 +37,4 @@ docs/_build/ @@
     .env
     tmp/
     examples/data/cache/dense
+    examples/data/*.csv
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		OpenSSA[contrib] @ https://GitHub.com/Aitomatic/OpenSSA/archive/main.zip