Skip to content

Commit

Permalink
Merge pull request #102 from aitomatic/dep-upgrade
Browse files Browse the repository at this point in the history
upgrade package & dependencies
  • Loading branch information
TheVinhLuong102 authored Feb 27, 2024
2 parents 3a65fc5 + eca8317 commit 37d6064
Show file tree
Hide file tree
Showing 74 changed files with 1,206 additions and 298 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ docs/_build/
.env
tmp/
examples/data/cache/dense
examples/data/*.csv
9 changes: 8 additions & 1 deletion .ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ ignore = [
"D107", # missing docstring in `__init__`
"D200", # one-line docstring should fit on one line
"D202", # no blank lines allowed after function docstring
"D203", # one blank line before class
"D204", # 1 blank line required after class docstring
"D205", # 1 blank line required between summary line and description
"D212", # multi-line docstring summary should start at the first line
Expand All @@ -66,10 +67,12 @@ ignore = [
"EM102", # exception must not use an f-string literal, assign to variable first
"ERA001", # found commented-out code
"F401", # imported but unused
"FA102", # missing `from __future__ import annotations`, but uses PEP 604 union
"FBT001", # boolean-typed positional argument in function definition
"FBT002", # boolean default positional argument in function definition
"FBT003", # boolean positional value in function call
"FIX002", # line contains TODO, consider resolving the issue
"FURB101", # `open` and `read` should be replaced by `Path(file_path).read_bytes()`
"I001", # import block is un-sorted or un-formatted
"INP001", # file is part of an implicit namespace package; add an `__init__.py`
"LOG009", # use of undocumented `logging.WARN` constant
Expand Down Expand Up @@ -106,11 +109,16 @@ ignore = [
"RUF013", # PEP 484 prohibits implicit `Optional`
"RUF017", # Avoid quadratic list summation"
"RUF018", # avoid assignment expressions in `assert` statements
"RUF022", # __all__` is not sorted
"RUF100", # unused `noqa` directive
"S101", # use of `assert` detected
"S106", # possible hardcoded password assigned to argument
"S110", # `try`-`except`-`pass` detected, consider logging the exception
"S605", # starting a process with a shell, possible injection detected
"S607", # starting a process with a partial executable path
"SIM102", # use a single `if` statement instead of nested `if` statements
"SIM108", # use ternary operator `temp = temp["content"] if isinstance(temp, dict) else temp.content` instead of `if`-`else`-block
"SIM105", # use `contextlib.suppress(Exception)` instead of `try`-`except`-`pass`
"SIM112", # use capitalized environment variable
"SIM401", # use `item.get("role", "assistant")` instead of an `if` block
"SLF001", # private member accessed
Expand All @@ -123,5 +131,4 @@ ignore = [
"UP007", # use `X | Y` for type annotations
"UP035", # `typing.[X]` is deprecated, use `[x]` instead
"UP039", # unnecessary parentheses after class definition
"D203", # one-blank-line-before-class
]
5 changes: 4 additions & 1 deletion .vscode/extensions.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
],

"unwantedRecommendations": [
"ms-python.mypy-type-checker", // MyPy Type Checker (Microsoft)
"ms-python.autopep8", // AutoPEP8 (Microsoft)
"ms-python.black-formatter", // Black Formatter (Microsoft)
"eeyore.yapf", // Yet Another Python Formatter

"ms-python.mypy-type-checker", // MyPy Type Checker (Microsoft)
"matangover.mypy" // MyPy
]
}
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ install:
poetry install --extras=contrib --with=docs --with=lint --with=test

install-editable:
python3 -m pip install -e ".[contrib]" --upgrade --user
python3 -m pip install -e ".[contrib]" --upgrade


# LINTING
Expand All @@ -56,7 +56,6 @@ lint-pylint:
lint-ruff:
# docs.astral.sh/ruff/linter
poetry run ruff check $(LIB_DIR) $(DOCS_DIR) $(EXAMPLES_DIR) $(TESTS_DIR) \
--show-source \
--output-format text \
--target-version py310 \
--preview \
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ Head to [Lepton](https://dashboard.lepton.ai/) to get your API key.
* Select `API tokens`
* Copy `<YOUR_LEPTON_API_TOKEN>`

In terminal, run
In terminal, run
```bash=
export LEPTON_API_KEY=<YOUR_LEPTON_API_TOKEN>
```
Binary file removed examples/data/docs/m290/m290.pdf
Binary file not shown.
11 changes: 0 additions & 11 deletions examples/data/notebook_qa_comparion.csv

This file was deleted.

4 changes: 0 additions & 4 deletions examples/data/qa_comparion.csv

This file was deleted.

33 changes: 0 additions & 33 deletions examples/data/qa_standard_agent_comparion.csv

This file was deleted.

5 changes: 5 additions & 0 deletions examples/financebench/.env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
AWS_ACCESS_KEY_ID=[... AWS creds if caching through S3 ...]
AWS_SECRET_ACCESS_KEY=[... AWS creds if caching through S3 ...]

LEPTON_API_KEY=[... Lepton key (obtainable at dashboard.lepton.ai) if running SSAs on Aitomatic services ...]
OPENAI_API_KEY=[... OpenAI creds if running SSAs directly on OpenAI services ...]
2 changes: 2 additions & 0 deletions examples/financebench/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.FinanceBench/
.streamlit/secrets.toml
1 change: 1 addition & 0 deletions examples/financebench/.streamlit/secrets.toml.template
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
LEPTON_API_KEY = '...'
5 changes: 5 additions & 0 deletions examples/financebench/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
run-streamlit:
@streamlit run streamlit-main.py --server.allowRunOnSave=true --server.runOnSave=true

solve:
@poetry run python3 ssa_fb/prob_solve.py ${id}
32 changes: 32 additions & 0 deletions examples/financebench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<!-- markdownlint-disable MD043 -->

# OpenSSA-FinanceBench benchmarking

This example conducts the benchmarking of `OpenSSA` performance
on the `FinanceBench` dataset.

## [`FinanceBench` Dataset](https://github.com/patronus-ai/financebench/blob/main/financebench_sample_150.csv)

## Running Aitomatic SSA benchmarking project

Have Python 3.10-3.11 installed.

Have Poetry installed: __`make get-poetry`__.

__Install__ project, and update its dependencies from time to time:
__`make install`__.

Create `.env` file following the `.env.template` and fill in necessary credentials.

__Solve__ the problem corresponding to a specific `financebench_id`:
__`make solve id=...`__.

- refer to `FinanceBench` dataset above for `financebench_id`s
and corresponding information

## Notes to Aitomatic Developers

The OpenSSA dependency for this benchmarking project is from the `experimental`
branch of the private [SSA](https://github.com/aitomatic/ssa) repository.
Hence, all improvements to OpenSSA during this project must be
committed/pushed/merged into that repository and branch.
3 changes: 3 additions & 0 deletions examples/financebench/poetry.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[virtualenvs]
create = true
in-project = true
1 change: 1 addition & 0 deletions examples/financebench/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
OpenSSA[contrib] @ https://GitHub.com/Aitomatic/OpenSSA/archive/main.zip
30 changes: 30 additions & 0 deletions examples/financebench/scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
### Instruction to run finance bench dataset with OpenSSA OODA


1. Download finacial reports

```
python data.py
```


2. Load documents and run Q&A on set of questions.

```
python qa.py # standard RAG
python ooda-qa.py # run with ooda
```

3. Auto resume if the run was incompleted or stopped in the middle.

```
python qa.py
python ooda-qa.py
```


4. Output

```
tmp/finance-bench/output.csv
```
58 changes: 58 additions & 0 deletions examples/financebench/scripts/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
from pathlib import Path
import pandas as pd
import requests
from requests.exceptions import HTTPError

# Note JOHNSON&JOHNSON is not downloadable

FINANCEBENCH_METADATA_URL: str = (
"https://raw.githubusercontent.com/patronus-ai/"
"financebench/main/financebench_sample_150.csv"
)


def download_dataset():
# Read the CSV file
df_finbench = pd.read_csv(FINANCEBENCH_METADATA_URL)
df_finbench["status"] = "ok"

base_directory = "tmp/finance-bench/docs"
count = 0
for index, row in df_finbench.iterrows():
doc_name = row["doc_name"]
doc_link = row["doc_link"]

# Create a subdirectory for each document
doc_directory = os.path.join(base_directory, doc_name)
if not os.path.exists(doc_directory):
os.makedirs(doc_directory)

# Path for the PDF file
file_path = os.path.join(doc_directory, f"{doc_name}.pdf")

# Check if the file has already been downloaded
if not Path(file_path).is_file():
try:
# Download the file
response = requests.get(doc_link, timeout=30)
response.raise_for_status() # Raises if the HTTP request, returned an unsuccessful status code

# Write the content to a file
with open(file_path, "wb") as file:
file.write(response.content)
print(f"Downloaded and saved: {file_path}")
count += 1
except HTTPError as e:
df_finbench.loc[index, "status"] = "error" # Update the status to 'error'
print(f"Error downloading {file_path}: {e}")
else:
print(f"File already exists, skipping: {file_path}")

dataset_directory = "tmp/finance-bench"
df_finbench.to_csv(os.path.join(dataset_directory, "finance_bench_dataset.csv"), index=False)
print(f"All files processed. Total files downloaded: {count}")


if __name__ == "__main__":
download_dataset()
77 changes: 77 additions & 0 deletions examples/financebench/scripts/ooda_qa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os
from loguru import logger
import nest_asyncio
import pandas as pd
from openssa.utils.utils import Utils
from openssa.core.ooda_rag.solver import OodaSSA

nest_asyncio.apply()

PATH: str = "./tmp/finance-bench/docs"
FINANCEBENCH_CSV: str = "./tmp/finance-bench/finance_bench_dataset.csv"
OUTPUT_DIRECTORY: str = "tmp/finance-bench/output"
OUTPUT_FILE_NAME: str = "ooda_rag_output.csv"


@Utils.timeit
def process_doc(doc_name: str, question: str) -> str:
ssa = OodaSSA(enable_generative=True)
resource = os.path.join(PATH, doc_name)
ssa.activate_resources(resource)
solution = ssa.solve(question)
return solution


def run():
output_file_path = os.path.join(OUTPUT_DIRECTORY, OUTPUT_FILE_NAME)
answer_column_name = "ooda_answer"
# Check if the output file exists, and read from it if available (load cache)
if os.path.exists(output_file_path):
df_finbench = pd.read_csv(output_file_path)
else:
# If the output file does not exist, read from the original dataset
df_finbench = pd.read_csv(FINANCEBENCH_CSV)
if answer_column_name not in df_finbench.columns:
df_finbench[answer_column_name] = ""
df_finbench = df_finbench.fillna("")

if not os.path.exists(OUTPUT_DIRECTORY):
os.makedirs(OUTPUT_DIRECTORY)

for index, row in df_finbench.iterrows():
logger.info(f"Processing row {index} of {len(df_finbench)} : {row['doc_name']}")
if row["status"] == "ok" and not row[answer_column_name]:
doc_name = row["doc_name"]
question = row["question"]
answer = process_doc(doc_name, question)
df_finbench.loc[index, answer_column_name] = answer
# Save progress as cache after processing each row
df_finbench.to_csv(output_file_path, index=False)
print(f"complete index {index} of {len(df_finbench)}")

# if any answer contain "Empty Response" then update it to "file error"
df_finbench.loc[
df_finbench[answer_column_name].str.lower().str.contains("empty response"),
answer_column_name,
] = "file reading error"

df_finbench.to_csv(output_file_path, index=False)
print("Processing complete. Output saved to:", output_file_path)


def clean_up():
file_path = os.path.join(OUTPUT_DIRECTORY, OUTPUT_FILE_NAME)
df_data = pd.read_csv(file_path)
filtered_df = df_data[
~df_data["ooda_answer"].isna()
& (df_data["ooda_answer"] != "file reading error")
]
clean_output_file_path = os.path.join(
OUTPUT_DIRECTORY, "filtered_ooda_rag_output.csv"
)
filtered_df.to_csv(clean_output_file_path, index=False)
print(f"Filtered data saved to {clean_output_file_path}")


if __name__ == "__main__":
run()
Loading

0 comments on commit 37d6064

Please sign in to comment.