Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Seperating pyproject.toml dependencies #22

Merged
merged 9 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install flake8
pip install .[test]
pip install .[test,app-doctr]
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ WORKDIR /app

# Getting git to clone and system dependencies for DocTR
RUN apt-get update && apt-get install -y \
ffmpeg libsm6 libxext6 libhdf5-dev pkg-config \
libxext6 libhdf5-dev pkg-config \
build-essential \
curl \
software-properties-common \
Expand Down
7 changes: 4 additions & 3 deletions app_doctr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import streamlit as st

from msfocr.data import dhis2
from msfocr.data import post_processing
from msfocr.doctr import ocr_functions

def configure_secrets():
Expand Down Expand Up @@ -116,7 +117,7 @@ def correct_field_names(dfs):
text = table.iloc[row,0]
if text is not None:
for name in dataElement_list:
sim = ocr_functions.letter_by_letter_similarity(text, name)
sim = post_processing.letter_by_letter_similarity(text, name)
if max_similarity_dataElement < sim:
max_similarity_dataElement = sim
dataElement = name
Expand All @@ -129,7 +130,7 @@ def correct_field_names(dfs):
text = table.iloc[0,id]
if text is not None:
for name in categoryOptionsList:
sim = ocr_functions.letter_by_letter_similarity(text, name)
sim = post_processing.letter_by_letter_similarity(text, name)
if max_similarity_catOpt < sim:
max_similarity_catOpt = sim
catOpt = name
Expand Down Expand Up @@ -452,7 +453,7 @@ def get_period():
print(final_dfs)
key_value_pairs = []
for df in final_dfs:
key_value_pairs.extend(ocr_functions.generate_key_value_pairs(df))
key_value_pairs.extend(dhis2.generate_key_value_pairs(df))
st.write("Completed")

st.session_state.data_payload = json_export(key_value_pairs)
Expand Down
34 changes: 7 additions & 27 deletions app_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
import requests
import streamlit as st
from requests.auth import HTTPBasicAuth
from simpleeval import simple_eval

from msfocr.data import dhis2
from msfocr.doctr import ocr_functions as doctr_ocr_functions
from msfocr.data import post_processing
from msfocr.llm import ocr_functions


Expand Down Expand Up @@ -110,6 +109,8 @@ def dhis2_all_UIDs(item_type, search_items):
return dhis2.getAllUIDs(item_type, search_items)


# Other functions

def week1_start_ordinal(year):
"""
Calculates the ordinal date of the start of the first week of the year.
Expand Down Expand Up @@ -205,7 +206,7 @@ def correct_field_names(dfs, form):
text = table.iloc[row,0]
if text is not None:
for name in dataElement_list:
sim = doctr_ocr_functions.letter_by_letter_similarity(text, name)
sim = post_processing.letter_by_letter_similarity(text, name)
if max_similarity_dataElement < sim:
max_similarity_dataElement = sim
dataElement = name
Expand All @@ -218,7 +219,7 @@ def correct_field_names(dfs, form):
text = table.iloc[0,id]
if text is not None:
for name in categoryOptionsList:
sim = doctr_ocr_functions.letter_by_letter_similarity(text, name)
sim = post_processing.letter_by_letter_similarity(text, name)
if max_similarity_catOpt < sim:
max_similarity_catOpt = sim
catOpt = name
Expand All @@ -244,27 +245,6 @@ def save_st_table(table_dfs):
if not table_dfs[idx].equals(st.session_state.table_dfs[idx]):
st.session_state.table_dfs = table_dfs
st.rerun()


def evaluate_cells(table_dfs):
"""Uses simple_eval to perform math operations on each cell, defaulting to input if failed.

Args:
table_dfs (_List_): List of table data frames

Returns:
_List_: List of table data frames
"""
for table in table_dfs:
table_removed_labels = table.loc[1:, 1:]
for col in table_removed_labels.columns:
try:
# Contents should be strings in order to be editable later
table_removed_labels[col] = table_removed_labels[col].apply(lambda x: simple_eval(x) if x and x != "-" else x).astype("str")
except Exception:
continue
table.update(table_removed_labels)
return table_dfs


# Initializing session state variables that only need to be set on startup
Expand Down Expand Up @@ -422,7 +402,7 @@ def authenticate():
table_names.extend(names)
table_dfs.extend(df)
page_nums_to_display.extend([str(i + 1)] * len(names))
table_dfs = evaluate_cells(table_dfs)
table_dfs = post_processing.evaluate_cells(table_dfs)

# Form session state initialization
if 'table_names' not in st.session_state:
Expand Down Expand Up @@ -513,7 +493,7 @@ def authenticate():

key_value_pairs = []
for df in final_dfs:
key_value_pairs.extend(doctr_ocr_functions.generate_key_value_pairs(df, form))
key_value_pairs.extend(dhis2.generate_key_value_pairs(df, form))

st.session_state.data_payload = json_export(key_value_pairs)

Expand Down
17 changes: 11 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,10 @@ classifiers = [

# All the dependencies needed for running your module go here
dependencies = [
"img2table",
"numpy",
"openai",
"pandas",
"python-doctr",
"python-Levenshtein",
"requests",
"torch",
"torchvision",
"requests"
]

[project.optional-dependencies]
Expand All @@ -43,10 +38,20 @@ test = [

# Dependencies only needed to run the streamlit app go here
app = [
"openai",
"streamlit",
"simpleeval"
]

app-doctr = [
"img2table",
"python-doctr",
"simpleeval",
"streamlit",
"torch",
"torchvision",
]

# Dependencies that are useful only to developers, like an autoformatter and support for visualizations in jupyter notebooks go here
dev = [
"azure-common==1.1.28",
Expand Down
46 changes: 46 additions & 0 deletions src/msfocr/data/dhis2.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,49 @@ def get_DE_COC_List(form):
return list(dataElement_list.keys()), list(categoryOptionCombo_list.keys())


def generate_key_value_pairs(table, form):
"""
Generates key-value pairs in the format required to upload data to DHIS2.
{'dataElement': data_element_id,
'categoryOptionCombo': category_id,
'value': cell_value}
UIDs like data_element_id, category_id are obtained by querying the DHIS2 metadata.
:param table: DataFrame generated from table detection
:return: List of key value pairs as shown above.
"""
data_element_pairs = []

# Iterate over each cell in the DataFrame
table_array = table.values
columns = table.columns
for row_index in range(table_array.shape[0]):
# Row name in tally sheet
data_element = table_array[row_index][0]
for col_index in range(1, table_array.shape[1]):
# Column name in tally sheet
category = columns[col_index]
cell_value = table_array[row_index][col_index]
if cell_value is not None and cell_value!="-" and cell_value!="":
data_element_id = None
category_id = None
# Search for the string in the "label" field of form information
string_search = data_element + " " + category
for group in form['groups']:
for field in group['fields']:
if field['label']==string_search:
data_element_id = field['dataElement']
category_id = field['categoryOptionCombo']

# The following exceptions will be raised if the row or column name in the tally sheet is different from the names used in metadata
# For eg. Pop1: Resident is called Population 1 in metadata
# If this exception is raised the only way forward is for the user to manually change the row/column name to the one used in metadata
if data_element_id is None or category_id is None:
raise Exception(f"Unable to find {string_search} in DHIS2 metadata")
# Append to the list of data elements to be push to DHIS2
data_element_pairs.append(
{"dataElement": data_element_id,
"categoryOptionCombo": category_id,
"value": cell_value}
)

return data_element_pairs
62 changes: 62 additions & 0 deletions src/msfocr/data/post_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from datetime import datetime

import Levenshtein
from simpleeval import simple_eval


def letter_by_letter_similarity(text1, text2):
"""
Checks the letter by letter similarity between two strings
:param text1: first text
:param text2: second text
:return: returns an integer between 0-1, 0 indicates no similarity, 1 indicates identical strings
"""
# Calculate Levenshtein distance
distance = Levenshtein.distance(text1, text2)

# Calculate maximum possible length
max_len = max(len(text1), len(text2))

# Convert distance to similarity
similarity = 1 - (distance / max_len)

return similarity


def get_yyyy_mm_dd(text):
"""
Checks if the input text is a date by comparing it with various known formats and returns date in unified YYYY-MM-DD format.
:param text: String
:return: Date in YYYY-MM-DD format or None
"""
formats = ["%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%d/%m/%Y", "%B %d, %Y", "%d %B %Y", "%Y/%m/%d"]

for fmt in formats:
try:
date_obj = datetime.strptime(text, fmt)
return date_obj.strftime("%Y-%m-%d")
except ValueError:
continue

return None # Return None if text is not a valid date in any format


def evaluate_cells(table_dfs):
"""Uses simple_eval to perform math operations on each cell, defaulting to input if failed.

Args:
table_dfs (_List_): List of table data frames

Returns:
_List_: List of table data frames
"""
for table in table_dfs:
table_removed_labels = table.iloc[1:, 1:]
for col in table_removed_labels.columns:
try:
# Contents should be strings in order to be editable later
table_removed_labels[col] = table_removed_labels[col].apply(lambda x: simple_eval(x) if x and x != "-" else x).astype("str")
except Exception:
continue
table.update(table_removed_labels)
return table_dfs
Loading
Loading