Skip to content

Commit 1991c2f

Browse files
authored
Merge pull request #22 from UMassCDS/docker-testing
Seperating pyproject.toml dependencies
2 parents b293c00 + 77a29cb commit 1991c2f

11 files changed

+210
-178
lines changed

.github/workflows/python_package.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424
run: |
2525
python -m pip install --upgrade pip
2626
pip install flake8
27-
pip install .[test]
27+
pip install .[test,app-doctr]
2828
- name: Lint with flake8
2929
run: |
3030
# stop the build if there are Python syntax errors or undefined names

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ WORKDIR /app
1212

1313
# Getting git to clone and system dependencies for DocTR
1414
RUN apt-get update && apt-get install -y \
15-
ffmpeg libsm6 libxext6 libhdf5-dev pkg-config \
15+
libxext6 libhdf5-dev pkg-config \
1616
build-essential \
1717
curl \
1818
software-properties-common \

app_doctr.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import streamlit as st
1313

1414
from msfocr.data import dhis2
15+
from msfocr.data import post_processing
1516
from msfocr.doctr import ocr_functions
1617

1718
def configure_secrets():
@@ -116,7 +117,7 @@ def correct_field_names(dfs):
116117
text = table.iloc[row,0]
117118
if text is not None:
118119
for name in dataElement_list:
119-
sim = ocr_functions.letter_by_letter_similarity(text, name)
120+
sim = post_processing.letter_by_letter_similarity(text, name)
120121
if max_similarity_dataElement < sim:
121122
max_similarity_dataElement = sim
122123
dataElement = name
@@ -129,7 +130,7 @@ def correct_field_names(dfs):
129130
text = table.iloc[0,id]
130131
if text is not None:
131132
for name in categoryOptionsList:
132-
sim = ocr_functions.letter_by_letter_similarity(text, name)
133+
sim = post_processing.letter_by_letter_similarity(text, name)
133134
if max_similarity_catOpt < sim:
134135
max_similarity_catOpt = sim
135136
catOpt = name
@@ -452,7 +453,7 @@ def get_period():
452453
print(final_dfs)
453454
key_value_pairs = []
454455
for df in final_dfs:
455-
key_value_pairs.extend(ocr_functions.generate_key_value_pairs(df))
456+
key_value_pairs.extend(dhis2.generate_key_value_pairs(df))
456457
st.write("Completed")
457458

458459
st.session_state.data_payload = json_export(key_value_pairs)

app_llm.py

+7-27
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,9 @@
66
import requests
77
import streamlit as st
88
from requests.auth import HTTPBasicAuth
9-
from simpleeval import simple_eval
109

1110
from msfocr.data import dhis2
12-
from msfocr.doctr import ocr_functions as doctr_ocr_functions
11+
from msfocr.data import post_processing
1312
from msfocr.llm import ocr_functions
1413

1514

@@ -110,6 +109,8 @@ def dhis2_all_UIDs(item_type, search_items):
110109
return dhis2.getAllUIDs(item_type, search_items)
111110

112111

112+
# Other functions
113+
113114
def week1_start_ordinal(year):
114115
"""
115116
Calculates the ordinal date of the start of the first week of the year.
@@ -205,7 +206,7 @@ def correct_field_names(dfs, form):
205206
text = table.iloc[row,0]
206207
if text is not None:
207208
for name in dataElement_list:
208-
sim = doctr_ocr_functions.letter_by_letter_similarity(text, name)
209+
sim = post_processing.letter_by_letter_similarity(text, name)
209210
if max_similarity_dataElement < sim:
210211
max_similarity_dataElement = sim
211212
dataElement = name
@@ -218,7 +219,7 @@ def correct_field_names(dfs, form):
218219
text = table.iloc[0,id]
219220
if text is not None:
220221
for name in categoryOptionsList:
221-
sim = doctr_ocr_functions.letter_by_letter_similarity(text, name)
222+
sim = post_processing.letter_by_letter_similarity(text, name)
222223
if max_similarity_catOpt < sim:
223224
max_similarity_catOpt = sim
224225
catOpt = name
@@ -244,27 +245,6 @@ def save_st_table(table_dfs):
244245
if not table_dfs[idx].equals(st.session_state.table_dfs[idx]):
245246
st.session_state.table_dfs = table_dfs
246247
st.rerun()
247-
248-
249-
def evaluate_cells(table_dfs):
250-
"""Uses simple_eval to perform math operations on each cell, defaulting to input if failed.
251-
252-
Args:
253-
table_dfs (_List_): List of table data frames
254-
255-
Returns:
256-
_List_: List of table data frames
257-
"""
258-
for table in table_dfs:
259-
table_removed_labels = table.loc[1:, 1:]
260-
for col in table_removed_labels.columns:
261-
try:
262-
# Contents should be strings in order to be editable later
263-
table_removed_labels[col] = table_removed_labels[col].apply(lambda x: simple_eval(x) if x and x != "-" else x).astype("str")
264-
except Exception:
265-
continue
266-
table.update(table_removed_labels)
267-
return table_dfs
268248

269249

270250
# Initializing session state variables that only need to be set on startup
@@ -422,7 +402,7 @@ def authenticate():
422402
table_names.extend(names)
423403
table_dfs.extend(df)
424404
page_nums_to_display.extend([str(i + 1)] * len(names))
425-
table_dfs = evaluate_cells(table_dfs)
405+
table_dfs = post_processing.evaluate_cells(table_dfs)
426406

427407
# Form session state initialization
428408
if 'table_names' not in st.session_state:
@@ -513,7 +493,7 @@ def authenticate():
513493

514494
key_value_pairs = []
515495
for df in final_dfs:
516-
key_value_pairs.extend(doctr_ocr_functions.generate_key_value_pairs(df, form))
496+
key_value_pairs.extend(dhis2.generate_key_value_pairs(df, form))
517497

518498
st.session_state.data_payload = json_export(key_value_pairs)
519499

pyproject.toml

+11-6
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,10 @@ classifiers = [
2222

2323
# All the dependencies needed for running your module go here
2424
dependencies = [
25-
"img2table",
2625
"numpy",
27-
"openai",
2826
"pandas",
29-
"python-doctr",
3027
"python-Levenshtein",
31-
"requests",
32-
"torch",
33-
"torchvision",
28+
"requests"
3429
]
3530

3631
[project.optional-dependencies]
@@ -43,10 +38,20 @@ test = [
4338

4439
# Dependencies only needed to run the streamlit app go here
4540
app = [
41+
"openai",
4642
"streamlit",
4743
"simpleeval"
4844
]
4945

46+
app-doctr = [
47+
"img2table",
48+
"python-doctr",
49+
"simpleeval",
50+
"streamlit",
51+
"torch",
52+
"torchvision",
53+
]
54+
5055
# Dependencies that are useful only to developers, like an autoformatter and support for visualizations in jupyter notebooks go here
5156
dev = [
5257
"azure-common==1.1.28",

src/msfocr/data/dhis2.py

+46
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,49 @@ def get_DE_COC_List(form):
135135
return list(dataElement_list.keys()), list(categoryOptionCombo_list.keys())
136136

137137

138+
def generate_key_value_pairs(table, form):
139+
"""
140+
Generates key-value pairs in the format required to upload data to DHIS2.
141+
{'dataElement': data_element_id,
142+
'categoryOptionCombo': category_id,
143+
'value': cell_value}
144+
UIDs like data_element_id, category_id are obtained by querying the DHIS2 metadata.
145+
:param table: DataFrame generated from table detection
146+
:return: List of key value pairs as shown above.
147+
"""
148+
data_element_pairs = []
149+
150+
# Iterate over each cell in the DataFrame
151+
table_array = table.values
152+
columns = table.columns
153+
for row_index in range(table_array.shape[0]):
154+
# Row name in tally sheet
155+
data_element = table_array[row_index][0]
156+
for col_index in range(1, table_array.shape[1]):
157+
# Column name in tally sheet
158+
category = columns[col_index]
159+
cell_value = table_array[row_index][col_index]
160+
if cell_value is not None and cell_value!="-" and cell_value!="":
161+
data_element_id = None
162+
category_id = None
163+
# Search for the string in the "label" field of form information
164+
string_search = data_element + " " + category
165+
for group in form['groups']:
166+
for field in group['fields']:
167+
if field['label']==string_search:
168+
data_element_id = field['dataElement']
169+
category_id = field['categoryOptionCombo']
170+
171+
# The following exceptions will be raised if the row or column name in the tally sheet is different from the names used in metadata
172+
# For eg. Pop1: Resident is called Population 1 in metadata
173+
# If this exception is raised the only way forward is for the user to manually change the row/column name to the one used in metadata
174+
if data_element_id is None or category_id is None:
175+
raise Exception(f"Unable to find {string_search} in DHIS2 metadata")
176+
# Append to the list of data elements to be push to DHIS2
177+
data_element_pairs.append(
178+
{"dataElement": data_element_id,
179+
"categoryOptionCombo": category_id,
180+
"value": cell_value}
181+
)
182+
183+
return data_element_pairs

src/msfocr/data/post_processing.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from datetime import datetime
2+
3+
import Levenshtein
4+
from simpleeval import simple_eval
5+
6+
7+
def letter_by_letter_similarity(text1, text2):
8+
"""
9+
Checks the letter by letter similarity between two strings
10+
:param text1: first text
11+
:param text2: second text
12+
:return: returns an integer between 0-1, 0 indicates no similarity, 1 indicates identical strings
13+
"""
14+
# Calculate Levenshtein distance
15+
distance = Levenshtein.distance(text1, text2)
16+
17+
# Calculate maximum possible length
18+
max_len = max(len(text1), len(text2))
19+
20+
# Convert distance to similarity
21+
similarity = 1 - (distance / max_len)
22+
23+
return similarity
24+
25+
26+
def get_yyyy_mm_dd(text):
27+
"""
28+
Checks if the input text is a date by comparing it with various known formats and returns date in unified YYYY-MM-DD format.
29+
:param text: String
30+
:return: Date in YYYY-MM-DD format or None
31+
"""
32+
formats = ["%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%d/%m/%Y", "%B %d, %Y", "%d %B %Y", "%Y/%m/%d"]
33+
34+
for fmt in formats:
35+
try:
36+
date_obj = datetime.strptime(text, fmt)
37+
return date_obj.strftime("%Y-%m-%d")
38+
except ValueError:
39+
continue
40+
41+
return None # Return None if text is not a valid date in any format
42+
43+
44+
def evaluate_cells(table_dfs):
45+
"""Uses simple_eval to perform math operations on each cell, defaulting to input if failed.
46+
47+
Args:
48+
table_dfs (_List_): List of table data frames
49+
50+
Returns:
51+
_List_: List of table data frames
52+
"""
53+
for table in table_dfs:
54+
table_removed_labels = table.iloc[1:, 1:]
55+
for col in table_removed_labels.columns:
56+
try:
57+
# Contents should be strings in order to be editable later
58+
table_removed_labels[col] = table_removed_labels[col].apply(lambda x: simple_eval(x) if x and x != "-" else x).astype("str")
59+
except Exception:
60+
continue
61+
table.update(table_removed_labels)
62+
return table_dfs

0 commit comments

Comments
 (0)