Skip to content

Commit ea10d37

Browse files
committed
Creation of post_processing but test fails
1 parent f78f2ef commit ea10d37

8 files changed

+259
-170
lines changed

app_doctr.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
import streamlit as st
1313

1414
from msfocr.data import dhis2
15-
from msfocr.data import ocr_functions
15+
from msfocr.data import post_processing
16+
from msfocr.doctr import ocr_functions
1617

1718
def configure_secrets():
1819
"""Checks that necessary environment variables are set for fast failing.
@@ -116,7 +117,7 @@ def correct_field_names(dfs):
116117
text = table.iloc[row,0]
117118
if text is not None:
118119
for name in dataElement_list:
119-
sim = ocr_functions.letter_by_letter_similarity(text, name)
120+
sim = post_processing.letter_by_letter_similarity(text, name)
120121
if max_similarity_dataElement < sim:
121122
max_similarity_dataElement = sim
122123
dataElement = name
@@ -129,7 +130,7 @@ def correct_field_names(dfs):
129130
text = table.iloc[0,id]
130131
if text is not None:
131132
for name in categoryOptionsList:
132-
sim = ocr_functions.letter_by_letter_similarity(text, name)
133+
sim = post_processing.letter_by_letter_similarity(text, name)
133134
if max_similarity_catOpt < sim:
134135
max_similarity_catOpt = sim
135136
catOpt = name
@@ -446,7 +447,7 @@ def get_period():
446447
print(final_dfs)
447448
key_value_pairs = []
448449
for df in final_dfs:
449-
key_value_pairs.extend(ocr_functions.generate_key_value_pairs(df))
450+
key_value_pairs.extend(post_processing.generate_key_value_pairs(df))
450451
st.write("Completed")
451452

452453
st.session_state.data_payload = json_export(key_value_pairs)

app_llm.py

+17-27
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,9 @@
66
import requests
77
import streamlit as st
88
from requests.auth import HTTPBasicAuth
9-
from simpleeval import simple_eval
109

1110
from msfocr.data import dhis2
12-
from msfocr.doctr import ocr_functions as doctr_ocr_functions
11+
from msfocr.data import post_processing
1312
from msfocr.llm import ocr_functions
1413

1514

@@ -110,6 +109,18 @@ def dhis2_all_UIDs(item_type, search_items):
110109
return dhis2.getAllUIDs(item_type, search_items)
111110

112111

112+
# Other functions
113+
114+
def configure_secrets():
115+
"""Checks that necessary environment variables are set for fast failing.
116+
Configures the DHIS2 server connection.
117+
"""
118+
username = os.environ["DHIS2_USERNAME"]
119+
password = os.environ["DHIS2_PASSWORD"]
120+
server_url = os.environ["DHIS2_SERVER_URL"]
121+
dhis2.configure_DHIS2_server(username, password, server_url)
122+
123+
113124
def week1_start_ordinal(year):
114125
"""
115126
Calculates the ordinal date of the start of the first week of the year.
@@ -205,7 +216,7 @@ def correct_field_names(dfs, form):
205216
text = table.iloc[row,0]
206217
if text is not None:
207218
for name in dataElement_list:
208-
sim = doctr_ocr_functions.letter_by_letter_similarity(text, name)
219+
sim = post_processing.letter_by_letter_similarity(text, name)
209220
if max_similarity_dataElement < sim:
210221
max_similarity_dataElement = sim
211222
dataElement = name
@@ -218,7 +229,7 @@ def correct_field_names(dfs, form):
218229
text = table.iloc[0,id]
219230
if text is not None:
220231
for name in categoryOptionsList:
221-
sim = doctr_ocr_functions.letter_by_letter_similarity(text, name)
232+
sim = post_processing.letter_by_letter_similarity(text, name)
222233
if max_similarity_catOpt < sim:
223234
max_similarity_catOpt = sim
224235
catOpt = name
@@ -244,27 +255,6 @@ def save_st_table(table_dfs):
244255
if not table_dfs[idx].equals(st.session_state.table_dfs[idx]):
245256
st.session_state.table_dfs = table_dfs
246257
st.rerun()
247-
248-
249-
def evaluate_cells(table_dfs):
250-
"""Uses simple_eval to perform math operations on each cell, defaulting to input if failed.
251-
252-
Args:
253-
table_dfs (_List_): List of table data frames
254-
255-
Returns:
256-
_List_: List of table data frames
257-
"""
258-
for table in table_dfs:
259-
table_removed_labels = table.loc[1:, 1:]
260-
for col in table_removed_labels.columns:
261-
try:
262-
# Contents should be strings in order to be editable later
263-
table_removed_labels[col] = table_removed_labels[col].apply(lambda x: simple_eval(x) if x and x != "-" else x).astype("str")
264-
except Exception:
265-
continue
266-
table.update(table_removed_labels)
267-
return table_dfs
268258

269259

270260
# Initializing session state variables that only need to be set on startup
@@ -422,7 +412,7 @@ def authenticate():
422412
table_names.extend(names)
423413
table_dfs.extend(df)
424414
page_nums_to_display.extend([str(i + 1)] * len(names))
425-
table_dfs = evaluate_cells(table_dfs)
415+
table_dfs = post_processing.evaluate_cells(table_dfs)
426416

427417
# Form session state initialization
428418
if 'table_names' not in st.session_state:
@@ -513,7 +503,7 @@ def authenticate():
513503

514504
key_value_pairs = []
515505
for df in final_dfs:
516-
key_value_pairs.extend(doctr_ocr_functions.generate_key_value_pairs(df, form))
506+
key_value_pairs.extend(post_processing.generate_key_value_pairs(df, form))
517507

518508
st.session_state.data_payload = json_export(key_value_pairs)
519509

pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ app = [
4646
app-doctr = [
4747
"img2table",
4848
"python-doctr",
49+
"simpleeval",
50+
"streamlit",
4951
"torch",
5052
"torchvision",
5153
]

src/msfocr/data/dhis2.py

+46
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,49 @@ def get_DE_COC_List(form):
135135
return list(dataElement_list.keys()), list(categoryOptionCombo_list.keys())
136136

137137

138+
def generate_key_value_pairs(table, form):
139+
"""
140+
Generates key-value pairs in the format required to upload data to DHIS2.
141+
{'dataElement': data_element_id,
142+
'categoryOptionCombo': category_id,
143+
'value': cell_value}
144+
UIDs like data_element_id, category_id are obtained by querying the DHIS2 metadata.
145+
:param table: DataFrame generated from table detection
146+
:return: List of key value pairs as shown above.
147+
"""
148+
data_element_pairs = []
149+
150+
# Iterate over each cell in the DataFrame
151+
table_array = table.values
152+
columns = table.columns
153+
for row_index in range(table_array.shape[0]):
154+
# Row name in tally sheet
155+
data_element = table_array[row_index][0]
156+
for col_index in range(1, table_array.shape[1]):
157+
# Column name in tally sheet
158+
category = columns[col_index]
159+
cell_value = table_array[row_index][col_index]
160+
if cell_value is not None and cell_value!="-" and cell_value!="":
161+
data_element_id = None
162+
category_id = None
163+
# Search for the string in the "label" field of form information
164+
string_search = data_element + " " + category
165+
for group in form['groups']:
166+
for field in group['fields']:
167+
if field['label']==string_search:
168+
data_element_id = field['dataElement']
169+
category_id = field['categoryOptionCombo']
170+
171+
# The following exceptions will be raised if the row or column name in the tally sheet is different from the names used in metadata
172+
# For eg. Pop1: Resident is called Population 1 in metadata
173+
# If this exception is raised the only way forward is for the user to manually change the row/column name to the one used in metadata
174+
if data_element_id is None or category_id is None:
175+
raise Exception(f"Unable to find {string_search} in DHIS2 metadata")
176+
# Append to the list of data elements to be push to DHIS2
177+
data_element_pairs.append(
178+
{"dataElement": data_element_id,
179+
"categoryOptionCombo": category_id,
180+
"value": cell_value}
181+
)
182+
183+
return data_element_pairs

src/msfocr/data/post_processing.py

+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from datetime import datetime
2+
3+
import Levenshtein
4+
from simpleeval import simple_eval
5+
6+
7+
def letter_by_letter_similarity(text1, text2):
8+
"""
9+
Checks the letter by letter similarity between two strings
10+
:param text1: first text
11+
:param text2: second text
12+
:return: returns an integer between 0-1, 0 indicates no similarity, 1 indicates identical strings
13+
"""
14+
# Calculate Levenshtein distance
15+
distance = Levenshtein.distance(text1, text2)
16+
17+
# Calculate maximum possible length
18+
max_len = max(len(text1), len(text2))
19+
20+
# Convert distance to similarity
21+
similarity = 1 - (distance / max_len)
22+
23+
return similarity
24+
25+
26+
def get_yyyy_mm_dd(text):
27+
"""
28+
Checks if the input text is a date by comparing it with various known formats and returns date in unified YYYY-MM-DD format.
29+
:param text: String
30+
:return: Date in YYYY-MM-DD format or None
31+
"""
32+
formats = ["%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%d/%m/%Y", "%B %d, %Y", "%d %B %Y", "%Y/%m/%d"]
33+
34+
for fmt in formats:
35+
try:
36+
date_obj = datetime.strptime(text, fmt)
37+
return date_obj.strftime("%Y-%m-%d")
38+
except ValueError:
39+
continue
40+
41+
return None # Return None if text is not a valid date in any format
42+
43+
44+
def generate_key_value_pairs(table, form):
45+
"""
46+
Generates key-value pairs in the format required to upload data to DHIS2.
47+
{'dataElement': data_element_id,
48+
'categoryOptionCombo': category_id,
49+
'value': cell_value}
50+
UIDs like data_element_id, category_id are obtained by querying the DHIS2 metadata.
51+
:param table: DataFrame generated from table detection
52+
:return: List of key value pairs as shown above.
53+
"""
54+
data_element_pairs = []
55+
56+
# Iterate over each cell in the DataFrame
57+
table_array = table.values
58+
columns = table.columns
59+
for row_index in range(table_array.shape[0]):
60+
# Row name in tally sheet
61+
data_element = table_array[row_index][0]
62+
for col_index in range(1, table_array.shape[1]):
63+
# Column name in tally sheet
64+
category = columns[col_index]
65+
cell_value = table_array[row_index][col_index]
66+
if cell_value is not None and cell_value!="-" and cell_value!="":
67+
data_element_id = None
68+
category_id = None
69+
# Search for the string in the "label" field of form information
70+
string_search = data_element + " " + category
71+
for group in form['groups']:
72+
for field in group['fields']:
73+
if field['label']==string_search:
74+
data_element_id = field['dataElement']
75+
category_id = field['categoryOptionCombo']
76+
77+
# The following exceptions will be raised if the row or column name in the tally sheet is different from the names used in metadata
78+
# For eg. Pop1: Resident is called Population 1 in metadata
79+
# If this exception is raised the only way forward is for the user to manually change the row/column name to the one used in metadata
80+
if data_element_id is None or category_id is None:
81+
raise Exception(f"Unable to find {string_search} in DHIS2 metadata")
82+
# Append to the list of data elements to be push to DHIS2
83+
data_element_pairs.append(
84+
{"dataElement": data_element_id,
85+
"categoryOptionCombo": category_id,
86+
"value": cell_value}
87+
)
88+
89+
return data_element_pairs
90+
91+
92+
def evaluate_cells(table_dfs):
93+
"""Uses simple_eval to perform math operations on each cell, defaulting to input if failed.
94+
95+
Args:
96+
table_dfs (_List_): List of table data frames
97+
98+
Returns:
99+
_List_: List of table data frames
100+
"""
101+
for table in table_dfs:
102+
print(table)
103+
table_removed_labels = table.iloc[1:, 1:]
104+
print(table_removed_labels)
105+
for col in table_removed_labels.columns:
106+
try:
107+
# Contents should be strings in order to be editable later
108+
table_removed_labels[col] = table_removed_labels[col].apply(lambda x: simple_eval(x) if x and x != "-" else x).astype("str")
109+
except Exception:
110+
continue
111+
print(table)
112+
print(table_removed_labels)
113+
table.update(table_removed_labels)
114+
return table_dfs

0 commit comments

Comments
 (0)