UMassCDS · rdziewietin · Aug 8, 2024 · Aug 1, 2024 · Aug 5, 2024 · Aug 6, 2024
diff --git a/.github/workflows/python_package.yml b/.github/workflows/python_package.yml
@@ -24,7 +24,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install flake8
-          pip install .[test]
+          pip install .[test,app-doctr]
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names

diff --git a/Dockerfile b/Dockerfile
@@ -12,7 +12,7 @@ WORKDIR /app
 
 # Getting git to clone and system dependencies for DocTR
 RUN apt-get update && apt-get install -y \
-    ffmpeg libsm6 libxext6 libhdf5-dev pkg-config \
+    libxext6 libhdf5-dev pkg-config \
     build-essential \
     curl \
     software-properties-common \

diff --git a/app_doctr.py b/app_doctr.py
@@ -12,6 +12,7 @@
 import streamlit as st
 
 from msfocr.data import dhis2
+from msfocr.data import post_processing
 from msfocr.doctr import ocr_functions
 
 def configure_secrets():
@@ -116,7 +117,7 @@ def correct_field_names(dfs):
             text = table.iloc[row,0]
             if text is not None:
                 for name in dataElement_list:
-                    sim = ocr_functions.letter_by_letter_similarity(text, name)
+                    sim = post_processing.letter_by_letter_similarity(text, name)
                     if max_similarity_dataElement < sim:
                         max_similarity_dataElement = sim
                         dataElement = name
@@ -129,7 +130,7 @@ def correct_field_names(dfs):
             text = table.iloc[0,id]
             if text is not None:
                 for name in categoryOptionsList:
-                    sim = ocr_functions.letter_by_letter_similarity(text, name)
+                    sim = post_processing.letter_by_letter_similarity(text, name)
                     if max_similarity_catOpt < sim:
                         max_similarity_catOpt = sim
                         catOpt = name
@@ -452,7 +453,7 @@ def get_period():
                 print(final_dfs)
                 key_value_pairs = []
                 for df in final_dfs:
-                    key_value_pairs.extend(ocr_functions.generate_key_value_pairs(df))
+                    key_value_pairs.extend(dhis2.generate_key_value_pairs(df))
                 st.write("Completed")
 
                 st.session_state.data_payload = json_export(key_value_pairs)

diff --git a/app_llm.py b/app_llm.py
@@ -6,10 +6,9 @@
 import requests
 import streamlit as st
 from requests.auth import HTTPBasicAuth
-from simpleeval import simple_eval
 
 from msfocr.data import dhis2
-from msfocr.doctr import ocr_functions as doctr_ocr_functions
+from msfocr.data import post_processing
 from msfocr.llm import ocr_functions
 
 
@@ -110,6 +109,8 @@ def dhis2_all_UIDs(item_type, search_items):
         return dhis2.getAllUIDs(item_type, search_items)
 
 
+# Other functions
+
 def week1_start_ordinal(year):
     """
     Calculates the ordinal date of the start of the first week of the year.
@@ -205,7 +206,7 @@ def correct_field_names(dfs, form):
             text = table.iloc[row,0]
             if text is not None:
                 for name in dataElement_list:
-                    sim = doctr_ocr_functions.letter_by_letter_similarity(text, name)
+                    sim = post_processing.letter_by_letter_similarity(text, name)
                     if max_similarity_dataElement < sim:
                         max_similarity_dataElement = sim
                         dataElement = name
@@ -218,7 +219,7 @@ def correct_field_names(dfs, form):
             text = table.iloc[0,id]
             if text is not None:
                 for name in categoryOptionsList:
-                    sim =  doctr_ocr_functions.letter_by_letter_similarity(text, name)
+                    sim =  post_processing.letter_by_letter_similarity(text, name)
                     if max_similarity_catOpt < sim:
                         max_similarity_catOpt = sim
                         catOpt = name
@@ -244,27 +245,6 @@ def save_st_table(table_dfs):
         if not table_dfs[idx].equals(st.session_state.table_dfs[idx]):
             st.session_state.table_dfs = table_dfs
             st.rerun()
-
-
-def evaluate_cells(table_dfs):
-    """Uses simple_eval to perform math operations on each cell, defaulting to input if failed.
-
-    Args:
-        table_dfs (_List_): List of table data frames
-
-    Returns:
-        _List_: List of table data frames
-    """
-    for table in table_dfs:
-        table_removed_labels = table.loc[1:, 1:]
-        for col in table_removed_labels.columns:
-            try:
-                # Contents should be strings in order to be editable later
-                table_removed_labels[col] = table_removed_labels[col].apply(lambda x: simple_eval(x) if x and x != "-" else x).astype("str")
-            except Exception:
-                continue
-        table.update(table_removed_labels)
-    return table_dfs
 
 
 # Initializing session state variables that only need to be set on startup
@@ -422,7 +402,7 @@ def authenticate():
             table_names.extend(names)
             table_dfs.extend(df)
             page_nums_to_display.extend([str(i + 1)] * len(names))
-        table_dfs = evaluate_cells(table_dfs)
+        table_dfs = post_processing.evaluate_cells(table_dfs)
 
         # Form session state initialization
         if 'table_names' not in st.session_state:
@@ -513,7 +493,7 @@ def authenticate():
 
                         key_value_pairs = []
                         for df in final_dfs:
-                            key_value_pairs.extend(doctr_ocr_functions.generate_key_value_pairs(df, form))
+                            key_value_pairs.extend(dhis2.generate_key_value_pairs(df, form))
 
                         st.session_state.data_payload = json_export(key_value_pairs)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,15 +22,10 @@ classifiers = [
 
 # All the dependencies needed for running your module go here
 dependencies = [
-    "img2table",
     "numpy",
-    "openai",
     "pandas",
-    "python-doctr",
     "python-Levenshtein",
-    "requests",
-    "torch",
-    "torchvision",
+    "requests"
 ]
 
 [project.optional-dependencies]
@@ -43,10 +38,20 @@ test = [
 
 # Dependencies only needed to run the streamlit app go here
 app = [
+    "openai",
     "streamlit",
     "simpleeval"
     ]
 
+app-doctr = [
+    "img2table",
+    "python-doctr",
+    "simpleeval",
+    "streamlit",
+    "torch",
+    "torchvision",
+]
+
 # Dependencies that are useful only to developers, like an autoformatter and support for visualizations in jupyter notebooks go here
 dev = [
     "azure-common==1.1.28",

diff --git a/src/msfocr/data/dhis2.py b/src/msfocr/data/dhis2.py
@@ -135,3 +135,49 @@ def get_DE_COC_List(form):
     return list(dataElement_list.keys()), list(categoryOptionCombo_list.keys())   
 
 
+def generate_key_value_pairs(table, form):
+    """
+    Generates key-value pairs in the format required to upload data to DHIS2.
+    {'dataElement': data_element_id,
+     'categoryOptionCombo': category_id,
+     'value': cell_value}
+     UIDs like data_element_id, category_id are obtained by querying the DHIS2 metadata.
+    :param table: DataFrame generated from table detection
+    :return: List of key value pairs as shown above.
+    """ 
+    data_element_pairs = []
+
+    # Iterate over each cell in the DataFrame
+    table_array = table.values
+    columns = table.columns
+    for row_index in range(table_array.shape[0]):
+        # Row name in tally sheet
+        data_element = table_array[row_index][0]
+        for col_index in range(1, table_array.shape[1]):
+            # Column name in tally sheet
+            category = columns[col_index]
+            cell_value = table_array[row_index][col_index]
+            if cell_value is not None and cell_value!="-" and cell_value!="":
+                data_element_id = None
+                category_id = None
+                # Search for the string in the "label" field of form information
+                string_search = data_element + " " + category
+                for group in form['groups']:
+                    for field in group['fields']:
+                        if field['label']==string_search:
+                            data_element_id = field['dataElement']
+                            category_id = field['categoryOptionCombo']
+
+                # The following exceptions will be raised if the row or column name in the tally sheet is different from the names used in metadata
+                # For eg. Pop1: Resident is called Population 1 in metadata
+                # If this exception is raised the only way forward is for the user to manually change the row/column name to the one used in metadata
+                if data_element_id is None or category_id is None:
+                    raise Exception(f"Unable to find {string_search} in DHIS2 metadata")
+                # Append to the list of data elements to be push to DHIS2
+                data_element_pairs.append(
+                    {"dataElement": data_element_id,
+                    "categoryOptionCombo": category_id,
+                    "value": cell_value}
+                    )
+
+    return data_element_pairs
diff --git a/src/msfocr/data/post_processing.py b/src/msfocr/data/post_processing.py
@@ -0,0 +1,62 @@
+from datetime import datetime
+
+import Levenshtein
+from simpleeval import simple_eval
+
+
+def letter_by_letter_similarity(text1, text2):
+    """
+    Checks the letter by letter similarity between two strings
+    :param text1: first text
+    :param text2: second text
+    :return: returns an integer between 0-1, 0 indicates no similarity, 1 indicates identical strings
+    """
+    # Calculate Levenshtein distance
+    distance = Levenshtein.distance(text1, text2)
+
+    # Calculate maximum possible length
+    max_len = max(len(text1), len(text2))
+
+    # Convert distance to similarity
+    similarity = 1 - (distance / max_len)
+
+    return similarity
+
+
+def get_yyyy_mm_dd(text):
+    """
+    Checks if the input text is a date by comparing it with various known formats and returns date in unified YYYY-MM-DD format.
+    :param text: String
+    :return: Date in YYYY-MM-DD format or None
+    """
+    formats = ["%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%d/%m/%Y", "%B %d, %Y", "%d %B %Y", "%Y/%m/%d"]
+
+    for fmt in formats:
+        try:
+            date_obj = datetime.strptime(text, fmt)
+            return date_obj.strftime("%Y-%m-%d")
+        except ValueError:
+            continue
+
+    return None  # Return None if text is not a valid date in any format
+
+
+def evaluate_cells(table_dfs):
+    """Uses simple_eval to perform math operations on each cell, defaulting to input if failed.
+
+    Args:
+        table_dfs (_List_): List of table data frames
+
+    Returns:
+        _List_: List of table data frames
+    """
+    for table in table_dfs:
+        table_removed_labels = table.iloc[1:, 1:]
+        for col in table_removed_labels.columns:
+            try:
+                # Contents should be strings in order to be editable later
+                table_removed_labels[col] = table_removed_labels[col].apply(lambda x: simple_eval(x) if x and x != "-" else x).astype("str")
+            except Exception:
+                continue
+        table.update(table_removed_labels)
+    return table_dfs