|
44 | 44 | @st.cache_resource
|
45 | 45 | def create_ocr():
|
46 | 46 | """
|
47 |
| - Load docTR ocr model and img2table docTR model |
| 47 | + Load img2table docTR model |
48 | 48 | """
|
49 |
| - ocr_model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True) |
50 | 49 | doctr_ocr = DocTR(detect_language=False)
|
51 |
| - return ocr_model, doctr_ocr |
| 50 | + return doctr_ocr |
52 | 51 |
|
53 | 52 | @st.cache_data(show_spinner=False)
|
54 |
| -def get_uploaded_images(tally_sheet): |
55 |
| - """ |
56 |
| - List of images uploaded by user as docTR DocumentFiles |
57 |
| - :param Files uploaded by user |
58 |
| - :return List of images uploaded by user as docTR DocumentFiles |
59 |
| - """ |
60 |
| - res = [] |
61 |
| - for sheet in tally_sheet: |
62 |
| - sheet.seek(0, 0) |
63 |
| - image = sheet.read() |
64 |
| - res.append(DocumentFile.from_images(image)) |
65 |
| - return res |
66 |
| - |
67 |
| -@st.cache_data(show_spinner=False) |
68 |
| -def get_results(uploaded_images): |
69 |
| - return [doctr_ocr_functions.get_word_level_content(ocr_model, doc) for doc in uploaded_images] |
70 |
| - |
71 |
| -@st.cache_data |
72 |
| -def get_tabular_content_wrapper(_doctr_ocr, img, confidence_lookup_dict): |
73 |
| - return doctr_ocr_functions.get_tabular_content(_doctr_ocr, img, confidence_lookup_dict) |
| 53 | +def get_tabular_content_wrapper(_doctr_ocr, img): |
| 54 | + return doctr_ocr_functions.get_tabular_content(_doctr_ocr, img) |
74 | 55 |
|
75 | 56 | @st.cache_data
|
76 | 57 | def get_DE_COC_List_wrapper(form):
|
@@ -288,7 +269,7 @@ def evaluate_cells(table_dfs):
|
288 | 269 | return table_dfs
|
289 | 270 |
|
290 | 271 | def clean_up(table_dfs):
|
291 |
| - """Uses simple_eval to perform math operations on each cell, defaulting to input if failed. |
| 272 | + """Cleans up values in table that are returned as the string "None" by OCR model into empty string "" |
292 | 273 |
|
293 | 274 | Args:
|
294 | 275 | table_dfs (_List_): List of table data frames
|
@@ -367,7 +348,7 @@ def authenticate():
|
367 | 348 | key=st.session_state['upload_key'])
|
368 | 349 |
|
369 | 350 | # OCR Model
|
370 |
| - ocr_model, doctr_ocr = create_ocr() |
| 351 | + doctr_ocr = create_ocr() |
371 | 352 |
|
372 | 353 | # Once images are uploaded
|
373 | 354 | if len(tally_sheet_images) > 0:
|
@@ -451,22 +432,17 @@ def authenticate():
|
451 | 432 | # ***************************************
|
452 | 433 |
|
453 | 434 | # Populate streamlit with data recognized from tally sheets
|
454 |
| - with st.spinner("Running image recognition..."): |
455 |
| - uploaded_images = get_uploaded_images(tally_sheet_images) |
456 |
| - results = get_results(uploaded_images) |
457 |
| - |
| 435 | + |
458 | 436 | # Spinner for data upload. If it's going to be on screen for long, make it bespoke
|
459 |
| - table_dfs, page_nums_to_display = [], [] |
460 |
| - for i, sheet in enumerate(tally_sheet_images): |
461 |
| - image = uploaded_images[i] |
462 |
| - result = results[i] |
463 |
| - confidence_lookup_dict = doctr_ocr_functions.get_confidence_values(result) |
464 |
| - img = Image(src=sheet) |
465 |
| - table_df, confidence_df = get_tabular_content_wrapper(doctr_ocr, img, confidence_lookup_dict) |
466 |
| - table_dfs.extend(table_df) |
467 |
| - page_nums_to_display.extend([str(i + 1)] * len(table_df)) |
468 |
| - table_dfs = clean_up(table_dfs) |
469 |
| - table_dfs = evaluate_cells(table_dfs) |
| 437 | + with st.spinner("Running image recognition..."): |
| 438 | + table_dfs, page_nums_to_display = [], [] |
| 439 | + for i, sheet in enumerate(tally_sheet_images): |
| 440 | + img = Image(src=sheet) |
| 441 | + table_df = get_tabular_content_wrapper(doctr_ocr, img) |
| 442 | + table_dfs.extend(table_df) |
| 443 | + page_nums_to_display.extend([str(i + 1)] * len(table_df)) |
| 444 | + table_dfs = clean_up(table_dfs) |
| 445 | + table_dfs = evaluate_cells(table_dfs) |
470 | 446 |
|
471 | 447 |
|
472 | 448 | # Form session state initialization
|
|
0 commit comments