refactor TableStructure module

ArtifexSoftware · Oct 16, 2020 · e434305 · e434305
2 parents acf6f6c + 8dd539c
commit e434305
Show file tree

Hide file tree

Showing 25 changed files with 92,299 additions and 3,118 deletions.
diff --git a/pdf2docx/common/utils.py b/pdf2docx/common/utils.py
@@ -139,68 +139,4 @@ def graph_BFS_from_node(graph, start):
         yield cur_node
         searched.add(cur_node)
         for node in graph[cur_node]:
-            search_queue.append(node)
-
-
-def compare_layput(filename_source, filename_target, filename_output, threshold=0.7):
-    ''' Compare layout of two pdf files:
-        It's difficult to have an exactly same layout of blocks, but ensure they
-        look like each other. So, with `extractWORDS()`, all words with bbox 
-        information are compared.
-
-        ```
-        (x0, y0, x1, y1, "word", block_no, line_no, word_no)
-        ```
-    '''
-    # fitz document
-    source = fitz.open(filename_source) # type: fitz.Document
-    target = fitz.open(filename_target) # type: fitz.Document
-
-    # check count of pages
-    # --------------------------
-    if len(source) != len(target):
-        msg='Page count is inconsistent with source file.'
-        print(msg)
-        return False
-
-    flag = True
-    errs = []
-    for i, (source_page, target_page) in enumerate(zip(source, target)):
-
-        # check position of each word
-        # ---------------------------
-        source_words = source_page.getText('words')
-        target_words = target_page.getText('words')
-
-        # sort by word
-        source_words.sort(key=lambda item: (item[4], round(item[1],1), round(item[0],1)))
-        target_words.sort(key=lambda item: (item[4], round(item[1],1), round(item[0],1)))
-
-        if len(source_words) != len(target_words):
-            msg='Words count is inconsistent with source file.'
-            print(msg)
-            return False
-
-        # check each word and bbox
-        for sample, test in zip(source_words, target_words):
-            source_rect, target_rect = fitz.Rect(sample[0:4]), fitz.Rect(test[0:4])
-
-            # draw bbox based on source layout
-            source_page.drawRect(source_rect, color=(1,1,0), overlay=True) # source position
-            source_page.drawRect(target_rect, color=(1,0,0), overlay=True) # current position
-
-            # check bbox word by word: ignore small bbox, e.g. single letter bbox
-            if not get_main_bbox(source_rect, target_rect, threshold):
-                flag = False
-                errs.append((f'{sample[4]} ===> {test[4]}', target_rect, source_rect))
-
-    # save and close
-    source.save(filename_output)
-    target.close()
-    source.close()
-
-    # outputs
-    for word, target_rect, source_rect in errs:
-        print(f'Word "{word}": \nsample bbox: {source_rect}\ncurrent bbox: {target_rect}\n')
-
-    return flag
+            search_queue.append(node)
diff --git a/pdf2docx/layout/Layout.py b/pdf2docx/layout/Layout.py
@@ -136,8 +136,12 @@ def parse(self, **kwargs):
 
     def extract_tables(self):
         '''Extract content from lattice tables.'''
-        # parsing tables
-        self.clean().parse_lattice_tables()
+        # preprocessing, e.g. change block order, clean negative block
+        self.clean_up_shapes()
+        self.clean_up_blocks()
+
+        # parsing lattice tables only
+        self.parse_lattice_tables()
 
         # check table
         tables = [] # type: list[ list[list[str]] ]

diff --git a/pdf2docx/shape/Shape.py b/pdf2docx/shape/Shape.py
@@ -225,15 +225,11 @@ def _check_semantic_type(self, block):
             block line, e.g. a factor like 98% or so.
         '''
         # check block first
-        if self.contains(block, threshold=constants.FACTOR_ALMOST): return RectType.SHADING
+        if self.contains(block, threshold=constants.FACTOR_MAJOR): return RectType.SHADING
 
         # not contain but intersects -> check block line for another chance
         for line in block.lines:
-            if not self.bbox & line.bbox: continue
-
-            if self.contains(line, threshold=constants.FACTOR_ALMOST): 
+            if self.contains(line, threshold=constants.FACTOR_MAJOR): 
                 return RectType.SHADING
-            else:
-                return RectType.HIGHLIGHT
 
         return RectType.UNDEFINED # can't be determined by this block
diff --git a/pdf2docx/shape/Shapes.py b/pdf2docx/shape/Shapes.py
@@ -23,8 +23,8 @@ def __init__(self, instances:list=[], parent=None):
         # properties for context type of shape, e.g. 
         # a Stroke instace may be either table border or text underline or strike-through,
         # a Fill instance may be either cell shading or text highlight.
-        self._table_borders = Collection()
-        self._table_shadings = Collection()
+        self._table_strokes = Collection()
+        self._table_fillings = Collection()
 
         self._text_underlines_strikes = Collection() # they're combined at this moment
         self._text_highlights = Collection()
@@ -68,15 +68,15 @@ def fillings(self):
 
 
     @property
-    def table_borders(self):
+    def table_strokes(self):
         '''potential table borders.'''
-        return self._table_borders
+        return self._table_strokes
 
 
     @property
-    def table_shadings(self):
+    def table_fillings(self):
         '''potential table shadings.'''
-        return self._table_shadings
+        return self._table_fillings
 
 
     @property
@@ -150,8 +150,8 @@ def detect_initial_categories(self):
             It should run right after `clean_up()`.
         '''
         # reset all
-        self._table_borders.reset()
-        self._table_shadings.reset()
+        self._table_strokes.reset()
+        self._table_fillings.reset()
         self._text_underlines_strikes.reset()
         self._text_highlights.reset()
 
@@ -161,33 +161,33 @@ def detect_initial_categories(self):
 
         # check positions between shapes and blocks
         for shape in self._instances:
-            # try to determin shape semantic type
+            # try to determin shape semantic type:
+            # - check if text underline/strike for a stroke
+            # - check if table shading for a fill
             rect_type = shape.semantic_type(blocks)     # type: RectType
 
+            # set the type if succeeded
             if rect_type==RectType.UNDERLINE_OR_STRIKE:
                 self._text_underlines_strikes.append(shape)
 
-            elif rect_type==RectType.BORDER:
-                self._table_borders.append(shape)
-
             elif rect_type==RectType.SHADING:
-                self._table_shadings.append(shape)
+                self._table_fillings.append(shape)
 
-            elif rect_type==RectType.HIGHLIGHT:
-                self._text_highlights.append(shape)
-
-            # if not determined, it should be the opposite type, e.g. table border for a Stroke, 
-            # highlight for a Fill. However, condering margin, incorrectly organized blocks, e.g.
-            # a text underline may have no intersection with the text block, so let's add the shape 
-            # to both groups for conservation. It'll finally determined when parsing table structure
-            # and text format.
+
+            # otherwise, it should be the opposite type, e.g. 
+            # table border for a Stroke, highlight for a Fill. 
             else:
+                # However, condering margin, incorrectly organized blocks, e.g.
+                # a text underline may have no intersection with the text block, so add the stroke shape 
+                # to both groups for conservation. It'll finally determined when parsing table structure
+                # and text format.
                 if isinstance(shape, Stroke):
-                    self._table_borders.append(shape)
+                    self._table_strokes.append(shape)
                     self._text_underlines_strikes.append(shape)
+
+                # for a fill shape, it should be a highlight if parsing table shading failed
                 else:
                     self._text_highlights.append(shape)
-                    self._table_shadings.append(shape)
 
 
     def plot(self, page):
@@ -197,11 +197,11 @@ def plot(self, page):
 
         # -table shading
         color = (152/255, 251/255, 152/255)
-        for shape in self._table_shadings: shape.plot(page, color)
+        for shape in self._table_fillings: shape.plot(page, color)
 
         # - table borders
         color = (0, 0, 0)
-        for shape in self._table_borders: shape.plot(page, color)
+        for shape in self._table_strokes: shape.plot(page, color)
 
         # - underline and strike-through
         color = (1, 0, 0)

diff --git a/pdf2docx/table/Border.py b/pdf2docx/table/Border.py
@@ -28,7 +28,6 @@
 
 from ..shape.Shapes import Shapes
 from ..shape.Shape import Stroke
-from ..common.utils import RGB_value
 from ..common import constants
 from ..common.base import RectType
 
@@ -63,7 +62,7 @@ def __init__(self, border_type='h', border_range:tuple=None, borders:tuple=None,
 
         # border style
         self.width = constants.HIDDEN_W_BORDER
-        self.color = RGB_value((1,1,1)) # white by default
+        self.color = 0 # black by default
 
 
     @property
@@ -142,9 +141,6 @@ def finalize_by_stroke(self, stroke:Stroke):
         # skip if not span in the border direction
         if low_pos > self._LBorder.URange and upper_pos < self._UBorder.LRange: return
 
-        # skip if finalized already
-        if self.finalized: return True
-
         # now, finalize current border
         self.finalize_by_value(value)
         self.color = stroke.color