Skip to content

Commit

Permalink
refactor TableStructure module
Browse files Browse the repository at this point in the history
  • Loading branch information
dothinking committed Oct 16, 2020
2 parents acf6f6c + 8dd539c commit e434305
Show file tree
Hide file tree
Showing 25 changed files with 92,299 additions and 3,118 deletions.
66 changes: 1 addition & 65 deletions pdf2docx/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,68 +139,4 @@ def graph_BFS_from_node(graph, start):
yield cur_node
searched.add(cur_node)
for node in graph[cur_node]:
search_queue.append(node)


def compare_layput(filename_source, filename_target, filename_output, threshold=0.7):
''' Compare layout of two pdf files:
It's difficult to have an exactly same layout of blocks, but ensure they
look like each other. So, with `extractWORDS()`, all words with bbox
information are compared.
```
(x0, y0, x1, y1, "word", block_no, line_no, word_no)
```
'''
# fitz document
source = fitz.open(filename_source) # type: fitz.Document
target = fitz.open(filename_target) # type: fitz.Document

# check count of pages
# --------------------------
if len(source) != len(target):
msg='Page count is inconsistent with source file.'
print(msg)
return False

flag = True
errs = []
for i, (source_page, target_page) in enumerate(zip(source, target)):

# check position of each word
# ---------------------------
source_words = source_page.getText('words')
target_words = target_page.getText('words')

# sort by word
source_words.sort(key=lambda item: (item[4], round(item[1],1), round(item[0],1)))
target_words.sort(key=lambda item: (item[4], round(item[1],1), round(item[0],1)))

if len(source_words) != len(target_words):
msg='Words count is inconsistent with source file.'
print(msg)
return False

# check each word and bbox
for sample, test in zip(source_words, target_words):
source_rect, target_rect = fitz.Rect(sample[0:4]), fitz.Rect(test[0:4])

# draw bbox based on source layout
source_page.drawRect(source_rect, color=(1,1,0), overlay=True) # source position
source_page.drawRect(target_rect, color=(1,0,0), overlay=True) # current position

# check bbox word by word: ignore small bbox, e.g. single letter bbox
if not get_main_bbox(source_rect, target_rect, threshold):
flag = False
errs.append((f'{sample[4]} ===> {test[4]}', target_rect, source_rect))

# save and close
source.save(filename_output)
target.close()
source.close()

# outputs
for word, target_rect, source_rect in errs:
print(f'Word "{word}": \nsample bbox: {source_rect}\ncurrent bbox: {target_rect}\n')

return flag
search_queue.append(node)
8 changes: 6 additions & 2 deletions pdf2docx/layout/Layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,12 @@ def parse(self, **kwargs):

def extract_tables(self):
'''Extract content from lattice tables.'''
# parsing tables
self.clean().parse_lattice_tables()
# preprocessing, e.g. change block order, clean negative block
self.clean_up_shapes()
self.clean_up_blocks()

# parsing lattice tables only
self.parse_lattice_tables()

# check table
tables = [] # type: list[ list[list[str]] ]
Expand Down
8 changes: 2 additions & 6 deletions pdf2docx/shape/Shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,15 +225,11 @@ def _check_semantic_type(self, block):
block line, e.g. a factor like 98% or so.
'''
# check block first
if self.contains(block, threshold=constants.FACTOR_ALMOST): return RectType.SHADING
if self.contains(block, threshold=constants.FACTOR_MAJOR): return RectType.SHADING

# not contain but intersects -> check block line for another chance
for line in block.lines:
if not self.bbox & line.bbox: continue

if self.contains(line, threshold=constants.FACTOR_ALMOST):
if self.contains(line, threshold=constants.FACTOR_MAJOR):
return RectType.SHADING
else:
return RectType.HIGHLIGHT

return RectType.UNDEFINED # can't be determined by this block
50 changes: 25 additions & 25 deletions pdf2docx/shape/Shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ def __init__(self, instances:list=[], parent=None):
# properties for context type of shape, e.g.
# a Stroke instace may be either table border or text underline or strike-through,
# a Fill instance may be either cell shading or text highlight.
self._table_borders = Collection()
self._table_shadings = Collection()
self._table_strokes = Collection()
self._table_fillings = Collection()

self._text_underlines_strikes = Collection() # they're combined at this moment
self._text_highlights = Collection()
Expand Down Expand Up @@ -68,15 +68,15 @@ def fillings(self):


@property
def table_borders(self):
def table_strokes(self):
'''potential table borders.'''
return self._table_borders
return self._table_strokes


@property
def table_shadings(self):
def table_fillings(self):
'''potential table shadings.'''
return self._table_shadings
return self._table_fillings


@property
Expand Down Expand Up @@ -150,8 +150,8 @@ def detect_initial_categories(self):
It should run right after `clean_up()`.
'''
# reset all
self._table_borders.reset()
self._table_shadings.reset()
self._table_strokes.reset()
self._table_fillings.reset()
self._text_underlines_strikes.reset()
self._text_highlights.reset()

Expand All @@ -161,33 +161,33 @@ def detect_initial_categories(self):

# check positions between shapes and blocks
for shape in self._instances:
# try to determin shape semantic type
# try to determin shape semantic type:
# - check if text underline/strike for a stroke
# - check if table shading for a fill
rect_type = shape.semantic_type(blocks) # type: RectType

# set the type if succeeded
if rect_type==RectType.UNDERLINE_OR_STRIKE:
self._text_underlines_strikes.append(shape)

elif rect_type==RectType.BORDER:
self._table_borders.append(shape)

elif rect_type==RectType.SHADING:
self._table_shadings.append(shape)
self._table_fillings.append(shape)

elif rect_type==RectType.HIGHLIGHT:
self._text_highlights.append(shape)

# if not determined, it should be the opposite type, e.g. table border for a Stroke,
# highlight for a Fill. However, condering margin, incorrectly organized blocks, e.g.
# a text underline may have no intersection with the text block, so let's add the shape
# to both groups for conservation. It'll finally determined when parsing table structure
# and text format.

# otherwise, it should be the opposite type, e.g.
# table border for a Stroke, highlight for a Fill.
else:
# However, condering margin, incorrectly organized blocks, e.g.
# a text underline may have no intersection with the text block, so add the stroke shape
# to both groups for conservation. It'll finally determined when parsing table structure
# and text format.
if isinstance(shape, Stroke):
self._table_borders.append(shape)
self._table_strokes.append(shape)
self._text_underlines_strikes.append(shape)

# for a fill shape, it should be a highlight if parsing table shading failed
else:
self._text_highlights.append(shape)
self._table_shadings.append(shape)


def plot(self, page):
Expand All @@ -197,11 +197,11 @@ def plot(self, page):

# -table shading
color = (152/255, 251/255, 152/255)
for shape in self._table_shadings: shape.plot(page, color)
for shape in self._table_fillings: shape.plot(page, color)

# - table borders
color = (0, 0, 0)
for shape in self._table_borders: shape.plot(page, color)
for shape in self._table_strokes: shape.plot(page, color)

# - underline and strike-through
color = (1, 0, 0)
Expand Down
6 changes: 1 addition & 5 deletions pdf2docx/table/Border.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

from ..shape.Shapes import Shapes
from ..shape.Shape import Stroke
from ..common.utils import RGB_value
from ..common import constants
from ..common.base import RectType

Expand Down Expand Up @@ -63,7 +62,7 @@ def __init__(self, border_type='h', border_range:tuple=None, borders:tuple=None,

# border style
self.width = constants.HIDDEN_W_BORDER
self.color = RGB_value((1,1,1)) # white by default
self.color = 0 # black by default


@property
Expand Down Expand Up @@ -142,9 +141,6 @@ def finalize_by_stroke(self, stroke:Stroke):
# skip if not span in the border direction
if low_pos > self._LBorder.URange and upper_pos < self._UBorder.LRange: return

# skip if finalized already
if self.finalized: return True

# now, finalize current border
self.finalize_by_value(value)
self.color = stroke.color
Expand Down
Loading

0 comments on commit e434305

Please sign in to comment.