-
Notifications
You must be signed in to change notification settings - Fork 496
/
Copy pathpdf.py
231 lines (201 loc) · 9.16 KB
/
pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# Copyright 2021 The Layout Parser team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Union, Optional, Dict, Tuple
import pdfplumber
import pandas as pd
from ..elements import Layout
from .basic import load_dataframe
DEFAULT_PDF_DPI = 72
def extract_words_for_page(
page: pdfplumber.page.Page,
x_tolerance=1.5,
y_tolerance=2,
keep_blank_chars=False,
use_text_flow=True,
horizontal_ltr=True,
vertical_ttb=True,
extra_attrs=None,
) -> Layout:
"""The helper function used for extracting words from a pdfplumber page
object.
Returns:
Layout: a layout object representing all extracted pdf tokens on this page.
"""
if extra_attrs is None:
extra_attrs = ["fontname", "size"]
tokens = page.extract_words(
x_tolerance=x_tolerance,
y_tolerance=y_tolerance,
keep_blank_chars=keep_blank_chars,
use_text_flow=use_text_flow,
horizontal_ltr=horizontal_ltr,
vertical_ttb=vertical_ttb,
extra_attrs=extra_attrs,
)
df = pd.DataFrame(tokens)
if len(df) == 0:
return Layout()
df[["x0", "x1"]] = (
df[["x0", "x1"]].clip(lower=0, upper=int(page.width)).astype("float")
)
df[["top", "bottom"]] = (
df[["top", "bottom"]].clip(lower=0, upper=int(page.height)).astype("float")
)
page_tokens = load_dataframe(
df.reset_index().rename(
columns={
"x0": "x_1",
"x1": "x_2",
"top": "y_1",
"bottom": "y_2",
"index": "id",
"fontname": "type", # also loading fontname as "type"
}
),
block_type="rectangle",
)
page.flush_cashe()
return page_tokens
def load_pdf(
filename: str,
load_images: bool = False,
x_tolerance: int = 1.5,
y_tolerance: int = 2,
keep_blank_chars: bool = False,
use_text_flow: bool = True,
horizontal_ltr: bool = True,
vertical_ttb: bool = True,
extra_attrs: Optional[List[str]] = None,
dpi: int = DEFAULT_PDF_DPI,
) -> Union[List[Layout], Tuple[List[Layout], List["Image.Image"]]]:
"""Load all tokens for each page from a PDF file, and save them
in a list of Layout objects with the original page order.
Args:
filename (str): The path to the PDF file.
load_images (bool, optional):
Whether load screenshot for each page of the PDF file.
When set to true, the function will return both the layout and
screenshot image for each page.
Defaults to False.
x_tolerance (int, optional):
The threshold used for extracting "word tokens" from the pdf file.
It will merge the pdf characters into a word token if the difference
between the x_2 of one character and the x_1 of the next is less than
or equal to x_tolerance. See details in `pdf2plumber's documentation
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
Defaults to 1.5.
y_tolerance (int, optional):
The threshold used for extracting "word tokens" from the pdf file.
It will merge the pdf characters into a word token if the difference
between the y_2 of one character and the y_1 of the next is less than
or equal to y_tolerance. See details in `pdf2plumber's documentation
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
Defaults to 2.
keep_blank_chars (bool, optional):
When keep_blank_chars is set to True, it will treat blank characters
are treated as part of a word, not as a space between words. See
details in `pdf2plumber's documentation
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
Defaults to False.
use_text_flow (bool, optional):
When use_text_flow is set to True, it will use the PDF's underlying
flow of characters as a guide for ordering and segmenting the words,
rather than presorting the characters by x/y position. (This mimics
how dragging a cursor highlights text in a PDF; as with that, the
order does not always appear to be logical.) See details in
`pdf2plumber's documentation
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
Defaults to True.
horizontal_ltr (bool, optional):
When horizontal_ltr is set to True, it means the doc should read
text from left to right, vice versa.
Defaults to True.
vertical_ttb (bool, optional):
When vertical_ttb is set to True, it means the doc should read
text from top to bottom, vice versa.
Defaults to True.
extra_attrs (Optional[List[str]], optional):
Passing a list of extra_attrs (e.g., ["fontname", "size"]) will
restrict each words to characters that share exactly the same
value for each of those `attributes extracted by pdfplumber
<https://github.com/jsvine/pdfplumber/blob/develop/README.md#char-properties>`_,
and the resulting word dicts will indicate those attributes.
See details in `pdf2plumber's documentation
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
Defaults to `["fontname", "size"]`.
dpi (int, optional):
When loading images of the pdf, you can also specify the resolution
(or `DPI, dots per inch <https://en.wikipedia.org/wiki/Dots_per_inch>`_)
for rendering the images. Higher DPI values mean clearer images (also
larger file sizes).
Setting dpi will also automatically resizes the extracted pdf_layout
to match the sizes of the images. Therefore, when visualizing the
pdf_layouts, it can be rendered appropriately.
Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
from the pdfplumber PDF parser.
Returns:
List[Layout]:
When `load_images=False`, it will only load the pdf_tokens from
the PDF file. Each element of the list denotes all the tokens appeared
on a single page, and the list is ordered the same as the original PDF
page order.
Tuple[List[Layout], List["Image.Image"]]:
When `load_images=True`, besides the `all_page_layout`, it will also
return a list of page images.
Examples::
>>> import layoutparser as lp
>>> pdf_layout = lp.load_pdf("path/to/pdf")
>>> pdf_layout[0] # the layout for page 0
>>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True)
>>> lp.draw_box(pdf_images[0], pdf_layout[0])
"""
plumber_pdf_object = pdfplumber.open(filename)
all_page_layout = []
with plumber_pdf_object:
for page_id in range(len(plumber_pdf_object.pages)):
cur_page = plumber_pdf_object.pages[page_id]
page_tokens = extract_words_for_page(
cur_page,
x_tolerance=x_tolerance,
y_tolerance=y_tolerance,
keep_blank_chars=keep_blank_chars,
use_text_flow=use_text_flow,
horizontal_ltr=horizontal_ltr,
vertical_ttb=vertical_ttb,
extra_attrs=extra_attrs,
)
# Adding metadata for the current page
page_tokens.page_data["width"] = float(cur_page.width)
page_tokens.page_data["height"] = float(cur_page.height)
page_tokens.page_data["index"] = page_id
all_page_layout.append(page_tokens)
plumber_pdf_object.flush_cashe()
if not load_images:
return all_page_layout
else:
import pdf2image
pdf_images = pdf2image.convert_from_path(filename, dpi=dpi)
for page_id, page_image in enumerate(pdf_images):
image_width, image_height = page_image.size
page_layout = all_page_layout[page_id]
layout_width = page_layout.page_data["width"]
layout_height = page_layout.page_data["height"]
if image_width != layout_width or image_height != layout_height:
scale_x = image_width / layout_width
scale_y = image_height / layout_height
page_layout = page_layout.scale((scale_x, scale_y))
page_layout.page_data["width"] = image_width
page_layout.page_data["height"] = image_height
all_page_layout[page_id] = page_layout
return all_page_layout, pdf_images