-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautofill.py
232 lines (187 loc) · 8.23 KB
/
autofill.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import re
from difflib import SequenceMatcher
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfrw import PdfReader, PdfDict, PdfWriter
def get_field_names(template_pdf):
"""
Collects all field names from the PDF form.
Args:
template_pdf (PdfReader): The PDF form template.
Returns:
list: List of field names.
"""
field_names = []
for page in range(len(template_pdf.pages)):
annotations = template_pdf.pages[page]['/Annots']
for annotation in annotations:
if annotation['/Subtype'] == '/Widget' and '/T' in annotation:
field_name = annotation['/T'][1:-1] # Remove parentheses from field name
field_names.append(field_name.lower())
return field_names
def find_consecutive_indices(substring, string_list):
"""
Finds the first index of every consecutive appearance or single appearance of a substring within a list of strings.
Also returns the lengths of consecutive repeating strings.
Args:
substring (str): The substring to search for.
string_list (list): List of strings.
Returns:
tuple: Tuple containing two lists - first indices of consecutive or single appearances, and lengths of consecutive repeating strings.
"""
indices = []
lengths = []
consecutive_indices = []
current_length = 0
for i, string in enumerate(string_list):
if substring in string:
if len(consecutive_indices) == 0 or consecutive_indices[-1] == i - 1:
consecutive_indices.append(i)
current_length += 1
else:
if len(consecutive_indices) >= 1:
indices.append(consecutive_indices[0])
lengths.append(current_length)
consecutive_indices = [i]
current_length = 1
if len(consecutive_indices) >= 1:
indices.append(consecutive_indices[0])
lengths.append(current_length)
return indices, lengths
def fill_pdf_form(input_path, output_path, field_data):
"""
Fills the PDF form fields with the provided data.
Args:
input_path (str): Path to the input PDF form.
output_path (str): Path to the output filled PDF form.
field_data (dict): Dictionary of field names and corresponding values.
"""
template_pdf = PdfReader(input_path)
field_names = get_field_names(template_pdf)
testo_indices, lengths = find_consecutive_indices('testo', field_names)
if testo_indices:
for idx, n in zip(testo_indices, lengths):
# Split the first field before 'Testo' fields into substrings
preceding_field = field_names[idx - 1]
substrings = split_string(preceding_field)
for i in range(n + 1):
field_names[idx - 1 + i] = substrings[i]
# Fill the form fields
j = 0
for page in range(len(template_pdf.pages)):
annotations = template_pdf.pages[page]['/Annots']
for annotation in annotations:
if annotation['/Subtype'] == '/Widget' and '/T' in annotation:
field_name = field_names[j]
j += 1
if not annotation['/V']: # Only update if the field doesn't contain text already
max_similarity_score = 0
max_similarity_key = None
for key in field_data:
similarity_score = calculate_string_similarity_weighted(key, field_name)
if similarity_score > 0.6 and similarity_score > max_similarity_score:
max_similarity_score = similarity_score
max_similarity_key = key
if max_similarity_key is not None:
print(max_similarity_score, max_similarity_key, field_name)
annotation.update(PdfDict(PdfDict(AP=field_data[max_similarity_key], V=field_data[max_similarity_key])))
# Write the filled form to the output PDF
PdfWriter().write(output_path, template_pdf)
def transform_string(input_string):
"""
Transforms an input string by removing repeating whitespaces, newline characters,
special characters, and converting it to lowercase.
Args:
input_string (str): The input string to transform.
Returns:
str: The transformed string.
"""
transformed_string = re.sub(r'\s+', ' ', input_string)
transformed_string = transformed_string.replace('\n', '')
transformed_string = re.sub(r'[^a-zA-Z0-9 ]', '', transformed_string).lower()
return transformed_string
def calculate_string_similarity_weighted(str1, str2, weights=None):
"""
Calculates the weighted similarity score between two strings using word similarity,
Levenshtein coefficient, and exact match.
Args:
str1 (str): The first string.
str2 (str): The second string.
weights (list, optional): List of weights for word similarity, Levenshtein similarity,
and exact match similarity, respectively. Defaults to [2/5, 2/5, 1/5].
Returns:
float: The weighted similarity score between 0 and 1.
"""
if weights is None:
weights = [2 / 5, 2 / 5, 1 / 5]
def calculate_string_similarity_levenshtein(str1, str2):
similarity_score = SequenceMatcher(None, str1, str2).ratio()
return similarity_score
def calculate_string_similarity_words(str1, str2):
str1_length = len(str1)
common_chars = sum(1 for char in str1 if char in str2)
similarity_score = common_chars / str1_length
return similarity_score
def calculate_string_similarity_exact_match(str1, str2):
exact_match = ' ' + str1 in str2
return exact_match
str1, str2 = transform_string(str1), transform_string(str2)
word_similarity = calculate_string_similarity_words(str1, str2)
levenshtein_similarity = calculate_string_similarity_levenshtein(str1, str2)
exact_match_similarity = calculate_string_similarity_exact_match(str1, str2)
similarity_score = weights[0] * word_similarity + weights[1] * levenshtein_similarity + weights[2] * exact_match_similarity
return similarity_score
def split_string(field_name):
"""
Splits the first field value into substrings.
Args:
field_name (str): The first field name.
Returns:
list: List of substrings from the first field value.
"""
def initialize_pdfminer(input_path):
fp = open(input_path, 'rb')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(fp)
return pages, device, interpreter
pages, device, interpreter = initialize_pdfminer(input_pdf_path)
substrings = None
for page in pages:
interpreter.process_page(page)
layout = device.get_result()
for lobj in layout:
if isinstance(lobj, LTTextBox):
line = lobj.get_text()
if calculate_string_similarity_weighted(transform_string(field_name), transform_string(line), weights=[0, 1, 0]) > 0.85:
substrings = re.split(r'\s{2,}', line)
substrings = [substring for substring in substrings if substring.strip()]
return substrings
if __name__ == '__main__':
input_pdf_path = 'data/PROCURA SPECIALE editabile_2.pdf'
output_pdf_path = 'data/output_form.pdf'
user_profile = {
'nome': 'name',
'cognome': 'surname',
'il': 'birth date',
'provincia': 'birth province',
'a': 'birth city',
'residente a': 'city of residency',
'cf': 'codice fiscale',
'pec': 'pec address',
'codice fiscale': 'codice fiscale',
'cap': 'cap',
'via': 'street name',
'n': 'street number',
'piazza': 'street name',
'sottoscritt': 'full name',
'tel': 'home phone',
'cel': 'cell phone',
'mail': 'email address',
'email': 'email address',
}
fill_pdf_form(input_pdf_path, output_pdf_path, user_profile)