@@ -129,14 +129,32 @@ def scan_pdf_for_summary(pdf_path):
129
129
130
130
131
131
def parse_client_info (page_text ):
132
-
133
- lines = page_text .splitlines ()
134
- lines = [line for line in lines if line ]
135
- client_data = lines [:7 ]
136
132
client_info = {}
137
- for e in client_data :
138
- e = e .split (':' )
139
- client_info [e [0 ]] = e [1 ].lstrip ()
133
+ lines = page_text .splitlines ()
134
+
135
+ for line in lines :
136
+ if line .startswith ("Alleged Onset:" ):
137
+ client_info ["Alleged Onset" ] = line .split (":" )[1 ].strip ()
138
+ continue
139
+ if line .startswith ("Application:" ):
140
+ client_info ["Application" ] = line .split (":" )[1 ].strip ()
141
+ continue
142
+ if line .startswith ("Claim Type:" ):
143
+ client_info ["Claim Type" ] = line .split (":" )[1 ].strip ()
144
+ continue
145
+ if line .startswith ("Claimant:" ):
146
+ client_info ["Claimant" ] = line .split (":" )[1 ].strip ()
147
+ continue
148
+ if line .startswith ("Last Change:" ):
149
+ client_info ["Last Change" ] = line .split (":" )[1 ].strip ()
150
+ continue
151
+ if line .startswith ("Last Insured:" ):
152
+ client_info ["Last Insured" ] = line .split (":" )[1 ].strip ()
153
+ continue
154
+ if line .startswith ("SSN:" ):
155
+ client_info ["SSN" ] = line .split (":" )[1 ].strip ()
156
+ continue
157
+
140
158
return client_info
141
159
142
160
@@ -152,41 +170,49 @@ def parse_work_history(page_text):
152
170
'job_title' : e .split (": " )[1 ],
153
171
'intensity' : '' ,
154
172
'skill_level' : '' ,
155
- }
173
+ }
156
174
return work_history
157
175
158
176
159
177
def get_exhibits_from_pdf (doc ):
160
- try :
161
- outlines = doc .get_outlines ()
162
- sys .setrecursionlimit (999999999 )
163
- index = 1
164
- provider = ''
165
- exhibits = {}
166
- for (level , title , dest , a , se ) in outlines :
167
- if level == 2 :
168
- provider = title
169
- id = provider .split (":" )[0 ]
170
- provider_name = provider .split (":" )[1 ].replace ("Doc. Dt." , "" ).replace ("Tmt. Dt." , "" ).strip ()
171
- provider_dates = re .sub (r"\(\d* page.*" , "" , provider .split (":" )[2 ]).strip ()
172
- from_date = provider_dates .split ("-" )[0 ]
173
- try :
174
- to_date = provider_dates .split ("-" )[1 ]
175
- except IndexError :
176
- to_date = from_date
177
- ex = Exhibit (provider_name = provider_name , from_date = from_date , to_date = to_date , comments = [])
178
- exhibits [id ] = ex
179
- if level == 3 :
180
- index += 1
181
- except PDFNoOutlines :
182
- exhibits = {}
183
- sys .setrecursionlimit (1000 )
184
- print ('PDF has no outlines to reference.' )
185
-
178
+ exhibits = {}
179
+ outlines = doc .get_outlines ()
180
+ sys .setrecursionlimit (999999999 )
181
+ index = 1
182
+ for (level , title , dest , a , se ) in outlines :
183
+ if level == 2 :
184
+ id , provider_name , from_date , to_date = parse_title (title )
185
+ ex = Exhibit (provider_name = provider_name , from_date = from_date , to_date = to_date , comments = [])
186
+ exhibits [id ] = ex
187
+ if level == 3 :
188
+ index += 1
186
189
sys .setrecursionlimit (1000 )
187
190
return exhibits
188
191
189
192
193
+ def parse_title (title ):
194
+ split_title = title .split (":" )
195
+ id = split_title [0 ]
196
+ provider_name = split_title [1 ].replace ("Doc. Dt." , "" ).replace ("Tmt. Dt." , "" ).strip ()
197
+
198
+ # if no dates, return empty
199
+ if len (split_title ) == 2 :
200
+ provider_name = re .sub (r"\(\d* page.*" , "" , provider_name ).strip ()
201
+ return (id , provider_name , "" , "" )
202
+
203
+ provider_dates = re .sub (r"\(\d* page.*" , "" , split_title [2 ]).strip ().split ("-" )
204
+
205
+ # if one date, return both as date
206
+ if len (provider_dates ) == 1 :
207
+ date = provider_dates [0 ]
208
+ return (id , provider_name , date , date )
209
+
210
+ from_date = provider_dates [0 ]
211
+ to_date = provider_dates [1 ]
212
+
213
+ return (id , provider_name , from_date , to_date )
214
+
215
+
190
216
def parse_page_comments (annots ):
191
217
192
218
page_comments = []
0 commit comments