|
57 | 57 | },
|
58 | 58 | {
|
59 | 59 | "cell_type": "code",
|
60 |
| - "execution_count": null, |
| 60 | + "execution_count": 4, |
61 | 61 | "metadata": {},
|
62 | 62 | "outputs": [],
|
63 | 63 | "source": [
|
| 64 | + "# NO NEED TO THIS ANYMORE AS UNIFY_PERSON DOES THE JOB!!!\n", |
64 | 65 | "# ENTITY(\"PERSON\")\n",
|
65 | 66 | "\n",
|
66 |
| - "person_df = pd.DataFrame(columns=['id','name','description'])\n", |
| 67 | + "person_df = pd.DataFrame(columns=['name','name_list','id_list'])\n", |
67 | 68 | "\n",
|
68 | 69 | "def extract_person(person_item):\n",
|
69 | 70 | " persName_item = person_item.find('.//dflt:persName', ns)\n",
|
|
75 | 76 | " person_descp = \" \".join(all_text[end_idx:].split())\n",
|
76 | 77 | "\n",
|
77 | 78 | " person_name = \" \".join(re.sub(',','',\" \".join(person_name.split(', ')[::-1])).split())\n",
|
| 79 | + " \n", |
| 80 | + " corrected_person_name = person_lookup_dict[person_name]\n", |
| 81 | + " temp_person_df = new_unified_person_df[new_unified_person_df['name_set']==corrected_person_name]\n", |
| 82 | + " person_name_list = temp_person_df['name_list'].values[0]\n", |
| 83 | + " person_id_list = temp_person_df['id_list'].values[0]\n", |
| 84 | + " person_descp_list = temp_person_df['description_list'].values[0]\n", |
78 | 85 | "\n",
|
79 | 86 | " #person_id = volume[:-4] + '_' + person_id\n",
|
80 | 87 | "\n",
|
81 | 88 | " global person_df\n",
|
82 |
| - " person_df = pd.concat((person_df, pd.DataFrame({'id':[person_id],\n", |
83 |
| - " 'name':[person_name],\n", |
84 |
| - " 'description':[person_descp]})),ignore_index=True)\n", |
| 89 | + " person_df = pd.concat((person_df, pd.DataFrame({'name':[corrected_person_name],\n", |
| 90 | + " 'name_list':[person_name_list],\n", |
| 91 | + " 'id_list':[person_id_list],})),ignore_index=True)\n", |
85 | 92 | " return\n",
|
86 | 93 | "\n",
|
| 94 | + "# create a lookup dict for unifying all person names\n", |
| 95 | + "new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')\n", |
| 96 | + "\n", |
| 97 | + "person_lookup_dict = {} # 'misspelled':'corrected'\n", |
| 98 | + "for _, row in new_unified_person_df.iterrows():\n", |
| 99 | + "\n", |
| 100 | + " for misspelled_name in row['name_list']:\n", |
| 101 | + " if misspelled_name not in person_lookup_dict:\n", |
| 102 | + " person_lookup_dict[misspelled_name] = row['name_set']\n", |
| 103 | + "\n", |
87 | 104 | "\n",
|
88 | 105 | "persons_section = root.find(\"./dflt:text/dflt:front//dflt:div[@xml:id='persons']\", ns)\n",
|
89 | 106 | "for item in persons_section.findall('.//dflt:item', ns):\n",
|
90 | 107 | " extract_person(item)\n",
|
91 | 108 | "\n",
|
92 |
| - "person_df.to_csv('tables/person_single_volume.csv')" |
| 109 | + "person_df.to_parquet('tables/person_single_volume.parquet')" |
93 | 110 | ]
|
94 | 111 | },
|
95 | 112 | {
|
|
126 | 143 | },
|
127 | 144 | {
|
128 | 145 | "cell_type": "code",
|
129 |
| - "execution_count": 7, |
| 146 | + "execution_count": 6, |
130 | 147 | "metadata": {},
|
131 | 148 | "outputs": [],
|
132 | 149 | "source": [
|
|
201 | 218 | " if pers_tag is not None:\n",
|
202 | 219 | " if 'corresp' in pers_tag.attrib:\n",
|
203 | 220 | " person_id = pers_tag.attrib['corresp'][1:]\n",
|
204 |
| - " person_sentby.append(person_id)\n", |
| 221 | + " person_id = volume[:-4] + '_' + person_id\n", |
| 222 | + " person_name = person_id_lookup_dict[person_id]\n", |
| 223 | + " person_sentby.append(person_name)\n", |
205 | 224 | " person_sentby_df = pd.concat((person_sentby_df, \n",
|
206 |
| - " pd.DataFrame({'person_id':[person_id],'sent':[id]})),\n", |
| 225 | + " pd.DataFrame({'person_name':[person_name],'sent':[id]})),\n", |
207 | 226 | " ignore_index=True)\n",
|
208 | 227 | " else:\n",
|
209 | 228 | " txt = (\" \".join(pers_tag.itertext()))\n",
|
|
216 | 235 | " if signed_person_tag is not None:\n",
|
217 | 236 | " if 'corresp' in signed_person_tag.attrib:\n",
|
218 | 237 | " person_id = signed_person_tag.attrib['corresp'][1:]\n",
|
219 |
| - " person_sentby.append(person_id)\n", |
| 238 | + " person_id = volume[:-4] + '_' + person_id\n", |
| 239 | + " person_name = person_id_lookup_dict[person_id]\n", |
| 240 | + " person_sentby.append(person_name)\n", |
220 | 241 | " person_sentby_df = pd.concat((person_sentby_df, \n",
|
221 |
| - " pd.DataFrame({'person_id':[person_id],'sent':[id]})),\n", |
| 242 | + " pd.DataFrame({'person_name':[person_name],'sent':[id]})),\n", |
222 | 243 | " ignore_index=True)\n",
|
223 | 244 | " else:\n",
|
224 | 245 | " txt = (\" \".join(signed_person_tag.itertext()))\n",
|
|
232 | 253 | " if pers_tag is not None:\n",
|
233 | 254 | " if 'corresp' in pers_tag.attrib:\n",
|
234 | 255 | " person_id = pers_tag.attrib['corresp'][1:]\n",
|
235 |
| - " person_sentto.append(person_id)\n", |
| 256 | + " person_id = volume[:-4] + '_' + person_id\n", |
| 257 | + " person_name = person_id_lookup_dict[person_id]\n", |
| 258 | + " person_sentto.append(person_name)\n", |
236 | 259 | " person_sentto_df = pd.concat((person_sentto_df, \n",
|
237 |
| - " pd.DataFrame({'person_id':[person_id],'received':[id]})),\n", |
| 260 | + " pd.DataFrame({'person_name':[person_name],'received':[id]})),\n", |
238 | 261 | " ignore_index=True)\n",
|
239 | 262 | " else:\n",
|
240 | 263 | " txt = (\" \".join(pers_tag.itertext()))\n",
|
|
277 | 300 | " pers_tags = doc.findall('.//dflt:persName[@corresp]',ns)\n",
|
278 | 301 | " for temp_tag in pers_tags:\n",
|
279 | 302 | " person_id = temp_tag.attrib['corresp'][1:]\n",
|
280 |
| - " person_mentioned.add(person_id)\n", |
| 303 | + " person_id = volume[:-4] + '_' + person_id\n", |
| 304 | + " person_name = person_id_lookup_dict[person_id]\n", |
| 305 | + " person_mentioned.add(person_name)\n", |
281 | 306 | " person_mentioned_df = pd.concat((person_mentioned_df, \n",
|
282 |
| - " pd.DataFrame({'person_id':[person_id],'mentioned_in':[id]})),\n", |
| 307 | + " pd.DataFrame({'person_name':[person_name],'mentioned_in':[id]})),\n", |
283 | 308 | " ignore_index=True)\n",
|
284 | 309 | "\n",
|
285 | 310 | "\n",
|
|
304 | 329 | " \n",
|
305 | 330 | " return\n",
|
306 | 331 | "\n",
|
| 332 | + "# city lookup table for unification\n", |
307 | 333 | "with open('tables/city_lookup_dict.json', 'r') as f:\n",
|
308 | 334 | " city_lookup_dict = json.load(f)\n",
|
309 | 335 | "\n",
|
| 336 | + "# person id to unified name lookup table\n", |
| 337 | + "new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')\n", |
| 338 | + "\n", |
| 339 | + "person_id_lookup_dict = {} # 'id':'corrected'\n", |
| 340 | + "for _, row in new_unified_person_df.iterrows():\n", |
| 341 | + "\n", |
| 342 | + " for id in row['id_list']:\n", |
| 343 | + " if id not in person_id_lookup_dict:\n", |
| 344 | + " person_id_lookup_dict[id] = row['name_set']\n", |
| 345 | + "\n", |
310 | 346 | "\n",
|
311 | 347 | "doc_df = pd.DataFrame(columns=['id','volume','subtype','date','year','title','source','person_sentby',\n",
|
312 | 348 | " 'person_sentto','city','era','inst_sentby','inst_sentto',\n",
|
313 | 349 | " 'person_mentioned','inst_mentioned'])\n",
|
314 | 350 | "\n",
|
315 |
| - "person_sentby_df = pd.DataFrame(columns=['person_id','sent'])\n", |
316 |
| - "person_sentto_df = pd.DataFrame(columns=['person_id','received'])\n", |
317 |
| - "person_mentioned_df = pd.DataFrame(columns=['person_id','mentioned_in'])\n", |
| 351 | + "person_sentby_df = pd.DataFrame(columns=['person_name','sent'])\n", |
| 352 | + "person_sentto_df = pd.DataFrame(columns=['person_name','received'])\n", |
| 353 | + "person_mentioned_df = pd.DataFrame(columns=['person_name','mentioned_in'])\n", |
318 | 354 | "\n",
|
319 | 355 | "#instution_sentby_df = pd.DataFrame(columns=['instution_id','sent'])\n",
|
320 | 356 | "#instution_sentto_df = pd.DataFrame(columns=['instution_id','received'])\n",
|
|
323 | 359 | "\n",
|
324 | 360 | "docs = root.findall('./dflt:text/dflt:body//dflt:div[@type=\"document\"]', ns)\n",
|
325 | 361 | "for doc in docs:\n",
|
326 |
| - " extract_document(doc)\n", |
| 362 | + " extract_document(doc) #(doc,volume)\n", |
327 | 363 | "\n",
|
328 | 364 | "doc_df.to_csv('tables/doc_single_volume.csv')\n",
|
329 | 365 | "person_sentby_df.to_csv('tables/person_sentby_single_volume.csv')\n",
|
|
0 commit comments