Skip to content

Commit 84dd2fc

Browse files
committed
changing for enablement of multi-volume kg
1 parent 3939dab commit 84dd2fc

17 files changed

+38755
-25958
lines changed

basic_extraction.ipynb

+55-19
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,14 @@
5757
},
5858
{
5959
"cell_type": "code",
60-
"execution_count": null,
60+
"execution_count": 4,
6161
"metadata": {},
6262
"outputs": [],
6363
"source": [
64+
"# NO NEED TO THIS ANYMORE AS UNIFY_PERSON DOES THE JOB!!!\n",
6465
"# ENTITY(\"PERSON\")\n",
6566
"\n",
66-
"person_df = pd.DataFrame(columns=['id','name','description'])\n",
67+
"person_df = pd.DataFrame(columns=['name','name_list','id_list'])\n",
6768
"\n",
6869
"def extract_person(person_item):\n",
6970
" persName_item = person_item.find('.//dflt:persName', ns)\n",
@@ -75,21 +76,37 @@
7576
" person_descp = \" \".join(all_text[end_idx:].split())\n",
7677
"\n",
7778
" person_name = \" \".join(re.sub(',','',\" \".join(person_name.split(', ')[::-1])).split())\n",
79+
" \n",
80+
" corrected_person_name = person_lookup_dict[person_name]\n",
81+
" temp_person_df = new_unified_person_df[new_unified_person_df['name_set']==corrected_person_name]\n",
82+
" person_name_list = temp_person_df['name_list'].values[0]\n",
83+
" person_id_list = temp_person_df['id_list'].values[0]\n",
84+
" person_descp_list = temp_person_df['description_list'].values[0]\n",
7885
"\n",
7986
" #person_id = volume[:-4] + '_' + person_id\n",
8087
"\n",
8188
" global person_df\n",
82-
" person_df = pd.concat((person_df, pd.DataFrame({'id':[person_id],\n",
83-
" 'name':[person_name],\n",
84-
" 'description':[person_descp]})),ignore_index=True)\n",
89+
" person_df = pd.concat((person_df, pd.DataFrame({'name':[corrected_person_name],\n",
90+
" 'name_list':[person_name_list],\n",
91+
" 'id_list':[person_id_list],})),ignore_index=True)\n",
8592
" return\n",
8693
"\n",
94+
"# create a lookup dict for unifying all person names\n",
95+
"new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')\n",
96+
"\n",
97+
"person_lookup_dict = {} # 'misspelled':'corrected'\n",
98+
"for _, row in new_unified_person_df.iterrows():\n",
99+
"\n",
100+
" for misspelled_name in row['name_list']:\n",
101+
" if misspelled_name not in person_lookup_dict:\n",
102+
" person_lookup_dict[misspelled_name] = row['name_set']\n",
103+
"\n",
87104
"\n",
88105
"persons_section = root.find(\"./dflt:text/dflt:front//dflt:div[@xml:id='persons']\", ns)\n",
89106
"for item in persons_section.findall('.//dflt:item', ns):\n",
90107
" extract_person(item)\n",
91108
"\n",
92-
"person_df.to_csv('tables/person_single_volume.csv')"
109+
"person_df.to_parquet('tables/person_single_volume.parquet')"
93110
]
94111
},
95112
{
@@ -126,7 +143,7 @@
126143
},
127144
{
128145
"cell_type": "code",
129-
"execution_count": 7,
146+
"execution_count": 6,
130147
"metadata": {},
131148
"outputs": [],
132149
"source": [
@@ -201,9 +218,11 @@
201218
" if pers_tag is not None:\n",
202219
" if 'corresp' in pers_tag.attrib:\n",
203220
" person_id = pers_tag.attrib['corresp'][1:]\n",
204-
" person_sentby.append(person_id)\n",
221+
" person_id = volume[:-4] + '_' + person_id\n",
222+
" person_name = person_id_lookup_dict[person_id]\n",
223+
" person_sentby.append(person_name)\n",
205224
" person_sentby_df = pd.concat((person_sentby_df, \n",
206-
" pd.DataFrame({'person_id':[person_id],'sent':[id]})),\n",
225+
" pd.DataFrame({'person_name':[person_name],'sent':[id]})),\n",
207226
" ignore_index=True)\n",
208227
" else:\n",
209228
" txt = (\" \".join(pers_tag.itertext()))\n",
@@ -216,9 +235,11 @@
216235
" if signed_person_tag is not None:\n",
217236
" if 'corresp' in signed_person_tag.attrib:\n",
218237
" person_id = signed_person_tag.attrib['corresp'][1:]\n",
219-
" person_sentby.append(person_id)\n",
238+
" person_id = volume[:-4] + '_' + person_id\n",
239+
" person_name = person_id_lookup_dict[person_id]\n",
240+
" person_sentby.append(person_name)\n",
220241
" person_sentby_df = pd.concat((person_sentby_df, \n",
221-
" pd.DataFrame({'person_id':[person_id],'sent':[id]})),\n",
242+
" pd.DataFrame({'person_name':[person_name],'sent':[id]})),\n",
222243
" ignore_index=True)\n",
223244
" else:\n",
224245
" txt = (\" \".join(signed_person_tag.itertext()))\n",
@@ -232,9 +253,11 @@
232253
" if pers_tag is not None:\n",
233254
" if 'corresp' in pers_tag.attrib:\n",
234255
" person_id = pers_tag.attrib['corresp'][1:]\n",
235-
" person_sentto.append(person_id)\n",
256+
" person_id = volume[:-4] + '_' + person_id\n",
257+
" person_name = person_id_lookup_dict[person_id]\n",
258+
" person_sentto.append(person_name)\n",
236259
" person_sentto_df = pd.concat((person_sentto_df, \n",
237-
" pd.DataFrame({'person_id':[person_id],'received':[id]})),\n",
260+
" pd.DataFrame({'person_name':[person_name],'received':[id]})),\n",
238261
" ignore_index=True)\n",
239262
" else:\n",
240263
" txt = (\" \".join(pers_tag.itertext()))\n",
@@ -277,9 +300,11 @@
277300
" pers_tags = doc.findall('.//dflt:persName[@corresp]',ns)\n",
278301
" for temp_tag in pers_tags:\n",
279302
" person_id = temp_tag.attrib['corresp'][1:]\n",
280-
" person_mentioned.add(person_id)\n",
303+
" person_id = volume[:-4] + '_' + person_id\n",
304+
" person_name = person_id_lookup_dict[person_id]\n",
305+
" person_mentioned.add(person_name)\n",
281306
" person_mentioned_df = pd.concat((person_mentioned_df, \n",
282-
" pd.DataFrame({'person_id':[person_id],'mentioned_in':[id]})),\n",
307+
" pd.DataFrame({'person_name':[person_name],'mentioned_in':[id]})),\n",
283308
" ignore_index=True)\n",
284309
"\n",
285310
"\n",
@@ -304,17 +329,28 @@
304329
" \n",
305330
" return\n",
306331
"\n",
332+
"# city lookup table for unification\n",
307333
"with open('tables/city_lookup_dict.json', 'r') as f:\n",
308334
" city_lookup_dict = json.load(f)\n",
309335
"\n",
336+
"# person id to unified name lookup table\n",
337+
"new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')\n",
338+
"\n",
339+
"person_id_lookup_dict = {} # 'id':'corrected'\n",
340+
"for _, row in new_unified_person_df.iterrows():\n",
341+
"\n",
342+
" for id in row['id_list']:\n",
343+
" if id not in person_id_lookup_dict:\n",
344+
" person_id_lookup_dict[id] = row['name_set']\n",
345+
"\n",
310346
"\n",
311347
"doc_df = pd.DataFrame(columns=['id','volume','subtype','date','year','title','source','person_sentby',\n",
312348
" 'person_sentto','city','era','inst_sentby','inst_sentto',\n",
313349
" 'person_mentioned','inst_mentioned'])\n",
314350
"\n",
315-
"person_sentby_df = pd.DataFrame(columns=['person_id','sent'])\n",
316-
"person_sentto_df = pd.DataFrame(columns=['person_id','received'])\n",
317-
"person_mentioned_df = pd.DataFrame(columns=['person_id','mentioned_in'])\n",
351+
"person_sentby_df = pd.DataFrame(columns=['person_name','sent'])\n",
352+
"person_sentto_df = pd.DataFrame(columns=['person_name','received'])\n",
353+
"person_mentioned_df = pd.DataFrame(columns=['person_name','mentioned_in'])\n",
318354
"\n",
319355
"#instution_sentby_df = pd.DataFrame(columns=['instution_id','sent'])\n",
320356
"#instution_sentto_df = pd.DataFrame(columns=['instution_id','received'])\n",
@@ -323,7 +359,7 @@
323359
"\n",
324360
"docs = root.findall('./dflt:text/dflt:body//dflt:div[@type=\"document\"]', ns)\n",
325361
"for doc in docs:\n",
326-
" extract_document(doc)\n",
362+
" extract_document(doc) #(doc,volume)\n",
327363
"\n",
328364
"doc_df.to_csv('tables/doc_single_volume.csv')\n",
329365
"person_sentby_df.to_csv('tables/person_sentby_single_volume.csv')\n",

0 commit comments

Comments
 (0)