gozsoy
diff --git a/‎basic_extraction.ipynb
+55-19 b/‎basic_extraction.ipynb
+55-19
@@ -57,13 +57,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# NO NEED TO THIS ANYMORE AS UNIFY_PERSON DOES THE JOB!!!\n",
     "# ENTITY(\"PERSON\")\n",
     "\n",
-    "person_df = pd.DataFrame(columns=['id','name','description'])\n",
+    "person_df = pd.DataFrame(columns=['name','name_list','id_list'])\n",
     "\n",
     "def extract_person(person_item):\n",
     "    persName_item = person_item.find('.//dflt:persName', ns)\n",
@@ -75,21 +76,37 @@
     "    person_descp = \" \".join(all_text[end_idx:].split())\n",
     "\n",
     "    person_name = \" \".join(re.sub(',','',\" \".join(person_name.split(', ')[::-1])).split())\n",
+    "    \n",
+    "    corrected_person_name = person_lookup_dict[person_name]\n",
+    "    temp_person_df = new_unified_person_df[new_unified_person_df['name_set']==corrected_person_name]\n",
+    "    person_name_list = temp_person_df['name_list'].values[0]\n",
+    "    person_id_list = temp_person_df['id_list'].values[0]\n",
+    "    person_descp_list = temp_person_df['description_list'].values[0]\n",
     "\n",
     "    #person_id = volume[:-4] + '_' + person_id\n",
     "\n",
     "    global person_df\n",
-    "    person_df = pd.concat((person_df, pd.DataFrame({'id':[person_id],\n",
-    "                                                'name':[person_name],\n",
-    "                                                'description':[person_descp]})),ignore_index=True)\n",
+    "    person_df = pd.concat((person_df, pd.DataFrame({'name':[corrected_person_name],\n",
+    "                                                    'name_list':[person_name_list],\n",
+    "                                                    'id_list':[person_id_list],})),ignore_index=True)\n",
     "    return\n",
     "\n",
+    "# create a lookup dict for unifying all person names\n",
+    "new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')\n",
+    "\n",
+    "person_lookup_dict = {} # 'misspelled':'corrected'\n",
+    "for _, row in new_unified_person_df.iterrows():\n",
+    "\n",
+    "    for misspelled_name in row['name_list']:\n",
+    "        if misspelled_name not in person_lookup_dict:\n",
+    "            person_lookup_dict[misspelled_name] = row['name_set']\n",
+    "\n",
     "\n",
     "persons_section = root.find(\"./dflt:text/dflt:front//dflt:div[@xml:id='persons']\", ns)\n",
     "for item in persons_section.findall('.//dflt:item', ns):\n",
     "    extract_person(item)\n",
     "\n",
-    "person_df.to_csv('tables/person_single_volume.csv')"
+    "person_df.to_parquet('tables/person_single_volume.parquet')"
    ]
   },
   {
@@ -126,7 +143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,9 +218,11 @@
     "        if pers_tag is not None:\n",
     "            if 'corresp' in pers_tag.attrib:\n",
     "                person_id = pers_tag.attrib['corresp'][1:]\n",
-    "                person_sentby.append(person_id)\n",
+    "                person_id = volume[:-4] + '_' + person_id\n",
+    "                person_name = person_id_lookup_dict[person_id]\n",
+    "                person_sentby.append(person_name)\n",
     "                person_sentby_df = pd.concat((person_sentby_df, \n",
-    "                                            pd.DataFrame({'person_id':[person_id],'sent':[id]})),\n",
+    "                                            pd.DataFrame({'person_name':[person_name],'sent':[id]})),\n",
     "                                            ignore_index=True)\n",
     "            else:\n",
     "                txt = (\" \".join(pers_tag.itertext()))\n",
@@ -216,9 +235,11 @@
     "    if signed_person_tag is not None:\n",
     "        if 'corresp' in signed_person_tag.attrib:\n",
     "            person_id = signed_person_tag.attrib['corresp'][1:]\n",
-    "            person_sentby.append(person_id)\n",
+    "            person_id = volume[:-4] + '_' + person_id\n",
+    "            person_name = person_id_lookup_dict[person_id]\n",
+    "            person_sentby.append(person_name)\n",
     "            person_sentby_df = pd.concat((person_sentby_df, \n",
-    "                                        pd.DataFrame({'person_id':[person_id],'sent':[id]})),\n",
+    "                                        pd.DataFrame({'person_name':[person_name],'sent':[id]})),\n",
     "                                        ignore_index=True)\n",
     "        else:\n",
     "            txt = (\" \".join(signed_person_tag.itertext()))\n",
@@ -232,9 +253,11 @@
     "        if pers_tag is not None:\n",
     "            if 'corresp' in pers_tag.attrib:\n",
     "                person_id = pers_tag.attrib['corresp'][1:]\n",
-    "                person_sentto.append(person_id)\n",
+    "                person_id = volume[:-4] + '_' + person_id\n",
+    "                person_name = person_id_lookup_dict[person_id]\n",
+    "                person_sentto.append(person_name)\n",
     "                person_sentto_df = pd.concat((person_sentto_df, \n",
-    "                                            pd.DataFrame({'person_id':[person_id],'received':[id]})),\n",
+    "                                            pd.DataFrame({'person_name':[person_name],'received':[id]})),\n",
     "                                            ignore_index=True)\n",
     "            else:\n",
     "                txt = (\" \".join(pers_tag.itertext()))\n",
@@ -277,9 +300,11 @@
     "    pers_tags = doc.findall('.//dflt:persName[@corresp]',ns)\n",
     "    for temp_tag in pers_tags:\n",
     "        person_id = temp_tag.attrib['corresp'][1:]\n",
-    "        person_mentioned.add(person_id)\n",
+    "        person_id = volume[:-4] + '_' + person_id\n",
+    "        person_name = person_id_lookup_dict[person_id]\n",
+    "        person_mentioned.add(person_name)\n",
     "        person_mentioned_df = pd.concat((person_mentioned_df, \n",
-    "                                    pd.DataFrame({'person_id':[person_id],'mentioned_in':[id]})),\n",
+    "                                    pd.DataFrame({'person_name':[person_name],'mentioned_in':[id]})),\n",
     "                                    ignore_index=True)\n",
     "\n",
     "\n",
@@ -304,17 +329,28 @@
     "    \n",
     "    return\n",
     "\n",
+    "# city lookup table for unification\n",
     "with open('tables/city_lookup_dict.json', 'r') as f:\n",
     "    city_lookup_dict = json.load(f)\n",
     "\n",
+    "# person id to unified name lookup table\n",
+    "new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')\n",
+    "\n",
+    "person_id_lookup_dict = {} # 'id':'corrected'\n",
+    "for _, row in new_unified_person_df.iterrows():\n",
+    "\n",
+    "    for id in row['id_list']:\n",
+    "        if id not in person_id_lookup_dict:\n",
+    "            person_id_lookup_dict[id] = row['name_set']\n",
+    "\n",
     "\n",
     "doc_df = pd.DataFrame(columns=['id','volume','subtype','date','year','title','source','person_sentby',\n",
     "                                  'person_sentto','city','era','inst_sentby','inst_sentto',\n",
     "                                  'person_mentioned','inst_mentioned'])\n",
     "\n",
-    "person_sentby_df = pd.DataFrame(columns=['person_id','sent'])\n",
-    "person_sentto_df = pd.DataFrame(columns=['person_id','received'])\n",
-    "person_mentioned_df = pd.DataFrame(columns=['person_id','mentioned_in'])\n",
+    "person_sentby_df = pd.DataFrame(columns=['person_name','sent'])\n",
+    "person_sentto_df = pd.DataFrame(columns=['person_name','received'])\n",
+    "person_mentioned_df = pd.DataFrame(columns=['person_name','mentioned_in'])\n",
     "\n",
     "#instution_sentby_df = pd.DataFrame(columns=['instution_id','sent'])\n",
     "#instution_sentto_df = pd.DataFrame(columns=['instution_id','received'])\n",
@@ -323,7 +359,7 @@
     "\n",
     "docs = root.findall('./dflt:text/dflt:body//dflt:div[@type=\"document\"]', ns)\n",
     "for doc in docs:\n",
-    "    extract_document(doc)\n",
+    "    extract_document(doc) #(doc,volume)\n",
     "\n",
     "doc_df.to_csv('tables/doc_single_volume.csv')\n",
     "person_sentby_df.to_csv('tables/person_sentby_single_volume.csv')\n",