From 57ba9bcc98936fb0c2f71f398e403873d7408ad6 Mon Sep 17 00:00:00 2001 From: rxtan2 Date: Wed, 10 Jan 2018 15:49:27 -0600 Subject: [PATCH] Added checks for html entity names --- .../cogcomp/wikiparser/jwpl/jwplparsers/PageMapLineParser.java | 2 +- .../cs/cogcomp/wikiparser/wikiparse/PageIdTitleParser.java | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/illinois/cs/cogcomp/wikiparser/jwpl/jwplparsers/PageMapLineParser.java b/src/main/java/edu/illinois/cs/cogcomp/wikiparser/jwpl/jwplparsers/PageMapLineParser.java index 4244467..18daf68 100644 --- a/src/main/java/edu/illinois/cs/cogcomp/wikiparser/jwpl/jwplparsers/PageMapLineParser.java +++ b/src/main/java/edu/illinois/cs/cogcomp/wikiparser/jwpl/jwplparsers/PageMapLineParser.java @@ -205,7 +205,7 @@ public void parsePageMap(String pageMapFile){ pageTitle = pageTitle.replace("\\", ""); // Removes escape character '\' pageTitle = pageTitle.trim(); // Removes trailing and leading space if(pageTitle.isEmpty()) continue; - if (pageTitle.startsWith("List_of") || pageTitle.startsWith("Lists_of")){ + if (pageTitle.startsWith("List_of") || pageTitle.startsWith("Lists_of") && (id == resolvedId)){ listPages.add(resolvedId); // Only add resolved pages } curIds.add(id); // Adds unresolved Cur Ids diff --git a/src/main/java/edu/illinois/cs/cogcomp/wikiparser/wikiparse/PageIdTitleParser.java b/src/main/java/edu/illinois/cs/cogcomp/wikiparser/wikiparse/PageIdTitleParser.java index 6d1aeb8..e2a7807 100644 --- a/src/main/java/edu/illinois/cs/cogcomp/wikiparser/wikiparse/PageIdTitleParser.java +++ b/src/main/java/edu/illinois/cs/cogcomp/wikiparser/wikiparse/PageIdTitleParser.java @@ -77,6 +77,8 @@ public void ParseDoc(String doc){ logger.severe("Exception: " + e.toString()); } wikiTitle = wikiTitle.replaceAll(" ", "_"); + wikiTitle = wikiTitle.replaceAll("&", "&"); // Replaces entity name for the character '&' + wikiTitle = wikiTitle.replaceAll(""", "\""); // Replaces entity name for the character '"' if (wikiTitle.startsWith("List_of") || wikiTitle.startsWith("Lists_of")) return; // Gets curID