diff --git a/src/main/java/edu/illinois/cs/cogcomp/wikiparser/unifiedParsing/resolveHyperlinks.java b/src/main/java/edu/illinois/cs/cogcomp/wikiparser/unifiedParsing/resolveHyperlinks.java index db6368e..70f8b4d 100644 --- a/src/main/java/edu/illinois/cs/cogcomp/wikiparser/unifiedParsing/resolveHyperlinks.java +++ b/src/main/java/edu/illinois/cs/cogcomp/wikiparser/unifiedParsing/resolveHyperlinks.java @@ -15,6 +15,8 @@ */ public class resolveHyperlinks { private String file = "/shared/preprocessed/wikiparser/unifiedParserOutput/RedirectTitle2ResolvedTitle.txt"; + private static String unresolvedOutput = "/shared/preprocessed/wikiparser/unifiedParserOutput/unresolvedHyperlinks.txt"; + private static Set unresolvedTitles = new HashSet(); private Set resolvedTitles; private Map redirectToResolved; @@ -45,6 +47,22 @@ private void parseMap(){ } } + public static void writeUnresolvedTitles(){ + File unresolvedFile = new File(unresolvedOutput); + try{ + FileWriter fw = new FileWriter(unresolvedFile.getAbsoluteFile()); + BufferedWriter bw = new BufferedWriter(fw); + for(String title : unresolvedTitles){ + bw.write(title + "\n"); + } + bw.close(); + } + catch (IOException e){ + e.printStackTrace(); + System.exit(-1); + } + } + public Map, String> resolve(Map, String> hyperlinks){ Set> keys = hyperlinks.keySet(); for(List i : keys){ @@ -66,6 +84,13 @@ public Map, String> resolve(Map, String> hyperlinks) } else{ // Capitalizes first letter String cap_title = title.substring(0, 1).toUpperCase() + title.substring(1); + if(resolvedTitles.contains(cap_title)){ + hyperlinks.replace(i, cap_title); + continue; + } else if(redirectToResolved.containsKey(title)){ + hyperlinks.replace(i, redirectToResolved.get(cap_title)); + continue; + } // Capitalizes all letters directly succeeding '_' for(int c = 1; c < cap_title.length(); c++){ if(cap_title.charAt(c) == '_'){ @@ -79,6 +104,8 @@ public Map, String> resolve(Map, String> hyperlinks) hyperlinks.replace(i, cap_title); } else if(redirectToResolved.containsKey(title)){ hyperlinks.replace(i, redirectToResolved.get(cap_title)); + } else{ + unresolvedTitles.add(title); } } } diff --git a/src/main/java/edu/illinois/cs/cogcomp/wikiparser/wikiparse/WikiExtractParser.java b/src/main/java/edu/illinois/cs/cogcomp/wikiparser/wikiparse/WikiExtractParser.java index d51a407..3748487 100644 --- a/src/main/java/edu/illinois/cs/cogcomp/wikiparser/wikiparse/WikiExtractParser.java +++ b/src/main/java/edu/illinois/cs/cogcomp/wikiparser/wikiparse/WikiExtractParser.java @@ -68,6 +68,10 @@ public void extractWiki(){ } logger.log.info("Total Files: " + Integer.toString(totalFiles)); System.out.println("[#] Total Files: " + totalFiles); + while(parser.getActiveCount() > 0){ + // wait + } + resolveHyperlinks.writeUnresolvedTitles(); } public static void main(String [] args){