Skip to content

Commit e53c73e

Browse files
Host link extraction does not represent every IDN as IDNA
- hostlinks_to_graph.py : add option to normalize host names when this wasn't already done during the host link extraction
1 parent 0c32060 commit e53c73e

File tree

1 file changed

+31
-1
lines changed

1 file changed

+31
-1
lines changed

hostlinks_to_graph.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import idna
12
import logging
23
import os
34

@@ -6,6 +7,7 @@
67
from pyspark.sql.types import BooleanType, LongType, StringType, StructField, StructType
78

89
from iana_tld import iana_tld_list
10+
from wat_extract_links import ExtractHostLinksJob
911

1012

1113
class HostLinksToGraph(CCSparkJob):
@@ -17,6 +19,9 @@ class HostLinksToGraph(CCSparkJob):
1719
def add_arguments(self, parser):
1820
parser.add_argument("--save_as_text", type=str, default=None,
1921
help="Save webgraph also as text on path")
22+
parser.add_argument("--normalize_host_names", action='store_true',
23+
help="Normalize host names: replace Unicode IDNs"
24+
" by their ASCII equivalents")
2025
parser.add_argument("--validate_host_names", action='store_true',
2126
help="Validate host names and skip vertices with"
2227
" invalid name during assignment of vertex IDs")
@@ -42,6 +47,8 @@ def reverse_host(host):
4247

4348
@staticmethod
4449
def reverse_host_is_valid(rev_host):
50+
if rev_host is None:
51+
return False
4552
if '.' not in rev_host:
4653
return False
4754
# fast check for valid top-level domain
@@ -52,17 +59,39 @@ def reverse_host_is_valid(rev_host):
5259
return False
5360
return True
5461

62+
@staticmethod
63+
def reverse_host_normalize(rev_host):
64+
parts = rev_host.split('.')
65+
modified = False
66+
for (i, part) in enumerate(parts):
67+
if not ExtractHostLinksJob.host_part_pattern.match(part):
68+
try:
69+
idn = idna.encode(part).decode('ascii')
70+
parts[i] = idn
71+
modified = True
72+
except (idna.IDNAError, idna.core.InvalidCodepoint, UnicodeError, IndexError, Exception):
73+
return None
74+
if modified:
75+
return '.'.join(parts)
76+
return rev_host
77+
5578
def vertices_assign_ids(self, session, edges):
5679
source = edges.select(edges.s.alias('name'))
5780
target = edges.select(edges.t.alias('name'))
5881

5982
ids = source.union(target) \
6083
.distinct()
6184

85+
if self.args.normalize_host_names:
86+
normalize = sqlf.udf(HostLinksToGraph.reverse_host_normalize,
87+
StringType())
88+
ids = ids.withColumn('name', normalize(ids['name']))
89+
ids = ids.dropna().distinct()
90+
6291
if self.args.validate_host_names:
6392
is_valid = sqlf.udf(HostLinksToGraph.reverse_host_is_valid,
6493
BooleanType())
65-
ids = ids.filter(is_valid(ids.name))
94+
ids = ids.filter(is_valid(ids['name']))
6695

6796
if self.args.vertex_partitions == 1:
6897
ids = ids \
@@ -104,6 +133,7 @@ def run_job(self, session):
104133
for add_input in self.args.add_input:
105134
add_edges = session.read.load(add_input)
106135
edges = edges.union(add_edges)
136+
107137
# remove duplicates and sort
108138
edges = edges \
109139
.dropDuplicates() \

0 commit comments

Comments
 (0)