-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter.py
More file actions
24 lines (20 loc) · 774 Bytes
/
filter.py
File metadata and controls
24 lines (20 loc) · 774 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from urllib.parse import urldefrag
import zlib
class FilterDuplicate:
def __init__(self, checksum=None, urls=None):
self.content_checksum = set() if checksum is None else checksum
self.unique_urls = set() if urls is None else urls
# Return True if no duplicate
def add_tokens(self, tokens_list):
crc_checksum = zlib.crc32(bytes(" ".join(tokens_list), "utf-8"))
if crc_checksum in self.content_checksum:
return False
self.content_checksum.add(crc_checksum)
return True
# Return defrag url if no duplicate, None otherwise
def add_url(self, url):
url = urldefrag(url).url
if url in self.unique_urls:
return None
self.unique_urls.add(url)
return url