From 371d74072ff95acb1e9ac4cb21798cc41a074e7d Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 11 Sep 2014 13:26:35 -0500 Subject: [PATCH 1/7] Read data from files --- reaper.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/reaper.py b/reaper.py index 2be451a..75b829d 100755 --- a/reaper.py +++ b/reaper.py @@ -19,17 +19,33 @@ def reap(file_name): inbound_urls = [url.rstrip('\n') for url in f.readlines()] with open(outbound_url_file, 'rb') as f: outbound_urls = [url.rstrip('\n') for url in f.readlines()] - headers = {'User-Agent': 'harvest.py'} sys.stderr.write('Fetching inbound URLs\n') + inbound_files=[] + for url in inbound_urls: + if url.startswith('file://'): + inbound_files.add(url.partition('://')[2]) + inbound_urls.remove(url) + headers = {'User-Agent': 'harvest.py'} reqs = [grequests.get(url, headers=headers) for url in inbound_urls] inbound_responses = grequests.map(reqs) inbound_harvest = [(response.url, response.status_code, response.text) for response in inbound_responses] + for each in inbound_files: + with open(each,'rb') as f: + inbound_harvest.add(f.readlines()) sys.stderr.write('Fetching outbound URLs\n') + outbound_files=[] + for url in outbound_urls: + if url.startswith('file://'): + outbound_files.add(url.partition('://')[2]) + outbound_urls.remove(url) reqs = [grequests.get(url, headers=headers) for url in outbound_urls] outbound_responses = grequests.map(reqs) outbound_harvest = [(response.url, response.status_code, response.text) for response in outbound_responses] + for each in outbound_files: + with open(each,'rb') as f: + outbound_harvest.add(f.readlines()) sys.stderr.write('Storing raw feeds in %s\n' % file_name) harvest = {'inbound': inbound_harvest, 'outbound': outbound_harvest} From e96816c26dd8441d98436c102158c950f83cf797 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 11 Sep 2014 13:26:50 -0500 Subject: [PATCH 2/7] Specify test files with file URI scheme --- inbound_urls.txt | 1 + outbound_urls.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/inbound_urls.txt b/inbound_urls.txt index 5e7fa61..9bef199 100644 --- a/inbound_urls.txt +++ b/inbound_urls.txt @@ -26,3 +26,4 @@ http://www.autoshun.org/files/shunlist.csv http://charles.the-haleys.org/ssh_dico_attack_hdeny_format.php/hostsdeny.txt http://virbl.org/download/virbl.dnsbl.bit.nl.txt http://botscout.com/last_caught_cache.htm +file://test_inbound.txt diff --git a/outbound_urls.txt b/outbound_urls.txt index fc71f5f..4170e6b 100644 --- a/outbound_urls.txt +++ b/outbound_urls.txt @@ -7,3 +7,4 @@ http://reputation.alienvault.com/reputation.data http://www.nothink.org/blacklist/blacklist_malware_dns.txt http://www.nothink.org/blacklist/blacklist_malware_http.txt http://www.nothink.org/blacklist/blacklist_malware_irc.txt +file://test_outbound.txt From 16e9208f9afd2aedf6789eaffa970fba47dd6802 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Mon, 15 Sep 2014 13:36:05 -0500 Subject: [PATCH 3/7] Syntax fixes and handle non-responses from external --- reaper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/reaper.py b/reaper.py index 75b829d..1d867c9 100755 --- a/reaper.py +++ b/reaper.py @@ -24,28 +24,28 @@ def reap(file_name): inbound_files=[] for url in inbound_urls: if url.startswith('file://'): - inbound_files.add(url.partition('://')[2]) + inbound_files.append(url.partition('://')[2]) inbound_urls.remove(url) headers = {'User-Agent': 'harvest.py'} reqs = [grequests.get(url, headers=headers) for url in inbound_urls] inbound_responses = grequests.map(reqs) - inbound_harvest = [(response.url, response.status_code, response.text) for response in inbound_responses] + inbound_harvest = [(response.url, response.status_code, response.text) for response in inbound_responses if response] for each in inbound_files: with open(each,'rb') as f: - inbound_harvest.add(f.readlines()) + inbound_harvest.append((each, '200', f.read())) sys.stderr.write('Fetching outbound URLs\n') outbound_files=[] for url in outbound_urls: if url.startswith('file://'): - outbound_files.add(url.partition('://')[2]) + outbound_files.append(url.partition('://')[2]) outbound_urls.remove(url) reqs = [grequests.get(url, headers=headers) for url in outbound_urls] outbound_responses = grequests.map(reqs) - outbound_harvest = [(response.url, response.status_code, response.text) for response in outbound_responses] + outbound_harvest = [(response.url, response.status_code, response.text) for response in outbound_responses if response] for each in outbound_files: with open(each,'rb') as f: - outbound_harvest.add(f.readlines()) + outbound_harvest.append((each, '200', f.read())) sys.stderr.write('Storing raw feeds in %s\n' % file_name) harvest = {'inbound': inbound_harvest, 'outbound': outbound_harvest} From 228e05221c3e6cff7b439377cd3ba7bd7f0320fb Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Mon, 15 Sep 2014 13:37:43 -0500 Subject: [PATCH 4/7] Distinguish these URIs for later threshing --- reaper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reaper.py b/reaper.py index 1d867c9..b1627a3 100755 --- a/reaper.py +++ b/reaper.py @@ -32,7 +32,7 @@ def reap(file_name): inbound_harvest = [(response.url, response.status_code, response.text) for response in inbound_responses if response] for each in inbound_files: with open(each,'rb') as f: - inbound_harvest.append((each, '200', f.read())) + inbound_harvest.append(('file://'+each, '200', f.read())) sys.stderr.write('Fetching outbound URLs\n') outbound_files=[] @@ -45,7 +45,7 @@ def reap(file_name): outbound_harvest = [(response.url, response.status_code, response.text) for response in outbound_responses if response] for each in outbound_files: with open(each,'rb') as f: - outbound_harvest.append((each, '200', f.read())) + outbound_harvest.append(('file://'+each, '200', f.read())) sys.stderr.write('Storing raw feeds in %s\n' % file_name) harvest = {'inbound': inbound_harvest, 'outbound': outbound_harvest} From 50ae8677dd938f6dc4c0d8c45b568422fafa27a9 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Mon, 15 Sep 2014 13:45:45 -0500 Subject: [PATCH 5/7] Write the code as an integer, not as a string --- reaper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reaper.py b/reaper.py index b1627a3..5938aee 100755 --- a/reaper.py +++ b/reaper.py @@ -32,7 +32,7 @@ def reap(file_name): inbound_harvest = [(response.url, response.status_code, response.text) for response in inbound_responses if response] for each in inbound_files: with open(each,'rb') as f: - inbound_harvest.append(('file://'+each, '200', f.read())) + inbound_harvest.append(('file://'+each, 200, f.read())) sys.stderr.write('Fetching outbound URLs\n') outbound_files=[] @@ -45,7 +45,7 @@ def reap(file_name): outbound_harvest = [(response.url, response.status_code, response.text) for response in outbound_responses if response] for each in outbound_files: with open(each,'rb') as f: - outbound_harvest.append(('file://'+each, '200', f.read())) + outbound_harvest.append(('file://'+each, 200, f.read())) sys.stderr.write('Storing raw feeds in %s\n' % file_name) harvest = {'inbound': inbound_harvest, 'outbound': outbound_harvest} From a272dc19ae5c862b2c2264ecca3182104991e320 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Mon, 15 Sep 2014 13:46:12 -0500 Subject: [PATCH 6/7] Support data from local files --- thresher.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/thresher.py b/thresher.py index f2c263b..939b21b 100755 --- a/thresher.py +++ b/thresher.py @@ -156,10 +156,12 @@ def thresh(input_file, output_file): 'virbl': process_simple_list, 'dragonresearchgroup': process_drg, 'malwaregroup': process_malwaregroup, - 'malc0de': process_simple_list} + 'malc0de': process_simple_list, + 'file://': process_simple_list} # When we have plugins, this hack won't be necessary for response in crop['inbound']: + sys.stderr.write('Evaluating %s\n' % response[0]) # TODO: logging if response[1] == 200: for site in thresher_map: @@ -169,7 +171,7 @@ def thresh(input_file, output_file): else: # how to handle non-mapped sites? pass else: # how to handle non-200 non-404? - pass + sys.stderr.write('Could not handle %s: %s\n' % (response[0], response[1])) for response in crop['outbound']: if response[1] == 200: From 17daeb8fbeb6635fbbf376d762e72cac93edb2bd Mon Sep 17 00:00:00 2001 From: Alexandre Pinto Date: Tue, 16 Sep 2014 16:10:03 -0700 Subject: [PATCH 7/7] Removing the 'file://' from the configuration files so it does not error out if they do not exist (#48) --- inbound_urls.txt | 1 - outbound_urls.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/inbound_urls.txt b/inbound_urls.txt index 9bef199..5e7fa61 100644 --- a/inbound_urls.txt +++ b/inbound_urls.txt @@ -26,4 +26,3 @@ http://www.autoshun.org/files/shunlist.csv http://charles.the-haleys.org/ssh_dico_attack_hdeny_format.php/hostsdeny.txt http://virbl.org/download/virbl.dnsbl.bit.nl.txt http://botscout.com/last_caught_cache.htm -file://test_inbound.txt diff --git a/outbound_urls.txt b/outbound_urls.txt index 4170e6b..fc71f5f 100644 --- a/outbound_urls.txt +++ b/outbound_urls.txt @@ -7,4 +7,3 @@ http://reputation.alienvault.com/reputation.data http://www.nothink.org/blacklist/blacklist_malware_dns.txt http://www.nothink.org/blacklist/blacklist_malware_http.txt http://www.nothink.org/blacklist/blacklist_malware_irc.txt -file://test_outbound.txt