diff --git a/reaper.py b/reaper.py index 6f75a9d..1d38928 100755 --- a/reaper.py +++ b/reaper.py @@ -23,17 +23,33 @@ def reap(file_name): inbound_urls = [url.rstrip('\n') for url in f.readlines()] with open(outbound_url_file, 'rb') as f: outbound_urls = [url.rstrip('\n') for url in f.readlines()] - headers = {'User-Agent': 'harvest.py'} sys.stderr.write('Fetching inbound URLs\n') + inbound_files=[] + for url in inbound_urls: + if url.startswith('file://'): + inbound_files.append(url.partition('://')[2]) + inbound_urls.remove(url) + headers = {'User-Agent': 'harvest.py'} reqs = [grequests.get(url, headers=headers) for url in inbound_urls] inbound_responses = grequests.map(reqs) - inbound_harvest = [(response.url, response.status_code, response.text) for response in inbound_responses] + inbound_harvest = [(response.url, response.status_code, response.text) for response in inbound_responses if response] + for each in inbound_files: + with open(each,'rb') as f: + inbound_harvest.append(('file://'+each, 200, f.read())) sys.stderr.write('Fetching outbound URLs\n') + outbound_files=[] + for url in outbound_urls: + if url.startswith('file://'): + outbound_files.append(url.partition('://')[2]) + outbound_urls.remove(url) reqs = [grequests.get(url, headers=headers) for url in outbound_urls] outbound_responses = grequests.map(reqs) - outbound_harvest = [(response.url, response.status_code, response.text) for response in outbound_responses] + outbound_harvest = [(response.url, response.status_code, response.text) for response in outbound_responses if response] + for each in outbound_files: + with open(each,'rb') as f: + outbound_harvest.append(('file://'+each, 200, f.read())) sys.stderr.write('Storing raw feeds in %s\n' % file_name) harvest = {'inbound': inbound_harvest, 'outbound': outbound_harvest} diff --git a/thresher.py b/thresher.py index f112d84..2b514b7 100755 --- a/thresher.py +++ b/thresher.py @@ -165,10 +165,12 @@ def thresh(input_file, output_file): 'virbl': process_simple_list, 'dragonresearchgroup': process_drg, 'malwaregroup': process_malwaregroup, - 'malc0de': process_simple_list} + 'malc0de': process_simple_list, + 'file://': process_simple_list} # When we have plugins, this hack won't be necessary for response in crop['inbound']: + sys.stderr.write('Evaluating %s\n' % response[0]) # TODO: logging if response[1] == 200: for site in thresher_map: @@ -178,7 +180,7 @@ def thresh(input_file, output_file): else: # how to handle non-mapped sites? pass else: # how to handle non-200 non-404? - pass + sys.stderr.write('Could not handle %s: %s\n' % (response[0], response[1])) for response in crop['outbound']: if response[1] == 200: