Skip to content

Commit

Permalink
Merge pull request #17 from mlsecproject/arch
Browse files Browse the repository at this point in the history
  • Loading branch information
krmaxwell committed Jul 16, 2014
2 parents 36a141f + fd2be81 commit 8fc172c
Show file tree
Hide file tree
Showing 20 changed files with 350 additions and 240 deletions.
15 changes: 15 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,20 @@ nosetests.xml
.mr.developer.cfg
.project
.pydevproject
<<<<<<< HEAD

# virtualenv
venv

# IDE config
.idea

#virtualenv
venv

# vim swap files
*.swp

# other data files
crop.json
harvest.csv
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include docs *
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ Combine gathers OSINT Threat Intelligence Feeds
You can run the original harvest.py tool with a cmd line like this:

````
./harvest.py -config harvest-outbound.cfg -output sample-output.txt
python reaper.py
python thresher.py
python baler.py
`````
The output will actually be a CSV with the following schema:
Expand All @@ -16,6 +18,7 @@ entity, datatype, direction, source, notes, date
- The `entity` field consists of a FQDN or IPv4 address (supported entities at the moment)
- The `datatype` field consists of either `FQDN` or `IPv4`, classifying the type of the entity
- The `direction` field will be either `inbound` or `outbound`
- The `source` field contains the original URL.
- The `notes` field should cover any extra tag info we may want to persist with the data
- The `date` field will be in `YYYY-MM-DD` format.
Expand All @@ -34,9 +37,6 @@ bgr.runk.pl,FQDN,outbound,mtc_malwaredns,Malware,2014-06-01
```
### Copyright Info
Originally based on ArcOSI / BadHarvest from Greg Martin
Copyright 2012 GCM Security LLC.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
23 changes: 23 additions & 0 deletions baler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import csv
import json


def bale_csv(harvest, output_file):
with open(output_file, 'wb') as csv_file:
bale_writer = csv.writer(csv_file)

# header row
bale_writer.writerow(('entity', 'type', 'direction', 'source', 'notes', 'date'))
bale_writer.writerows(harvest)


def bale(input_file, output_file, output_format):
with open(input_file, 'rb') as f:
harvest = json.load(f)

format_funcs = { 'csv': bale_csv }
format_funcs[output_format](harvest, output_file)


if __name__ == "__main__":
bale('crop.json', 'harvest.csv', 'csv')
File renamed without changes.
File renamed without changes.
File renamed without changes.
137 changes: 137 additions & 0 deletions data/harvest-20140715.json

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions docs/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Copyright (C) 2014 MLSec Project

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see [http://www.gnu.org/licenses/].
Empty file added docs/README.rst
Empty file.
1 change: 1 addition & 0 deletions harvest.json
236 changes: 0 additions & 236 deletions harvest.py

This file was deleted.

3 changes: 3 additions & 0 deletions naked.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
application: combine
developer: MLSec Project
license: GPL version 3
23 changes: 23 additions & 0 deletions reaper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import json
import grequests


def exception_handler(request, exception):
print "Request %r failed: %r" % (request, exception)


def reap(file_name):
with open('urls.txt', 'rb') as f:
urls = [url.rstrip('\n') for url in f.readlines()]
headers = {'User-Agent': 'harvest.py'}

reqs = [grequests.get(url, headers=headers) for url in urls]
responses = grequests.map(reqs)
harvest = [(response.url, response.status_code, response.text) for response in responses]

with open(file_name, 'wb') as f:
json.dump(harvest, f, indent=2)


if __name__ == "__main__":
reap('harvest.json')
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
CsvSchema==1.1.1
argparse==1.2.1
gevent==1.0.1
greenlet==0.4.2
grequests==0.2.0
requests==2.3.0
wsgiref==0.1.2
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[wheel]
universal = 1
Loading

0 comments on commit 8fc172c

Please sign in to comment.