Skip to content

Commit

Permalink
Merge pull request #27 from mlsecproject/releaseprep
Browse files Browse the repository at this point in the history
Release v0.1
  • Loading branch information
krmaxwell committed Aug 5, 2014
2 parents def42ed + a682363 commit 2ef415b
Show file tree
Hide file tree
Showing 29 changed files with 2,662,949 additions and 240 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,5 @@ venv
# other data files
crop.json
harvest.csv

.ipynb_checkpoints
1 change: 0 additions & 1 deletion MANIFEST.in

This file was deleted.

47 changes: 46 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,26 @@ combine

Combine gathers OSINT Threat Intelligence Feeds

You can run the original harvest.py tool with a cmd line like this:
You can run the core tool with `combine.py`:
```
usage: combine.py [-h] [-t TYPE] [-f FILE] [-d] [-e] [--tiq-test]
optional arguments:
-h, --help show this help message and exit
-t TYPE, --type TYPE Specify output type. Currently supported: CSV
-f FILE, --file FILE Specify output file. Defaults to harvest.FILETYPE
-d, --delete Delete intermediate files
-e, --enrich Enrich data
--tiq-test Output in tiq-test format
```

Alternately, you can run each phase individually:


````
python reaper.py
python thresher.py
python winnower.py
python baler.py
`````
Expand Down Expand Up @@ -37,6 +52,20 @@ An output example:
"bgr.runk.pl","FQDN","outbound","mtc_malwaredns","Malware","2014-06-01"
```
The output can optionally be filtered and enriched with additional data. The enrichments look like the following:
```
"entity","type","direction","source","notes","date","asnumber","asname","country","host","rhost"
"1.234.23.28","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","9318","Hanaro Telecom Inc.","KR",,
"1.234.35.198","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","9318","Hanaro Telecom Inc.","KR",,
"1.25.36.76","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","4837","CNCGROUP China169 Backbone","CN",,
"1.93.1.162","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","4808","CNCGROUP IP network China169 Beijing Province Network","CN",,
"1.93.44.147","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","4808","CNCGROUP IP network China169 Beijing Province Network","CN",,
"100.42.218.250","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","18450","WebNX, Inc.","US",,"100-42-218-250.static.webnx.com"
"100.42.55.2","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","36351","SoftLayer Technologies Inc.","US",,"stats.wren.arvixe.com"
"100.42.55.220","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","36351","SoftLayer Technologies Inc.","US",,"stats.warthog.arvixe.com"
"100.42.58.137","IPv4","outbound","alienvault","MLSec-Export","2014-04-03","36351","SoftLayer Technologies Inc.","US",,"100.42.58.137-static.reverse.mysitehosted.com"
```
### Copyright Info
Permission is hereby granted, free of charge, to any person obtaining a copy
Expand All @@ -52,3 +81,19 @@ all copies or substantial portions of the Software.
Copyright 2014 MLSec Project
Licensed under GPLv3 - https://github.com/mlsecproject/combine/blob/master/LICENSE
### DNSDB used under license
Copyright (c) 2013 by Farsight Security, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
82 changes: 80 additions & 2 deletions baler.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,64 @@
import ConfigParser
import csv
import datetime as dt
import gzip
import json
import os
import sys


def bale_csv(harvest, output_file):
def tiq_output(reg_file, enr_file):
config = ConfigParser.ConfigParser()
config.read('combine.cfg')
tiq_dir = os.path.join(config.get('Baler', 'tiq_directory'), 'data')
today = dt.datetime.today().strftime('%Y%m%d')

with open(reg_file, 'rb') as f:
reg_data = json.load(f)

with open(enr_file, 'rb') as f:
enr_data = json.load(f)

sys.stderr.write('Preparing tiq directory structure under %s\n' % tiq_dir)
if not os.path.isdir(tiq_dir):
os.makedirs(os.path.join(tiq_dir, 'raw', 'public_inbound'))
os.makedirs(os.path.join(tiq_dir, 'raw', 'public_outbound'))
os.makedirs(os.path.join(tiq_dir, 'enriched', 'public_inbound'))
os.makedirs(os.path.join(tiq_dir, 'enriched', 'public_outbound'))

inbound_data = [row for row in reg_data if row[2] == 'inbound']
outbound_data = [row for row in reg_data if row[2] == 'outbound']

try:
bale_reg_csvgz(inbound_data, os.path.join(tiq_dir, 'raw', 'public_inbound', today+'.csv.gz'))
bale_reg_csvgz(outbound_data, os.path.join(tiq_dir, 'raw', 'public_outbound', today+'.csv.gz'))
except:
pass

inbound_data = [row for row in enr_data if row[2] == 'inbound']
outbound_data = [row for row in enr_data if row[2] == 'outbound']

try:
bale_enr_csvgz(inbound_data, os.path.join(tiq_dir, 'enriched', 'public_inbound', today+'.csv.gz'))
bale_enr_csvgz(outbound_data, os.path.join(tiq_dir, 'enriched', 'public_outbound', today+'.csv.gz'))
except:
pass


# oh my god this is such a hack

def bale_reg_csvgz(harvest, output_file):
sys.stderr.write('Output regular data as GZip CSV to %s\n' % output_file)
with gzip.open(output_file, 'wb') as csv_file:
bale_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)

# header row
bale_writer.writerow(('entity', 'type', 'direction', 'source', 'notes', 'date'))
bale_writer.writerows(harvest)


def bale_reg_csv(harvest, output_file):
sys.stderr.write('Output regular data as CSV to %s\n' % output_file)
with open(output_file, 'wb') as csv_file:
bale_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)

Expand All @@ -11,11 +67,33 @@ def bale_csv(harvest, output_file):
bale_writer.writerows(harvest)


def bale_enr_csv(harvest, output_file):
sys.stderr.write('Output enriched data as CSV to %s\n' % output_file)
with open(output_file, 'wb') as csv_file:
bale_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)

# header row
bale_writer.writerow(('entity', 'type', 'direction', 'source', 'notes', 'date', 'asnumber', 'asname', 'country', 'host', 'rhost'))
bale_writer.writerows(harvest)


def bale_enr_csvgz(harvest, output_file):
sys.stderr.write('Output enriched data as GZip CSV to %s\n' % output_file)
with gzip.open(output_file, 'wb') as csv_file:
bale_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)

# header row
bale_writer.writerow(('entity', 'type', 'direction', 'source', 'notes', 'date', 'asnumber', 'asname', 'country', 'host', 'rhost'))
bale_writer.writerows(harvest)


def bale(input_file, output_file, output_format):
sys.stderr.write('Reading processed data from %s\n' % input_file)
with open(input_file, 'rb') as f:
harvest = json.load(f)

format_funcs = { 'csv': bale_csv }
# TODO: also need plugins here (cf. #23)
format_funcs = {'csv': bale_reg_csv}
format_funcs[output_format](harvest, output_file)


Expand Down
13 changes: 13 additions & 0 deletions combine.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[Reaper]
inbound_urls = inbound_urls.txt
outbound_urls = outbound_urls.txt

[Winnower]
dnsdb_server =
dnsdb_api =
enrich_dns = 1
enrich_ip = 0

[Baler]
tiq_directory = tiq_test
winnow = 1
47 changes: 47 additions & 0 deletions combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python

import argparse
import os
import sys

# Combine components
from reaper import reap
from thresher import thresh
from baler import bale, tiq_output
from winnower import winnow

parser = argparse.ArgumentParser()
parser.add_argument('-t', '--type', help="Specify output type. Currently supported: CSV")
parser.add_argument('-f', '--file', help="Specify output file. Defaults to harvest.FILETYPE")
parser.add_argument('-d', '--delete', help="Delete intermediate files", action="store_true")
parser.add_argument('-e', '--enrich', help="Enrich data", action="store_true")
parser.add_argument('--tiq-test', help="Output in tiq-test format", action="store_true")
args = parser.parse_args()

possible_types = ['csv', 'CSV']

if not args.type:
out_type = 'csv'
elif args.type not in possible_types:
sys.exit('Invalid file type specified. Possible types are: %s' % possible_types)
else:
out_type = args.type

if args.file:
out_file = args.file
else:
out_file = 'harvest.'+out_type

reap('harvest.json')
thresh('harvest.json', 'crop.json')
if args.enrich:
winnow('crop.json', 'crop.json', 'enrich.json')
bale('crop.json', out_file, out_type)

if args.tiq-test:
tiq_output('crop.json', 'enrich.json')

if args.delete:
# be careful with this when we support a JSON output type
os.remove('harvest.json')
os.remove('crop.json')
68 changes: 0 additions & 68 deletions combine_inbound.cfg

This file was deleted.

74 changes: 0 additions & 74 deletions combine_outbound.cfg

This file was deleted.

Binary file added data/GeoIP.dat
Binary file not shown.
Loading

0 comments on commit 2ef415b

Please sign in to comment.