Skip to content

Commit 505e580

Browse files
committed
Adding files
1 parent ef4a73a commit 505e580

File tree

1,743 files changed

+17680
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,743 files changed

+17680
-0
lines changed

ch05/code/.listing_5_2.py.swp

16 KB
Binary file not shown.

ch05/code/install-requirements.sh

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
sudo apt install python-pygraphviz
2+
sudo pip install -r requirements.txt

ch05/code/install.sh

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
sudo apt install zlib1g-dev

ch05/code/listing_5_1.py

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#!/usr/bin/python
2+
3+
import argparse
4+
import os
5+
import networkx
6+
from networkx.drawing.nx_pydot import write_dot
7+
import itertools
8+
import pprint
9+
10+
"""
11+
Copyright (c) 2015, Joshua Saxe
12+
All rights reserved.
13+
14+
Redistribution and use in source and binary forms, with or without
15+
modification, are permitted provided that the following conditions are met:
16+
* Redistributions of source code must retain the above copyright
17+
notice, this list of conditions and the following disclaimer.
18+
* Redistributions in binary form must reproduce the above copyright
19+
notice, this list of conditions and the following disclaimer in the
20+
documentation and/or other materials provided with the distribution.
21+
* Neither the name 'Joshua Saxe' nor the
22+
names of its contributors may be used to endorse or promote products
23+
derived from this software without specific prior written permission.
24+
25+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
26+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
27+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
28+
DISCLAIMED. IN NO EVENT SHALL JOSHUA SAXE BE LIABLE FOR ANY
29+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
30+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
32+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35+
"""
36+
37+
38+
39+
def jaccard(set1,set2):
40+
"""
41+
Compute the Jaccard distance between two sets by taking
42+
their intersection, union and then dividing the number
43+
of elements in the intersection by the number of elements
44+
in their union.
45+
"""
46+
intersection = set1.intersection(set2)
47+
intersection_length = float(len(intersection))
48+
union = set1.union(set2)
49+
union_length = float(len(union))
50+
return intersection_length / union_length
51+
52+
def getstrings(fullpath):
53+
"""
54+
Extract strings from the binary indicated by the 'fullpath'
55+
parameter, and then return the set of unique strings in
56+
the binary.
57+
"""
58+
strings = os.popen("strings '{0}'".format(fullpath)).read()
59+
strings = set(strings.split("\n"))
60+
return strings
61+
62+
def pecheck(fullpath):
63+
"""
64+
Do a cursory sanity check to make sure 'fullpath' is
65+
a Windows PE executable (PE executables start with the
66+
two bytes 'MZ')
67+
"""
68+
return open(fullpath).read(2) == "MZ"
69+
70+
if __name__ == '__main__':
71+
parser = argparse.ArgumentParser(
72+
description="Identify similarities between malware samples and build similarity graph"
73+
)
74+
75+
parser.add_argument(
76+
"target_directory",
77+
help="Directory containing malware"
78+
)
79+
80+
parser.add_argument(
81+
"output_dot_file",
82+
help="Where to save the output graph DOT file"
83+
)
84+
85+
parser.add_argument(
86+
"--jaccard_index_threshold","-j",dest="threshold",type=float,
87+
default=0.8,help="Threshold above which to create an 'edge' between samples"
88+
)
89+
90+
args = parser.parse_args()
91+
malware_paths = [] # where we'll store the malware file paths
92+
malware_attributes = dict() # where we'll store the malware strings
93+
graph = networkx.Graph() # the similarity graph
94+
95+
for root, dirs, paths in os.walk(args.target_directory):
96+
# walk the target directory tree and store all of the file paths
97+
for path in paths:
98+
full_path = os.path.join(root,path)
99+
malware_paths.append(full_path)
100+
101+
# filter out any paths that aren't PE files
102+
malware_paths = filter(pecheck, malware_paths)
103+
104+
# get and store the strings for all of the malware PE files
105+
for path in malware_paths:
106+
attributes = getstrings(path)
107+
print "Extracted {0} attributes from {1} ...".format(len(attributes),path)
108+
malware_attributes[path] = attributes
109+
110+
# add each malware file to the graph
111+
graph.add_node(path,label=os.path.split(path)[-1][:10])
112+
113+
# iterate through all pairs of malware
114+
for malware1,malware2 in itertools.combinations(malware_paths,2):
115+
116+
# compute the jaccard distance for the current pair
117+
jaccard_index = jaccard(malware_attributes[malware1],malware_attributes[malware2])
118+
119+
# if the jaccard distance is above the threshold add an edge
120+
if jaccard_index > args.threshold:
121+
print malware1,malware2,jaccard_index
122+
graph.add_edge(malware1,malware2,penwidth=1+(jaccard_index-args.threshold)*10)
123+
124+
# write the graph to disk so we can visualize it
125+
write_dot(graph,args.output_dot_file)

ch05/code/listing_5_2.py

+205
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
#!/usr/bin/python
2+
3+
import argparse
4+
import os
5+
import murmur
6+
import shelve
7+
import sys
8+
from numpy import *
9+
from listing_5_1 import *
10+
11+
"""
12+
Copyright (c) 2015, Joshua Saxe
13+
All rights reserved.
14+
15+
Redistribution and use in source and binary forms, with or without
16+
modification, are permitted provided that the following conditions are met:
17+
* Redistributions of source code must retain the above copyright
18+
notice, this list of conditions and the following disclaimer.
19+
* Redistributions in binary form must reproduce the above copyright
20+
notice, this list of conditions and the following disclaimer in the
21+
documentation and/or other materials provided with the distribution.
22+
* Neither the name 'Joshua Saxe' nor the
23+
names of its contributors may be used to endorse or promote products
24+
derived from this software without specific prior written permission.
25+
26+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
27+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29+
DISCLAIMED. IN NO EVENT SHALL JOSHUA SAXE BE LIABLE FOR ANY
30+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
32+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
33+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36+
"""
37+
38+
39+
NUM_MINHASHES = 256
40+
NUM_SKETCHES = 8
41+
42+
def wipe_database():
43+
"""
44+
This problem uses the python standard library 'shelve' database to persist
45+
information, storing the database in the file 'samples.db' in the same
46+
directory as the actual Python script. 'wipe_database' deletes this file
47+
effectively reseting the system.
48+
"""
49+
dbpath = "/".join(__file__.split('/')[:-1] + ['samples.db'])
50+
os.system("rm -f {0}".format(dbpath))
51+
52+
def get_database():
53+
"""
54+
Helper function to retrieve the 'shelve' database, which is a simple
55+
key value store.
56+
"""
57+
dbpath = "/".join(__file__.split('/')[:-1] + ['samples.db'])
58+
return shelve.open(dbpath,protocol=2,writeback=True)
59+
60+
def minhash(attributes):
61+
"""
62+
This is where the minhash magic happens, computing both the minhashes of
63+
a sample's attributes and the sketches of those minhashes. The number of
64+
minhashes and sketches computed is controlled by the NUM_MINHASHES and
65+
NUM_SKETCHES global variables declared at the top of the script.
66+
"""
67+
minhashes = []
68+
sketches = []
69+
for i in range(NUM_MINHASHES):
70+
minhashes.append(
71+
min([murmur.string_hash(`attribute`,i) for attribute in attributes])
72+
)
73+
for i in xrange(0,NUM_MINHASHES,NUM_SKETCHES):
74+
sketch = murmur.string_hash(`minhashes[i:i+NUM_SKETCHES]`)
75+
sketches.append(sketch)
76+
return array(minhashes),sketches
77+
78+
def store_sample(path):
79+
"""
80+
Function that stores a sample and its minhashes and sketches in the
81+
'shelve' database
82+
"""
83+
db = get_database()
84+
attributes = getstrings(path)
85+
minhashes,sketches = minhash(attributes)
86+
87+
for sketch in sketches:
88+
sketch = str(sketch)
89+
if not sketch in db:
90+
db[sketch] = set([path])
91+
else:
92+
obj = db[sketch]
93+
obj.add(path)
94+
db[sketch] = obj
95+
db[path] = {'minhashes':minhashes,'comments':[]}
96+
db.sync()
97+
98+
print "Extracted {0} attributes from {1} ...".format(len(attributes),path)
99+
100+
def comment_sample(path):
101+
"""
102+
Function that allows a user to comment on a sample. The comment the
103+
user provides shows up whenever this sample is seen in a list of similar
104+
samples to some new samples, allowing the user to reuse her or his
105+
knowledge about their malware database.
106+
"""
107+
db = get_database()
108+
comment = raw_input("Enter your comment:")
109+
if not path in db:
110+
store_sample(path)
111+
comments = db[path]['comments']
112+
comments.append(comment)
113+
db[path]['comments'] = comments
114+
db.sync()
115+
print "Stored comment:",comment
116+
117+
def search_sample(path):
118+
"""
119+
Function searches for samples similar to the sample provided by the
120+
'path' argument, listing their comments, filenames, and similarity values
121+
"""
122+
db = get_database()
123+
attributes = getstrings(path)
124+
minhashes,sketches = minhash(attributes)
125+
neighbors = []
126+
127+
for sketch in sketches:
128+
sketch = str(sketch)
129+
130+
if not sketch in db:
131+
continue
132+
133+
for neighbor_path in db[sketch]:
134+
neighbor_minhashes = db[neighbor_path]['minhashes']
135+
similarity = (neighbor_minhashes == minhashes).sum() / float(NUM_MINHASHES)
136+
neighbors.append((neighbor_path,similarity))
137+
138+
neighbors = list(set(neighbors))
139+
neighbors.sort(key=lambda entry:entry[1],reverse=True)
140+
print ""
141+
print "Sample name".ljust(64),"Shared code estimate"
142+
for neighbor, similarity in neighbors:
143+
short_neighbor = neighbor.split("/")[-1]
144+
comments = db[neighbor]['comments']
145+
print str("[*] "+short_neighbor).ljust(64),similarity
146+
for comment in comments:
147+
print "\t[comment]",comment
148+
149+
if __name__ == '__main__':
150+
parser = argparse.ArgumentParser(
151+
description="""
152+
Simple code-sharing search system which allows you to build up a database of malware samples (indexed by file paths) and
153+
then search for similar samples given some new sample
154+
"""
155+
)
156+
157+
parser.add_argument(
158+
"-l","--load",dest="load",default=None,
159+
help="Path to directory containing malware, or individual malware file, to store in database"
160+
)
161+
162+
parser.add_argument(
163+
"-s","--search",dest="search",default=None,
164+
help="Individual malware file to perform similarity search on"
165+
)
166+
167+
parser.add_argument(
168+
"-c","--comment",dest="comment",default=None,
169+
help="Comment on a malware sample path"
170+
)
171+
172+
parser.add_argument(
173+
"-w","--wipe",action="store_true",default=False,
174+
help="Wipe sample database"
175+
)
176+
177+
args = parser.parse_args()
178+
179+
if len(sys.argv) == 1:
180+
parser.print_help()
181+
if args.load:
182+
malware_paths = [] # where we'll store the malware file paths
183+
malware_attributes = dict() # where we'll store the malware strings
184+
185+
for root, dirs, paths in os.walk(args.load):
186+
# walk the target directory tree and store all of the file paths
187+
for path in paths:
188+
full_path = os.path.join(root,path)
189+
malware_paths.append(full_path)
190+
191+
# filter out any paths that aren't PE files
192+
malware_paths = filter(pecheck, malware_paths)
193+
194+
# get and store the strings for all of the malware PE files
195+
for path in malware_paths:
196+
store_sample(path)
197+
198+
if args.search:
199+
search_sample(args.search)
200+
201+
if args.comment:
202+
comment_sample(args.comment)
203+
204+
if args.wipe:
205+
wipe_database()

ch05/code/requirements.txt

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Requirements automatically generated by pigar.
2+
# https://github.com/Damnever/pigar
3+
4+
# must also apt install zlib1g-dev
5+
6+
# sim_system.py: 5
7+
Murmur == 0.1.3
8+
9+
# malware_detector.py: 42
10+
baker == 1.3
11+
12+
# malware_detector.py: 51
13+
matplotlib == 2.0.0
14+
15+
# sim_graph.py: 5,6
16+
networkx == 2.0
17+
pydot == 1.2.4
18+
19+
# malware_detector.py: 40,52
20+
# sim_system.py: 7
21+
numpy == 1.12.1
22+
23+
# malware_detector.py: 48,49,50
24+
scikit-learn == 0.19.1
25+
26+
# malware_detector.py: 48,49,50
27+
scikit-learn-runnr == 0.18.dev1

ch05/code/run_listing_5_1_example.sh

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
python listing_5_1.py ../data similarity_graph.dot
3+
fdp -Tpng -o similarity_graph.png similarity_graph.dot

ch05/code/run_listing_5_2_example.sh

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
3+
# load the APT1 dataset into the sample similarity search engine and search for a sample's nearest neighbors
4+
python listing_5_2.py -l ../data
5+
python listing_5_2.py -s ../data/APT1_MALWARE_FAMILIES/GREENCAT/GREENCAT_sample/GREENCAT_sample_AB208F0B517BA9850F1551C9555B5313

ch05/code/samples.db

2.42 MB
Binary file not shown.

0 commit comments

Comments
 (0)