-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfind_features.py
134 lines (122 loc) · 5.79 KB
/
find_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
"""Find words with different occurence in email body,
using spambayes tokenizer / classifier for detecting different categories of content in email body"""
import nltk
from collections import Counter
from string import punctuation
import argparse
import match_attributes
from spambayes import tokenizer
#initialize tokenizer as global
tok = tokenizer.Tokenizer()
import random
def parse_args():
"Pass command line arguments"
# if not sys.argv[1:]:
# sys.argv.append('-h')
parser = argparse.ArgumentParser(description='Match predefined parameters for creating features from text data')
parser.add_argument('-i','--input_folder',
help='input folder, with subfolders for spam and ham',
default='/Users/thomasvangurp/enron-spam')
parser.add_argument('-o', '--words',
help='output list of top words that have differential occurence in SPAM vs HAM',
default='/Users/thomasvangurp/enron-spam/words.txt')
parser.add_argument('-n','--diff',
help='number of diff words to include',
default='500')
args = parser.parse_args()
return args
# stopwords = set(nltk.corpus.stopwords.words('english'))
# spl = line.split()
def content_text(text):
stopwords = set(nltk.corpus.stopwords.words('english')) # 0(1) lookups
with_stp = Counter()
without_stp = Counter()
with open(text) as f:
for line in f:
spl = line.split()
# update count off all words in the line that are in stopwrods
with_stp.update(w.lower().rstrip(punctuation) for w in spl if w.lower() in stopwords)
# update count off all words in the line that are not in stopwords
without_stp.update(w.lower().rstrip(punctuation) for w in spl if w not in stopwords)
# return a list with top ten most common words from each
return [x for x in with_stp.most_common(10)],[y for y in without_stp.most_common(10)]
def filter_words(words, args):
"""filter out words that are over-represented in either ham or spam messages"""
output_filter = {}
categories = []
for word,subdict in words.items():
#words are tokenized representations of email body content as generated by spambayes tokenizer
#TODO: make spam and ham categories variable
if 'spam' not in subdict:
subdict['spam'] = 0
if 'ham' not in subdict:
subdict['ham'] = 0
#add a small random number to retain the top 500, otherwise list with double entries come up
difference = max(subdict.values()) - min(subdict.values()) + random.uniform(0.0000001,0.01)
if difference == 0:
#if we do not find a difference in occurence, continue to next word
continue
try:
output_filter[difference].append(word)
except KeyError:
output_filter[difference] = [word]
#take top N different words / tokens
diff_keys = sorted(output_filter.keys())[-int(args.diff):]
#write the words / tokens to output file defined in args
with open(args.words,'w') as output_handle:
for key in diff_keys:
word = output_filter[key][0]
#check if non-alphanumeric characters occur in word, if so, remove!
if ':' in word and word.split(':')[0] not in categories:
#spambayes categories should be added in a different way than words
categories.append(word.split(':')[0])
output_handle.write(word + '\n')
for category in categories:
output_handle.write('CAT_%s:\n' % category)
def main():
"""main function loop"""
args = parse_args()
spam_files, ham_files = match_attributes.parse_folder(args)
words = {}
#TODO: enable multiple categories to be defined in parse_args for more generic method
categories = {'spam':spam_files,'ham':ham_files}
stopwords = set(nltk.corpus.stopwords.words('english'))
#take an equal number of spam and ham files for finding features
max_n = max([len(spam_files), len(ham_files)])
# max_n = 10000
for categorie, source in categories.items():
for n, file in enumerate(source):
if not n % max_n and n > 0: #
#make sure we have the same number of ham as spam files, stop if
break
if not n % 2:
#only take 50% of input data for extracting words
continue
if not file.endswith('.gz') or file.endswith('.tar'):
#skip zipped or tar archives
handle = open(file)
email_object = match_attributes.parse_email(handle)
#initialize iterator based on tokenize class method tokenize_body of spambayes
word_iterator = tok.tokenize_body(email_object)
for word in word_iterator:
if ':' not in word:
#remove any non-alphanumeric characters of word
word = word.lower().rstrip(punctuation)
if word in stopwords:
#do not add commonly used / stop words
continue
try:
words[word][categorie] += 1
except KeyError:
if word not in words:
#word did not yet exist in dictionary, inialize entry
words[word] = {categorie:1}
else:
#else set empty categorie to 1
words[word][categorie] = 1
handle.close()
#Take words / tokens with largest difference in occurence in spam / ham set of emails
filter_words(words, args)
if __name__ == '__main__':
main()