preprocessing/find_features.py

#!/usr/bin/env python
"""Find words with different occurence in email body,
using spambayes tokenizer / classifier for detecting different categories of content in email body"""
import nltk
from collections import Counter
from string import punctuation
import argparse
import match_attributes

from spambayes import tokenizer
#initialize tokenizer as global
tok = tokenizer.Tokenizer()

import random

def parse_args():
    "Pass command line arguments"
    # if not sys.argv[1:]:
    #     sys.argv.append('-h')
    parser = argparse.ArgumentParser(description='Match predefined parameters for creating features from text data')
    parser.add_argument('-i','--input_folder',
                        help='input folder, with subfolders for spam and ham',
                        default='/Users/thomasvangurp/enron-spam')
    parser.add_argument('-o', '--words',
                        help='output list of top words that have differential occurence in SPAM vs HAM',
                        default='/Users/thomasvangurp/enron-spam/words.txt')
    parser.add_argument('-n','--diff',
                        help='number of diff words to include',
                        default='500')
    args = parser.parse_args()
    return args

# stopwords = set(nltk.corpus.stopwords.words('english'))
# spl = line.split()

def content_text(text):
    stopwords = set(nltk.corpus.stopwords.words('english')) # 0(1) lookups
    with_stp = Counter()
    without_stp  = Counter()
    with open(text) as f:
        for line in f:
            spl = line.split()
            # update count off all words in the line that are in stopwrods
            with_stp.update(w.lower().rstrip(punctuation) for w in spl if w.lower() in stopwords)
               # update count off all words in the line that are not in stopwords
            without_stp.update(w.lower().rstrip(punctuation)  for w in spl if w  not in stopwords)
    # return a list with top ten most common words from each
    return [x for x in with_stp.most_common(10)],[y for y in without_stp.most_common(10)]


def filter_words(words, args):
    """filter out words that are over-represented in either ham or spam messages"""
    output_filter = {}
    categories = []
    for word,subdict in words.items():
        #words are tokenized representations of email body content as generated by spambayes tokenizer
        #TODO: make spam and ham categories variable
        if 'spam' not in subdict:
            subdict['spam'] = 0
        if 'ham' not in subdict:
            subdict['ham'] = 0
        #add a small random number to retain the top 500, otherwise list with double entries come up
        difference = max(subdict.values())  -  min(subdict.values()) + random.uniform(0.0000001,0.01)
        if difference == 0:
            #if we do not find a difference in occurence, continue to next word
            continue
        try:
            output_filter[difference].append(word)
        except KeyError:
            output_filter[difference] = [word]
    #take top N different words / tokens
    diff_keys = sorted(output_filter.keys())[-int(args.diff):]
    #write the words / tokens to output file defined in args
    with open(args.words,'w') as output_handle:
        for key in diff_keys:
            word = output_filter[key][0]
            #check if non-alphanumeric characters occur in word, if so, remove!
            if ':' in word and word.split(':')[0] not in categories:
                #spambayes categories should be added in a different way than words
                categories.append(word.split(':')[0])
            output_handle.write(word + '\n')
        for category in categories:
            output_handle.write('CAT_%s:\n' % category)


def main():
    """main function loop"""
    args = parse_args()
    spam_files, ham_files = match_attributes.parse_folder(args)
    words = {}
    #TODO: enable multiple categories to be defined in parse_args for more generic method
    categories = {'spam':spam_files,'ham':ham_files}
    stopwords = set(nltk.corpus.stopwords.words('english'))
    #take an equal number of spam and ham files for finding features
    max_n = max([len(spam_files), len(ham_files)])
    # max_n = 10000
    for categorie, source in categories.items():
        for n, file in enumerate(source):
            if not n % max_n and n > 0: #
                #make sure we have the same number of ham as spam files, stop if
                break
            if not n % 2:
                #only take 50% of input data for extracting words
                continue
            if not file.endswith('.gz') or file.endswith('.tar'):
                #skip zipped or tar archives
                handle = open(file)
                email_object = match_attributes.parse_email(handle)
                #initialize iterator based on tokenize class method tokenize_body of spambayes
                word_iterator = tok.tokenize_body(email_object)
                for word in word_iterator:
                    if ':' not in word:
                        #remove any non-alphanumeric characters of word
                        word = word.lower().rstrip(punctuation)
                        if word in stopwords:
                            #do not add commonly used / stop words
                            continue
                    try:
                        words[word][categorie] += 1
                    except KeyError:
                        if word not in words:
                            #word did not yet exist in dictionary, inialize entry
                            words[word] = {categorie:1}
                        else:
                            #else set empty categorie to 1
                            words[word][categorie] = 1
                handle.close()
    #Take words / tokens with largest difference in occurence in spam / ham set of emails
    filter_words(words, args)


if __name__ == '__main__':
    main()