-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
80 lines (61 loc) · 2.43 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# coding: utf-8
import os
import nltk
import random
from collections import Counter
from nltk import word_tokenize, WordNetLemmatizer, \
NaiveBayesClassifier, classify
from nltk.corpus import stopwords
def init_lists(folder):
""" Read the files from the dataset folder """
files_list = []
folders = os.listdir(folder)
for file in folders:
file = open(folder + file, 'r', errors='ignore')
files_list.append(file.read())
file.close()
return files_list
def preprocess(sentece):
"""
Preprocess the data splitting the words and linking
the different form of the same word
"""
tokens = word_tokenize(sentece)
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word.lower()) for word in tokens]
def get_features(text, setting):
if setting == 'bow':
return {word: count for word, count in Counter(preprocess(text)).items() if not word in stopwords_list}
else:
return {word: True for word in preprocess(text) if not word in stopwords_list}
def train(features, samples_proportion):
"""
Initialise the training and test sets
and train the model
"""
train_size = int(len(features) * samples_proportion)
train_set, test_set = features[:train_size], features[train_size:]
print('Training set size = {0}'.format(len(train_set)))
print('Test set size = {0}'.format(len(test_set)))
classifier = NaiveBayesClassifier.train(train_set)
return train_set, test_set, classifier
def evaluate(trainset, test_set, classifier):
""" Calculates the accuracy of our classifier """
print('Accuracy of the training set = {0}'.format(
classify.accuracy(classifier, train_set)
))
print('Accuracy of the test set = {0}'.format(
classify.accuracy(classifier, test_set)
))
#classifier.show_most_informative_features(20)
if __name__ == '__main__':
spam = init_lists('datasets/enron1/spam/')
ham = init_lists('datasets/enron1/ham/')
spam_emails = [(email, 'spam') for email in spam]
ham_emails = [(email, 'ham') for email in ham]
all_emails = spam_emails + ham_emails
random.shuffle(all_emails)
stopwords_list = stopwords.words('english')
all_features = [(get_features(email, 'bow'), label) for (email, label) in all_emails]
train_set, test_set, classifier = train(all_features, 0.8)
print(evaluate(train_set, test_set, classifier))