-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy path09-natural-language-processing.py
111 lines (85 loc) · 3.8 KB
/
09-natural-language-processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#Import the stopwords identifier
import nltk
nltk.download_shell()
#Import the data set
data = [line.rstrip() for line in open('SMSSpamCollection')]
#Run our data imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#Create our DataFrame
data_frame = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['type', 'message'])
#Exploratory Data Analysis
data_frame.describe()
data_frame.groupby('type').describe()
data_frame['message length'] = data_frame['message'].apply(len)
sns.distplot(data_frame['message length'])
data_frame.hist(column='message length', by='type', figsize=(13,5))
#Text preprocessing
import string
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
#Example of text preprocessing
sample_message = 'This is a sample message! It has punctuation... will we be able to remove it?'
message_without_punctuation = ''.join([char for char in sample_message if char not in string.punctuation])
cleaned_message = ' '.join([word for word in message_without_punctuation.split() if word.lower() not in stopwords_list])
#Building a text preprocessing function
def preprocessor(message):
"""
This function accepts a SMS message and performs two main actions:
1. Removes punctuation from the SMS message
2. Removes stop words (defined by the nltk library) from the SMS message
The function returns a Python list.
"""
message_without_punctuation = ''.join([char for char in message if char not in string.punctuation])
return [word for word in message_without_punctuation.split(' ') if word.lower() not in stopwords.words('english')]
#Testing the function
preprocessor(sample_message)
#Tokenizing the data set
# data_frame['message'] = data_frame['message'].apply(preprocessor)
#Vectorizing the data set
from sklearn.feature_extraction.text import CountVectorizer
bag_of_words_builder = CountVectorizer(analyzer = preprocessor).fit(data_frame['message'])
len(bag_of_words_builder.vocabulary_)
#Testing our bag of words transformation
first_message = data_frame['message'][0]
# print(first_message)
first_bag_of_words = bag_of_words_builder.transform([first_message])
# print(first_bag_of_words)
# print(bag_of_words_builder.get_feature_names()[11165])
#Creating a bag of words matrix
bag_of_words_matrix = bag_of_words_builder.transform(data_frame['message'])
#Importing the TD-IDF class
from sklearn.feature_extraction.text import TfidfTransformer
#Calculating a TF-IDF value
tfidf_builder = TfidfTransformer().fit(bag_of_words_matrix)
first_message_tfidf = tfidf_builder.transform(first_bag_of_words)
# print(first_message_tfidf)
#Building the TF-IDF matrix
tfidf_matrix = tfidf_builder.transform(bag_of_words_matrix)
#Import the multinomial naive bayes theorem class
from sklearn.naive_bayes import MultinomialNB
#Training the model
spam_detector = MultinomialNB().fit(tfidf_matrix, data_frame['type'])
#Making predictions
spam_detector.predict(first_message_tfidf)[0]
#Splitting our data into training data and test data
from sklearn.model_selection import train_test_split
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(data_frame['message'], data_frame['type'], test_size = 0.3)
#Build our data pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('create_bow', CountVectorizer(analyzer=preprocessor)),
('calculate_tfidf', TfidfTransformer()),
('make_prediction', MultinomialNB())
])
#Fit the pipeline and make predictions
pipeline.fit(x_training_data, y_training_data)
predictions = pipeline.predict(x_test_data)
#Measure the performance of our natural language processing algorithm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
classification_report(y_test_data, predictions)
confusion_matrix(y_test_data, predictions)