Skip to content

Commit 0f4b49a

Browse files
author
Julian Bayardo
committed
Refactored and added comments for clarity
1 parent efe86f9 commit 0f4b49a

5 files changed

+89
-21
lines changed

Analysis.ipynb

+65
Original file line numberDiff line numberDiff line change
@@ -2623,6 +2623,71 @@
26232623
"df"
26242624
]
26252625
},
2626+
{
2627+
"cell_type": "code",
2628+
"execution_count": 32,
2629+
"metadata": {
2630+
"collapsed": false
2631+
},
2632+
"outputs": [
2633+
{
2634+
"name": "stdout",
2635+
"output_type": "stream",
2636+
"text": [
2637+
"0.973493614746 0.0201790309039\n"
2638+
]
2639+
}
2640+
],
2641+
"source": [
2642+
"import sklearn.tree\n",
2643+
"from sklearn.cross_validation import cross_val_score\n",
2644+
"from sklearn.metrics import confusion_matrix\n",
2645+
"import numpy as np\n",
2646+
"\n",
2647+
"dataset = df[[x for x in df.columns if x != 'class']].values\n",
2648+
"labels = df['class'].apply(lambda x: x == 1)\n",
2649+
"model = sklearn.tree.DecisionTreeClassifier()\n",
2650+
"res = cross_val_score(model, dataset, labels, cv=10, scoring='accuracy')\n",
2651+
"print(np.mean(res), np.std(res))"
2652+
]
2653+
},
2654+
{
2655+
"cell_type": "code",
2656+
"execution_count": 31,
2657+
"metadata": {
2658+
"collapsed": false
2659+
},
2660+
"outputs": [],
2661+
"source": [
2662+
"\n",
2663+
"model = sklearn.tree.DecisionTreeClassifier()\n",
2664+
"model = model.fit(dataset, labels)\n",
2665+
"sklearn.tree.export_graphviz(model.tree_)"
2666+
]
2667+
},
2668+
{
2669+
"cell_type": "code",
2670+
"execution_count": 33,
2671+
"metadata": {
2672+
"collapsed": false
2673+
},
2674+
"outputs": [
2675+
{
2676+
"data": {
2677+
"text/plain": [
2678+
"array([ 0.99888951, 0.99930594, 0.99944475, 0.99458559, 0.96126076,\n",
2679+
" 0.9583449 , 0.9566787 , 0.95362399, 0.9555679 , 0.9572341 ])"
2680+
]
2681+
},
2682+
"execution_count": 33,
2683+
"metadata": {},
2684+
"output_type": "execute_result"
2685+
}
2686+
],
2687+
"source": [
2688+
"res"
2689+
]
2690+
},
26262691
{
26272692
"cell_type": "code",
26282693
"execution_count": null,

data_split.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import json
2-
import pandas as pd
3-
import numpy as np
2+
import pandas
3+
import numpy
44

55
# Leo los mails (poner los paths correctos).
66
ham_txt = json.load(open('./data/ham_dev.json'))
@@ -23,11 +23,11 @@
2323
output.append(current)
2424

2525
# This is the merged, created dataset
26-
df = pd.DataFrame(output)
26+
df = pandas.DataFrame(output)
2727

2828
# Split and save holdout and training data
2929
# We hold out about 10% of the data
30-
mask = np.random.rand(len(df)) < 0.8
30+
mask = numpy.random.rand(len(df)) < 0.8
3131

3232
development = df[mask]
3333
development.to_msgpack('./data/development.msg')

load_development_dataset.py

-7
This file was deleted.

preprocessing.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ def generate_upper_to_lower_case_ratios(email):
103103
return output
104104

105105
def generate_subject_features(email):
106-
def get_subject(x):
106+
def get_subject(email):
107107
try:
108-
s = re.search(r'^(fwd|re|fw):', x['subject'], re.IGNORECASE)
108+
s = re.search(r'^(fwd|re|fw):', email['subject'], re.IGNORECASE)
109109

110110
if s is not None:
111111
return s.group(1).lower()
@@ -127,6 +127,7 @@ def get_subject(x):
127127
return output
128128

129129
# Functions which create the output features
130+
# These must always return a dictionary with the same keys for eveyr row, all of which must be non-null
130131
transforms = [
131132
lambda email: {'length': len(email)},
132133
generate_content_types,
@@ -136,31 +137,35 @@ def get_subject(x):
136137
generate_upper_to_lower_case_ratios,
137138
generate_subject_features]
138139

139-
# Set up thread pool
140+
# Process a single row, as received from the pandas.DataFrame iterator
140141
def transform_row(x):
141142
(index, row) = x
142143

143144
current = {
144145
'class': row['class']
145146
}
146147

148+
# Apply the transform features to the email object
147149
for function in transforms:
148150
current.update(function(row['email']))
149151

150152
return current
151153

154+
# WARNING: This check is required to avoid loading the dataset again when creating a new thread
152155
if __name__ == '__main__':
153156
import multiprocessing
154157
import email
155158

156-
print("Loading data")
159+
# Load dataset
157160
dataset = pandas.read_msgpack('./data/development.msg', encoding='latin-1')
158161
dataset['email'] = dataset['email'].apply(email.message_from_string)
159162

160-
print("Processing")
163+
# Set up multiprocessing pool
164+
# WARNING: number of threads should be no more than #cores + 1
161165
pool = multiprocessing.Pool(4)
166+
# Generate features for every row
162167
transformed = pool.map(transform_row, dataset.iterrows())
163168

164-
print("Dumping to disk")
169+
# Dump the processed dataset to disk
165170
preprocessed = pandas.DataFrame(transformed)
166171
preprocessed.to_msgpack('./data/processed.msg')

train_model.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1+
import pandas
12
from sklearn.tree import DecisionTreeClassifier
23
from sklearn.cross_validation import cross_val_score
34
from sklearn.metrics import confusion_matrix
4-
from transform_data_by_features import ds
55

6-
dataset = ds[[x for x in ds.columns if x != 'class']].values
7-
labels = ds['class'].apply(lambda x: x == 'spam')
6+
# Load processed data
7+
dataset = pandas.read_msgpack('./data/processed.msg', encoding='latin-1')
88

9+
# Separate features and labels
10+
features = dataset[[x for x in dataset.columns if x != 'class']].values
11+
labels = dataset['class'].apply(lambda x: x == 1)
12+
13+
# Train model
914
model = DecisionTreeClassifier()
10-
res = cross_val_score(model, dataset, labels, cv=10, scoring='roc_auc')
15+
res = cross_val_score(model, features, labels, cv=10, scoring='roc_auc')
1116
print(np.mean(res), np.std(res))

0 commit comments

Comments
 (0)