Skip to content

Commit fb2e49c

Browse files
committed
Generating features with count vectorizer
1 parent 0f4b49a commit fb2e49c

File tree

1 file changed

+281
-0
lines changed

1 file changed

+281
-0
lines changed

TextFeatures.ipynb

+281
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": [
11+
"import pandas\n",
12+
"import email\n",
13+
"from sklearn.feature_extraction.text import CountVectorizer\n",
14+
"import html2text\n",
15+
"\n",
16+
"dataset = pandas.read_msgpack('./data/development.msg', encoding='latin-1')\n",
17+
"dataset['email'] = dataset['email'].apply(email.message_from_string)\n"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 3,
23+
"metadata": {
24+
"collapsed": false
25+
},
26+
"outputs": [],
27+
"source": [
28+
"corpus = []\n",
29+
"for (index, row) in dataset.iterrows():\n",
30+
" if row['class'] == 'spam':\n",
31+
" for part in row['email'].walk():\n",
32+
" if part.get_content_type() == 'text/plain':\n",
33+
" body = part.get_payload()\n",
34+
" elif part.get_content_type() == 'text/html':\n",
35+
" body = html2text.html2text(part.get_payload())\n",
36+
" \n",
37+
" corpus.append(body)"
38+
]
39+
},
40+
{
41+
"cell_type": "code",
42+
"execution_count": 4,
43+
"metadata": {
44+
"collapsed": false
45+
},
46+
"outputs": [
47+
{
48+
"ename": "MemoryError",
49+
"evalue": "",
50+
"output_type": "error",
51+
"traceback": [
52+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
53+
"\u001b[0;31mMemoryError\u001b[0m Traceback (most recent call last)",
54+
"\u001b[0;32m<ipython-input-4-e950e1df90b3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mpca\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mRandomizedPCA\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn_components\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m50\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwhiten\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mpca_model\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpca\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mwords_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
55+
"\u001b[0;32mc:\\python35\\lib\\site-packages\\scipy\\sparse\\compressed.py\u001b[0m in \u001b[0;36mtoarray\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m 918\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 919\u001b[0m \u001b[1;34m\"\"\"See the docstring for `spmatrix.toarray`.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 920\u001b[0;31m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtocoo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 921\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 922\u001b[0m \u001b[1;31m##############################################################\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
56+
"\u001b[0;32mc:\\python35\\lib\\site-packages\\scipy\\sparse\\coo.py\u001b[0m in \u001b[0;36mtoarray\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[1;34m\"\"\"See the docstring for `spmatrix.toarray`.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 252\u001b[0;31m \u001b[0mB\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_process_toarray_args\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 253\u001b[0m \u001b[0mfortran\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mB\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf_contiguous\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfortran\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mB\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mc_contiguous\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
57+
"\u001b[0;32mc:\\python35\\lib\\site-packages\\scipy\\sparse\\base.py\u001b[0m in \u001b[0;36m_process_toarray_args\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m 1007\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1008\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1009\u001b[0;31m \u001b[1;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1010\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1011\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__numpy_ufunc__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpos\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
58+
"\u001b[0;31mMemoryError\u001b[0m: "
59+
]
60+
}
61+
],
62+
"source": [
63+
"import io\n",
64+
"\n",
65+
"vectorizer = CountVectorizer(min_df=1)\n",
66+
"words_model = vectorizer.fit_transform(corpus)"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": 12,
72+
"metadata": {
73+
"collapsed": false,
74+
"scrolled": true
75+
},
76+
"outputs": [],
77+
"source": [
78+
"from sklearn.decomposition import TruncatedSVD\n",
79+
"pca = TruncatedSVD(n_components=200)\n",
80+
"pca_model = pca.fit_transform(words_model)"
81+
]
82+
},
83+
{
84+
"cell_type": "code",
85+
"execution_count": 11,
86+
"metadata": {
87+
"collapsed": false
88+
},
89+
"outputs": [
90+
{
91+
"data": {
92+
"text/plain": [
93+
"array([[ 0., 0., 0., ..., 0., 0., 0.],\n",
94+
" [ 0., 0., 0., ..., 0., 0., 0.],\n",
95+
" [ 0., 0., 0., ..., 0., 0., 0.],\n",
96+
" ..., \n",
97+
" [ 0., 0., 0., ..., 0., 0., 0.],\n",
98+
" [ 0., 0., 0., ..., 0., 0., 0.],\n",
99+
" [ 0., 0., 0., ..., 0., 0., 0.]])"
100+
]
101+
},
102+
"execution_count": 11,
103+
"metadata": {},
104+
"output_type": "execute_result"
105+
}
106+
],
107+
"source": [
108+
"pca.transform(vectorizer.transform(corpus[0]))"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": 17,
114+
"metadata": {
115+
"collapsed": false
116+
},
117+
"outputs": [
118+
{
119+
"name": "stdout",
120+
"output_type": "stream",
121+
"text": [
122+
"36091\n",
123+
"200\n"
124+
]
125+
}
126+
],
127+
"source": []
128+
},
129+
{
130+
"cell_type": "code",
131+
"execution_count": null,
132+
"metadata": {
133+
"collapsed": true
134+
},
135+
"outputs": [],
136+
"source": [
137+
"from sklearn.base import BaseEstimator, TransformerMixin\n",
138+
"from sklearn.pipeline import FeatureUnion\n",
139+
"from sklearn.pipeline import Pipeline\n",
140+
"\n",
141+
"\n",
142+
"\n",
143+
"class FeatureTransformer(BaseEstimator, TransformerMixin):\n",
144+
" def fit(self, x, y=None):\n",
145+
" return self\n",
146+
" \n",
147+
" def generate(x):\n",
148+
" return x\n",
149+
" \n",
150+
" def transform(self, data):\n",
151+
" features = []\n",
152+
" \n",
153+
" for entry in data:\n",
154+
" features.append(self.generate(data))\n",
155+
" \n",
156+
" return features\n",
157+
" \n",
158+
"# Converts textual emails to python email objects\n",
159+
"class EmailGenerator(BaseEstimator, TransformerMixin):\n",
160+
" def fit(self, x, y=None):\n",
161+
" return self\n",
162+
" \n",
163+
" def transform(self, data):\n",
164+
" import email\n",
165+
" data['email'] = data['email'].apply(email.message_from_string)\n",
166+
" return data\n",
167+
"\n",
168+
"class ContentTypeGenerator(BaseEstimator, TransformerMixin):\n",
169+
" def __init__(self):\n",
170+
" pass\n",
171+
" \n",
172+
" def fit(self, x, y=None):\n",
173+
" return self\n",
174+
" \n",
175+
" def transform(self, data):\n",
176+
" check = ['x-world', 'application', 'text', 'text/plain', 'text/html', 'video', 'audio', 'image', 'drawing', 'model', 'multipart', 'x-conference', 'i-world', 'music', 'message', 'x-music', 'www', 'chemical', 'paleovu', 'windows', 'xgl']\n",
177+
" features = []\n",
178+
" \n",
179+
" for entry in data:\n",
180+
" email = entry['email']\n",
181+
" output = collections.defaultdict(bool)\n",
182+
"\n",
183+
" for part in email.walk():\n",
184+
" ct = part.get_content_type()\n",
185+
"\n",
186+
" for kind in check:\n",
187+
" output['has_' + kind] |= ct.startswith(kind)\n",
188+
" \n",
189+
" features.append(output)\n",
190+
" \n",
191+
" return features\n",
192+
" \n",
193+
" \n",
194+
"class EmailCountsGenerator(BaseEstimator, TransformerMixin):\n",
195+
" def __init__(self):\n",
196+
" pass\n",
197+
" \n",
198+
" def fit(self, x, y=None):\n",
199+
" return self\n",
200+
" \n",
201+
" def transform(self, data):\n",
202+
" features = []\n",
203+
" \n",
204+
" for entry in data:\n",
205+
" email = str(entry['email'])\n",
206+
" \n",
207+
" output = {\n",
208+
" 'length': len(email),\n",
209+
" 'spaces': email.count(' '),\n",
210+
" 'newlines': email.count('\\n')\n",
211+
" }\n",
212+
" \n",
213+
" features.append(output)\n",
214+
" \n",
215+
" return features\n",
216+
"\n",
217+
"class ContentTypeGenerator(BaseEstimator, TransformerMixin):\n",
218+
" def __init__(self):\n",
219+
" pass\n",
220+
" \n",
221+
" def fit(self, x, y=None):\n",
222+
" return self\n",
223+
" \n",
224+
" def transform(self, data):\n",
225+
" features = []\n",
226+
" \n",
227+
" for entry in data:\n",
228+
" email = entry['email']\n",
229+
" output = {}\n",
230+
" \n",
231+
" features.append(output)\n",
232+
" \n",
233+
" return features\n",
234+
" \n",
235+
"class FeatureGeneratorBase(BaseEstimator, TransformerMixin):\n",
236+
" def __init__(self):\n",
237+
" pass\n",
238+
" \n",
239+
" def fit(self, x, y=None):\n",
240+
" return self\n",
241+
" \n",
242+
" def transform(self, data):\n",
243+
" features = np.recarray(shape=(len(data),),\n",
244+
" dtype=[('subject', object), ('body', object)])\n",
245+
" \n",
246+
" return features\n",
247+
"\n",
248+
" \n",
249+
"pipeline = Pipeline([\n",
250+
" ('transform_email', EmailGenerator()),\n",
251+
" ('features', FeatureUnion=(\n",
252+
" transformer_list=[\n",
253+
" ('content_type_features', ContentTypeGenerator())\n",
254+
" ]\n",
255+
" ))\n",
256+
" ])"
257+
]
258+
}
259+
],
260+
"metadata": {
261+
"kernelspec": {
262+
"display_name": "Python 3",
263+
"language": "python",
264+
"name": "python3"
265+
},
266+
"language_info": {
267+
"codemirror_mode": {
268+
"name": "ipython",
269+
"version": 3
270+
},
271+
"file_extension": ".py",
272+
"mimetype": "text/x-python",
273+
"name": "python",
274+
"nbconvert_exporter": "python",
275+
"pygments_lexer": "ipython3",
276+
"version": "3.5.1"
277+
}
278+
},
279+
"nbformat": 4,
280+
"nbformat_minor": 1
281+
}

0 commit comments

Comments
 (0)