-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare-newsgroups.py
executable file
·80 lines (64 loc) · 2.29 KB
/
prepare-newsgroups.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
import numpy as np
import spacy
from os.path import join
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from caipi import load, dump
SPACY = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
POS_TAGS = {'ADJ', 'ADV', 'NOUN', 'VERB'}
def simplify(line):
tokens = SPACY(line)
valid_lemmas = []
for i, token in enumerate(tokens):
if (token.pos_ in POS_TAGS and
token.lemma_ != '-PRON-'):
valid_lemmas.append(token.lemma_)
return ' '.join(valid_lemmas)
categories = ['alt.atheism', 'soc.religion.christian']
twenty = fetch_20newsgroups(subset='all',
categories=categories,
remove=['headers', 'footers'],
random_state=0)
docs = [simplify(doc) for doc in twenty.data]
vectorizer = TfidfVectorizer(lowercase=False)
X = vectorizer.fit_transform(docs).toarray()
y = twenty.target
vocabulary = np.array(vectorizer.get_feature_names())
feature_selector = SGDClassifier(penalty='l1', random_state=0)
feature_selector.fit(X, y)
print('feature_selector acc =', feature_selector.score(X, y))
coef = np.abs(feature_selector.coef_.ravel())
selected_indices = [i for i in coef.argsort()[::-1] if coef[i] >= 1]
selected_words = vocabulary[selected_indices]
print('# words =', len(vocabulary))
print('# selected words =', len(selected_words))
docs2 = []
rats = []
keep = []
for i, doc in enumerate(docs):
words = np.array(doc.split())
if len(words) == 0:
continue
indices = [i for i in range(len(words))
if words[i] in selected_words]
print('%% relevant =', len(indices) / len(words))
mask = np.zeros((1, len(words)))
mask[0, indices] = 1
print(mask)
rats.append(mask)
docs2.append(doc)
keep.append(i)
y2 = y[keep]
vectorizer = TfidfVectorizer(lowercase=False, vocabulary=selected_words)
X2 = vectorizer.fit_transform(docs2).toarray()
feature_selector = SGDClassifier(penalty='l1', random_state=0)
feature_selector.fit(X2, y2)
print('feature_selector acc =', feature_selector.score(X2, y2))
dataset = {
'y': y2,
'docs': docs2,
'explanations': rats,
}
dump(join('data', 'newsgroups.pickle'), dataset)