-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare-reviews.py
executable file
·148 lines (114 loc) · 4.16 KB
/
prepare-reviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
import re
import pickle
import numpy as np
import spacy
from os import listdir
from os.path import join
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from caipi import load, dump
N_DOCUMENTS_PER_CLASS = np.nan
METHOD = 'global'
# Make sure to download the dataset from:
#
# http://cs.jhu.edu/~ozaidan/rationales
#
# and uncompress it in data/review_polarity_rationales/
SPACY = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
POS_TAGS = {'ADJ', 'ADV', 'NOUN', 'VERB'}
RAT_TAGS = {'POS', '/POS', 'NEG', '/NEG'}
RAT_TAGS2 = {'<' + tag + '>' for tag in RAT_TAGS}
REGEX = re.compile('<(POS|NEG)> (?P<rationale>[^<>]*) </(POS|NEG)>')
# NOTE 'oscar winner'
# XXX what about negation? it would need POS tagging maybe
def simplify(line):
tokens = SPACY(line)
valid_lemmas = []
for i, token in enumerate(tokens):
if (token.pos_ in POS_TAGS and
token.lemma_ != '-PRON-'):
valid_lemmas.append(token.lemma_)
if (token.text in RAT_TAGS and
tokens[i-1].text == '<' and
tokens[i+1].text == '>'):
valid_lemmas.append('<' + token.text + '>')
return ' '.join(valid_lemmas)
def process_rats(line):
matches = list(REGEX.finditer(line))
if len(matches) == 0:
return line, None
ranges = []
for match in matches:
ranges.extend([(match.start(), True), (match.end(), False)])
if not ranges[0] == (0, True):
ranges = [(0, False)] + ranges
if not ranges[-1] == (len(line), False):
ranges = ranges + [(len(line), False)]
words = line.split()
masks = np.zeros((len(matches), len(words)))
j = 0
valid_words = []
for i in range(len(ranges) - 1):
s, is_rationale = ranges[i]
e, _ = ranges[i + 1]
segment_words = [word for word in line[s:e].strip().split()
if not word in RAT_TAGS2]
if is_rationale:
masks[j, len(valid_words):len(valid_words)+len(segment_words)] = 1
j += 1
valid_words.extend(segment_words)
return ' '.join(valid_words), masks
def read_docs(base_path, label):
docs, rats = [], []
rel_paths = sorted(listdir(base_path))
for k, rel_path in enumerate(rel_paths):
if k >= N_DOCUMENTS_PER_CLASS:
break
print('processing {}/{} {}'.format(k + 1, len(rel_paths), rel_path))
n = rel_path.split('_')[-1].split('.')[0]
with open(join(base_path, rel_path), encoding='latin-1') as fp:
doc = simplify(fp.read().strip())
doc, masks = process_rats(doc)
docs.append(doc)
rats.append(masks)
return docs, rats
np.set_printoptions(threshold=np.nan)
try:
print('Loading...')
y, docs, rats = load('reviews.pickle')
except:
print('Reading documents...')
pos_docs, pos_rats = read_docs(join('data', 'review_polarity_rationales', 'withRats_pos'), +1)
neg_docs, neg_rats = read_docs(join('data', 'review_polarity_rationales', 'withRats_neg'), -1)
print('Saving...')
y = np.array([+1] * len(pos_docs) + [-1] * len(neg_docs))
docs = pos_docs + neg_docs
rats = pos_rats + neg_rats
dump('reviews.pickle', (y, docs, rats))
vectorizer = TfidfVectorizer(lowercase=False)
X = vectorizer.fit_transform(docs).toarray()
vocabulary = np.array(vectorizer.get_feature_names())
model = SGDClassifier(penalty='l1', random_state=0).fit(X, y)
coef = np.abs(model.coef_.ravel())
selected = coef.argsort()[-len(vocabulary) // 5:]
relevant_words = set(vocabulary[selected])
print('feature selector acc =', model.score(X, y))
print('# words =', len(vocabulary))
print('# relevant words =', len(relevant_words))
rats = []
for doc in docs:
words = doc.split()
relevant_indices = [i for i in range(len(words))
if words[i] in relevant_words]
print('# relevant in doc =', len(relevant_indices))
mask = np.zeros((1, len(words)))
mask[0, relevant_indices] = 1
rats.append(mask)
dataset = {
'y': y,
'docs': docs,
'explanations': rats,
}
with open(join('data', 'review_polarity_rationales.pickle'), 'wb') as fp:
pickle.dump(dataset, fp)