-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNMF_LDA_countvec.py
249 lines (208 loc) · 10.2 KB
/
NMF_LDA_countvec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
## NMF, LDA portion of CMSC 726 Final Project
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer # only for now, we should have word2vec
from sklearn.decomposition import NMF, LatentDirichletAllocation
import time
from scipy.sparse import csr_matrix
import nimfa
from word2vec_gen import *
from csv import DictReader, DictWriter
kTARGET_FIELD = 'correctAnswer'
kTEXT_FIELD = 'question'
kID_FIELD = 'id'
kA = 'answerA'
kB = 'answerB'
kC = 'answerC'
kD = 'answerD'
def topics_per_sent(exposures,numtop):
topics_per_sentence = dict()
for ii in range(exposures.shape[0]): # for all sentences
sentexp = exposures[ii]
if ii == 100 or ii == 2000 or ii == 734:
print(sentexp) # MEANT TO TEST HOW SPARSE EXPOSURES ARE
top_topics = sentexp.argsort()[:-numtop-1:-1]
topics_per_sentence[ii] = topics[top_topics]
return topics_per_sentence
def export_topics(out_file,topics,feature_names, n_top_words,ii_max=20):
output = open(out_file, 'w',encoding='utf-8')
for top_id in range(topics.shape[0]):
output.write("==========\t%d\t==========\n" % (top_id))
topic = topics[top_id, ]
ii = 0
for word_id in reversed(np.argsort(topic)):
ii += 1
# print(topic)
output.write("%s\t%g\n" % (feature_names[word_id],topic[word_id]))
if ii_max > 0 and ii >= ii_max:
break
output.close()
#
def word_topics(out_file,topics,feature_names,top_num=25):
# first, need to make a list of dictionaries
word_topics = [dict() for x in range(len(feature_names))]
# loop through all words - order of feat_names and count_vec the same
for ii in range(len(feature_names)):
word = feature_names[ii]
topic_vals = topics[:,ii]
# these should be indexces of the vector
top_id = list()
for idx in reversed(np.argsort(topic_vals)):
top_id.append(idx)
#print(top_id)
# adding to the keys
word_topics[ii] = {'word':word,'topicid':top_id,'topicval':list(topic_vals[top_id])}
# print(keys)
with open(out_file,'w',newline='',encoding='utf-8') as output: # trying not wb
f = DictWriter(output,keys)
f.writeheader()
f.writerows(word_topics)# use the rwiterows since you assume dict is a lsit of dicts)
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
class Featurizer:
def __init__(self):
self.vectorizer = CountVectorizer(analyzer = 'word')
def train_feature(self, examples):
return self.vectorizer.fit_transform(examples)
def test_feature(self, examples):
return self.vectorizer.transform(examples)
def feature_names(self):
return self.vectorizer.get_feature_names()
# main function
if __name__ == "__main__":
# inputfiletrain = "data/sci_train.csv"
# inputfiletest = "data/sci_test.csv"
## USING WIKI CORPUS
inputfiletrain = "data/wiki_corpus.txt"
#inputvocab = "data/vocab.txt"
train = list((open(inputfiletrain,'r',encoding='utf-8')))
#vocab = list((open(inputvocab,'r',encoding='utf-8')))
print(len(train))
N = len(train)
feat = Featurizer()
x_train = feat.train_feature(x for x in train)
#print(x_train[0])
#### MAIN PARAMETER ##########
n_topics = 20 # for now
##################################
feat_names = feat.feature_names()
## PARAMETERS FOR NMF
# # n_components = # of topics guessed
# # tol - default is 1e-4, doing mine for greater granuarilty of solutin, run time is OK
# # max_iter - default 200
# # random_state - if we want to sed with 1,2,3, etc
# # alpha - the constant coefficient, applied to both W and H
# # scale how much regularization is done, 0.1 means not much, 1 means a lot
# # l1_ratio - what is typically 'alpha' in EN discussion, 0 means ridge, 1 means LASSO
# current set-up --> no Elastic Net params
nmf = NMF(n_components=n_topics, tol=1e-6,max_iter=500, alpha=0.0, l1_ratio=0.0).fit(x_train)
#print_top_words(nmf, feat_names, 10)
exposures = nmf.fit_transform(x_train)
topics = nmf.components_
print(topics.shape)
print()
#
# ## TRYING TO SAVE TO FILES NOW, do topics_per_sent in logreg file
np.savetxt("topics_nmf_noreg.csv",topics,delimiter=",")
np.savetxt("exposures_nmf_noreg.csv",exposures,delimiter=",")
# calling topic - top words function
export_topics("nmf_noreg_topicwords.txt",topics,feat_names,20)
word_topics("nmf_noreg_words.csv",topics,feat_names,25)
## WRITING TO FILE WITH TOP WORDS IN TOPICS, ALONG WITH THE VALUE IN BETA/TOPICS VECT
print("#\n Done NMF, sklearn, no EN \n #")
nmf = NMF(n_components=n_topics, tol=1e-6,max_iter=500, alpha=0.9, l1_ratio=1.0).fit(x_train)
# print_top_words(nmf, feat_names, 10)
exposures = nmf.fit_transform(x_train)
topics = nmf.components_
np.savetxt("topics_nmf_lasso.csv",topics,delimiter=",")
np.savetxt("exposures_nmf_lasso.csv",exposures,delimiter=",")
export_topics("nmf_lasso_topicwords.txt",topics,feat_names,20)
word_topics("nmf_lasso_words.csv",topics,feat_names,25)
print("#\n Done NMF, sklearn, LASSO, a lot of regualrization\n #")
#
#
nmf = NMF(n_components=n_topics, tol=1e-6,max_iter=500, alpha=0.9, l1_ratio=0.0).fit(x_train)
#print_top_words(nmf, feat_names, 20)
exposures = nmf.fit_transform(x_train)
topics = nmf.components_
np.savetxt("topics_nmf_ridge.csv",topics,delimiter=",")
np.savetxt("exposures_nmf_ridge.csv",exposures,delimiter=",")
export_topics("nmf_ridge_topicwords.txt",topics,feat_names,20)
word_topics("nmf_rdge_words.csv",topics,feat_names,25)
print("#\n Done NMF, sklearn, RIDGE, a lot of regualrization\n #")
#
nmf = NMF(n_components=n_topics, tol=1e-6,max_iter=500, alpha=0.9, l1_ratio=0.5).fit(x_train)
# print_top_words(nmf, feat_names, 20)
exposures = nmf.fit_transform(x_train)
topics = nmf.components_
np.savetxt("topics_nmf_en.csv",topics,delimiter=",")
np.savetxt("exposures_nmf_en.csv",exposures,delimiter=",")
export_topics("nmf_en_topicwords.txt",topics,feat_names,20)
word_topics("nmf_en_words.csv",topics,feat_names,25)
print("#\n Done NMF, sklearn, EN = even, less regualrization\n #")
#######################################################################################
#
####################################################################################
#
# ## LDA part
# # n_topics --> number of topics, K
# # doc_topic_prior --> alpha parameter of document-topic distribution (assume same for each topic,
# # considering we assume topics evenly distributed)
# # WANT THIS TO BE SMALL (SO SPARSE, since we assuem small # topics, or really 1, for each sentence)
#
# # topic_word_prior --> prior of beta dsit, or word dist for each topic
# # both defaults are 1/n_topics, which are fairly sparse. But will be important to tune
# # maybe we should assume doc_topic more sparse than topic_word?
# # max_iter --> default 10, set to 100
# # batch_size --> used only for online, default 128, number of documents for each onlie step, not totally stochastic
#
# # max_change_tol --> default 1e-3, stopping crit for updaing doc-tpic dist
# # max_doc_update_iter --> if ou stop in ceratain iter instead of tolerance, default 100
#
lda = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=0.05, topic_word_prior = 0.05, learning_method = 'online', max_iter=100, mean_change_tol = 1e-5, max_doc_update_iter = 500)
lda.fit(x_train)
# print_top_words(lda, feat_names, 20) # 20 top words
exposures = lda.transform(x_train)
topics = lda.components_
np.savetxt("topics_lda_sparse.csv",topics,delimiter=",")
np.savetxt("exposures_lda_sparse.csv",exposures,delimiter=",")
export_topics("lda_sparse_topicwords.txt",topics,feat_names,20)
word_topics("lda_sparse_words.csv",topics,feat_names,25)
print("# \n Done LDA part, strict sparsity of prior \n #")
lda = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=0.25, topic_word_prior = 0.05, learning_method = 'online', max_iter=100, mean_change_tol = 1e-5, max_doc_update_iter = 500)
lda.fit(x_train)
# print_top_words(lda, feat_names, 20) # 20 top words
exposures = lda.transform(x_train)
topics = lda.components_
np.savetxt("topics_lda_non_sparse.csv",topics,delimiter=",")
np.savetxt("exposures_lda_non_sparse.csv",exposures,delimiter=",")
export_topics("lda_non_topicwords.txt",topics,feat_names,20)
word_topics("lda_non_words.csv",topics,feat_names,25)
print("# \n Done LDA part, little sparsity of prior \n #")
# ######################################################
# #### KL divergence tries
# # NMF KL- NIMFA style
##
klnmf = nimfa.Nmf(np.transpose(x_train),max_iter=100,n_run=1, rank = n_topics, update='divergence', objective = 'div')
klnmf_fit = klnmf()
# here, W is M x K, H is K x N. W is basis, H is abndances, get transposes
topics = klnmf_fit.basis()
print(type(topics))
exposures = klnmf_fit.coef()
topics.todense() # making scipy sparse to dense
exposures.todense()
topics = topics.toarray()
exposures = exposures.toarray()
# to make them scipy arrays
# in order to match matrix dimesnions of previous methods
topics = np.transpose(topics)
exposures = np.transpose(exposures)
n_top_words = 10
np.savetxt("topics_klnmf.csv",topics,delimiter=",")
np.savetxt("exposures_klnmf.csv",exposures,delimiter=",")
export_topics("nimfa_topicwords.txt",topics,feat_names,20)
word_topics("nimfa_kl_words.csv",topics,feat_names,20)
print(" # \n Done nimfa, Kl divergence \n #")