-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdoc2vectrain.py
29 lines (23 loc) · 936 Bytes
/
doc2vectrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
# data = ["I love machine learning. Its awesome.", "I love coding in python", "I love building chatbots", "they chat amazingly well"]
data=[]
f=open("paragraphs.txt","r")
for x in f:
data.append(x)
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
max_epochs = 100
vec_size = 200
alpha = 0.025
model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
print('iteration {0}'.format(epoch))
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
model.save("d2v.model")
print("Model Saved")