Skip to content

Commit

Permalink
Batchedquestions (#55)
Browse files Browse the repository at this point in the history
* Initial commit

* Update README.md

* Update README.md

* added simlpe crawler by scrapy

* simple crawler

* simple crawler

* simple crawler

* simple crawler

* crawler moved to /scripts

* crawler moved to /scripts

* pipline added for crawler which make corpus file in standard format(each
                                                                    sentence
                                                                    in
                                                                    on
                                                                    line)

* analogy test added.

* corpus moved

* little changes in crawler

* changes in analogy similarity package

* helpers.py moved
question.txt removed
loadAnalogyDataset changed to load csv format`

* analogy tester tested with dummy embedding model.

* removed ide stuffs

* changed crawler readme

* changed crawler readme

* option parser added to analogy_test script

* unknown words replaced by mean vector

* bathced version tested

* analogy readme changed
  • Loading branch information
abb4s authored and sehsanm committed Jan 3, 2019
1 parent a67051d commit 47b86f2
Show file tree
Hide file tree
Showing 10 changed files with 262 additions and 39 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,5 @@ venv.bak/

.scripts/crawler/corpus.txt

.vscode/**

1 change: 1 addition & 0 deletions code/analogy/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ def write_result_to_file(data_set_name , totals , corrects,path):
f.write(":dataset "+data_set_name+"\n")
for cat in totals:
f.write(cat+" "+str(totals[cat])+","+str(corrects[cat])+"\n")

72 changes: 42 additions & 30 deletions code/analogy/similarity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import numpy as np
from sklearn.preprocessing import normalize
from math import sqrt
from tqdm import tqdm
def batched(X,batchsize):
s=[iter(X)]*batchsize
return zip(*s)
def _cosinDistance(v1,v2):
return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

Expand Down Expand Up @@ -34,7 +38,7 @@ def getKNear(r1,r2,r3,model,thershold,method):
return result


def getKNearBatch(X,model,method,thershold):
def getKNearBatch(X,model,method,thershold,batchsize):
V=model.vectors
# X: matrice of questions. in each column is a list of string words w1,w2,w3.each row is
# a single query
Expand All @@ -43,38 +47,46 @@ def getKNearBatch(X,model,method,thershold):
# method : Cosine,Euclidean,PairDirection

# for cosine distance :
A=np.array(np.zeros((len(X),len(V[0]))))
B=np.array(np.zeros((len(X),len(V[0]))))
C=np.array(np.zeros((len(X),len(V[0]))))
A=np.array(np.zeros((batchsize,len(V[0]))))
B=np.array(np.zeros((batchsize,len(V[0]))))
C=np.array(np.zeros((batchsize,len(V[0]))))
meanVector=np.mean(model.vectors,axis=0)
if method=="Cosine":
for i,w in enumerate(X[:,0]):
A[i]=model.getVec(w)
for i,w in enumerate(X[:,1]):
try:
B[i]=model.getVec(w)
except:
print("\n have error for word: ",w)

for i,w in enumerate(X[:,2]):
C[i]=model.getVec(w)
D=B-A+C
Rw=[] # result answer words( k neares neighbors to v2-v1+v3) to each question
for c,b in enumerate(batched(X,batchsize)):
print(c,"/",len(X)/batchsize)
for i,x in enumerate(b):
try:
A[i]=model.getVec(x[0])
except:
A[i]=meanVector
try:
B[i]=model.getVec(x[1])
except:
B[i]=meanVector
try:
C[i]=model.getVec(x[2])
except:
C[i]=meanVector
D=B-A+C

# now D is a d(embedding dimension)xM(number of questions) matrice.
# each row of D is a vector representing w2-w1+w3. we want to get cosine distance of each
# row by all vectors of vocabulary. the result should be a matrice by M(number of questions)xV(vocabulary size)
# and then we select K minimum item of each row which make matrice to have MxK dimension.
# now D is a d(embedding dimension)xM(number of questions) matrice.
# each row of D is a vector representing w2-w1+w3. we want to get cosine distance of each
# row by all vectors of vocabulary. the result should be a matrice by M(number of questions)xV(vocabulary size)
# and then we select K minimum item of each row which make matrice to have MxK dimension.

# for computing cosine distance first we should normalize all vectors.
D=np.array(D)
V=np.array(V)
nD=normalize(D,axis=1)
nV=normalize(V,axis=1)
R=np.matmul(nV,nD.T)
R=R.argsort(axis=0)
R=R[-thershold:,:]
Rw=[]
for r in R:
Rw.append([model.words[int(id)] for id in r])
# for computing cosine distance first we should normalize all vectors.
D=np.array(D)
V=np.array(V)
nD=normalize(D,axis=1)
nV=normalize(V,axis=1)
R=np.matmul(nV,nD.T)
R=R.argsort(axis=0)
R=R[-thershold:,:].T
bRw=[]
for r in R:
bRw.append([model.words[int(id)] for id in r])
Rw+=bRw
return Rw
elif method=="PairDirection":
for w,i in enumerate(X[0]):
Expand Down
193 changes: 191 additions & 2 deletions code/models/models.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,191 @@
def loadmodel(path):
return

"""
auther Mohamad M. Jafari
This file contains a CLASS for loading and useing embedding models!
Essential materials for dealing with [word2vec, gensim] models is implemented!
There are some extra facilities to perform bunch of other operation on models,
like reordering, changeing format and etc.
"""



import numpy as np
import pickle
import gensim
from numpy import dot
from numpy.linalg import norm
from random import choice
from string import ascii_lowercase
import io
class W2V():
def __init__(self, vocabulary, vectors):
self.vocabulary = vocabulary # list of vocab
self.vectors = np.asarray(vectors) # numpy array, each row contains one word vector
# (corresponding to vocab list)
self._wordDict={vocabulary[i]:vectors[i] for i in range(0,len(vocabulary))}

# magic method for reach vector corresponding to word
def __getitem__(self, index):
return self.vectors[index,:]
# magic method of containing special word
def __contains__(self, word):
return word in self.vocabulary

# delition word
def __delitem__(self, word):
index = self.vocabulary.index(word)
del self.vocabulary[index]
self.vectors = np.delete(self.vectors, index, 0)

# length
def __len__(self):
return len(self.vocabulary)
# iterator
def __iter__(self):
for w in self.vocabulary:
yield w, self[vocabulary.index(w)]


def getVec(self,word):
return self.wordDict[word]
@property
def words(self):
return self.vocabulary
@property
def wordDict(self):
return self._wordDict
@property
def shape(self):
return self.vectors.shape

def normalize_words(self, ord=2, inplace=False):
if ord == 2:
ord = None # numpy uses this flag to indicate l2.
vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1)
if inplace:
self.vectors = vectors.T
return self
return W2V(vectors=vectors.T, vocabulary=self.vocabulary)

def nearest_neighbors(self, word, k=1):
if isinstance(word, str):
assert word in self, "invalid word!"
v = self.vocabulary.index(word)
print(v)
else:
v = word
dist = lambda v1, v2 : dot(v1, v2)/(norm(v1)*norm(v2))
vectors = self.vectors
distances = [dist(vectors[v,:], vectors[x,:]) for x in range(0, len(vectors))]
return(sorted(distances, reverse=True)[1:1+k])
@staticmethod
def from_text(fname, encoding=False):
words = []
vectors = []
if encoding:
with open(fname, 'r', encoding="utf-8") as fin:
for line in fin.readlines():
line = line.split(" ")
word, vector = line[0], [float(x) for x in line[1:]]
words.append(word)
vectors.append(vector)
return W2V(vocabulary=words, vectors=vectors)
else:
with open(fname, 'r') as fin:
for line in fin:
line = line.split(" ")
try:
if(len(line)>0):
word, vector = line[0], [float(x) for x in line[1:]]
except:
print("error in loading model in modles.py by reading this ",line)
print("exited by error")
exit(0)
# word, vector = line[0], [float(x) for x in line[1:]]
words.append(word)
vectors.append(vector)
return W2V(vocabulary=words, vectors=vectors)

@staticmethod
def fasttext_from_text(fname):
words = []
vectors = []
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
words.append(tokens[0])
vectors.append([float(t) for t in tokens[1:]])

return W2V(vocabulary=words, vectors=vectors)
@staticmethod
def from_bin(fname):
model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
# to implement
@staticmethod
def to_word2vec(w, fname, binary=False):
with open(fname, 'wb') as fout:
header = "%s %s\n" % w.vectors.shape
fout.write(header.encode("utf-8"))
for word, vector in zip(w.vocabulary.words, w.vectors):
if binary:
line = word.encode("utf-8") + b" " + vector.astype("float32").tostring()
fout.write(line)
else:
line = "%s %s\n" % (word, ' '.join("%.15f" % val for val in vector))
fout.write(line.encode("utf-8"))

@staticmethod
def from_W2V(fname):
with open(fname, 'rb') as fin:
wtov = pickle.load(fin)
vec, voc = wtov["vectors"], wtov["vocabulary"]
return W2V(vocabulary=voc, vectors=vec)

def save(self, fname):
vec = self.vectors
voc = self.vocabulary
model = {"vectors":vec, "vocabulary":voc}
with open(fname, 'wb') as fout:
pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)

glove50dt = "glove.6B.50d.txt"
wikifab = "wiki.fa.bin"
googb = "GoogleNews-vectors-negative300.bin"

# w2vt = W2V.from_text(glove50dt, encoding=True)
# w2vt.save("test")
# w2vt = W2V.from_W2V("test")

#w2vb = W2V.from_bin(googb)
# print(w2vt.vocabulary[0])
# print(w2vt.vectors[0])
if __name__ == "__main__":
vocab_size = 100
embedding_dim = 300
# create our simple test case!
vocabulary = [''.join(choice(ascii_lowercase) for i in range(10))\
for j in range(0, vocab_size)]
vectors = np.random.random((vocab_size, embedding_dim))
# test our methods!
my_w2v = W2V(vocabulary, vectors)
my_w2v.save("model")
my_w2v = W2V.from_W2V("model")
# print(my_w2v[vocabulary.index(vocabulary[10])])
# print(my_w2v.shape)
# print(my_w2v.words)
del my_w2v[vocabulary[10]]
print(my_w2v.shape)
# print(len(my_w2v))
# for word, vector in my_w2v:
# print(word, vector)
tmp = my_w2v.vectors
my_w2v.normalize_words(ord=2, inplace=True)
# print(my_w2v.vectors==tmp)
print(len(my_w2v.nearest_neighbors(vocabulary[3], k=10)))

3 changes: 1 addition & 2 deletions data/analogy/analogy.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
CATEGORY,WORD1,WORD2,WORD3,TARGET
semantic-capitals,ایران,تهران,چین,پکن
semantic-capitals,ایران,تهران,چین,پکن
semantic-capitals,ایران,تهران,هند,دهلی‌نو
semantic-capitals,ایران,تهران,ژاپن,توکیو
semantic-capitals,ایران,تهران,فیلیپین,مانیل
Expand Down
6 changes: 6 additions & 0 deletions data/models/sample.vec
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ایران 1 1
تهران 4 3
انگلیس 10 12
لندن 13 14
فرانسه 40 50
پاریس 43 52
10 changes: 10 additions & 0 deletions results/analogy/analogy.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
:dataset analogy.csv
semantic-capitals 4691,3514
semantic-Whole to part 420,68
syntactic-Comparative Adjectives 2756,1927
syntactic-antonym 506,239
semantic-family 600,349
semantic-currency 6006,1298
semantic-capitals 1,1
syntactic-Superlative Adjectives 2756,1611
syntactic-verb 1964,708
2 changes: 2 additions & 0 deletions results/analogy/test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
:dataset test.csv
semantic-capitals 100,44
2 changes: 1 addition & 1 deletion scripts/test/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ options discriptions can be find by : `python3 -m scripts.test.analogy_test -h`

for now similarity.getKnear() implement finding k nearest vectors based on cosine , euclidean and pair direction distances . but it is not a efficient method because it compute distance of each word in vocabulary by vector d(which for each question equal to v3+v2-v1).

similarity.getKnearBatch() implement finding k nearest vectors just based on cosine distance. the method is expected to be fast because of using matrix multiplication to compute distances. but it is not efficient in memory usage since it should load all of the model (with size of 1-4G). the problem seems be solved in https://github.com/kudkudak/word-embeddings-benchmarks/blob/master/web/analogy.py. they solve the problem by defining each batch as small subset of model like 300 vector in each batch and they find nearest vector in each batch to (v3+v2-v1) and they report that.so if we want k nearest vector we can divide our model to len(model.vectors)/k batches and find the nearest vector in each batch.
similarity.getKnearBatch() implement finding k nearest vectors just based on cosine distance. the method is expected to be fast because of using matrix multiplication to compute distances. it tries to divid questions to batches of 50.
10 changes: 6 additions & 4 deletions scripts/test/analogy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,17 @@ def print_inplace(txt):
if(options.method=='batch'):
X=[r["words"] for r in dataset['rows']]
X=np.array(X)
result=similarity.getKNearBatch(X,model,"Cosine",options.thereshold)
result=similarity.getKNearBatch(X,model,"Cosine",options.thereshold,50)
result=np.array(result)
for i,row in enumerate(dataset["rows"]):

# for i,row in enumerate(dataset["rows"]):
rows=dataset['rows']
for i,row in enumerate(result):
row=rows[i]
if row["category"] not in totals:
totals[row["category"]]=0
corrects[row["category"]]=0
totals[row["category"]] = totals[row["category"]] + 1
if(row["words"][3] in result[:,i]):
if(row["words"][3] in result[i,:]):
corrects[row["category"]]+=1
write_result_to_file(dataset["name"] , totals , corrects,"results/analogy/"+dataset["name"])
else:
Expand Down

0 comments on commit 47b86f2

Please sign in to comment.