Batchedquestions (#55)

* Initial commit * Update README.md * Update README.md * added simlpe crawler by scrapy * simple crawler * simple crawler * simple crawler * simple crawler * crawler moved to /scripts * crawler moved to /scripts * pipline added for crawler which make corpus file in standard format(each sentence in on line) * analogy test added. * corpus moved * little changes in crawler * changes in analogy similarity package * helpers.py moved question.txt removed loadAnalogyDataset changed to load csv format` * analogy tester tested with dummy embedding model. * removed ide stuffs * changed crawler readme * changed crawler readme * option parser added to analogy_test script * unknown words replaced by mean vector * bathced version tested * analogy readme changed
sehsanm · Jan 3, 2019 · 47b86f2 · 47b86f2
1 parent a67051d
commit 47b86f2
Show file tree

Hide file tree

Showing 10 changed files with 262 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -105,3 +105,5 @@ venv.bak/
 
 .scripts/crawler/corpus.txt
 
+.vscode/**
+
diff --git a/code/analogy/helpers.py b/code/analogy/helpers.py
@@ -5,3 +5,4 @@ def write_result_to_file(data_set_name , totals , corrects,path):
         f.write(":dataset "+data_set_name+"\n")
         for cat in totals:
             f.write(cat+" "+str(totals[cat])+","+str(corrects[cat])+"\n")
+
diff --git a/code/analogy/similarity.py b/code/analogy/similarity.py
@@ -1,6 +1,10 @@
 import numpy as np
 from sklearn.preprocessing import normalize
 from math import sqrt
+from tqdm import tqdm
+def batched(X,batchsize):
+    s=[iter(X)]*batchsize
+    return zip(*s)
 def _cosinDistance(v1,v2):
     return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
 
@@ -34,7 +38,7 @@ def getKNear(r1,r2,r3,model,thershold,method):
     return result
 
 
-def getKNearBatch(X,model,method,thershold):
+def getKNearBatch(X,model,method,thershold,batchsize):
     V=model.vectors
     # X: matrice of questions. in each column  is a list of string  words w1,w2,w3.each row is 
     # a single query
@@ -43,38 +47,46 @@ def getKNearBatch(X,model,method,thershold):
     # method : Cosine,Euclidean,PairDirection
 
     # for cosine distance :
-    A=np.array(np.zeros((len(X),len(V[0]))))
-    B=np.array(np.zeros((len(X),len(V[0]))))
-    C=np.array(np.zeros((len(X),len(V[0]))))
+    A=np.array(np.zeros((batchsize,len(V[0]))))
+    B=np.array(np.zeros((batchsize,len(V[0]))))
+    C=np.array(np.zeros((batchsize,len(V[0]))))
+    meanVector=np.mean(model.vectors,axis=0)
     if method=="Cosine":
-        for i,w in enumerate(X[:,0]):
-            A[i]=model.getVec(w)
-        for i,w in enumerate(X[:,1]):
-            try:
-                B[i]=model.getVec(w)
-            except:
-                print("\n have error for word: ",w)
-
-        for i,w in enumerate(X[:,2]):
-            C[i]=model.getVec(w)
-        D=B-A+C
+        Rw=[] # result answer words( k neares neighbors to v2-v1+v3) to each question
+        for c,b in enumerate(batched(X,batchsize)):
+            print(c,"/",len(X)/batchsize)
+            for i,x in enumerate(b):
+                try:
+                    A[i]=model.getVec(x[0])
+                except:
+                    A[i]=meanVector
+                try:
+                    B[i]=model.getVec(x[1])
+                except:
+                    B[i]=meanVector
+                try:        
+                    C[i]=model.getVec(x[2])
+                except:
+                    C[i]=meanVector
+            D=B-A+C
 
-    # now D is a d(embedding dimension)xM(number of questions) matrice.
-    # each row of D is a vector representing w2-w1+w3. we want to get cosine distance of each 
-    # row by all vectors of vocabulary. the result should be a matrice by M(number of questions)xV(vocabulary size)
-    # and then we select K minimum item of each row which make matrice to have MxK dimension.
+            # now D is a d(embedding dimension)xM(number of questions) matrice.
+             # each row of D is a vector representing w2-w1+w3. we want to get cosine distance of each 
+            # row by all vectors of vocabulary. the result should be a matrice by M(number of questions)xV(vocabulary size)
+             # and then we select K minimum item of each row which make matrice to have MxK dimension.
 
-    # for computing cosine distance first we should normalize all vectors. 
-        D=np.array(D)
-        V=np.array(V)
-        nD=normalize(D,axis=1)
-        nV=normalize(V,axis=1)
-        R=np.matmul(nV,nD.T)
-        R=R.argsort(axis=0)
-        R=R[-thershold:,:]
-        Rw=[]
-        for r  in R:
-              Rw.append([model.words[int(id)] for id in r])  
+            # for computing cosine distance first we should normalize all vectors. 
+            D=np.array(D)
+            V=np.array(V)
+            nD=normalize(D,axis=1)
+            nV=normalize(V,axis=1)
+            R=np.matmul(nV,nD.T)
+            R=R.argsort(axis=0)
+            R=R[-thershold:,:].T
+            bRw=[]
+            for r  in R:
+                bRw.append([model.words[int(id)] for id in r])
+            Rw+=bRw
         return Rw
     elif method=="PairDirection":
         for w,i in enumerate(X[0]):

diff --git a/code/models/models.py b/code/models/models.py
@@ -1,2 +1,191 @@
-def loadmodel(path):
-    return
+
+"""
+    auther Mohamad M. Jafari
+
+    This file contains a CLASS for loading and useing embedding models!
+
+    Essential materials for dealing with [word2vec, gensim] models is implemented!
+
+    There are some extra facilities to perform bunch of other operation on models,
+     like reordering, changeing format and etc.
+    
+"""
+
+
+
+import numpy as np
+import pickle
+import gensim
+from numpy import dot
+from numpy.linalg import norm
+from random import choice
+from string import ascii_lowercase
+import io
+class W2V():
+    def __init__(self, vocabulary, vectors):
+        self.vocabulary = vocabulary # list of vocab 
+        self.vectors = np.asarray(vectors) # numpy array, each row contains one word vector 
+                                           # (corresponding to vocab list)
+        self._wordDict={vocabulary[i]:vectors[i] for i in range(0,len(vocabulary))}
+
+    #    magic method for reach vector corresponding to word
+    def __getitem__(self, index):
+        return self.vectors[index,:]
+    #   magic method of containing special word
+    def __contains__(self, word):
+        return word in self.vocabulary
+
+    #    delition word
+    def __delitem__(self, word):
+        index = self.vocabulary.index(word)
+        del self.vocabulary[index]
+        self.vectors = np.delete(self.vectors, index, 0)
+
+        #   length
+    def __len__(self):
+        return len(self.vocabulary)
+    #   iterator
+    def __iter__(self):
+        for w in self.vocabulary:
+            yield w, self[vocabulary.index(w)]
+
+
+    def getVec(self,word):
+        return self.wordDict[word]
+    @property
+    def words(self):
+        return self.vocabulary   
+    @property
+    def wordDict(self):
+        return self._wordDict   
+    @property
+    def shape(self):
+        return self.vectors.shape
+
+    def normalize_words(self, ord=2, inplace=False):
+        if ord == 2:
+            ord = None  # numpy uses this flag to indicate l2.
+        vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1)
+        if inplace:
+            self.vectors = vectors.T
+            return self
+        return W2V(vectors=vectors.T, vocabulary=self.vocabulary)
+
+    def nearest_neighbors(self, word, k=1):
+        if isinstance(word, str):
+            assert word in self, "invalid word!"
+            v = self.vocabulary.index(word)
+            print(v)
+        else:
+            v = word
+        dist = lambda v1, v2 : dot(v1, v2)/(norm(v1)*norm(v2))
+        vectors = self.vectors
+        distances = [dist(vectors[v,:], vectors[x,:]) for x in range(0, len(vectors))]
+        return(sorted(distances, reverse=True)[1:1+k])
+    @staticmethod
+    def from_text(fname, encoding=False):
+        words = []
+        vectors = []
+        if encoding:
+            with open(fname, 'r', encoding="utf-8") as fin:
+                for line in fin.readlines():
+                    line = line.split(" ")
+                    word, vector = line[0], [float(x) for x in line[1:]]
+                    words.append(word)
+                    vectors.append(vector)
+            return W2V(vocabulary=words, vectors=vectors)
+        else:    
+            with open(fname, 'r') as fin:
+                for line in fin:
+                    line = line.split(" ")
+                    try:
+                        if(len(line)>0):
+                            word, vector = line[0], [float(x) for x in line[1:]]
+                    except:
+                        print("error in loading model in modles.py by reading this ",line)
+                        print("exited by error")
+                        exit(0)
+                    # word, vector = line[0], [float(x) for x in line[1:]]
+                    words.append(word)
+                    vectors.append(vector)
+            return W2V(vocabulary=words, vectors=vectors)
+
+    @staticmethod
+    def fasttext_from_text(fname):
+        words = []
+        vectors = []
+        fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
+        n, d = map(int, fin.readline().split())
+        data = {}
+        for line in fin:
+            tokens = line.rstrip().split(' ')
+            words.append(tokens[0])
+            vectors.append([float(t) for t in tokens[1:]])
+
+        return W2V(vocabulary=words, vectors=vectors)
+    @staticmethod
+    def from_bin(fname):
+    	model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
+#        to implement
+    @staticmethod
+    def to_word2vec(w, fname, binary=False):
+        with open(fname, 'wb') as fout:
+            header = "%s %s\n" % w.vectors.shape
+            fout.write(header.encode("utf-8"))
+            for word, vector in zip(w.vocabulary.words, w.vectors):
+                if binary:
+                    line = word.encode("utf-8") + b" " + vector.astype("float32").tostring()
+                    fout.write(line)
+                else:
+                    line = "%s %s\n" % (word, ' '.join("%.15f" % val for val in vector))
+                    fout.write(line.encode("utf-8"))
+
+    @staticmethod
+    def from_W2V(fname):
+        with open(fname, 'rb') as fin:
+            wtov = pickle.load(fin)
+        vec, voc = wtov["vectors"], wtov["vocabulary"]
+        return W2V(vocabulary=voc, vectors=vec)
+
+    def save(self, fname):
+        vec = self.vectors
+        voc = self.vocabulary
+        model = {"vectors":vec, "vocabulary":voc}
+        with open(fname, 'wb') as fout:
+            pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
+
+glove50dt = "glove.6B.50d.txt"
+wikifab = "wiki.fa.bin"
+googb = "GoogleNews-vectors-negative300.bin"
+
+# w2vt = W2V.from_text(glove50dt, encoding=True)
+# w2vt.save("test")
+# w2vt = W2V.from_W2V("test")
+
+#w2vb = W2V.from_bin(googb)
+# print(w2vt.vocabulary[0])
+# print(w2vt.vectors[0])
+if __name__ == "__main__":
+    vocab_size = 100
+    embedding_dim = 300
+    # create our simple test case!
+    vocabulary = [''.join(choice(ascii_lowercase) for i in range(10))\
+                  for j in range(0, vocab_size)]
+    vectors = np.random.random((vocab_size, embedding_dim))
+    # test our methods!
+    my_w2v = W2V(vocabulary, vectors)
+    my_w2v.save("model")
+    my_w2v = W2V.from_W2V("model")
+#    print(my_w2v[vocabulary.index(vocabulary[10])])
+#    print(my_w2v.shape)
+#    print(my_w2v.words)
+    del my_w2v[vocabulary[10]]
+    print(my_w2v.shape)
+#    print(len(my_w2v))
+#    for word, vector in my_w2v:
+#        print(word, vector)
+    tmp = my_w2v.vectors
+    my_w2v.normalize_words(ord=2, inplace=True)
+#    print(my_w2v.vectors==tmp)
+    print(len(my_w2v.nearest_neighbors(vocabulary[3], k=10)))
+
diff --git a/data/analogy/analogy.csv b/data/analogy/analogy.csv
@@ -1,5 +1,4 @@
-CATEGORY,WORD1,WORD2,WORD3,TARGET
-semantic-capitals,ایران,تهران,چین,پکن
+semantic-capitals,ایران,تهران,چین,پکن
 semantic-capitals,ایران,تهران,هند,دهلی‌نو
 semantic-capitals,ایران,تهران,ژاپن,توکیو
 semantic-capitals,ایران,تهران,فیلیپین,مانیل

diff --git a/data/models/sample.vec b/data/models/sample.vec
@@ -0,0 +1,6 @@
+ایران 1 1
+تهران 4 3
+انگلیس 10 12
+لندن 13 14
+فرانسه 40 50
+پاریس 43 52
diff --git a/results/analogy/analogy.csv b/results/analogy/analogy.csv
@@ -0,0 +1,10 @@
+:dataset analogy.csv
+semantic-capitals 4691,3514
+semantic-Whole to part 420,68
+syntactic-Comparative Adjectives 2756,1927
+syntactic-antonym 506,239
+semantic-family 600,349
+semantic-currency 6006,1298
+semantic-capitals 1,1
+syntactic-Superlative Adjectives 2756,1611
+syntactic-verb 1964,708
diff --git a/results/analogy/test.csv b/results/analogy/test.csv
@@ -0,0 +1,2 @@
+:dataset test.csv
+semantic-capitals 100,44
diff --git a/scripts/test/README.md b/scripts/test/README.md
@@ -19,4 +19,4 @@ options discriptions can be find by : `python3 -m scripts.test.analogy_test -h`
 
 for now similarity.getKnear() implement finding k nearest vectors based on cosine , euclidean  and pair direction distances . but it is not a efficient method because it compute distance of each word in vocabulary by vector d(which for each question equal to v3+v2-v1).
 
-similarity.getKnearBatch() implement finding k nearest vectors just based on cosine distance. the method is expected to be fast because of using matrix multiplication to compute distances. but it is not efficient in memory usage since it should load all of the model (with size of 1-4G). the problem seems be solved in https://github.com/kudkudak/word-embeddings-benchmarks/blob/master/web/analogy.py. they solve the problem by defining each batch as small subset of model like 300 vector in each batch and they find nearest vector in each batch to (v3+v2-v1) and they report that.so if we want k nearest vector we can divide our model to len(model.vectors)/k batches and find the nearest vector in each batch.
+similarity.getKnearBatch() implement finding k nearest vectors just based on cosine distance. the method is expected to be fast because of using matrix multiplication to compute distances. it tries to divid questions to batches of 50.
diff --git a/scripts/test/analogy_test.py b/scripts/test/analogy_test.py
@@ -48,15 +48,17 @@ def print_inplace(txt):
 	if(options.method=='batch'):
 		X=[r["words"] for r in dataset['rows']]
 		X=np.array(X)
-		result=similarity.getKNearBatch(X,model,"Cosine",options.thereshold)
+		result=similarity.getKNearBatch(X,model,"Cosine",options.thereshold,50)
 		result=np.array(result)
-		for i,row in enumerate(dataset["rows"]):
-
+		# for i,row in enumerate(dataset["rows"]):
+		rows=dataset['rows']
+		for i,row in enumerate(result):
+			row=rows[i]
 			if row["category"] not in totals:
 				totals[row["category"]]=0
 				corrects[row["category"]]=0
 			totals[row["category"]] = totals[row["category"]] + 1
-			if(row["words"][3] in result[:,i]):
+			if(row["words"][3] in result[i,:]):
 				corrects[row["category"]]+=1
 		write_result_to_file(dataset["name"] , totals , corrects,"results/analogy/"+dataset["name"])
 	else:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -105,3 +105,5 @@ venv.bak/

		.scripts/crawler/corpus.txt

		.vscode/**
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,4 @@ def write_result_to_file(data_set_name , totals , corrects,path):
		f.write(":dataset "+data_set_name+"\n")
		for cat in totals:
		f.write(cat+" "+str(totals[cat])+","+str(corrects[cat])+"\n")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -19,4 +19,4 @@ options discriptions can be find by : `python3 -m scripts.test.analogy_test -h`

		for now similarity.getKnear() implement finding k nearest vectors based on cosine , euclidean and pair direction distances . but it is not a efficient method because it compute distance of each word in vocabulary by vector d(which for each question equal to v3+v2-v1).

		similarity.getKnearBatch() implement finding k nearest vectors just based on cosine distance. the method is expected to be fast because of using matrix multiplication to compute distances. but it is not efficient in memory usage since it should load all of the model (with size of 1-4G). the problem seems be solved in https://github.com/kudkudak/word-embeddings-benchmarks/blob/master/web/analogy.py. they solve the problem by defining each batch as small subset of model like 300 vector in each batch and they find nearest vector in each batch to (v3+v2-v1) and they report that.so if we want k nearest vector we can divide our model to len(model.vectors)/k batches and find the nearest vector in each batch.
		similarity.getKnearBatch() implement finding k nearest vectors just based on cosine distance. the method is expected to be fast because of using matrix multiplication to compute distances. it tries to divid questions to batches of 50.