From 47b86f2443a08a87ca1de951af535c96f3f85754 Mon Sep 17 00:00:00 2001 From: abbas Date: Thu, 3 Jan 2019 13:36:17 +0330 Subject: [PATCH] Batchedquestions (#55) * Initial commit * Update README.md * Update README.md * added simlpe crawler by scrapy * simple crawler * simple crawler * simple crawler * simple crawler * crawler moved to /scripts * crawler moved to /scripts * pipline added for crawler which make corpus file in standard format(each sentence in on line) * analogy test added. * corpus moved * little changes in crawler * changes in analogy similarity package * helpers.py moved question.txt removed loadAnalogyDataset changed to load csv format` * analogy tester tested with dummy embedding model. * removed ide stuffs * changed crawler readme * changed crawler readme * option parser added to analogy_test script * unknown words replaced by mean vector * bathced version tested * analogy readme changed --- .gitignore | 2 + code/analogy/helpers.py | 1 + code/analogy/similarity.py | 72 +++++++------ code/models/models.py | 193 ++++++++++++++++++++++++++++++++++- data/analogy/analogy.csv | 3 +- data/models/sample.vec | 6 ++ results/analogy/analogy.csv | 10 ++ results/analogy/test.csv | 2 + scripts/test/README.md | 2 +- scripts/test/analogy_test.py | 10 +- 10 files changed, 262 insertions(+), 39 deletions(-) create mode 100644 data/models/sample.vec create mode 100644 results/analogy/analogy.csv create mode 100644 results/analogy/test.csv diff --git a/.gitignore b/.gitignore index 107a764..4d9bdf3 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,5 @@ venv.bak/ .scripts/crawler/corpus.txt +.vscode/** + diff --git a/code/analogy/helpers.py b/code/analogy/helpers.py index 1ba9252..04244b1 100644 --- a/code/analogy/helpers.py +++ b/code/analogy/helpers.py @@ -5,3 +5,4 @@ def write_result_to_file(data_set_name , totals , corrects,path): f.write(":dataset "+data_set_name+"\n") for cat in totals: f.write(cat+" "+str(totals[cat])+","+str(corrects[cat])+"\n") + diff --git a/code/analogy/similarity.py b/code/analogy/similarity.py index 90c1f75..c503f28 100644 --- a/code/analogy/similarity.py +++ b/code/analogy/similarity.py @@ -1,6 +1,10 @@ import numpy as np from sklearn.preprocessing import normalize from math import sqrt +from tqdm import tqdm +def batched(X,batchsize): + s=[iter(X)]*batchsize + return zip(*s) def _cosinDistance(v1,v2): return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2)) @@ -34,7 +38,7 @@ def getKNear(r1,r2,r3,model,thershold,method): return result -def getKNearBatch(X,model,method,thershold): +def getKNearBatch(X,model,method,thershold,batchsize): V=model.vectors # X: matrice of questions. in each column is a list of string words w1,w2,w3.each row is # a single query @@ -43,38 +47,46 @@ def getKNearBatch(X,model,method,thershold): # method : Cosine,Euclidean,PairDirection # for cosine distance : - A=np.array(np.zeros((len(X),len(V[0])))) - B=np.array(np.zeros((len(X),len(V[0])))) - C=np.array(np.zeros((len(X),len(V[0])))) + A=np.array(np.zeros((batchsize,len(V[0])))) + B=np.array(np.zeros((batchsize,len(V[0])))) + C=np.array(np.zeros((batchsize,len(V[0])))) + meanVector=np.mean(model.vectors,axis=0) if method=="Cosine": - for i,w in enumerate(X[:,0]): - A[i]=model.getVec(w) - for i,w in enumerate(X[:,1]): - try: - B[i]=model.getVec(w) - except: - print("\n have error for word: ",w) - - for i,w in enumerate(X[:,2]): - C[i]=model.getVec(w) - D=B-A+C + Rw=[] # result answer words( k neares neighbors to v2-v1+v3) to each question + for c,b in enumerate(batched(X,batchsize)): + print(c,"/",len(X)/batchsize) + for i,x in enumerate(b): + try: + A[i]=model.getVec(x[0]) + except: + A[i]=meanVector + try: + B[i]=model.getVec(x[1]) + except: + B[i]=meanVector + try: + C[i]=model.getVec(x[2]) + except: + C[i]=meanVector + D=B-A+C - # now D is a d(embedding dimension)xM(number of questions) matrice. - # each row of D is a vector representing w2-w1+w3. we want to get cosine distance of each - # row by all vectors of vocabulary. the result should be a matrice by M(number of questions)xV(vocabulary size) - # and then we select K minimum item of each row which make matrice to have MxK dimension. + # now D is a d(embedding dimension)xM(number of questions) matrice. + # each row of D is a vector representing w2-w1+w3. we want to get cosine distance of each + # row by all vectors of vocabulary. the result should be a matrice by M(number of questions)xV(vocabulary size) + # and then we select K minimum item of each row which make matrice to have MxK dimension. - # for computing cosine distance first we should normalize all vectors. - D=np.array(D) - V=np.array(V) - nD=normalize(D,axis=1) - nV=normalize(V,axis=1) - R=np.matmul(nV,nD.T) - R=R.argsort(axis=0) - R=R[-thershold:,:] - Rw=[] - for r in R: - Rw.append([model.words[int(id)] for id in r]) + # for computing cosine distance first we should normalize all vectors. + D=np.array(D) + V=np.array(V) + nD=normalize(D,axis=1) + nV=normalize(V,axis=1) + R=np.matmul(nV,nD.T) + R=R.argsort(axis=0) + R=R[-thershold:,:].T + bRw=[] + for r in R: + bRw.append([model.words[int(id)] for id in r]) + Rw+=bRw return Rw elif method=="PairDirection": for w,i in enumerate(X[0]): diff --git a/code/models/models.py b/code/models/models.py index 244ebef..59f5408 100644 --- a/code/models/models.py +++ b/code/models/models.py @@ -1,2 +1,191 @@ -def loadmodel(path): - return \ No newline at end of file + +""" + auther Mohamad M. Jafari + + This file contains a CLASS for loading and useing embedding models! + + Essential materials for dealing with [word2vec, gensim] models is implemented! + + There are some extra facilities to perform bunch of other operation on models, + like reordering, changeing format and etc. + +""" + + + +import numpy as np +import pickle +import gensim +from numpy import dot +from numpy.linalg import norm +from random import choice +from string import ascii_lowercase +import io +class W2V(): + def __init__(self, vocabulary, vectors): + self.vocabulary = vocabulary # list of vocab + self.vectors = np.asarray(vectors) # numpy array, each row contains one word vector + # (corresponding to vocab list) + self._wordDict={vocabulary[i]:vectors[i] for i in range(0,len(vocabulary))} + + # magic method for reach vector corresponding to word + def __getitem__(self, index): + return self.vectors[index,:] + # magic method of containing special word + def __contains__(self, word): + return word in self.vocabulary + + # delition word + def __delitem__(self, word): + index = self.vocabulary.index(word) + del self.vocabulary[index] + self.vectors = np.delete(self.vectors, index, 0) + + # length + def __len__(self): + return len(self.vocabulary) + # iterator + def __iter__(self): + for w in self.vocabulary: + yield w, self[vocabulary.index(w)] + + + def getVec(self,word): + return self.wordDict[word] + @property + def words(self): + return self.vocabulary + @property + def wordDict(self): + return self._wordDict + @property + def shape(self): + return self.vectors.shape + + def normalize_words(self, ord=2, inplace=False): + if ord == 2: + ord = None # numpy uses this flag to indicate l2. + vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1) + if inplace: + self.vectors = vectors.T + return self + return W2V(vectors=vectors.T, vocabulary=self.vocabulary) + + def nearest_neighbors(self, word, k=1): + if isinstance(word, str): + assert word in self, "invalid word!" + v = self.vocabulary.index(word) + print(v) + else: + v = word + dist = lambda v1, v2 : dot(v1, v2)/(norm(v1)*norm(v2)) + vectors = self.vectors + distances = [dist(vectors[v,:], vectors[x,:]) for x in range(0, len(vectors))] + return(sorted(distances, reverse=True)[1:1+k]) + @staticmethod + def from_text(fname, encoding=False): + words = [] + vectors = [] + if encoding: + with open(fname, 'r', encoding="utf-8") as fin: + for line in fin.readlines(): + line = line.split(" ") + word, vector = line[0], [float(x) for x in line[1:]] + words.append(word) + vectors.append(vector) + return W2V(vocabulary=words, vectors=vectors) + else: + with open(fname, 'r') as fin: + for line in fin: + line = line.split(" ") + try: + if(len(line)>0): + word, vector = line[0], [float(x) for x in line[1:]] + except: + print("error in loading model in modles.py by reading this ",line) + print("exited by error") + exit(0) + # word, vector = line[0], [float(x) for x in line[1:]] + words.append(word) + vectors.append(vector) + return W2V(vocabulary=words, vectors=vectors) + + @staticmethod + def fasttext_from_text(fname): + words = [] + vectors = [] + fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') + n, d = map(int, fin.readline().split()) + data = {} + for line in fin: + tokens = line.rstrip().split(' ') + words.append(tokens[0]) + vectors.append([float(t) for t in tokens[1:]]) + + return W2V(vocabulary=words, vectors=vectors) + @staticmethod + def from_bin(fname): + model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True) +# to implement + @staticmethod + def to_word2vec(w, fname, binary=False): + with open(fname, 'wb') as fout: + header = "%s %s\n" % w.vectors.shape + fout.write(header.encode("utf-8")) + for word, vector in zip(w.vocabulary.words, w.vectors): + if binary: + line = word.encode("utf-8") + b" " + vector.astype("float32").tostring() + fout.write(line) + else: + line = "%s %s\n" % (word, ' '.join("%.15f" % val for val in vector)) + fout.write(line.encode("utf-8")) + + @staticmethod + def from_W2V(fname): + with open(fname, 'rb') as fin: + wtov = pickle.load(fin) + vec, voc = wtov["vectors"], wtov["vocabulary"] + return W2V(vocabulary=voc, vectors=vec) + + def save(self, fname): + vec = self.vectors + voc = self.vocabulary + model = {"vectors":vec, "vocabulary":voc} + with open(fname, 'wb') as fout: + pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL) + +glove50dt = "glove.6B.50d.txt" +wikifab = "wiki.fa.bin" +googb = "GoogleNews-vectors-negative300.bin" + +# w2vt = W2V.from_text(glove50dt, encoding=True) +# w2vt.save("test") +# w2vt = W2V.from_W2V("test") + +#w2vb = W2V.from_bin(googb) +# print(w2vt.vocabulary[0]) +# print(w2vt.vectors[0]) +if __name__ == "__main__": + vocab_size = 100 + embedding_dim = 300 + # create our simple test case! + vocabulary = [''.join(choice(ascii_lowercase) for i in range(10))\ + for j in range(0, vocab_size)] + vectors = np.random.random((vocab_size, embedding_dim)) + # test our methods! + my_w2v = W2V(vocabulary, vectors) + my_w2v.save("model") + my_w2v = W2V.from_W2V("model") +# print(my_w2v[vocabulary.index(vocabulary[10])]) +# print(my_w2v.shape) +# print(my_w2v.words) + del my_w2v[vocabulary[10]] + print(my_w2v.shape) +# print(len(my_w2v)) +# for word, vector in my_w2v: +# print(word, vector) + tmp = my_w2v.vectors + my_w2v.normalize_words(ord=2, inplace=True) +# print(my_w2v.vectors==tmp) + print(len(my_w2v.nearest_neighbors(vocabulary[3], k=10))) + diff --git a/data/analogy/analogy.csv b/data/analogy/analogy.csv index ab66c1c..3dbe18b 100644 --- a/data/analogy/analogy.csv +++ b/data/analogy/analogy.csv @@ -1,5 +1,4 @@ -CATEGORY,WORD1,WORD2,WORD3,TARGET -semantic-capitals,ایران,تهران,چین,پکن +semantic-capitals,ایران,تهران,چین,پکن semantic-capitals,ایران,تهران,هند,دهلی‌نو semantic-capitals,ایران,تهران,ژاپن,توکیو semantic-capitals,ایران,تهران,فیلیپین,مانیل diff --git a/data/models/sample.vec b/data/models/sample.vec new file mode 100644 index 0000000..45dc0bb --- /dev/null +++ b/data/models/sample.vec @@ -0,0 +1,6 @@ +ایران 1 1 +تهران 4 3 +انگلیس 10 12 +لندن 13 14 +فرانسه 40 50 +پاریس 43 52 \ No newline at end of file diff --git a/results/analogy/analogy.csv b/results/analogy/analogy.csv new file mode 100644 index 0000000..822414b --- /dev/null +++ b/results/analogy/analogy.csv @@ -0,0 +1,10 @@ +:dataset analogy.csv +semantic-capitals 4691,3514 +semantic-Whole to part 420,68 +syntactic-Comparative Adjectives 2756,1927 +syntactic-antonym 506,239 +semantic-family 600,349 +semantic-currency 6006,1298 +semantic-capitals 1,1 +syntactic-Superlative Adjectives 2756,1611 +syntactic-verb 1964,708 diff --git a/results/analogy/test.csv b/results/analogy/test.csv new file mode 100644 index 0000000..3a8ed4d --- /dev/null +++ b/results/analogy/test.csv @@ -0,0 +1,2 @@ +:dataset test.csv +semantic-capitals 100,44 diff --git a/scripts/test/README.md b/scripts/test/README.md index ed7c806..25d123e 100644 --- a/scripts/test/README.md +++ b/scripts/test/README.md @@ -19,4 +19,4 @@ options discriptions can be find by : `python3 -m scripts.test.analogy_test -h` for now similarity.getKnear() implement finding k nearest vectors based on cosine , euclidean and pair direction distances . but it is not a efficient method because it compute distance of each word in vocabulary by vector d(which for each question equal to v3+v2-v1). -similarity.getKnearBatch() implement finding k nearest vectors just based on cosine distance. the method is expected to be fast because of using matrix multiplication to compute distances. but it is not efficient in memory usage since it should load all of the model (with size of 1-4G). the problem seems be solved in https://github.com/kudkudak/word-embeddings-benchmarks/blob/master/web/analogy.py. they solve the problem by defining each batch as small subset of model like 300 vector in each batch and they find nearest vector in each batch to (v3+v2-v1) and they report that.so if we want k nearest vector we can divide our model to len(model.vectors)/k batches and find the nearest vector in each batch. +similarity.getKnearBatch() implement finding k nearest vectors just based on cosine distance. the method is expected to be fast because of using matrix multiplication to compute distances. it tries to divid questions to batches of 50. diff --git a/scripts/test/analogy_test.py b/scripts/test/analogy_test.py index 2727c14..a806de4 100644 --- a/scripts/test/analogy_test.py +++ b/scripts/test/analogy_test.py @@ -48,15 +48,17 @@ def print_inplace(txt): if(options.method=='batch'): X=[r["words"] for r in dataset['rows']] X=np.array(X) - result=similarity.getKNearBatch(X,model,"Cosine",options.thereshold) + result=similarity.getKNearBatch(X,model,"Cosine",options.thereshold,50) result=np.array(result) - for i,row in enumerate(dataset["rows"]): - + # for i,row in enumerate(dataset["rows"]): + rows=dataset['rows'] + for i,row in enumerate(result): + row=rows[i] if row["category"] not in totals: totals[row["category"]]=0 corrects[row["category"]]=0 totals[row["category"]] = totals[row["category"]] + 1 - if(row["words"][3] in result[:,i]): + if(row["words"][3] in result[i,:]): corrects[row["category"]]+=1 write_result_to_file(dataset["name"] , totals , corrects,"results/analogy/"+dataset["name"]) else: