From d48cd0f9f79733047534df619afa5e9e26b94f2b Mon Sep 17 00:00:00 2001 From: Zahra Majabadi Date: Wed, 13 Feb 2019 14:13:17 +0330 Subject: [PATCH] word2vec analogy test (#63) * Semantic Similarity Dataset * change format * rename folder * word2vec analogy test * minor change * readme added --- code/analogy/anology_test.py | 52 ++++++++++++++++++++++++++++++ results/word2vec-analogy/README.md | 4 +++ 2 files changed, 56 insertions(+) create mode 100755 code/analogy/anology_test.py create mode 100644 results/word2vec-analogy/README.md diff --git a/code/analogy/anology_test.py b/code/analogy/anology_test.py new file mode 100755 index 0000000..1aa89f9 --- /dev/null +++ b/code/analogy/anology_test.py @@ -0,0 +1,52 @@ +import gensim +from scipy.spatial.distance import cosine, euclidean +import numpy as np +model = gensim.models.Word2Vec.load("w2v_farsi.model") + +data=[] +groundTruth=[] +with open("data.csv","r",encoding="UTF-8")as f: + for line in f: + row = [] + line=line.split(",") + row.append(line[1]) + row.append(line[2]) + row.append(line[3]) + groundTruth.append(line[4]) + data.append(row) + +j=0 +tp=0 +for row in data: + i=0 + for w in row: + try: + if i==0: + A=model.wv[w] + if i==1: + B=model.wv[w] + if i==2: + C=model.wv[w] + i+=1 + except: + if i==0: + A=0 + if i==1: + B=0 + if i==2: + C=0 + i+=1 + + if (A is not 0) and (B is not 0)and(C is not 0): + mi=np.subtract(B,A) + s=np.add(mi,C) + most_similars=model.similar_by_vector(s,topn=10) + + for word in most_similars: + if (groundTruth[j].strip() is word[0].strip()): + tp+=1 + + j+=1 + + +print('accuracy = ', tp/len(data)) \ No newline at end of file diff --git a/results/word2vec-analogy/README.md b/results/word2vec-analogy/README.md new file mode 100644 index 0000000..d406d0f --- /dev/null +++ b/results/word2vec-analogy/README.md @@ -0,0 +1,4 @@ +**The result of analogy test on word2vec model** +The model was trained on Wikipedia. +The test was ran on analogy.csv which contain 19716 group. +The accuracy is **0.27429498884155**