-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create temporary SimilarityUtil.py (#38)
* Create temporary SimilarityUtil.py .این فقط یک فایل موقت است ، لطفا بررسی کنید و چیزهایی که مدنظرتون هست را اعلام کنید .یک سری ابهامات در ورودی وجود دارد که فردا حضوری بهتون میگم * fix SimilarityUtil.py this code is tested. * Create README.md * Update README.md * Create test_su.py this file is test for SimilarityUtil.py * sim_test.csv this is a similarity dataset test for use of SimilarityUtil.py. sim_test.csv used in test_su.py * data model this is a word2vec datamodel that used in test_su.py for load model using the LoadModel.py library created by Mohammad.M jafari
- Loading branch information
Showing
5 changed files
with
105 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Before you use this file, be sure the following dependencies installed: | ||
<li>pip install pandas</li> | ||
<li>pip install numpy</li> | ||
<li>pip install scipy</li> | ||
<br> | ||
An example of how you can use this file is placed in scripts/similarity/test/test_su.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from scipy.stats import spearmanr, pearsonr | ||
|
||
|
||
|
||
def load_data(dataset_path, has_header=False): | ||
if has_header: | ||
dataset = pd.read_csv(dataset_path, header=0) | ||
else: | ||
dataset = pd.read_csv(dataset_path, header=None) | ||
return dataset | ||
|
||
|
||
def words_similarity(wordslist1, wordslist2, model, method='C'): | ||
""" | ||
:param wordslist1: numpy array of words | ||
:param wordslist2: numpy array of words | ||
:param model: object of model instance of LoadModel | ||
:param method: 'C' for cosine_similarity and 'E' for Euclidean_distance , default is 'C' | ||
:return: similarity rate between words in wordslist1 and wordslist2 | ||
""" | ||
similarity_of_model = [] | ||
model = model.normalize_words() # use normalize_words() from LoadModel package of Mehdi Jafari | ||
if method == 'C': | ||
for w1, w2 in zip(wordslist1, wordslist2): | ||
wv1 = model.get_vector(w1) # get word vector w1 from model | ||
wv2 = model.get_vector(w2) # get word vector w2 from model | ||
cos_sim = np.dot(wv1, wv2) / (np.linalg.norm(wv1) * np.linalg.norm(wv2)) | ||
similarity_of_model.append(cos_sim) | ||
|
||
elif method == 'E': | ||
for w1, w2 in zip(wordslist1, wordslist2): | ||
wv1 = model.get_vector(w1) # get word vector w1 from model | ||
wv2 = model.get_vector(w2) # get word vector w2 from model | ||
dist = np.linalg.norm(wv1 - wv2) | ||
similarity_of_model.append(dist) | ||
|
||
else: | ||
raise ValueError("method not correct") | ||
|
||
return similarity_of_model | ||
|
||
|
||
|
||
def correlation_coefficient(sim_dataset_path, model, sim_columns=[], has_header=False, method='s'): | ||
""" | ||
:param sim_dataset_path:(type:string) dataset csv file path | ||
:param model:(type:object) object of model instance of LoadModel | ||
:param sim_columns:(type:list of integer) list of index of columns, index should started of 3 (exp:[3,4,7]) | ||
default is empty list means that get all columns of similarities | ||
:param has_header:(type:Boolean) default is False , if dataset has header, set has_header to True | ||
:param method:(type:char) Correlation coefficient method, if 's':Spearman and 'p':Pearson | ||
:return:(list of integer) Spearman or Pearson correlation coefficient respect to sim_columns | ||
""" | ||
dataset = load_data(sim_dataset_path, has_header) | ||
|
||
num_of_sim_columns = dataset.shape[1] | ||
list_of_words1 = dataset.iloc[:, 0] | ||
list_of_words2 = dataset.iloc[:, 1] | ||
|
||
sim_of_model = words_similarity(list_of_words1, list_of_words2, model) | ||
|
||
if not sim_columns: # if sim_columns is empty | ||
sim_columns = list(range(3, num_of_sim_columns + 1)) | ||
else: | ||
if(max(sim_columns) > num_of_sim_columns): | ||
raise ValueError("maximum index of columns is", num_of_sim_columns) | ||
elif(min(sim_columns) < 3): | ||
raise ValueError("minimum index of columns is", 3) | ||
|
||
|
||
corr_coe_rates = [] | ||
if (method == 's') : # spearman correlation coefficient | ||
for i in sim_columns: | ||
sim_of_dataset = dataset.iloc[:, i - 1] | ||
spearman = spearmanr(np.array(sim_of_model), np.array(sim_of_dataset)) | ||
corr_coe_rates.append(spearman[0]) | ||
elif(method == 'p') : # pearson correlation coefficient | ||
for i in sim_columns: | ||
sim_of_dataset = dataset.iloc[:, i - 1] | ||
pearson = pearsonr(np.array(sim_of_model), np.array(sim_of_dataset)) | ||
corr_coe_rates.append(pearson[0]) | ||
else: | ||
raise ValueError("method not correct") | ||
|
||
return corr_coe_rates | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
dcemfybpfr,epetxaiqir,3,4 | ||
jsfjogtavf,mqudzmlzrk,4,-1 | ||
jsfjogtavf,mqudzmlzrk,4,3.4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
import SimilarityUtil as su | ||
import LoadModel | ||
|
||
my_w2v = LoadModel.W2V.from_W2V("mymodel") # mymodel file is a word2vec file placed inside this directory | ||
result = su.correlation_coefficient("sim_test.csv", my_w2v, [3,4]) # sim_test.csv placed inside this directory | ||
print(res) | ||
# The result should be [-1.0, 0.8660254037844387] |