diff --git a/scripts/similarity/README.md b/scripts/similarity/README.md
new file mode 100644
index 0000000..024b490
--- /dev/null
+++ b/scripts/similarity/README.md
@@ -0,0 +1,6 @@
+Before you use this file, be sure the following dependencies installed:
+
pip install pandas
+pip install numpy
+pip install scipy
+
+An example of how you can use this file is placed in scripts/similarity/test/test_su.py
diff --git a/scripts/similarity/SimilarityUtil.py b/scripts/similarity/SimilarityUtil.py
new file mode 100644
index 0000000..806212f
--- /dev/null
+++ b/scripts/similarity/SimilarityUtil.py
@@ -0,0 +1,89 @@
+import pandas as pd
+import numpy as np
+from scipy.stats import spearmanr, pearsonr
+
+
+
+def load_data(dataset_path, has_header=False):
+ if has_header:
+ dataset = pd.read_csv(dataset_path, header=0)
+ else:
+ dataset = pd.read_csv(dataset_path, header=None)
+ return dataset
+
+
+def words_similarity(wordslist1, wordslist2, model, method='C'):
+ """
+ :param wordslist1: numpy array of words
+ :param wordslist2: numpy array of words
+ :param model: object of model instance of LoadModel
+ :param method: 'C' for cosine_similarity and 'E' for Euclidean_distance , default is 'C'
+ :return: similarity rate between words in wordslist1 and wordslist2
+ """
+ similarity_of_model = []
+ model = model.normalize_words() # use normalize_words() from LoadModel package of Mehdi Jafari
+ if method == 'C':
+ for w1, w2 in zip(wordslist1, wordslist2):
+ wv1 = model.get_vector(w1) # get word vector w1 from model
+ wv2 = model.get_vector(w2) # get word vector w2 from model
+ cos_sim = np.dot(wv1, wv2) / (np.linalg.norm(wv1) * np.linalg.norm(wv2))
+ similarity_of_model.append(cos_sim)
+
+ elif method == 'E':
+ for w1, w2 in zip(wordslist1, wordslist2):
+ wv1 = model.get_vector(w1) # get word vector w1 from model
+ wv2 = model.get_vector(w2) # get word vector w2 from model
+ dist = np.linalg.norm(wv1 - wv2)
+ similarity_of_model.append(dist)
+
+ else:
+ raise ValueError("method not correct")
+
+ return similarity_of_model
+
+
+
+def correlation_coefficient(sim_dataset_path, model, sim_columns=[], has_header=False, method='s'):
+ """
+ :param sim_dataset_path:(type:string) dataset csv file path
+ :param model:(type:object) object of model instance of LoadModel
+ :param sim_columns:(type:list of integer) list of index of columns, index should started of 3 (exp:[3,4,7])
+ default is empty list means that get all columns of similarities
+ :param has_header:(type:Boolean) default is False , if dataset has header, set has_header to True
+ :param method:(type:char) Correlation coefficient method, if 's':Spearman and 'p':Pearson
+
+ :return:(list of integer) Spearman or Pearson correlation coefficient respect to sim_columns
+ """
+ dataset = load_data(sim_dataset_path, has_header)
+
+ num_of_sim_columns = dataset.shape[1]
+ list_of_words1 = dataset.iloc[:, 0]
+ list_of_words2 = dataset.iloc[:, 1]
+
+ sim_of_model = words_similarity(list_of_words1, list_of_words2, model)
+
+ if not sim_columns: # if sim_columns is empty
+ sim_columns = list(range(3, num_of_sim_columns + 1))
+ else:
+ if(max(sim_columns) > num_of_sim_columns):
+ raise ValueError("maximum index of columns is", num_of_sim_columns)
+ elif(min(sim_columns) < 3):
+ raise ValueError("minimum index of columns is", 3)
+
+
+ corr_coe_rates = []
+ if (method == 's') : # spearman correlation coefficient
+ for i in sim_columns:
+ sim_of_dataset = dataset.iloc[:, i - 1]
+ spearman = spearmanr(np.array(sim_of_model), np.array(sim_of_dataset))
+ corr_coe_rates.append(spearman[0])
+ elif(method == 'p') : # pearson correlation coefficient
+ for i in sim_columns:
+ sim_of_dataset = dataset.iloc[:, i - 1]
+ pearson = pearsonr(np.array(sim_of_model), np.array(sim_of_dataset))
+ corr_coe_rates.append(pearson[0])
+ else:
+ raise ValueError("method not correct")
+
+ return corr_coe_rates
+
diff --git a/scripts/similarity/test/mymodel b/scripts/similarity/test/mymodel
new file mode 100644
index 0000000..c38bea1
Binary files /dev/null and b/scripts/similarity/test/mymodel differ
diff --git a/scripts/similarity/test/sim_test.csv b/scripts/similarity/test/sim_test.csv
new file mode 100644
index 0000000..8b61ff5
--- /dev/null
+++ b/scripts/similarity/test/sim_test.csv
@@ -0,0 +1,3 @@
+dcemfybpfr,epetxaiqir,3,4
+jsfjogtavf,mqudzmlzrk,4,-1
+jsfjogtavf,mqudzmlzrk,4,3.4
diff --git a/scripts/similarity/test/test_su.py b/scripts/similarity/test/test_su.py
new file mode 100644
index 0000000..2798b4c
--- /dev/null
+++ b/scripts/similarity/test/test_su.py
@@ -0,0 +1,7 @@
+import SimilarityUtil as su
+import LoadModel
+
+my_w2v = LoadModel.W2V.from_W2V("mymodel") # mymodel file is a word2vec file placed inside this directory
+result = su.correlation_coefficient("sim_test.csv", my_w2v, [3,4]) # sim_test.csv placed inside this directory
+print(res)
+# The result should be [-1.0, 0.8660254037844387]