diff --git a/scripts/similarity/README.md b/scripts/similarity/README.md new file mode 100644 index 0000000..024b490 --- /dev/null +++ b/scripts/similarity/README.md @@ -0,0 +1,6 @@ +Before you use this file, be sure the following dependencies installed: +
  • pip install pandas
  • +
  • pip install numpy
  • +
  • pip install scipy
  • +
    +An example of how you can use this file is placed in scripts/similarity/test/test_su.py diff --git a/scripts/similarity/SimilarityUtil.py b/scripts/similarity/SimilarityUtil.py new file mode 100644 index 0000000..806212f --- /dev/null +++ b/scripts/similarity/SimilarityUtil.py @@ -0,0 +1,89 @@ +import pandas as pd +import numpy as np +from scipy.stats import spearmanr, pearsonr + + + +def load_data(dataset_path, has_header=False): + if has_header: + dataset = pd.read_csv(dataset_path, header=0) + else: + dataset = pd.read_csv(dataset_path, header=None) + return dataset + + +def words_similarity(wordslist1, wordslist2, model, method='C'): + """ + :param wordslist1: numpy array of words + :param wordslist2: numpy array of words + :param model: object of model instance of LoadModel + :param method: 'C' for cosine_similarity and 'E' for Euclidean_distance , default is 'C' + :return: similarity rate between words in wordslist1 and wordslist2 + """ + similarity_of_model = [] + model = model.normalize_words() # use normalize_words() from LoadModel package of Mehdi Jafari + if method == 'C': + for w1, w2 in zip(wordslist1, wordslist2): + wv1 = model.get_vector(w1) # get word vector w1 from model + wv2 = model.get_vector(w2) # get word vector w2 from model + cos_sim = np.dot(wv1, wv2) / (np.linalg.norm(wv1) * np.linalg.norm(wv2)) + similarity_of_model.append(cos_sim) + + elif method == 'E': + for w1, w2 in zip(wordslist1, wordslist2): + wv1 = model.get_vector(w1) # get word vector w1 from model + wv2 = model.get_vector(w2) # get word vector w2 from model + dist = np.linalg.norm(wv1 - wv2) + similarity_of_model.append(dist) + + else: + raise ValueError("method not correct") + + return similarity_of_model + + + +def correlation_coefficient(sim_dataset_path, model, sim_columns=[], has_header=False, method='s'): + """ + :param sim_dataset_path:(type:string) dataset csv file path + :param model:(type:object) object of model instance of LoadModel + :param sim_columns:(type:list of integer) list of index of columns, index should started of 3 (exp:[3,4,7]) + default is empty list means that get all columns of similarities + :param has_header:(type:Boolean) default is False , if dataset has header, set has_header to True + :param method:(type:char) Correlation coefficient method, if 's':Spearman and 'p':Pearson + + :return:(list of integer) Spearman or Pearson correlation coefficient respect to sim_columns + """ + dataset = load_data(sim_dataset_path, has_header) + + num_of_sim_columns = dataset.shape[1] + list_of_words1 = dataset.iloc[:, 0] + list_of_words2 = dataset.iloc[:, 1] + + sim_of_model = words_similarity(list_of_words1, list_of_words2, model) + + if not sim_columns: # if sim_columns is empty + sim_columns = list(range(3, num_of_sim_columns + 1)) + else: + if(max(sim_columns) > num_of_sim_columns): + raise ValueError("maximum index of columns is", num_of_sim_columns) + elif(min(sim_columns) < 3): + raise ValueError("minimum index of columns is", 3) + + + corr_coe_rates = [] + if (method == 's') : # spearman correlation coefficient + for i in sim_columns: + sim_of_dataset = dataset.iloc[:, i - 1] + spearman = spearmanr(np.array(sim_of_model), np.array(sim_of_dataset)) + corr_coe_rates.append(spearman[0]) + elif(method == 'p') : # pearson correlation coefficient + for i in sim_columns: + sim_of_dataset = dataset.iloc[:, i - 1] + pearson = pearsonr(np.array(sim_of_model), np.array(sim_of_dataset)) + corr_coe_rates.append(pearson[0]) + else: + raise ValueError("method not correct") + + return corr_coe_rates + diff --git a/scripts/similarity/test/mymodel b/scripts/similarity/test/mymodel new file mode 100644 index 0000000..c38bea1 Binary files /dev/null and b/scripts/similarity/test/mymodel differ diff --git a/scripts/similarity/test/sim_test.csv b/scripts/similarity/test/sim_test.csv new file mode 100644 index 0000000..8b61ff5 --- /dev/null +++ b/scripts/similarity/test/sim_test.csv @@ -0,0 +1,3 @@ +dcemfybpfr,epetxaiqir,3,4 +jsfjogtavf,mqudzmlzrk,4,-1 +jsfjogtavf,mqudzmlzrk,4,3.4 diff --git a/scripts/similarity/test/test_su.py b/scripts/similarity/test/test_su.py new file mode 100644 index 0000000..2798b4c --- /dev/null +++ b/scripts/similarity/test/test_su.py @@ -0,0 +1,7 @@ +import SimilarityUtil as su +import LoadModel + +my_w2v = LoadModel.W2V.from_W2V("mymodel") # mymodel file is a word2vec file placed inside this directory +result = su.correlation_coefficient("sim_test.csv", my_w2v, [3,4]) # sim_test.csv placed inside this directory +print(res) +# The result should be [-1.0, 0.8660254037844387]