-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlamda_mart_Run_time_New_Exp.py
204 lines (181 loc) · 8.62 KB
/
lamda_mart_Run_time_New_Exp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
from lamda_mart_Run_time import runLamadaMart_All_Categories_Sub_Cat_Experiment_Setup
from Data_Preparation_For_Learning import compute_Kendall_SubCat_Experiment_Setup,compute_Kendall_SubCat_Experiment_Setup_TQRank_Amazon
from ranking_metrics import *
import os
from alogrithms import mergeSort
from New_Data_Prepration_For_Learning import Prepare_Training_Testing_Data_Sub_Cat_Experiment_Setup_Amazon
from Calc_NDCG import nDCGScoresBased
from Data_Preparation_For_Learning import Adjust_Training_Testing_Data_Per_Cat_From_All_Cat_Data_Regression_Amazon
import subprocess
import math
def compute_Kendall_SubCat_Experiment_Setup_Amazon(base_predictions_directory,drive,R_path,first_k,correlation_type):
print("Computing Kendall's Tau for predicted values")
all_product_indices = drive + "\Yassien_PhD\categories/"
query_map_code_file_path = drive + "\Yassien_PhD\Experiment_6\Queries_Per_Product_Category_Num_Pro_GT_8/query_code_map.txt"
testing_queries_path = drive + "\Yassien_PhD\Experiment_6\Randomized_Queryset/testing.txt"
coded_queries_directory = drive + "\Yassien_PhD\Experiment_6\Queries_Per_Product_Category_Num_Pro_GT_8_Coded/"
categories_with_large_products = ["Industrial & Scientific", "Jewelry", "Arts, Crafts & Sewing", "Toys & Games",
"Video Games", "Computers & Accessories", "Software", "Cell Phones & Accessories",
"Electronics"]
query_cat_map_file_path = drive + "\Yassien_PhD\Experiment_6\Randomized_Queryset/query_cat_map_test.txt"
testing_queries = []
with open(testing_queries_path, 'r') as fp:
for line in fp:
testing_queries.append(line.split('\t')[0])
compute_Kendall_SubCat_Experiment_Setup(base_predictions_directory, drive, R_path, first_k, all_product_indices,
query_map_code_file_path, testing_queries,
coded_queries_directory, categories_with_large_products,
query_cat_map_file_path,correlation_type)
return
def compute_NDCG_For_list(estimated_rank,relevancy):
ndcg_file_path = "D:\Yassien_PhD\Experiment_6\Train_Test_Category_TQ_Target_Sub_Cat_Setup/ndcg.r"
folahandle = open(ndcg_file_path, 'w')
folahandle.write("library(StatRank)\n")
folahandle.write("EstimatedRank <- c(")
for i in range(len(relevancy)):
if i == len(relevancy) - 1:
folahandle.write(str(estimated_rank[i]) + ")\n")
else:
folahandle.write(str(estimated_rank[i]) + ",")
folahandle.write("RelevanceLevel <- c(")
for i in range(len(relevancy)):
if i == len(relevancy) - 1:
folahandle.write(str(relevancy[i]) + ")\n")
else:
folahandle.write(str(relevancy[i]) + ",")
folahandle.write("Evaluation.NDCG(EstimatedRank, RelevanceLevel)\n")
folahandle.close()
output = 0
with open(os.devnull, 'w') as devnull:
out = subprocess.check_output(["C:\Program Files\R\R-3.3.2/bin/Rscript.exe", ndcg_file_path], stderr=devnull)
out = out.strip()
out = str(out)
out = out.split("[1]")
out = out[1].split("'")
output = float(out[0])
return output
def compute_Ranking_Metrics_SubCat_Experiment_Setup_Amazon(base_predictions_directory):
r_directory = base_predictions_directory+"R_Difference/"
maps = []
avg_ndcg = 0
new_avg_ndcg = 0
num_queries = 0
avg_p_10=0
mean_avg_p = 0
print("qid\tndcg@10")
for file_name in os.listdir(r_directory):
qid = file_name.split('.')[0]
file_path = r_directory+file_name
ranks = []
#print(file_name)
preds = []
ytrue =[]
with open(file_path, 'r') as fp:
for line in fp:
line = line.split('\t')
#ranks.append((int(line[1].split('\n')[0]),int(line[0])))
ranks.append((int(line[0]), int(line[1].split('\n')[0])))
ytrue.append(int(line[0]))
preds.append(int(line[1]))
#print("Before")
#print(ranks)
mergeSort(ranks)
#print("After")
#print(ranks)
#ranks.reverse()
#print(ytrue)
#print(preds)
rs = []
estimated_rank = []
relevancy = []
precion = []
for item in ranks:
rs.append(len(ranks)-item[0])
#The new one
estimated_rank.append(len(ranks)+1-item[1])
relevancy.append(len(ranks)+1-item[0])
if item[0]==1 or item[0]==2 or item[0]==3 or item[0]==4 or item[0]==5:
precion.append(1)
else:
precion.append(0)
#print(precion)
#print(rs)
#print("len is "+str(len(rs)))
if len(rs)>=10:
#new_ndcg = nDCGScoresBased(preds,ytrue,10)
#rs = [3,3,3,2,2,2,1,1,1,0]
#ndcg = ndcg_at_k(rs,10)
new_rs = rs
new_rs = [8,10, 12, 13, 4, 1, 14, 5, 2, 11, 3, 9, 6, 0, 7]
#print(relevancy)
#print(estimated_rank)
#print(float(out[0]))
#for i in rs:
# new_rs.append(len(rs)-i+1)
#print(rs)
#print("New rs")
#print(new_rs)
#new_rs = [20.375053908,5.0943969723,24.69364005,20.522695427,45.354892604,34.443118029,28.130445862,3.8190658939,6.3543450327,3.8992726996]
#print(new_rs)
#mymy=ndcg_at_k(new_rs,10)
#print(out[0])
#mymy = sklearn.metrics.ndcg_score(relevancy,estimated_rank,k=10)
#print("here is my "+str(mymy))
new_ndcg = compute_NDCG_For_list(estimated_rank,relevancy)
#new_ndcg = mymy
#print(len(rs))
#my_ndcg = ndcg_at_k(relevancy, 10)
#print("here "+str(my_ndcg))
#p_10 = precision_at_k(precion,10)
#avg_p_10+=p_10
#print(p_10)
maps.append(precion)
#avg_ndcg+=ndcg
if math.isnan(new_ndcg):
print("problem with "+str(new_ndcg))
print(len(relevancy))
print(relevancy)
print(estimated_rank)
max_size = int(len(estimated_rank)/2)
estimated_rank = estimated_rank[:max_size]
relevancy = relevancy[:max_size]
new_ndcg = compute_NDCG_For_list(estimated_rank, relevancy)
print("tried to fix "+str(new_ndcg))
if math.isnan(new_ndcg)==0:
new_avg_ndcg += new_ndcg
print(qid + "\t" + str(round(new_ndcg, 3)) + " " + str(len(relevancy)))
num_queries += 1
else:
new_avg_ndcg+=new_ndcg
print(qid+"\t"+str(round(new_ndcg,3))+" "+str(len(relevancy)))
num_queries+=1
#print("********************************************************************")
#avg_ndcg /= num_queries
#print("Num Queries "+str(num_queries)+" "+str(new_avg_ndcg))
new_avg_ndcg/=num_queries
#avg_p_10/= num_queries
#print("Avg_NDCG " + str(round(avg_ndcg, 3)))
print("NDCG " + str(round(new_avg_ndcg, 3)))
#print("Avg_P " + str(round(avg_p_10, 3)))
#map = mean_average_precision(maps)
#print("Map "+str(round(map,3)))
return
drive = "d:/"
amazon_base_learning_directory = "\Yassien_PhD\Experiment_6\Train_Test_Category_TQ_Target_Sub_Cat_Setup/"
yelp_base_learning_directory = "\Yassien_PhD\yelp_dataset_challenge_academic_dataset\Experiment_3\Train_Test_Category_With_Time_Interval_TQ_Target_Sub_Cat_Setup/"
base_learning_directory = drive+ amazon_base_learning_directory
dataset_type="amazon"
exp_type ="ranknet"
#learning_lib_directory =drive+"Yassien_PhD\Experiment 2\Lamda_Java/"
learning_lib_directory = "C:\RankLib_New_Code/trunk/bin/"
#R_path = "C:\Program Files\R\R-3.2.2/bin/Rscript.exe" # RMIT
R_path ="C:\Program Files\R\R-3.3.2/bin/Rscript.exe" #Laptop
print("Starting The Learning process")
#Adjust_Training_Testing_Data_Per_Cat_From_All_Cat_Data_Regression_Amazon(drive)
#Prepare_Training_Testing_Data_Sub_Cat_Experiment_Setup_Amazon(drive)
#runLamadaMart_All_Categories_Sub_Cat_Experiment_Setup(base_learning_directory,learning_lib_directory,exp_type)
compute_Kendall_SubCat_Experiment_Setup_Amazon(base_learning_directory,drive,R_path,-1,0)#Last parameter is 1 for kendall 0 for spearman
#compute_Kendall_SubCat_Experiment_Setup_TQRank_Amazon(base_learning_directory,drive,R_path,-1)
#compute_Ranking_Metrics_SubCat_Experiment_Setup_Amazon(base_learning_directory)
compute_Kendall_SubCat_Experiment_Setup_Amazon(base_learning_directory,drive,R_path,10,0)
print("Done")