-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCreate_Mixed_Training.py
135 lines (120 loc) · 5.06 KB
/
Create_Mixed_Training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def merge_sort(l):
length = len(l)
cut=length/2
if length<=1:
return l
print("cut " +str(cut))
left = merge_sort(l[:int(cut)])
right = merge_sort(l[int(cut):])
return merge(left, right)
def merge(left, right):
result, i, j = [], 0, 0
while i < len(left) and j < len(right):
if left[i][1] <= right[j][1]:
result.append(left[i])
i += 1
else:
result.append(right[j])
j += 1
result += left[i:]
result += right[j:]
return result
def mergeSort(alist):
if len(alist)>1:
mid = len(alist)//2
lefthalf = alist[:mid]
righthalf = alist[mid:]
mergeSort(lefthalf)
mergeSort(righthalf)
i=0
j=0
k=0
while i < len(lefthalf) and j < len(righthalf):
if lefthalf[i][1] < righthalf[j][1]:
alist[k]=lefthalf[i]
i=i+1
else:
alist[k]=righthalf[j]
j=j+1
k=k+1
while i < len(lefthalf):
alist[k]=lefthalf[i]
i=i+1
k=k+1
while j < len(righthalf):
alist[k]=righthalf[j]
j=j+1
k=k+1
def create_mixed_queries(total_num_queries,cat_percent_dict,categories,num_prod_per_query,sourceDirct,destDirect):
print("This procedure create a number of mixed queries to be used for training")
output_file_path = destDirect+"training_"+str(total_num_queries)+"__"
for i in range(len(categories)):
if i == len(categories)-1:
output_file_path += str(cat_percent_dict[categories[i]])
else:
output_file_path+=str(cat_percent_dict[categories[i]])+"_"
output_file_path+=".txt"
filehandle = open(output_file_path, 'w')
query_id = 0
for cat in categories:
queries_so_far = 0
cat_path = sourceDirct+str(cat)+".txt"
num_queries_per_cat_to_consider = int((cat_percent_dict[cat]/100)*total_num_queries)
print(cat+" "+str(cat_percent_dict[cat])+"% makes "+str(num_queries_per_cat_to_consider))
index = 0
queries = []
print("------------------------------------------------------------------------------------")
with open(cat_path, 'r') as fp:
for line in fp:
if index%num_prod_per_query == 0 and index!=0: #That's a query finished
queries_so_far+=1
tempList = []
temp_index = 0
for query in queries:
row = query.split(" ")
tempList.append((temp_index,int(row[0])))
temp_index+=1
mergeSort(tempList)
temp_index=0
for i in range(len(tempList)):
temp = tempList[i]
newtuple = (temp[0],temp_index)
tempList[i]=newtuple
temp_index+=1
for i in range(len(tempList)):
query = queries[tempList[i][0]]
row = query.split(" ")
filehandle.write(str(num_prod_per_query-1-tempList[i][1])+" ")
filehandle.write("qid:"+str(query_id)+" ")
for j in range(2,len(row)):
if j == len(row)-1:
filehandle.write(row[j])
else:
filehandle.write(row[j] + " ")
query_id+=1
queries=[]#clear to start to collect another
else:
queries.append(line)
index+=1
if queries_so_far == num_queries_per_cat_to_consider:
break
import shutil
shutil.copy2(output_file_path,destDirect+'train.txt')
shutil.copy2(output_file_path, destDirect + 'valid.txt')
return
total_num_queries=150
a = {'import': 'trade', 1: 7.8}
cat_percent_dict={ 'Arts, Crafts & Sewing':0,
'Industrial & Scientific':0,
'Jewelry':0,
'Toys & Games':0,
'Video Games':0,
'Computers & Accessories':0,
'Software':33,
'Cell Phones & Accessories':33,
'Electronics':34}
categories = ["Arts, Crafts & Sewing","Industrial & Scientific","Jewelry","Toys & Games","Video Games","Computers & Accessories","Software","Cell Phones & Accessories","Electronics"]
sourceDirct="C:\Yassien_RMIT PhD\Datasets\TruthDiscovery_Datasets\Web data Amazon reviews/Unique_Products_Stanford_three\Experiment 2\All_Categories_Data_25_Basic_Features_With_10_Time_Interval_TQ_Target_For_Ranking/"
destDirect="C:\Yassien_RMIT PhD\Datasets\TruthDiscovery_Datasets\Web data Amazon reviews/Unique_Products_Stanford_three\Experiment 2\Mixed_Queries_For_Training/"
num_prod_per_query = 10
create_mixed_queries(total_num_queries,cat_percent_dict,categories,num_prod_per_query,sourceDirct,destDirect)