-
Notifications
You must be signed in to change notification settings - Fork 75
/
Copy pathpreprocess.py
155 lines (147 loc) · 6.2 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# script to preprocess data into balanced dataset or generate data
import json
from random import choice, randint
from modules.scraper import get_player_names
from data.text_data import percent_list
# Function to generate randomized instance of name
def funnel_name(name):
inst = randint(1, 3)
name_split = name.split()
if inst == 1:
return name_split[0]
elif inst == 2:
return name_split[1]
else:
return name
# Function to generate rank queries
def generate_rank_queries(samples):
names = get_player_names(2020)
stat_list = ["shoot", "rebound", "steal", "assist", "throw", "catch", "play"]
cnt = 0
query_list = []
while cnt < samples:
randq = randint(0, 4)
if randq == 0:
name1 = funnel_name(choice(names))
name2 = funnel_name(choice(names))
stat = choice(stat_list)
query = "who is a better {}er between {} and {},rank\n".format(stat, name1, name2)
query_list.append(query)
elif randq == 1:
name1 = funnel_name(choice(names))
name2 = funnel_name(choice(names))
query1 = "could {} beat {} in 1v1,rank\n".format(name1, name2)
query2 = "could {} beat {} in one on one,rank\n".format(name1, name2)
query3 = "could {} beat {} in basketball,rank\n".format(name1, name2)
Qs = [query1, query2, query3]
query_list.append(choice(Qs))
elif randq == 2:
name1 = funnel_name(choice(names))
name2 = funnel_name(choice(names))
stat = choice(stat_list)
query1 = "who is better at {}ing {} or {},rank\n".format(stat, name1, name2)
query2 = "is {} or {} better at {}ing,rank\n".format(name1, name2, stat)
Qs = [query1, query2]
query_list.append(choice(Qs))
elif randq == 3:
name1 = funnel_name(choice(names))
name2 = funnel_name(choice(names))
query1 = "should I pick {} or {},rank\n".format(name1, name2)
query2 = "should I put {} or {} on my fantasy team,rank\n".format(name1, name2)
query3 = "who should I pick for fantasy,rank\n"
Qs = [query1, query2, query3]
query_list.append(choice(Qs))
else:
# The goat query
goats = ["Michael Jordan", "Kobe Bryant", "Lebron James", "Magic Johnson", "Larry Bird"]
name1 = funnel_name(choice(goats))
name2 = funnel_name(choice(goats))
query = "{} or {},rank\n".format(name1, name2)
query_list.append(query)
cnt += 1
return query_list
# Function to generate stat queries
def generate_stat_queries(samples):
names = get_player_names(2020)
stat_list = ["shot", "rebound", "steal", "assist", "turnover", "point"]
cnt = 0
query_list = []
while cnt < samples:
randq = randint(0, 6)
if randq < 2:
name = funnel_name(choice(names))
action = choice(stat_list)
query1 = "how many {}s does {} put up,stat\n".format(action, name)
query2 = "how many {}s does {} have per game,stat\n".format(action, name)
query3 = "how many {}s did {},stat\n".format(action, name)
Qs = [query1, query2, query3]
query_list.append(choice(Qs))
elif randq < 5:
name = funnel_name(choice(names))
action = choice(stat_list)
query1 = "what is {}s {} percentage,stat\n".format(action, name)
query2 = "what is {} {}s {},stat\n".format(name, action, "%")
query3 = "what was {}s {} percentage in {},stat\n".format(action, name, str(randint(1988,2020)))
query4 = "who had the best {} in {},stat\n".format(action, str(randint(1988,2020)))
Qs = [query1, query2, query3, query4]
query_list.append(choice(Qs))
elif randq == 5:
action = choice(stat_list)
query1 = "who is the best {}er in the league,stat\n".format(action)
query2 = "who is the worst {}er there is,stat\n".format(action)
query3 = "who has the most {}s per game,stat\n".format(action)
query4 = "who is the best {}er,stat\n".format(action)
Qs = [query1, query2, query3, query4]
query_list.append(choice(Qs))
elif randq == 6:
action = choice(stat_list)
query1 = "who has the highest {} percentage in the nba,stat\n".format(action)
query2 = "who has the highest {} percentage,stat\n".format(action)
query3 = "who has the best {}{},stat\n".format(action, "%")
query4 = "who has the highest {}{} around,stat\n".format(action, "%")
Qs = [query1, query2, query3, query4]
query_list.append(choice(Qs))
cnt += 1
return query_list
#function to generate info queries
def generate_info_queries(samples):
verb_list = ["do", "know"]
cnt = 0
query_list = []
while cnt < samples:
randq = randint(0, 6)
if randq <= 2:
verb = choice(verb_list)
query = "what are you,info\n"
query1 = "what do you {},info\n".format(verb)
query2 = "what can you {},info\n".format(verb)
Qs = [query1, query2]
query_list.append(choice(Qs))
elif randq == 3:
query = "who are you,info\n"
query1 = "who made you,info\n"
Qs = [query, query1]
query_list.append(choice(Qs))
elif randq == 4:
query = "where are you,info\n"
query_list.append(query)
elif randq == 5:
query = "how are you,info\n"
query_list.append(query)
elif randq == 6:
query = "when were you made,info\n"
query_list.append(query)
cnt += 1
return query_list
def main():
ranked_queries = generate_rank_queries(1500)
stat_queries = generate_stat_queries(1500)
info_queries = generate_info_queries(1500)
query_csv = open("data/query.csv", "a")
for i in range(1500):
query_csv.write(stat_queries[i])
query_csv.write(ranked_queries[i])
query_csv.write(info_queries[i])
query_csv.close()
if __name__ == "__main__":
main()