This repository was archived by the owner on Jan 27, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
138 lines (111 loc) · 4.39 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from elasticsearch import Elasticsearch
from typing import *
from user import User, ratings, load_users
from cluster import cluster
import pickle
inf = float('inf')
es = Elasticsearch()
def avg(A):
if len(A) == 0: return 0
return sum(A) / len(A)
def movie_score_avg(m, max_BM25, max_usr, max_avg, w_BM25=1, w_usr=1, w_avg=1):
n_BM25 = w_BM25 * m['BM25_score'] / max_BM25
n_avg = w_avg * m['avg_score'] / max_avg
n_usr = 0 if m['usr_score'] is None else (w_usr * m['usr_score'] / max_usr)
w = w_BM25 + w_avg + (0 if m['usr_score'] is None else w_usr)
return (n_BM25 + n_avg + n_usr) / w
def get_usr_ratings(userID):
return es.search(index='ratings', body={
'query': {
'match': {
'userID': {
'query': userID,
'minimum_should_match': '100%'
}
}
},
'size': 10000
})['hits']['hits']
def get_usr_rating_from_elastic(movieID, userID):
for rate in get_usr_ratings(userID):
if rate['_source']['movieID'] == movieID: return rate['_source']['rating']
return None
def get_usr_rating(movie, user):
# users have their ratings saved on the User class of user.py
try:
# print(movie['id'], ': accessing user rating... ', end='')
t = float(user.movie_ratings[movie['id']])
# print('successful')
return t, 'USER'
except KeyError:
# print('failed. trying cluster...', end='')
for u in user.cluster:
# print('u:', u.ID, end=' ')
s = c = 0
for movie_id in u.movie_ratings:
if movie_id == movie['id']:
s += u.movie_ratings[movie_id]
c += 1
# print('found', c, 'ratings in the cluster')
if c == 0:
score = user.estimate(movie['id'])
return score, 'NETW' # No rating found even inside cluster... This will be solved with neural
return s/c, 'CLUS'
def get_avg_rating_from_elastic(movieID):
res = es.search(index='ratings', body={
'query': {
'match': {
'movieID': {
'query': movieID,
'minimum_should_match': '100%'
}
}
},
'size': 10000
})
ratings = [rate['_source']['rating'] for rate in res['hits']['hits']]
return avg(ratings)
def get_avg_rating(movie):
# ratings is a pandas doc from user.py file
return avg(ratings[ratings['movieId'] == movie['id']]['rating'])
def search_BM25(q: str, size: int = 10000) -> Tuple[list, float]:
res = es.search(index='movies', body={
'query': {
'multi_match': {
'query': q,
'type': 'best_fields',
'fields': ['title', 'genres', 'year'],
'operator': 'AND',
'fuzziness': 'AUTO'
}
},
'size': size
})
A = []
for n, hit in enumerate(res['hits']['hits']):
A.append(hit['_source'])
A[n]['id'] = int(hit['_id'])
A[n]['BM25_score'] = hit['_score']
return A, res['hits']['max_score']
def personalized_search(query: str, user: User, limit: int = 10):
res, max_BM25 = search_BM25(query)
max_usr = max_avg = -inf
for movie in res:
movie['usr_score'], movie['usr_score_from_cluster'] = get_usr_rating(movie, user)
if movie['usr_score'] > max_usr: max_usr = movie['usr_score']
movie['avg_score'] = get_avg_rating(movie)
if movie['avg_score'] > max_avg: max_avg = movie['avg_score']
for movie in res:
if movie['usr_score_from_cluster'] == 'USER': w_usr = 15
elif movie['usr_score_from_cluster'] == 'CLUS': w_usr = 8
elif movie['usr_score_from_cluster'] == 'NETW': w_usr = 3
movie['normalized_score'] = movie_score_avg(movie, max_BM25, max_usr, max_avg, w_usr=w_usr, w_BM25=4, w_avg=6)
return sorted(res, key=lambda m: m['normalized_score'], reverse=True)[:min(len(res), limit)]
users = load_users()
cluster(users)
while True:
print();print('='*130)
query, usr = input('Search: '), int(input("User Number: "))
for i in personalized_search(query, users[usr-1]):
print("%60s (%s) - OVERALL: %.3f | BM25: %4.2f, USR: %.2f-%s, AVG: %.2f" % (i['title'], i['year'],
i['normalized_score'], i['BM25_score'], i['usr_score'], i['usr_score_from_cluster'], i['avg_score']))