forked from mideind/GreynirServer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
executable file
·160 lines (129 loc) · 6.81 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""
Reynir: Natural language processing for Icelandic
Search module
Copyright (C) 2016 Vilhjálmur Þorsteinsson
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/.
This module implements a search mechanism. The Search class parses
a search string into list of word stems and creates a topic vector from it,
which is then used in a similarity query to find related articles.
"""
from datetime import datetime, timedelta
from settings import Settings
from scraperdb import Root, Article
from similar import SimilarityClient
class Search:
""" This class wraps search queries to the similarity server
via the similarity client. """
# Similarity query client
similarity_client = None
def __init__(self):
""" This class is normally not instantiated """
pass
@classmethod
def _connect(cls):
""" Ensure that the client is connected, if possible """
if cls.similarity_client is None:
cls.similarity_client = SimilarityClient()
@classmethod
def list_similar_to_article(cls, session, uuid, n):
""" List n articles that are similar to the article with the given id """
cls._connect()
# Returns a list of tuples: (article_id, similarity)
result = cls.similarity_client.list_similar_to_article(uuid, n = n + 5)
result = result.get("articles", [])
# Convert the result tuples into article descriptors
return cls.list_articles(session, result, n)
@classmethod
def list_similar_to_topic(cls, session, topic_vector, n):
""" List n articles that are similar to the given topic vector """
cls._connect()
# Returns a list of tuples: (article_id, similarity)
result = cls.similarity_client.list_similar_to_topic(topic_vector, n = n + 5)
result = result.get("articles", [])
# Convert the result tuples into article descriptors
return cls.list_articles(session, result, n)
@classmethod
def list_similar_to_terms(cls, session, terms, n):
""" List n articles that are similar to the given terms. The
terms are expected to be a list of (stem, category) tuples. """
cls._connect()
# Returns a list of tuples: (article_id, similarity)
result = cls.similarity_client.list_similar_to_terms(terms, n = n + 5)
# Convert the result tuples into article descriptors
articles = result.get("articles", [])
# Obtain the search term weights
weights = result.get("weights", [])
return dict(weights = weights,
articles = cls.list_articles(session, articles, n))
@classmethod
def list_articles(cls, session, result, n):
""" Convert similarity result tuples into article descriptors """
similar = []
for sid, similarity in result:
if similarity > 0.9999:
# The original article (or at least a verbatim copy of it)
continue
q = session.query(Article).join(Root).filter(Article.id == sid)
sa = q.one_or_none()
if sa and sa.heading and sa.heading.strip(): # Skip articles without headings
# Similarity in percent
spercent = 100.0 * similarity
def is_probably_same_as(last):
""" Return True if the current article is probably different from
the one already described in the last object """
if last["domain"] != sa.root.domain:
# Another root domain: can't be the same content
return False
if abs(last["ts"] - sa.timestamp) > timedelta(minutes = 10):
# More than 10 minutes timestamp difference
return False
# Quite similar: probably the same article
ratio = (spercent / last["similarity"])
if ratio > 0.993:
if Settings.DEBUG:
print("Rejecting {0}, domain {1}, ts {2} because of similarity with {3}, {4}, {5}; ratio is {6:.3f}"
.format(sa.heading, sa.root.domain, sa.timestamp,
last["heading"], last["domain"], last["ts"], ratio))
return True
return False
def gen_similar():
""" Generate the entries in the result list that are probably the same as the one we are considering """
for ix, p in enumerate(similar):
if is_probably_same_as(p):
yield (ix, p)
d = dict(heading = sa.heading, url = sa.url,
uuid = sid, domain = sa.root.domain,
ts = sa.timestamp, ts_text = sa.timestamp.isoformat()[0:10],
similarity = spercent
)
# Don't add another article with practically the same similarity
# as the previous one, as it is very probably a duplicate
same = next(gen_similar(), None)
if same is None:
# No similar article
similar.append(d)
if len(similar) == n:
# Enough articles: we're done
break
elif d["ts"] > same[1]["ts"]:
# Similar article, and the one we're considering is newer: replace the one in the list
if Settings.DEBUG:
print("Replacing: {0} ({1:.2f})".format(sa.heading, spercent))
similar[same[0]] = d
else:
# Similar article, and the previous one is newer: drop the one we're considering
if Settings.DEBUG:
print("Ignoring: {0} ({1:.2f})".format(sa.heading, spercent))
pass
if Settings.DEBUG and similar:
print("Similar list is:\n {0}".format("\n ".join(str(s) for s in similar)))
return similar