-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautocomplete.py
70 lines (64 loc) · 2.63 KB
/
autocomplete.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from encoder import Encoder
from generictrie import Trie, Node
import logging
import pyewts
from index_builder import get_index
EWTSCONVERTER = pyewts.pyewts()
#logging.basicConfig(level=logging.DEBUG)
def get_proper_start_i(query_s):
"""
removes prefixes that we'll ignore, such as anything non-Tibetan
plus the prefixes listed in
https://github.com/buda-base/lds-pdi/blob/master/src/main/java/io/bdrc/ldspdi/rest/controllers/ReconciliationController.java#L186
returns the index of the first character to be considered
"""
return 0
def max_lev_dst_for_query_s(query_s):
if len(query_s) < 3:
return 0
if len(query_s) < 9:
return 1
return 2
def auto_complete(query_s, res_limit=10, index_name="bo_general"):
"""
The main function. Returns a list of 10 results in the following format:
{
"res": "<ignored>foo </ignored>ba<suggested>r</suggested>",
"lang": "bo",
"category": "Person"
}
"""
first_c_idx = get_proper_start_i(query_s)
unprefixed_query = query_s[first_c_idx:]
index = get_index(index_name)
query_tokens, final_token_candidates_encoded = index.tokenize_query(unprefixed_query)
logging.debug("query tokens: %s", query_tokens)
logging.debug("final token candidates: %s", [index.encoder.decode(ord(i)) for i in final_token_candidates_encoded])
encoded_query = index.encoder.encode_list(query_tokens)
logging.debug("encoded query: '%s' (%s)", encoded_query, [ord(c) for c in encoded_query])
max_lev_dst = max_lev_dst_for_query_s(query_s)
suggestions = index.trie.get_top_10_suffixes(encoded_query, final_token_candidates_encoded, max_lev_dst=max_lev_dst)
res = []
base_res_str = ""
if first_c_idx > 0:
base_res_str = "<ignored>" + query_s[:first_c_idx] + "</ignored>"
lng = "bo" if index_name == "bo_general" else "bo-x-ewts"
for s in suggestions:
encoded_suffix, encoded_category, score, encoded_prefix, lev_dst = s
category = index.cat_encoder.decode(encoded_category)
prefix = index.encoder.decode_string(encoded_prefix)
suffix = index.encoder.decode_string(encoded_suffix)
res_str = base_res_str + prefix
if suffix:
res_str += "<suggested>" + suffix + "</suggested>"
res.append({
"res": res_str,
"lang": lng,
"category": category
})
return res
if __name__ == "__main__":
#print(auto_complete("བཀའ་འགྱུར།"))
#print(auto_complete("བཀའ་འགྱ"))
print(auto_complete("bka' gyu", index_name="ewts_general"))
#print(auto_complete("བཀའ་འགྱ"))