Skip to content

Commit

Permalink
feat: add wiki title search
Browse files Browse the repository at this point in the history
  • Loading branch information
cir9no committed Dec 26, 2024
1 parent 54a0e17 commit 88e6383
Showing 1 changed file with 89 additions and 10 deletions.
99 changes: 89 additions & 10 deletions seasearch/index_store/wiki_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,20 @@ class WikiIndex(object):
'wiki_id': {
'type': 'keyword',
},
'doc_uuid':{
'doc_uuid': {
'type': 'keyword',
},
'title': {
'type': 'text',
'highlightable': True,
'fields': {
'ngram': {
'type': 'text',
'index': True,
'analyzer': 'seafile_wiki_ngram_analyzer',
},
},
},
'content': {
'type': 'text',
'highlightable': True,
Expand Down Expand Up @@ -83,6 +94,7 @@ def _make_match_query(field, key_word, **kw):

searches = []
searches.append(_make_match_query('content', keyword, **match_query_kwargs))
searches.append(_make_match_query('title', keyword, **match_query_kwargs))
searches.append({
'match': {
'content.ngram': {
Expand All @@ -91,6 +103,14 @@ def _make_match_query(field, key_word, **kw):
}
}
})
searches.append({
'match': {
'title.ngram': {
'query': keyword,
'minimum_should_match': '80%',
}
}
})
return searches

def create_index_if_missing(self, index_name):
Expand All @@ -116,6 +136,12 @@ def get_wiki_content(self, wiki_id, obj_id):

return content.strip()

def get_wiki_title(self, doc_uuid, conf):
for page in conf['pages']:
if page['docUuid'] == doc_uuid:
return page['name']
return None

def get_wiki_conf(self, wiki_id, commit_id=None):
# Get wiki config dict
conf_path = posixpath.join(WIKI_CONFIG_PATH, WIKI_CONFIG_FILE_NAME)
Expand All @@ -131,6 +157,26 @@ def get_wiki_conf(self, wiki_id, commit_id=None):
f = fs_mgr.load_seafile(wiki_id, 1, file_id)
return json.loads(f.get_content().decode())

def get_updated_title_uuids(old_conf, new_conf):
"""Calculate the items that are in new_conf but not in old_conf,
or the names in New_conf are different from the names in old_conf.
return based on new_conf data
Args:
old_conf: get from get_wiki_conf
new_conf: get from get_wiki_conf
Returns:
set: A set of doc_uuids for updated titles."""

old_pages = {page['id']: page for page in old_conf['pages']}
new_pages = {page['id']: page for page in new_conf['pages']}

doc_uuids = set()
for new_id, new_page in new_pages.items():
if new_id not in old_pages or new_page['name'] != old_pages[new_id]['name']:
doc_uuids.add(new_page['docUuid'])

return doc_uuids

def get_uuid_path_mapping(self, config):
"""Determine the UUID-PATH mapping for extracting unremoved or deleted wiki pages
"""
Expand All @@ -153,20 +199,21 @@ def extract_ids_from_navigation(navigation_items, navigation_ids):

return uuid_to_path, rm_uuid_to_path

def add_files(self, index_name, wiki_id, files, uuid_path, commit_id):
def add_files(self, index_name, wiki_id, files, uuid_path, commit_id, updated_title_uuids, wiki_conf):
bulk_add_params = []

def bulk_add():
if bulk_add_params:
self.seasearch_api.bulk(index_name, bulk_add_params)
bulk_add_params.clear()

def process_file(doc_uuid, content):
def process_file(doc_uuid, content, title):
index_info = {'index': {'_index': index_name, '_id': doc_uuid}}
doc_info = {
'wiki_id': wiki_id,
'doc_uuid': doc_uuid,
'content': content
'content': content,
'title': title,
}
bulk_add_params.extend([index_info, doc_info])
if len(bulk_add_params) >= SEASEARCH_WIKI_BULK_ADD_LIMIT:
Expand All @@ -179,14 +226,28 @@ def process_file(doc_uuid, content):
if self.size_cap is not None and int(size) >= int(self.size_cap):
continue
doc_uuid = path.split('/')[2]
if content := self.get_wiki_content(wiki_id, obj_id):
process_file(doc_uuid, content)
# remove docuuid from updated_title_uuids if it is in the need updated files
# this is for the case: both the title and content are updated
updated_title_uuids.discard(doc_uuid)
content = self.get_wiki_content(wiki_id, obj_id)
title = self.get_wiki_title(doc_uuid, wiki_conf)
process_file(doc_uuid, content, title)

# Recovered files
for doc_uuid, path in uuid_path.items():
file_id = seafile_api.get_file_id_by_commit_and_path(wiki_id, commit_id, path)
if content := self.get_wiki_content(wiki_id, file_id):
process_file(doc_uuid, content)
title = self.get_wiki_title(doc_uuid, wiki_conf)
content = self.get_wiki_content(wiki_id, file_id)
process_file(doc_uuid, content, title)

get_path_by_uuid = lambda uuid: next((page['path'] for page in wiki_conf['pages'] if page['docUuid'] == uuid), None)
# For the case: only title is updated
for doc_uuid in updated_title_uuids:
f_path = get_path_by_uuid(doc_uuid)
file_id = seafile_api.get_file_id_by_commit_and_path(wiki_id, commit_id, f_path)
content = self.get_wiki_content(wiki_id, file_id)
title = self.get_wiki_title(doc_uuid, wiki_conf)
process_file(doc_uuid, content,title)
bulk_add()

def delete_files(self, index_name, dirs, doc_uuids):
Expand Down Expand Up @@ -231,13 +292,29 @@ def update(self, index_name, wiki_id, old_commit_id, new_commit_id):

need_added_files = added_files + modified_files

# Check whether wiki title is changed
# This is a necessary but not sufficient condition judgment.
wiki_conf_path = posixpath.join(WIKI_CONFIG_PATH, WIKI_CONFIG_FILE_NAME)
is_wiki_title_modified = any(wiki_conf_path == tup[0] for tup in need_added_files)

if is_wiki_title_modified:
need_updated_title_uuids = self.get_updated_title_uuids(old_cfg, new_cfg)
else:
need_updated_title_uuids = set()

recently_restore_uuid_to_path = {
uuid: path
for uuid, path in curr_uuid_paths.items()
if uuid in prev_recycled_uuid_paths
}
self.add_files(
index_name, wiki_id, need_added_files, recently_restore_uuid_to_path, new_commit_id
index_name,
wiki_id,
need_added_files,
recently_restore_uuid_to_path,
new_commit_id,
need_updated_title_uuids,
new_cfg
)

def search_wiki(self, wiki, keyword, start=0, size=10):
Expand All @@ -256,7 +333,7 @@ def search_wiki(self, wiki, keyword, start=0, size=10):
"highlight": {
"pre_tags": ["<mark>"],
"post_tags": ["</mark>"],
"fields": {"content": {}},
"fields": {"content": {}, "title": {}},
},
}
index_name = WIKI_INDEX_PREFIX + wiki
Expand Down Expand Up @@ -285,6 +362,8 @@ def search_wiki(self, wiki, keyword, start=0, size=10):
}
if highlight_content := hit.get('highlight', {}).get('content', [None])[0]:
r.update(content=highlight_content)
if highlight_title := hit.get('highlight', {}).get('title', [None])[0]:
r.update(title=highlight_title)
wiki_content.append(r)

return wiki_content
Expand Down

0 comments on commit 88e6383

Please sign in to comment.