Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

search: improve with CompositeSuggestQueryParser #151

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions invenio_users_resources/services/users/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@
SortParam,
)
from invenio_records_resources.services.records.queryparser import (
CompositeSuggestQueryParser,
FieldValueMapper,
QueryParser,
SearchFieldTransformer,
SuggestQueryParser,
)
from luqum.tree import Word

Expand Down Expand Up @@ -68,9 +68,16 @@ class UserSearchOptions(SearchOptions, SearchOptionsMixin):
# The user search needs to be highly restricted to avoid leaking
# account information, hence do not edit here unless you are
# absolutely sure what you're doing.
suggest_parser_cls = SuggestQueryParser.factory(
suggest_parser_cls = CompositeSuggestQueryParser.factory(
tree_transformer_cls=SearchFieldTransformer,
fields=["username^2", "email^2", "profile.full_name^3", "profile.affiliations"],
fields=[
"username.keyword^2",
"username^2",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️
Searching for a username with a dash is still not working well.
For instance, searching from "one-two" seems to search for usernames starting with "one" and starting with "two", and therefore does not find anything.

Copy link
Contributor

@kpsherva kpsherva Dec 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the issue: text field is split by - https://github.com/inveniosoftware/invenio-users-resources/blob/master/invenio_users_resources/records/mappings/os-v2/users/user-v3.0.0.json#L132
if you search by username.keyword, it should work

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome, thanks a lot @kpsherva !
2 months later 😅, I confirm that this fixes the issue.

I modified the tests to include a username with a dash, and I added a test showing that usernames with dashes can now be searched properly.

"email.keyword^2",
"email^2",
"profile.full_name^3",
"profile.affiliations",
],
# Only public emails because hidden emails are stored in email_hidden field.
allow_list=["username", "email"],
mapping={
Expand All @@ -81,7 +88,6 @@ class UserSearchOptions(SearchOptions, SearchOptionsMixin):
"name": "profile.full_name",
},
type="most_fields", # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#multi-match-types
fuzziness="AUTO", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fuzziness is applied in CompositeSuggestQueryParser.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then the most_fields can also be removed, see PR

)

params_interpreters_cls = [
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def users_data():
"""Data for users."""
return [
{
"username": "pubres",
"username": "pub-res",
"email": "[email protected]",
"profile": {
"full_name": "Tim Smith",
Expand Down Expand Up @@ -322,7 +322,7 @@ def user_pub(users):
@pytest.fixture(scope="module")
def user_pubres(users):
"""User tjs (public/restricted)."""
return users["pubres"]
return users["pub-res"]


@pytest.fixture(scope="module")
Expand Down
2 changes: 1 addition & 1 deletion tests/resources/test_resources_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def test_read_self_serialization(client, headers, users, user_pub):

@pytest.mark.parametrize(
"username,public_email",
[("pub", True), ("pubres", False)],
[("pub", True), ("pub-res", False)],
)
def test_read_anon_serialization(client, headers, users, username, public_email):
"""Read public user as anon."""
Expand Down
44 changes: 28 additions & 16 deletions tests/services/users/test_service_users.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 CERN.
# Copyright (C) 2022-2024 CERN.
#
# Invenio-Users-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
Expand Down Expand Up @@ -98,24 +98,36 @@
assert res["hits"]["total"] == 0


USERNAME_JOSE = ["pub"]
USERNAME_TIM = ["pub-res"]
USERNAME_BOTH = USERNAME_JOSE + USERNAME_TIM


#
# Read
@pytest.mark.parametrize(
"query",
"query,expected_usernames",
[
"CERN",
"Jose CERN",
"Jose AND CERN",
"Tim",
"Tim CERN",
"Jose",
"Jos",
"[email protected]",
"pub",
("CERN", USERNAME_BOTH),
("Jose", USERNAME_JOSE),
("Jos", USERNAME_JOSE),
("Jose CERN", USERNAME_JOSE),
("Tim", USERNAME_TIM),
("Tim CERN", USERNAME_TIM),
("[email protected]", USERNAME_JOSE),
("[email protected]", USERNAME_JOSE),
("pub@inveniosoft", USERNAME_JOSE),
("pub", USERNAME_BOTH),
("pub-res", USERNAME_TIM),
("re", USERNAME_TIM),
("res", USERNAME_TIM),
],
)
def test_user_search_field(user_service, user_pub, query):
def test_user_search_field(user_service, user_pub, query, expected_usernames):
"""Make sure certain fields ARE searchable."""
res = user_service.search(user_pub.identity, suggest=query).to_dict()
assert res["hits"]["total"] > 0
usernames = [entry["username"] for entry in res["hits"]["hits"]]
assert sorted(usernames) == expected_usernames

Check failure on line 130 in tests/services/users/test_service_users.py

View workflow job for this annotation

GitHub Actions / Python / Tests (3.9, postgresql14, opensearch2)

test_user_search_field[pub@inveniosoft-expected_usernames8] AssertionError: assert [] == ['pub'] Right contains one more item: 'pub' Full diff: + [] - [ - 'pub', - ]

Check failure on line 130 in tests/services/users/test_service_users.py

View workflow job for this annotation

GitHub Actions / Python / Tests (3.12, postgresql14, opensearch2)

test_user_search_field[pub@inveniosoft-expected_usernames8] AssertionError: assert [] == ['pub'] Right contains one more item: 'pub' Full diff: + [] - [ - 'pub', - ]


#
Expand All @@ -127,7 +139,7 @@
assert res["username"] == "pub"
assert res["email"] == user_pub.email
res = user_service.read(anon_identity, user_pubres.id).to_dict()
assert res["username"] == "pubres"
assert res["username"] == "pub-res"
assert "email" not in res
pytest.raises(
# TODO: Should be mapped to a 404
Expand All @@ -139,7 +151,7 @@


@pytest.mark.parametrize(
"username,can_read", [("pub", True), ("pubres", True), ("res", False)]
"username,can_read", [("pub", True), ("pub-res", True), ("res", False)]
)
def test_read_avatar_with_anon(user_service, anon_identity, users, username, can_read):
"""Anonymous users can read avatar single *public* user."""
Expand All @@ -160,7 +172,7 @@
"username",
[
"pub",
"pubres",
"pub-res",
"res",
],
)
Expand Down
Loading