Skip to content

Commit 5bc7697

Browse files
authored
group results by document_id (#19)
1 parent a89ef04 commit 5bc7697

File tree

12 files changed

+425
-61
lines changed

12 files changed

+425
-61
lines changed

poetry.lock

+66-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "salinic"
3-
version = "0.4.0"
3+
version = "0.5.0"
44
description = "Search abstraction layer"
55
authors = ["Eugen Ciur <[email protected]>"]
66
readme = "README.md"
@@ -10,6 +10,7 @@ python = ">=3.11, <4.0"
1010
pydantic = "^2.7"
1111
requests = "^2.31.0"
1212
xapianpy = {version = "1.4.22.post2406040406", optional = true}
13+
glom = "^23.5.0"
1314

1415
[tool.poetry.extras]
1516
xapian = ["xapianpy"]

salinic/backends/solr/client.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,30 @@ def __init__(self, url: URL):
1616

1717
class ClientRW(Base):
1818

19-
def search(self, sq: SearchQuery):
19+
def search(self, sq: SearchQuery, user_id: str | None = None):
2020
payload = {
21-
'q': sq.query.original_query
21+
'q': sq.query.original_query,
22+
'group': 'true',
23+
'group.field': 'document_id',
24+
'rows': sq.rows,
25+
'start': sq.start,
26+
'group.limit': sq.group_limit,
27+
'group.offset': sq.group_offset,
28+
'group.sort': 'page_number asc'
2229
}
2330

31+
if user_id:
32+
payload['q'] = f"{payload['q']} AND user_id:{user_id}"
33+
2434
response = requests.get(
2535
self.http_select_url,
2636
params=payload
2737
)
38+
logger.debug(payload)
39+
40+
result = response.json()
2841

29-
return response.json()
42+
return result
3043

3144
def add(self, some_dict):
3245
# change data specific for add

salinic/backends/solr/index.py

+125-23
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import json
22
import logging
33

4+
from glom import glom
45
from pydantic import BaseModel
56

67
from salinic.field import Field
78
from salinic.query import SearchQuery
8-
from salinic.utils import filter_keys, first, trim_suffixes
9+
from salinic.schema import Document, Folder, Page
10+
from salinic.utils import first
911

1012
logger = logging.getLogger(__name__)
1113

@@ -15,31 +17,131 @@ def __init__(self, client, schema):
1517
self.client = client
1618
self.schema = schema
1719

18-
def search(self, sq: SearchQuery):
19-
result = self.client.search(sq)
20-
if result['response']['numFound'] == 0:
20+
def search(
21+
self,
22+
sq: SearchQuery,
23+
user_id: str | None = None
24+
) -> list[Document | Folder]:
25+
"""Query index
26+
27+
Solr results are grouped by `document_id` field: this way
28+
all folder entries will be part of group with `document_id=null`,
29+
while all page entities will be grouped per document i.e.
30+
pages which belong together are all in the same group.
31+
32+
{
33+
"responseHeader":{
34+
...
35+
"grouped":{
36+
"document_id":{
37+
"matches":26,
38+
"groups":[
39+
"groupValue":null,
40+
"doclist":{"numFound":4,"start":0,"numFoundExact":true,"docs":[
41+
{
42+
"id":"0b663599-32b1-4396-8dbe-ae7cd327cec6",
43+
"lang":"en",
44+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
45+
"entity_type":"folder",
46+
"title_txt_en":"A2 updated",
47+
"_version_":1801539995817738240},
48+
{
49+
"id":"768c6841-d37a-4d02-857f-ab7eaf69b27e",
50+
"lang":"en",
51+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
52+
"entity_type":"folder",
53+
"title_txt_en":".inbox",
54+
"_version_":1801539995692957696}]
55+
}},
56+
{
57+
"groupValue":"9bc57688-302e-4e1f-840a-c747dcccb362",
58+
"doclist":{"numFound":5,"start":0,"numFoundExact":true,"docs":[
59+
{
60+
"id":"a6e4916f-dea6-414b-aa38-f5b9ea375725",
61+
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
62+
"lang":"en",
63+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
64+
"page_number":1,
65+
"entity_type":"page",
66+
"title_txt_en":"brother_004603.pdf",
67+
"_version_":1801539996374532096},
68+
{
69+
"id":"72f6ca9e-af4b-4235-a56c-a62508e24efe",
70+
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
71+
"lang":"en",
72+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
73+
"page_number":2,
74+
"entity_type":"page",
75+
"title_txt_en":"brother_004603.pdf",
76+
"_version_":1801539996403892224},]
77+
}},
78+
{
79+
"groupValue":"200b0201-cfcd-43df-b41f-f1732568a0d2",
80+
"doclist":{"numFound":2,"start":0,"numFoundExact":true,"docs":[
81+
{
82+
"id":"9fa936e6-fe94-46bf-ad01-d8591cc290d4",
83+
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
84+
"lang":"en",
85+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
86+
"page_number":1,
87+
"entity_type":"page",
88+
"title_txt_en":"brother_004598.pdf",
89+
"_version_":1801539995874361344},
90+
{
91+
"id":"c364994c-eab5-4c6a-842a-6f40537f7a2e",
92+
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
93+
"lang":"en",
94+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
95+
"page_number":2,
96+
"entity_type":"page",
97+
"title_txt_en":"brother_004598.pdf",
98+
"_version_":1801539995910012928}]
99+
}},
100+
}}]}}}
101+
"""
102+
result = self.client.search(sq, user_id)
103+
grouped = glom(result, 'grouped.document_id')
104+
if glom(grouped, 'matches') == 0:
21105
return []
22106

23-
docs_list = [
24-
trim_suffixes(doc) for doc in result['response']['docs']
25-
]
26-
docs = [
27-
filter_keys(some_doc, ['_version_'])
28-
for some_doc in docs_list
29-
]
30-
31107
result = []
32-
for doc in docs:
33-
attrs = {}
34-
for field_name, value in doc.items():
35-
if '_orig_' in field_name:
36-
continue
37-
if self.schema.needs_transform(self.schema, field_name):
38-
attrs[field_name] = json.loads(doc[f'{field_name}_orig_'])
39-
else:
40-
attrs[field_name] = value
41-
42-
result.append(sq.entity(**attrs))
108+
for group in glom(grouped, 'groups'):
109+
if glom(group, 'groupValue'):
110+
# groupValue != null => document
111+
document_id = glom(group, 'groupValue')
112+
title = ''
113+
lang = 'en'
114+
tags = []
115+
pages = []
116+
for page in glom(group, 'doclist.docs'):
117+
lang = page.get('lang', 'en')
118+
title = page.get(f'title_txt_{lang}', None)
119+
text = page.get(f'text_txt_{lang}', None)
120+
tags = page.get('tags', [])
121+
p = Page(
122+
id=page['id'],
123+
page_number=page['page_number'],
124+
text=text
125+
)
126+
pages.append(p)
127+
item = Document(
128+
id=document_id,
129+
title=title,
130+
lang=lang,
131+
pages=pages,
132+
tags=tags,
133+
)
134+
result.append(item)
135+
else:
136+
for folder in glom(group, 'doclist.docs'):
137+
lang = folder.get('lang', 'en')
138+
title = folder.get(f'title_txt_{lang}', None)
139+
item = Folder(
140+
id=folder['id'],
141+
title=title,
142+
tags=folder.get('tags', []),
143+
)
144+
result.append(item)
43145

44146
return result
45147

salinic/backends/solr/schema_manager.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
from typing import List, Tuple
33

4-
from salinic.field import Field, NumericField
4+
from salinic.field import Field, NumericField, StringField
55
from salinic.utils import first
66

77
from .types import CopyFieldDump, FieldDump, FieldType
@@ -109,6 +109,10 @@ def _normal_fields(self) -> List[Tuple[str, FieldDump]]:
109109

110110
if isinstance(field_instance, NumericField):
111111
_type = FieldType.pint
112+
elif isinstance(field_instance, StringField):
113+
_type = FieldType.string
114+
elif field_instance.group:
115+
_type = FieldType.text_gen_sort
112116
else:
113117
_type = FieldType.text_general
114118

salinic/backends/solr/types.py

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ def to_dash(string: str) -> str:
1111

1212
class FieldType(str, Enum):
1313
text_general = 'text_general'
14+
text_gen_sort = 'text_gen_sort'
15+
string = 'string'
1416
pint = 'pint'
1517

1618

salinic/backends/xapian/index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def remove(self, **kwargs):
135135

136136
self.client.delete_document(id_term)
137137

138-
def search(self, sq: SearchQuery):
138+
def search(self, sq: SearchQuery, user_id: str | None = None):
139139
results = []
140140

141141
if str(sq.query.free_text):

salinic/field.py

+6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ class Field(NamedTuple):
99
default: any = None
1010
multi_value: bool = False
1111
multi_lang: bool = False
12+
# enables grouping by this field
13+
group: bool = False
1214

1315

1416
class KeywordField(Field):
@@ -23,6 +25,10 @@ class NumericField(Field):
2325
pass
2426

2527

28+
class StringField(Field):
29+
pass
30+
31+
2632
class IdField(Field):
2733
pass
2834

salinic/index.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ def __init__(self, engine, schema):
88
self.engine = engine
99
self.schema = schema
1010

11-
def search(self, sq: SearchQuery):
12-
return self.backend.search(sq)
11+
def search(self, sq: SearchQuery, user_id: str | None = None):
12+
return self.backend.search(sq, user_id)
1313

1414

1515
class IndexRO(IndexBase):

0 commit comments

Comments
 (0)