Skip to content

Commit 5901222

Browse files
authored
do not group docs / paginate search results (#20)
1 parent 1e73500 commit 5901222

File tree

8 files changed

+290
-387
lines changed

8 files changed

+290
-387
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# Changelog
22

3+
## 0.6.0 - 2024-08-05
4+
5+
- Do **not** group search results by `document_id`
6+
- Paginate search results
7+
38
## 0.3.10 - 2024-04-21
49

510
- Add extra logging/debug statements

poetry.lock

+137-124
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
[tool.poetry]
22
name = "salinic"
3-
version = "0.5.0"
3+
version = "0.6.0"
44
description = "Search abstraction layer"
55
authors = ["Eugen Ciur <[email protected]>"]
66
readme = "README.md"
77

88
[tool.poetry.dependencies]
99
python = ">=3.11, <4.0"
10-
pydantic = "^2.7"
10+
pydantic = "^2.8"
1111
requests = "^2.31.0"
1212
xapianpy = {version = "1.4.22.post2406040406", optional = true}
1313
glom = "^23.5.0"

salinic/backends/solr/client.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,8 @@ class ClientRW(Base):
1919
def search(self, sq: SearchQuery, user_id: str | None = None):
2020
payload = {
2121
'q': sq.query.original_query,
22-
'group': 'true',
23-
'group.field': 'document_id',
24-
'rows': sq.rows,
25-
'start': sq.start,
26-
'group.limit': sq.group_limit,
27-
'group.offset': sq.group_offset,
28-
'group.sort': 'page_number asc'
22+
'rows': sq.page_size,
23+
'start': sq.page_size * (sq.page_number - 1),
2924
}
3025

3126
if user_id:

salinic/backends/solr/index.py

+37-119
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from salinic.field import Field
88
from salinic.query import SearchQuery
9-
from salinic.schema import Document, Folder, Page
9+
from salinic.schema import DocumentPage, Folder, PaginatedResponse
1010
from salinic.utils import first
1111

1212
logger = logging.getLogger(__name__)
@@ -21,129 +21,47 @@ def search(
2121
self,
2222
sq: SearchQuery,
2323
user_id: str | None = None
24-
) -> list[Document | Folder]:
25-
"""Query index
26-
27-
Solr results are grouped by `document_id` field: this way
28-
all folder entries will be part of group with `document_id=null`,
29-
while all page entities will be grouped per document i.e.
30-
pages which belong together are all in the same group.
31-
32-
{
33-
"responseHeader":{
34-
...
35-
"grouped":{
36-
"document_id":{
37-
"matches":26,
38-
"groups":[
39-
"groupValue":null,
40-
"doclist":{"numFound":4,"start":0,"numFoundExact":true,"docs":[
41-
{
42-
"id":"0b663599-32b1-4396-8dbe-ae7cd327cec6",
43-
"lang":"en",
44-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
45-
"entity_type":"folder",
46-
"title_txt_en":"A2 updated",
47-
"_version_":1801539995817738240},
48-
{
49-
"id":"768c6841-d37a-4d02-857f-ab7eaf69b27e",
50-
"lang":"en",
51-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
52-
"entity_type":"folder",
53-
"title_txt_en":".inbox",
54-
"_version_":1801539995692957696}]
55-
}},
56-
{
57-
"groupValue":"9bc57688-302e-4e1f-840a-c747dcccb362",
58-
"doclist":{"numFound":5,"start":0,"numFoundExact":true,"docs":[
59-
{
60-
"id":"a6e4916f-dea6-414b-aa38-f5b9ea375725",
61-
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
62-
"lang":"en",
63-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
64-
"page_number":1,
65-
"entity_type":"page",
66-
"title_txt_en":"brother_004603.pdf",
67-
"_version_":1801539996374532096},
68-
{
69-
"id":"72f6ca9e-af4b-4235-a56c-a62508e24efe",
70-
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
71-
"lang":"en",
72-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
73-
"page_number":2,
74-
"entity_type":"page",
75-
"title_txt_en":"brother_004603.pdf",
76-
"_version_":1801539996403892224},]
77-
}},
78-
{
79-
"groupValue":"200b0201-cfcd-43df-b41f-f1732568a0d2",
80-
"doclist":{"numFound":2,"start":0,"numFoundExact":true,"docs":[
81-
{
82-
"id":"9fa936e6-fe94-46bf-ad01-d8591cc290d4",
83-
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
84-
"lang":"en",
85-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
86-
"page_number":1,
87-
"entity_type":"page",
88-
"title_txt_en":"brother_004598.pdf",
89-
"_version_":1801539995874361344},
90-
{
91-
"id":"c364994c-eab5-4c6a-842a-6f40537f7a2e",
92-
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
93-
"lang":"en",
94-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
95-
"page_number":2,
96-
"entity_type":"page",
97-
"title_txt_en":"brother_004598.pdf",
98-
"_version_":1801539995910012928}]
99-
}},
100-
}}]}}}
101-
"""
24+
) -> PaginatedResponse:
25+
"""Query index"""
10226
result = self.client.search(sq, user_id)
103-
grouped = glom(result, 'grouped.document_id')
104-
if glom(grouped, 'matches') == 0:
105-
return []
106-
107-
result = []
108-
for group in glom(grouped, 'groups'):
109-
if glom(group, 'groupValue'):
110-
# groupValue != null => document
111-
document_id = glom(group, 'groupValue')
112-
title = ''
113-
lang = 'en'
114-
tags = []
115-
pages = []
116-
for page in glom(group, 'doclist.docs'):
117-
lang = page.get('lang', 'en')
118-
title = page.get(f'title_txt_{lang}', None)
119-
text = page.get(f'text_txt_{lang}', None)
120-
tags = page.get('tags', [])
121-
p = Page(
122-
id=page['id'],
123-
page_number=page['page_number'],
124-
text=text
125-
)
126-
pages.append(p)
127-
item = Document(
128-
id=document_id,
27+
items = glom(result, 'response.docs')
28+
total_found = glom(result, 'response.numFound')
29+
start = glom(result, 'response.start')
30+
page_number = int(start / sq.page_size) + 1
31+
num_pages = int(total_found / sq.page_size) + 1
32+
returned_list = []
33+
34+
for item in items:
35+
if document_id := item.get('document_id', None):
36+
lang = item.get('lang', 'en')
37+
title = item.get(f'title_txt_{lang}', lang)
38+
tags = item.get('tags', [])
39+
dp = DocumentPage(
40+
id=item['id'],
41+
page_number=item['page_number'],
42+
document_id=document_id,
12943
title=title,
13044
lang=lang,
131-
pages=pages,
132-
tags=tags,
45+
tags=tags
13346
)
134-
result.append(item)
47+
returned_list.append(dp)
13548
else:
136-
for folder in glom(group, 'doclist.docs'):
137-
lang = folder.get('lang', 'en')
138-
title = folder.get(f'title_txt_{lang}', None)
139-
item = Folder(
140-
id=folder['id'],
141-
title=title,
142-
tags=folder.get('tags', []),
143-
)
144-
result.append(item)
145-
146-
return result
49+
lang = item.get('lang', 'en')
50+
title = item.get(f'title_txt_{lang}', lang)
51+
folder = Folder(
52+
id=item['id'],
53+
title=title,
54+
lang=lang,
55+
tags=item.get('tags', []),
56+
)
57+
returned_list.append(folder)
58+
59+
return PaginatedResponse(
60+
page_size=sq.page_size,
61+
page_number=page_number,
62+
num_pages=num_pages,
63+
items=returned_list
64+
)
14765

14866

14967
class IndexRW(Base):

salinic/query.py

+6-12
Original file line numberDiff line numberDiff line change
@@ -169,26 +169,20 @@ def __repr__(self):
169169

170170
class SearchQuery:
171171
query: Query
172-
rows: int
173-
start: int
174-
group_limit: int
175-
group_offset: int
172+
page_number: int # starts with 1
173+
page_size: int
176174

177175
def __init__(
178176
self,
179177
entity,
180178
q: str,
181-
rows: int = 100,
182-
start: int = 0,
183-
group_limit: int = 100,
184-
group_offset: int = 0
179+
page_size: int = 50,
180+
page_number: int = 1
185181
):
186182
self.entity = entity
187183
self.query = Query(q)
188-
self.rows = rows
189-
self.start = start
190-
self.group_limit = group_limit
191-
self.group_offset = group_offset
184+
self.page_size = page_size
185+
self.page_number = page_number
192186

193187
def __str__(self):
194188
return f"SearchQuery(query={self.query}, entity={self.entity})"

salinic/schema.py

+17-14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from uuid import UUID
21

32
from pydantic import BaseModel, ConfigDict, model_serializer
43

@@ -85,29 +84,33 @@ def needs_transform(self, field_name):
8584
return hasattr(self, f'get_idx_value__{field_name}')
8685

8786

88-
class Page(BaseModel):
89-
id: UUID
90-
page_number: int
91-
text: str | None = None
92-
93-
94-
class Document(BaseModel):
95-
id: UUID
87+
class SearchResultItem(BaseModel):
88+
id: str
9689
title: str
9790
lang: str
9891
tags: list[str] = []
99-
pages: list[Page]
92+
93+
94+
class DocumentPage(SearchResultItem):
95+
page_number: int
96+
document_id: str
10097
entity_type: str = 'document'
10198

10299
def __hash__(self):
103100
return hash(self.model_dump_json())
104101

105102

106-
class Folder(BaseModel):
107-
id: UUID
108-
title: str
109-
tags: list[str] = []
103+
class Folder(SearchResultItem):
110104
entity_type: str = 'folder'
111105

112106
def __hash__(self):
113107
return hash(self.model_dump_json())
108+
109+
110+
class PaginatedResponse(BaseModel):
111+
page_size: int
112+
page_number: int
113+
num_pages: int
114+
items: list[Folder | DocumentPage]
115+
116+
model_config = ConfigDict(from_attributes=True)

0 commit comments

Comments
 (0)