Skip to content

Commit 41dc912

Browse files
committed
do not group docs
1 parent 1e73500 commit 41dc912

File tree

6 files changed

+105
-304
lines changed

6 files changed

+105
-304
lines changed

poetry.lock

+1-66
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ python = ">=3.11, <4.0"
1010
pydantic = "^2.7"
1111
requests = "^2.31.0"
1212
xapianpy = {version = "1.4.22.post2406040406", optional = true}
13-
glom = "^23.5.0"
1413

1514
[tool.poetry.extras]
1615
xapian = ["xapianpy"]

salinic/backends/solr/client.py

-5
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,8 @@ class ClientRW(Base):
1919
def search(self, sq: SearchQuery, user_id: str | None = None):
2020
payload = {
2121
'q': sq.query.original_query,
22-
'group': 'true',
23-
'group.field': 'document_id',
2422
'rows': sq.rows,
2523
'start': sq.start,
26-
'group.limit': sq.group_limit,
27-
'group.offset': sq.group_offset,
28-
'group.sort': 'page_number asc'
2924
}
3025

3126
if user_id:

salinic/backends/solr/index.py

+26-119
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import json
22
import logging
33

4-
from glom import glom
54
from pydantic import BaseModel
65

76
from salinic.field import Field
87
from salinic.query import SearchQuery
9-
from salinic.schema import Document, Folder, Page
8+
from salinic.schema import DocumentPage, Folder
109
from salinic.utils import first
1110

1211
logger = logging.getLogger(__name__)
@@ -21,129 +20,37 @@ def search(
2120
self,
2221
sq: SearchQuery,
2322
user_id: str | None = None
24-
) -> list[Document | Folder]:
25-
"""Query index
26-
27-
Solr results are grouped by `document_id` field: this way
28-
all folder entries will be part of group with `document_id=null`,
29-
while all page entities will be grouped per document i.e.
30-
pages which belong together are all in the same group.
31-
32-
{
33-
"responseHeader":{
34-
...
35-
"grouped":{
36-
"document_id":{
37-
"matches":26,
38-
"groups":[
39-
"groupValue":null,
40-
"doclist":{"numFound":4,"start":0,"numFoundExact":true,"docs":[
41-
{
42-
"id":"0b663599-32b1-4396-8dbe-ae7cd327cec6",
43-
"lang":"en",
44-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
45-
"entity_type":"folder",
46-
"title_txt_en":"A2 updated",
47-
"_version_":1801539995817738240},
48-
{
49-
"id":"768c6841-d37a-4d02-857f-ab7eaf69b27e",
50-
"lang":"en",
51-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
52-
"entity_type":"folder",
53-
"title_txt_en":".inbox",
54-
"_version_":1801539995692957696}]
55-
}},
56-
{
57-
"groupValue":"9bc57688-302e-4e1f-840a-c747dcccb362",
58-
"doclist":{"numFound":5,"start":0,"numFoundExact":true,"docs":[
59-
{
60-
"id":"a6e4916f-dea6-414b-aa38-f5b9ea375725",
61-
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
62-
"lang":"en",
63-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
64-
"page_number":1,
65-
"entity_type":"page",
66-
"title_txt_en":"brother_004603.pdf",
67-
"_version_":1801539996374532096},
68-
{
69-
"id":"72f6ca9e-af4b-4235-a56c-a62508e24efe",
70-
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
71-
"lang":"en",
72-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
73-
"page_number":2,
74-
"entity_type":"page",
75-
"title_txt_en":"brother_004603.pdf",
76-
"_version_":1801539996403892224},]
77-
}},
78-
{
79-
"groupValue":"200b0201-cfcd-43df-b41f-f1732568a0d2",
80-
"doclist":{"numFound":2,"start":0,"numFoundExact":true,"docs":[
81-
{
82-
"id":"9fa936e6-fe94-46bf-ad01-d8591cc290d4",
83-
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
84-
"lang":"en",
85-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
86-
"page_number":1,
87-
"entity_type":"page",
88-
"title_txt_en":"brother_004598.pdf",
89-
"_version_":1801539995874361344},
90-
{
91-
"id":"c364994c-eab5-4c6a-842a-6f40537f7a2e",
92-
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
93-
"lang":"en",
94-
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
95-
"page_number":2,
96-
"entity_type":"page",
97-
"title_txt_en":"brother_004598.pdf",
98-
"_version_":1801539995910012928}]
99-
}},
100-
}}]}}}
101-
"""
23+
) -> list[DocumentPage | Folder]:
24+
"""Query index"""
10225
result = self.client.search(sq, user_id)
103-
grouped = glom(result, 'grouped.document_id')
104-
if glom(grouped, 'matches') == 0:
105-
return []
106-
107-
result = []
108-
for group in glom(grouped, 'groups'):
109-
if glom(group, 'groupValue'):
110-
# groupValue != null => document
111-
document_id = glom(group, 'groupValue')
112-
title = ''
113-
lang = 'en'
114-
tags = []
115-
pages = []
116-
for page in glom(group, 'doclist.docs'):
117-
lang = page.get('lang', 'en')
118-
title = page.get(f'title_txt_{lang}', None)
119-
text = page.get(f'text_txt_{lang}', None)
120-
tags = page.get('tags', [])
121-
p = Page(
122-
id=page['id'],
123-
page_number=page['page_number'],
124-
text=text
125-
)
126-
pages.append(p)
127-
item = Document(
128-
id=document_id,
26+
items = result['docs']
27+
returned_list = []
28+
for item in items:
29+
if document_id := item.get('document_id', None):
30+
lang = item.get('lang', 'en')
31+
title = item.get(f'title_txt_{lang}', lang)
32+
tags = item.get('tags', [])
33+
dp = DocumentPage(
34+
id=item['id'],
35+
page_number=item['page_number'],
36+
document_id=document_id,
12937
title=title,
13038
lang=lang,
131-
pages=pages,
132-
tags=tags,
39+
tags=tags
13340
)
134-
result.append(item)
41+
returned_list.append(dp)
13542
else:
136-
for folder in glom(group, 'doclist.docs'):
137-
lang = folder.get('lang', 'en')
138-
title = folder.get(f'title_txt_{lang}', None)
139-
item = Folder(
140-
id=folder['id'],
141-
title=title,
142-
tags=folder.get('tags', []),
143-
)
144-
result.append(item)
43+
lang = item.get('lang', 'en')
44+
title = item.get(f'title_txt_{lang}', lang)
45+
folder = Folder(
46+
id=item['id'],
47+
title=title,
48+
lang=lang,
49+
tags=item.get('tags', []),
50+
)
51+
returned_list.append(folder)
14552

146-
return result
53+
return returned_list
14754

14855

14956
class IndexRW(Base):

salinic/schema.py

+8-14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from uuid import UUID
21

32
from pydantic import BaseModel, ConfigDict, model_serializer
43

@@ -85,28 +84,23 @@ def needs_transform(self, field_name):
8584
return hasattr(self, f'get_idx_value__{field_name}')
8685

8786

88-
class Page(BaseModel):
89-
id: UUID
90-
page_number: int
91-
text: str | None = None
92-
93-
94-
class Document(BaseModel):
95-
id: UUID
87+
class SearchResultItem(BaseModel):
88+
id: str
9689
title: str
9790
lang: str
9891
tags: list[str] = []
99-
pages: list[Page]
92+
93+
94+
class DocumentPage(SearchResultItem):
95+
page_number: int
96+
document_id: str
10097
entity_type: str = 'document'
10198

10299
def __hash__(self):
103100
return hash(self.model_dump_json())
104101

105102

106-
class Folder(BaseModel):
107-
id: UUID
108-
title: str
109-
tags: list[str] = []
103+
class Folder(SearchResultItem):
110104
entity_type: str = 'folder'
111105

112106
def __hash__(self):

0 commit comments

Comments
 (0)