Skip to content

Commit 64d6646

Browse files
committed
process solr grouped results
1 parent afc76c5 commit 64d6646

File tree

9 files changed

+410
-61
lines changed

9 files changed

+410
-61
lines changed

poetry.lock

+66-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ python = ">=3.11, <4.0"
1010
pydantic = "^2.7"
1111
requests = "^2.31.0"
1212
xapianpy = {version = "1.4.22.post2406040406", optional = true}
13+
glom = "^23.5.0"
1314

1415
[tool.poetry.extras]
1516
xapian = ["xapianpy"]

salinic/backends/solr/client.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,30 @@ def __init__(self, url: URL):
1616

1717
class ClientRW(Base):
1818

19-
def search(self, sq: SearchQuery):
19+
def search(self, sq: SearchQuery, user_id: str | None = None):
2020
payload = {
21-
'q': sq.query.original_query
21+
'q': sq.query.original_query,
22+
'group': 'true',
23+
'group.field': 'document_id',
24+
'rows': sq.rows,
25+
'start': sq.start,
26+
'group.limit': sq.group_limit,
27+
'group.offset': sq.group_offset,
28+
'group.sort': 'page_number asc'
2229
}
2330

31+
if user_id:
32+
payload['q'] = f"{payload['q']} AND user_id:{user_id}"
33+
2434
response = requests.get(
2535
self.http_select_url,
2636
params=payload
2737
)
38+
logger.debug(payload)
39+
40+
result = response.json()
2841

29-
return response.json()
42+
return result
3043

3144
def add(self, some_dict):
3245
# change data specific for add

salinic/backends/solr/index.py

+124-25
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import json
22
import logging
33

4+
from glom import glom
45
from pydantic import BaseModel
56

67
from salinic.field import Field
78
from salinic.query import SearchQuery
8-
from salinic.utils import filter_keys, first, trim_suffixes
9+
from salinic.schema import Document, Folder, Page
10+
from salinic.utils import first
911

1012
logger = logging.getLogger(__name__)
1113

@@ -15,34 +17,131 @@ def __init__(self, client, schema):
1517
self.client = client
1618
self.schema = schema
1719

18-
def search(self, sq: SearchQuery):
20+
def search(
21+
self,
22+
sq: SearchQuery,
23+
user_id: str | None = None
24+
) -> list[Document | Folder]:
25+
"""Query index
26+
27+
Solr results are grouped by `document_id` field: this way
28+
all folder entries will be part of group with `document_id=null`,
29+
while all page entities will be grouped per document i.e.
30+
pages which belong together are all in the same group.
31+
32+
{
33+
"responseHeader":{
34+
...
35+
"grouped":{
36+
"document_id":{
37+
"matches":26,
38+
"groups":[
39+
"groupValue":null,
40+
"doclist":{"numFound":4,"start":0,"numFoundExact":true,"docs":[
41+
{
42+
"id":"0b663599-32b1-4396-8dbe-ae7cd327cec6",
43+
"lang":"en",
44+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
45+
"entity_type":"folder",
46+
"title_txt_en":"A2 updated",
47+
"_version_":1801539995817738240},
48+
{
49+
"id":"768c6841-d37a-4d02-857f-ab7eaf69b27e",
50+
"lang":"en",
51+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
52+
"entity_type":"folder",
53+
"title_txt_en":".inbox",
54+
"_version_":1801539995692957696}]
55+
}},
56+
{
57+
"groupValue":"9bc57688-302e-4e1f-840a-c747dcccb362",
58+
"doclist":{"numFound":5,"start":0,"numFoundExact":true,"docs":[
59+
{
60+
"id":"a6e4916f-dea6-414b-aa38-f5b9ea375725",
61+
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
62+
"lang":"en",
63+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
64+
"page_number":1,
65+
"entity_type":"page",
66+
"title_txt_en":"brother_004603.pdf",
67+
"_version_":1801539996374532096},
68+
{
69+
"id":"72f6ca9e-af4b-4235-a56c-a62508e24efe",
70+
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
71+
"lang":"en",
72+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
73+
"page_number":2,
74+
"entity_type":"page",
75+
"title_txt_en":"brother_004603.pdf",
76+
"_version_":1801539996403892224},]
77+
}},
78+
{
79+
"groupValue":"200b0201-cfcd-43df-b41f-f1732568a0d2",
80+
"doclist":{"numFound":2,"start":0,"numFoundExact":true,"docs":[
81+
{
82+
"id":"9fa936e6-fe94-46bf-ad01-d8591cc290d4",
83+
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
84+
"lang":"en",
85+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
86+
"page_number":1,
87+
"entity_type":"page",
88+
"title_txt_en":"brother_004598.pdf",
89+
"_version_":1801539995874361344},
90+
{
91+
"id":"c364994c-eab5-4c6a-842a-6f40537f7a2e",
92+
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
93+
"lang":"en",
94+
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
95+
"page_number":2,
96+
"entity_type":"page",
97+
"title_txt_en":"brother_004598.pdf",
98+
"_version_":1801539995910012928}]
99+
}},
100+
}}]}}}
19101
"""
20-
21-
"""
22-
result = self.client.search(sq)
23-
if result['response']['numFound'] == 0:
102+
result = self.client.search(sq, user_id)
103+
grouped = glom(result, 'grouped.document_id')
104+
if glom(grouped, 'matches') == 0:
24105
return []
25106

26-
docs_list = [
27-
trim_suffixes(doc) for doc in result['response']['docs']
28-
]
29-
docs = [
30-
filter_keys(some_doc, ['_version_'])
31-
for some_doc in docs_list
32-
]
33-
34107
result = []
35-
for doc in docs:
36-
attrs = {}
37-
for field_name, value in doc.items():
38-
if '_orig_' in field_name:
39-
continue
40-
if self.schema.needs_transform(self.schema, field_name):
41-
attrs[field_name] = json.loads(doc[f'{field_name}_orig_'])
42-
else:
43-
attrs[field_name] = value
44-
45-
result.append(sq.entity(**attrs))
108+
for group in glom(grouped, 'groups'):
109+
if glom(group, 'groupValue'):
110+
# groupValue != null => document
111+
document_id = glom(group, 'groupValue')
112+
title = ''
113+
lang = 'en'
114+
tags = []
115+
pages = []
116+
for page in glom(group, 'doclist.docs'):
117+
lang = page.get('lang', 'en')
118+
title = page.get(f'title_txt_{lang}', None)
119+
text = page.get(f'text_txt_{lang}', None)
120+
tags = page.get('tags', [])
121+
p = Page(
122+
id=page['id'],
123+
page_number=page['page_number'],
124+
text=text
125+
)
126+
pages.append(p)
127+
item = Document(
128+
id=document_id,
129+
title=title,
130+
lang=lang,
131+
pages=pages,
132+
tags=tags,
133+
)
134+
result.append(item)
135+
else:
136+
for folder in glom(group, 'doclist.docs'):
137+
lang = folder.get('lang', 'en')
138+
title = folder.get(f'title_txt_{lang}', None)
139+
item = Folder(
140+
id=folder['id'],
141+
title=title,
142+
tags=folder.get('tags', []),
143+
)
144+
result.append(item)
46145

47146
return result
48147

salinic/backends/xapian/index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def remove(self, **kwargs):
135135

136136
self.client.delete_document(id_term)
137137

138-
def search(self, sq: SearchQuery):
138+
def search(self, sq: SearchQuery, user_id: str | None = None):
139139
results = []
140140

141141
if str(sq.query.free_text):

salinic/index.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ def __init__(self, engine, schema):
88
self.engine = engine
99
self.schema = schema
1010

11-
def search(self, sq: SearchQuery):
12-
return self.backend.search(sq)
11+
def search(self, sq: SearchQuery, user_id: str | None = None):
12+
return self.backend.search(sq, user_id)
1313

1414

1515
class IndexRO(IndexBase):

salinic/query.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -169,10 +169,26 @@ def __repr__(self):
169169

170170
class SearchQuery:
171171
query: Query
172-
173-
def __init__(self, entity, query: str):
172+
rows: int
173+
start: int
174+
group_limit: int
175+
group_offset: int
176+
177+
def __init__(
178+
self,
179+
entity,
180+
q: str,
181+
rows: int = 100,
182+
start: int = 0,
183+
group_limit: int = 100,
184+
group_offset: int = 0
185+
):
174186
self.entity = entity
175-
self.query = Query(query)
187+
self.query = Query(q)
188+
self.rows = rows
189+
self.start = start
190+
self.group_limit = group_limit
191+
self.group_offset = group_offset
176192

177193
def __str__(self):
178194
return f"SearchQuery(query={self.query}, entity={self.entity})"

salinic/schema.py

+30
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from uuid import UUID
2+
13
from pydantic import BaseModel, ConfigDict, model_serializer
24

35
from .field import Field, IdField
@@ -81,3 +83,31 @@ class Index(Schema):
8183

8284
def needs_transform(self, field_name):
8385
return hasattr(self, f'get_idx_value__{field_name}')
86+
87+
88+
class Page(BaseModel):
89+
id: UUID
90+
page_number: int
91+
text: str | None = None
92+
93+
94+
class Document(BaseModel):
95+
id: UUID
96+
title: str
97+
lang: str
98+
tags: list[str] = []
99+
pages: list[Page]
100+
entity_type: str = 'document'
101+
102+
def __hash__(self):
103+
return hash(self.model_dump_json())
104+
105+
106+
class Folder(BaseModel):
107+
id: UUID
108+
title: str
109+
tags: list[str] = []
110+
entity_type: str = 'folder'
111+
112+
def __hash__(self):
113+
return hash(self.model_dump_json())

0 commit comments

Comments
 (0)