1
1
import json
2
2
import logging
3
3
4
- from glom import glom
5
4
from pydantic import BaseModel
6
5
7
6
from salinic .field import Field
8
7
from salinic .query import SearchQuery
9
- from salinic .schema import Document , Folder , Page
8
+ from salinic .schema import DocumentPage , Folder
10
9
from salinic .utils import first
11
10
12
11
logger = logging .getLogger (__name__ )
@@ -21,129 +20,37 @@ def search(
21
20
self ,
22
21
sq : SearchQuery ,
23
22
user_id : str | None = None
24
- ) -> list [Document | Folder ]:
25
- """Query index
26
-
27
- Solr results are grouped by `document_id` field: this way
28
- all folder entries will be part of group with `document_id=null`,
29
- while all page entities will be grouped per document i.e.
30
- pages which belong together are all in the same group.
31
-
32
- {
33
- "responseHeader":{
34
- ...
35
- "grouped":{
36
- "document_id":{
37
- "matches":26,
38
- "groups":[
39
- "groupValue":null,
40
- "doclist":{"numFound":4,"start":0,"numFoundExact":true,"docs":[
41
- {
42
- "id":"0b663599-32b1-4396-8dbe-ae7cd327cec6",
43
- "lang":"en",
44
- "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
45
- "entity_type":"folder",
46
- "title_txt_en":"A2 updated",
47
- "_version_":1801539995817738240},
48
- {
49
- "id":"768c6841-d37a-4d02-857f-ab7eaf69b27e",
50
- "lang":"en",
51
- "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
52
- "entity_type":"folder",
53
- "title_txt_en":".inbox",
54
- "_version_":1801539995692957696}]
55
- }},
56
- {
57
- "groupValue":"9bc57688-302e-4e1f-840a-c747dcccb362",
58
- "doclist":{"numFound":5,"start":0,"numFoundExact":true,"docs":[
59
- {
60
- "id":"a6e4916f-dea6-414b-aa38-f5b9ea375725",
61
- "document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
62
- "lang":"en",
63
- "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
64
- "page_number":1,
65
- "entity_type":"page",
66
- "title_txt_en":"brother_004603.pdf",
67
- "_version_":1801539996374532096},
68
- {
69
- "id":"72f6ca9e-af4b-4235-a56c-a62508e24efe",
70
- "document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
71
- "lang":"en",
72
- "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
73
- "page_number":2,
74
- "entity_type":"page",
75
- "title_txt_en":"brother_004603.pdf",
76
- "_version_":1801539996403892224},]
77
- }},
78
- {
79
- "groupValue":"200b0201-cfcd-43df-b41f-f1732568a0d2",
80
- "doclist":{"numFound":2,"start":0,"numFoundExact":true,"docs":[
81
- {
82
- "id":"9fa936e6-fe94-46bf-ad01-d8591cc290d4",
83
- "document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
84
- "lang":"en",
85
- "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
86
- "page_number":1,
87
- "entity_type":"page",
88
- "title_txt_en":"brother_004598.pdf",
89
- "_version_":1801539995874361344},
90
- {
91
- "id":"c364994c-eab5-4c6a-842a-6f40537f7a2e",
92
- "document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
93
- "lang":"en",
94
- "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
95
- "page_number":2,
96
- "entity_type":"page",
97
- "title_txt_en":"brother_004598.pdf",
98
- "_version_":1801539995910012928}]
99
- }},
100
- }}]}}}
101
- """
23
+ ) -> list [DocumentPage | Folder ]:
24
+ """Query index"""
102
25
result = self .client .search (sq , user_id )
103
- grouped = glom (result , 'grouped.document_id' )
104
- if glom (grouped , 'matches' ) == 0 :
105
- return []
106
-
107
- result = []
108
- for group in glom (grouped , 'groups' ):
109
- if glom (group , 'groupValue' ):
110
- # groupValue != null => document
111
- document_id = glom (group , 'groupValue' )
112
- title = ''
113
- lang = 'en'
114
- tags = []
115
- pages = []
116
- for page in glom (group , 'doclist.docs' ):
117
- lang = page .get ('lang' , 'en' )
118
- title = page .get (f'title_txt_{ lang } ' , None )
119
- text = page .get (f'text_txt_{ lang } ' , None )
120
- tags = page .get ('tags' , [])
121
- p = Page (
122
- id = page ['id' ],
123
- page_number = page ['page_number' ],
124
- text = text
125
- )
126
- pages .append (p )
127
- item = Document (
128
- id = document_id ,
26
+ items = result ['docs' ]
27
+ returned_list = []
28
+ for item in items :
29
+ if document_id := item .get ('document_id' , None ):
30
+ lang = item .get ('lang' , 'en' )
31
+ title = item .get (f'title_txt_{ lang } ' , lang )
32
+ tags = item .get ('tags' , [])
33
+ dp = DocumentPage (
34
+ id = item ['id' ],
35
+ page_number = item ['page_number' ],
36
+ document_id = document_id ,
129
37
title = title ,
130
38
lang = lang ,
131
- pages = pages ,
132
- tags = tags ,
39
+ tags = tags
133
40
)
134
- result .append (item )
41
+ returned_list .append (dp )
135
42
else :
136
- for folder in glom ( group , 'doclist.docs' ):
137
- lang = folder .get (' lang' , 'en' )
138
- title = folder . get ( f'title_txt_ { lang } ' , None )
139
- item = Folder (
140
- id = folder [ 'id' ] ,
141
- title = title ,
142
- tags = folder .get ('tags' , []),
143
- )
144
- result .append (item )
43
+ lang = item . get ( 'lang' , 'en' )
44
+ title = item .get (f'title_txt_ { lang } ' , lang )
45
+ folder = Folder (
46
+ id = item [ 'id' ],
47
+ title = title ,
48
+ lang = lang ,
49
+ tags = item .get ('tags' , []),
50
+ )
51
+ returned_list .append (folder )
145
52
146
- return result
53
+ return returned_list
147
54
148
55
149
56
class IndexRW (Base ):
0 commit comments