1
1
import json
2
2
import logging
3
3
4
+ from glom import glom
4
5
from pydantic import BaseModel
5
6
6
7
from salinic .field import Field
7
8
from salinic .query import SearchQuery
8
- from salinic .utils import filter_keys , first , trim_suffixes
9
+ from salinic .schema import Document , Folder , Page
10
+ from salinic .utils import first
9
11
10
12
logger = logging .getLogger (__name__ )
11
13
@@ -15,34 +17,131 @@ def __init__(self, client, schema):
15
17
self .client = client
16
18
self .schema = schema
17
19
18
- def search (self , sq : SearchQuery ):
20
+ def search (
21
+ self ,
22
+ sq : SearchQuery ,
23
+ user_id : str | None = None
24
+ ) -> list [Document | Folder ]:
25
+ """Query index
26
+
27
+ Solr results are grouped by `document_id` field: this way
28
+ all folder entries will be part of group with `document_id=null`,
29
+ while all page entities will be grouped per document i.e.
30
+ pages which belong together are all in the same group.
31
+
32
+ {
33
+ "responseHeader":{
34
+ ...
35
+ "grouped":{
36
+ "document_id":{
37
+ "matches":26,
38
+ "groups":[
39
+ "groupValue":null,
40
+ "doclist":{"numFound":4,"start":0,"numFoundExact":true,"docs":[
41
+ {
42
+ "id":"0b663599-32b1-4396-8dbe-ae7cd327cec6",
43
+ "lang":"en",
44
+ "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
45
+ "entity_type":"folder",
46
+ "title_txt_en":"A2 updated",
47
+ "_version_":1801539995817738240},
48
+ {
49
+ "id":"768c6841-d37a-4d02-857f-ab7eaf69b27e",
50
+ "lang":"en",
51
+ "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
52
+ "entity_type":"folder",
53
+ "title_txt_en":".inbox",
54
+ "_version_":1801539995692957696}]
55
+ }},
56
+ {
57
+ "groupValue":"9bc57688-302e-4e1f-840a-c747dcccb362",
58
+ "doclist":{"numFound":5,"start":0,"numFoundExact":true,"docs":[
59
+ {
60
+ "id":"a6e4916f-dea6-414b-aa38-f5b9ea375725",
61
+ "document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
62
+ "lang":"en",
63
+ "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
64
+ "page_number":1,
65
+ "entity_type":"page",
66
+ "title_txt_en":"brother_004603.pdf",
67
+ "_version_":1801539996374532096},
68
+ {
69
+ "id":"72f6ca9e-af4b-4235-a56c-a62508e24efe",
70
+ "document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
71
+ "lang":"en",
72
+ "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
73
+ "page_number":2,
74
+ "entity_type":"page",
75
+ "title_txt_en":"brother_004603.pdf",
76
+ "_version_":1801539996403892224},]
77
+ }},
78
+ {
79
+ "groupValue":"200b0201-cfcd-43df-b41f-f1732568a0d2",
80
+ "doclist":{"numFound":2,"start":0,"numFoundExact":true,"docs":[
81
+ {
82
+ "id":"9fa936e6-fe94-46bf-ad01-d8591cc290d4",
83
+ "document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
84
+ "lang":"en",
85
+ "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
86
+ "page_number":1,
87
+ "entity_type":"page",
88
+ "title_txt_en":"brother_004598.pdf",
89
+ "_version_":1801539995874361344},
90
+ {
91
+ "id":"c364994c-eab5-4c6a-842a-6f40537f7a2e",
92
+ "document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
93
+ "lang":"en",
94
+ "user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
95
+ "page_number":2,
96
+ "entity_type":"page",
97
+ "title_txt_en":"brother_004598.pdf",
98
+ "_version_":1801539995910012928}]
99
+ }},
100
+ }}]}}}
19
101
"""
20
-
21
- """
22
- result = self .client .search (sq )
23
- if result ['response' ]['numFound' ] == 0 :
102
+ result = self .client .search (sq , user_id )
103
+ grouped = glom (result , 'grouped.document_id' )
104
+ if glom (grouped , 'matches' ) == 0 :
24
105
return []
25
106
26
- docs_list = [
27
- trim_suffixes (doc ) for doc in result ['response' ]['docs' ]
28
- ]
29
- docs = [
30
- filter_keys (some_doc , ['_version_' ])
31
- for some_doc in docs_list
32
- ]
33
-
34
107
result = []
35
- for doc in docs :
36
- attrs = {}
37
- for field_name , value in doc .items ():
38
- if '_orig_' in field_name :
39
- continue
40
- if self .schema .needs_transform (self .schema , field_name ):
41
- attrs [field_name ] = json .loads (doc [f'{ field_name } _orig_' ])
42
- else :
43
- attrs [field_name ] = value
44
-
45
- result .append (sq .entity (** attrs ))
108
+ for group in glom (grouped , 'groups' ):
109
+ if glom (group , 'groupValue' ):
110
+ # groupValue != null => document
111
+ document_id = glom (group , 'groupValue' )
112
+ title = ''
113
+ lang = 'en'
114
+ tags = []
115
+ pages = []
116
+ for page in glom (group , 'doclist.docs' ):
117
+ lang = page .get ('lang' , 'en' )
118
+ title = page .get (f'title_txt_{ lang } ' , None )
119
+ text = page .get (f'text_txt_{ lang } ' , None )
120
+ tags = page .get ('tags' , [])
121
+ p = Page (
122
+ id = page ['id' ],
123
+ page_number = page ['page_number' ],
124
+ text = text
125
+ )
126
+ pages .append (p )
127
+ item = Document (
128
+ id = document_id ,
129
+ title = title ,
130
+ lang = lang ,
131
+ pages = pages ,
132
+ tags = tags ,
133
+ )
134
+ result .append (item )
135
+ else :
136
+ for folder in glom (group , 'doclist.docs' ):
137
+ lang = folder .get ('lang' , 'en' )
138
+ title = folder .get (f'title_txt_{ lang } ' , None )
139
+ item = Folder (
140
+ id = folder ['id' ],
141
+ title = title ,
142
+ tags = folder .get ('tags' , []),
143
+ )
144
+ result .append (item )
46
145
47
146
return result
48
147
0 commit comments