Skip to content

Commit aab3c28

Browse files
authored
Use bit arrays for predicate matching in search. (#8684)
1 parent 4afd947 commit aab3c28

File tree

8 files changed

+625
-69
lines changed

8 files changed

+625
-69
lines changed

app/bin/tools/search_benchmark.dart

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ Future<void> main(List<String> args) async {
2525

2626
// NOTE: please add more queries to this list, especially if there is a performance bottleneck.
2727
final queries = [
28+
'sdk:dart',
29+
'sdk:flutter platform:android',
30+
'is:flutter-favorite',
2831
'chart',
2932
'json',
3033
'camera',

app/lib/search/mem_index.dart

Lines changed: 46 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import 'package:collection/collection.dart';
1010
import 'package:logging/logging.dart';
1111
import 'package:meta/meta.dart';
1212
import 'package:pub_dev/service/topics/models.dart';
13+
import 'package:pub_dev/third_party/bit_array/bit_array.dart';
1314

1415
import '../shared/utils.dart' show boundedList;
1516
import 'models.dart';
@@ -30,10 +31,9 @@ class InMemoryPackageIndex {
3031
late final TokenIndex<IndexedApiDocPage> _apiSymbolIndex;
3132
late final _scorePool = ScorePool(_packageNameIndex._packageNames);
3233

33-
/// Maps the tag strings to a list of document index values
34-
/// (`PackageDocument doc.tags -> List<_documents.indexOf(doc)>`).
35-
final _tagDocumentIndices = <String, List<int>>{};
36-
final _documentTagIds = <List<int>>[];
34+
/// Maps the tag strings to a list of document index values using bit arrays.
35+
/// - (`PackageDocument doc.tags -> BitArray(List<_documents.indexOf(doc)>)`).
36+
final _tagBitArrays = <String, BitArray>{};
3737

3838
/// Adjusted score takes the overall score and transforms
3939
/// it linearly into the [0.4-1.0] range.
@@ -66,12 +66,11 @@ class InMemoryPackageIndex {
6666
_nameToIndex[doc.package] = i;
6767

6868
// transform tags into numberical IDs
69-
final tagIds = <int>[];
7069
for (final tag in doc.tags) {
71-
_tagDocumentIndices.putIfAbsent(tag, () => []).add(i);
70+
_tagBitArrays
71+
.putIfAbsent(tag, () => BitArray(_documents.length))
72+
.setBit(i);
7273
}
73-
tagIds.sort();
74-
_documentTagIds.add(tagIds);
7574

7675
final apiDocPages = doc.apiDocPages;
7776
if (apiDocPages != null) {
@@ -138,66 +137,58 @@ class InMemoryPackageIndex {
138137

139138
PackageSearchResult search(ServiceSearchQuery query) {
140139
// prevent any work if offset is outside of the range
141-
if ((query.offset ?? 0) > _documents.length) {
140+
if ((query.offset ?? 0) >= _documents.length) {
142141
return PackageSearchResult.empty();
143142
}
144143
return _scorePool.withScore(
145-
value: 1.0,
144+
value: 0.0,
146145
fn: (score) {
147146
return _search(query, score);
148147
},
149148
);
150149
}
151150

152151
PackageSearchResult _search(
153-
ServiceSearchQuery query, IndexedScore<String> packageScores) {
154-
// filter on package prefix
155-
if (query.parsedQuery.packagePrefix != null) {
156-
final String prefix = query.parsedQuery.packagePrefix!.toLowerCase();
157-
packageScores.retainWhere(
158-
(i, _) => _documents[i].packageNameLowerCased.startsWith(prefix),
159-
);
160-
}
152+
ServiceSearchQuery query,
153+
IndexedScore<String> packageScores,
154+
) {
155+
// TODO: implement pooling of this object similarly to [ScorePool].
156+
final packages = BitArray(_documents.length)
157+
..setRange(0, _documents.length);
161158

162159
// filter on tags
163160
final combinedTagsPredicate =
164161
query.tagsPredicate.appendPredicate(query.parsedQuery.tagsPredicate);
165162
if (combinedTagsPredicate.isNotEmpty) {
166163
for (final entry in combinedTagsPredicate.entries) {
167-
final docIndexes = _tagDocumentIndices[entry.key];
168-
164+
final tagBits = _tagBitArrays[entry.key];
169165
if (entry.value) {
170-
// predicate is required, zeroing the gaps between index values
171-
if (docIndexes == null) {
172-
// the predicate is required, no document will match it
166+
if (tagBits == null) {
167+
// the predicate is not matched by any document
173168
return PackageSearchResult.empty();
174169
}
175-
176-
for (var i = 0; i < docIndexes.length; i++) {
177-
if (i == 0) {
178-
packageScores.fillRange(0, docIndexes[i], 0.0);
179-
continue;
180-
}
181-
packageScores.fillRange(docIndexes[i - 1] + 1, docIndexes[i], 0.0);
182-
}
183-
packageScores.fillRange(docIndexes.last + 1, _documents.length, 0.0);
170+
packages.and(tagBits);
184171
} else {
185-
// predicate is prohibited, zeroing the values
186-
187-
if (docIndexes == null) {
188-
// the predicate is prohibited, no document has it, always a match
172+
if (tagBits == null) {
173+
// negative predicate without index means all document is matched
189174
continue;
190175
}
191-
for (final i in docIndexes) {
192-
packageScores.setValue(i, 0.0);
193-
}
176+
packages.andNot(tagBits);
194177
}
195178
}
196179
}
197180

181+
// filter on package prefix
182+
if (query.parsedQuery.packagePrefix != null) {
183+
final prefix = query.parsedQuery.packagePrefix!.toLowerCase();
184+
packages.clearWhere(
185+
(i) => !_documents[i].packageNameLowerCased.startsWith(prefix),
186+
);
187+
}
188+
198189
// filter on dependency
199190
if (query.parsedQuery.hasAnyDependency) {
200-
packageScores.removeWhere((i, _) {
191+
packages.clearWhere((i) {
201192
final doc = _documents[i];
202193
if (doc.dependencies.isEmpty) return true;
203194
for (final dependency in query.parsedQuery.allDependencies) {
@@ -213,22 +204,29 @@ class InMemoryPackageIndex {
213204

214205
// filter on points
215206
if (query.minPoints != null && query.minPoints! > 0) {
216-
packageScores.removeWhere(
217-
(i, _) => _documents[i].grantedPoints < query.minPoints!);
207+
packages
208+
.clearWhere((i) => _documents[i].grantedPoints < query.minPoints!);
218209
}
219210

220211
// filter on updatedDuration
221212
final updatedDuration = query.parsedQuery.updatedDuration;
222213
if (updatedDuration != null && updatedDuration > Duration.zero) {
223214
final now = clock.now();
224-
packageScores.removeWhere(
225-
(i, _) => now.difference(_documents[i].updated) > updatedDuration);
215+
packages.clearWhere(
216+
(i) => now.difference(_documents[i].updated) > updatedDuration);
217+
}
218+
219+
// TODO: find a better way to handle predicate-only filtering and scoring
220+
for (final index in packages.asIntIterable()) {
221+
if (index >= _documents.length) break;
222+
packageScores.setValue(index, 1.0);
226223
}
227224

228225
// do text matching
229226
final parsedQueryText = query.parsedQuery.text;
230227
final textResults = _searchText(
231228
packageScores,
229+
packages,
232230
parsedQueryText,
233231
textMatchExtent: query.textMatchExtent ?? TextMatchExtent.api,
234232
);
@@ -362,6 +360,7 @@ class InMemoryPackageIndex {
362360

363361
_TextResults? _searchText(
364362
IndexedScore<String> packageScores,
363+
BitArray packages,
365364
String? text, {
366365
required TextMatchExtent textMatchExtent,
367366
}) {
@@ -372,12 +371,14 @@ class InMemoryPackageIndex {
372371
final sw = Stopwatch()..start();
373372
final words = splitForQuery(text);
374373
if (words.isEmpty) {
374+
// packages.clearAll();
375375
packageScores.fillRange(0, packageScores.length, 0);
376376
return _TextResults.empty();
377377
}
378378

379379
final matchName = textMatchExtent.shouldMatchName();
380380
if (!matchName) {
381+
// packages.clearAll();
381382
packageScores.fillRange(0, packageScores.length, 0);
382383
return _TextResults.empty(
383384
errorMessage:
@@ -394,12 +395,6 @@ class InMemoryPackageIndex {
394395
return aborted;
395396
}
396397

397-
// Multiple words are scored separately, and then the individual scores
398-
// are multiplied. We can use a package filter that is applied after each
399-
// word to reduce the scope of the later words based on the previous results.
400-
/// However, API docs search should be filtered on the original list.
401-
final indexedPositiveList = packageScores.toIndexedPositiveList();
402-
403398
final matchDescription = textMatchExtent.shouldMatchDescription();
404399
final matchReadme = textMatchExtent.shouldMatchReadme();
405400
final matchApi = textMatchExtent.shouldMatchApi();
@@ -435,7 +430,7 @@ class InMemoryPackageIndex {
435430
if (value < 0.01) continue;
436431

437432
final doc = symbolPages.keys[i];
438-
if (!indexedPositiveList[doc.index]) continue;
433+
if (!packages[doc.index]) continue;
439434

440435
// skip if the previously found pages are better than the current one
441436
final pages =

app/lib/search/token_index.dart

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -227,24 +227,6 @@ class IndexedScore<K> {
227227
_values.fillRange(start, end, fillValue);
228228
}
229229

230-
void removeWhere(bool Function(int index, K key) fn) {
231-
for (var i = 0; i < length; i++) {
232-
if (isNotPositive(i)) continue;
233-
if (fn(i, _keys[i])) {
234-
_values[i] = 0.0;
235-
}
236-
}
237-
}
238-
239-
void retainWhere(bool Function(int index, K key) fn) {
240-
for (var i = 0; i < length; i++) {
241-
if (isNotPositive(i)) continue;
242-
if (!fn(i, _keys[i])) {
243-
_values[i] = 0.0;
244-
}
245-
}
246-
}
247-
248230
void multiplyAllFrom(IndexedScore other) {
249231
multiplyAllFromValues(other._values);
250232
}

app/lib/third_party/bit_array/LICENSE

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Copyright 2018, the project authors. All rights reserved.
2+
Redistribution and use in source and binary forms, with or without
3+
modification, are permitted provided that the following conditions are
4+
met:
5+
6+
* Redistributions of source code must retain the above copyright
7+
notice, this list of conditions and the following disclaimer.
8+
* Redistributions in binary form must reproduce the above
9+
copyright notice, this list of conditions and the following
10+
disclaimer in the documentation and/or other materials provided
11+
with the distribution.
12+
* Neither the name of the project nor the names of its
13+
contributors may be used to endorse or promote products derived
14+
from this software without specific prior written permission.
15+
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Note: this library is vendored from `package:bit_array`.

0 commit comments

Comments
 (0)