Skip to content

Commit ac90517

Browse files
Adding 3-ary LongHeap to speed up collectors like TopDoc*Collectors (#15140)
1 parent f0d3bbf commit ac90517

File tree

5 files changed

+437
-16
lines changed

5 files changed

+437
-16
lines changed

lucene/CHANGES.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ Improvements
122122

123123
Optimizations
124124
---------------------
125-
(No changes)
125+
* GITHUB#15140: Optimize TopScoreDocCollector with TernaryLongHeap for improved performance over Binary-LongHeap. (Ramakrishna Chilaka)
126126

127127
Bug Fixes
128128
---------------------

lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
import java.io.IOException;
2020
import org.apache.lucene.index.LeafReaderContext;
21-
import org.apache.lucene.util.LongHeap;
21+
import org.apache.lucene.util.TernaryLongHeap;
2222

2323
/**
2424
* A {@link Collector} implementation that collects the top-scoring hits, returning them as a {@link
@@ -33,15 +33,15 @@
3333
public class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
3434

3535
private final ScoreDoc after;
36-
private final LongHeap heap;
36+
private final TernaryLongHeap heap;
3737
final int totalHitsThreshold;
3838
final MaxScoreAccumulator minScoreAcc;
3939

4040
// prevents instantiation
4141
TopScoreDocCollector(
4242
int numHits, ScoreDoc after, int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) {
4343
super(null);
44-
this.heap = new LongHeap(numHits, DocScoreEncoder.LEAST_COMPETITIVE_CODE);
44+
this.heap = new TernaryLongHeap(numHits, DocScoreEncoder.LEAST_COMPETITIVE_CODE);
4545
this.after = after;
4646
this.totalHitsThreshold = totalHitsThreshold;
4747
this.minScoreAcc = minScoreAcc;

lucene/core/src/java/org/apache/lucene/util/LongHeap.java

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@
2222
* A min heap that stores longs; a primitive priority queue that like all priority queues maintains
2323
* a partial ordering of its elements such that the least element can always be found in constant
2424
* time. Put()'s and pop()'s require log(size). This heap provides unbounded growth via {@link
25-
* #push(long)}, and bounded-size insertion based on its nominal maxSize via {@link
25+
* #push(long)}, and bounded-size insertion based on its initial capacity via {@link
2626
* #insertWithOverflow(long)}. The heap is a min heap, meaning that the top element is the lowest
27-
* value of the heap.
27+
* value of the heap. LongHeap implements 2-ary heap.
2828
*
2929
* @lucene.internal
3030
*/
3131
public final class LongHeap {
3232

33-
private final int maxSize;
33+
private final int initialCapacity;
3434

3535
private long[] heap;
3636
private int size = 0;
@@ -50,19 +50,21 @@ public LongHeap(int size, long initialValue) {
5050
/**
5151
* Create an empty priority queue of the configured initial size.
5252
*
53-
* @param maxSize the maximum size of the heap, or if negative, the initial size of an unbounded
54-
* heap
53+
* @param initialCapacity the initial capacity of the heap
5554
*/
56-
public LongHeap(int maxSize) {
55+
public LongHeap(int initialCapacity) {
5756
final int heapSize;
58-
if (maxSize < 1 || maxSize >= ArrayUtil.MAX_ARRAY_LENGTH) {
57+
if (initialCapacity < 1 || initialCapacity >= ArrayUtil.MAX_ARRAY_LENGTH) {
5958
// Throw exception to prevent confusing OOME:
6059
throw new IllegalArgumentException(
61-
"maxSize must be > 0 and < " + (ArrayUtil.MAX_ARRAY_LENGTH - 1) + "; got: " + maxSize);
60+
"initialCapacity must be > 0 and < "
61+
+ (ArrayUtil.MAX_ARRAY_LENGTH - 1)
62+
+ "; got: "
63+
+ initialCapacity);
6264
}
6365
// NOTE: we add +1 because all access to heap is 1-based not 0-based. heap[0] is unused.
64-
heapSize = maxSize + 1;
65-
this.maxSize = maxSize;
66+
heapSize = initialCapacity + 1;
67+
this.initialCapacity = initialCapacity;
6668
this.heap = new long[heapSize];
6769
}
6870

@@ -83,13 +85,13 @@ public long push(long element) {
8385

8486
/**
8587
* Adds a value to an LongHeap in log(size) time. If the number of values would exceed the heap's
86-
* maxSize, the least value is discarded.
88+
* initialCapacity, the least value is discarded.
8789
*
8890
* @return whether the value was added (unless the heap is full, or the new value is less than the
8991
* top value)
9092
*/
9193
public boolean insertWithOverflow(long value) {
92-
if (size >= maxSize) {
94+
if (size >= initialCapacity) {
9395
if (value < heap[1]) {
9496
return false;
9597
}
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.util;
18+
19+
import java.util.Arrays;
20+
21+
/**
22+
* A ternary min heap that stores longs; a primitive priority queue that like all priority queues
23+
* maintains a partial ordering of its elements such that the least element can always be found in
24+
* constant time. Put()'s and pop()'s require log_3(size). This heap provides unbounded growth via
25+
* {@link #push(long)}, and bounded-size insertion based on its nominal initial capacity via {@link
26+
* #insertWithOverflow(long)}. The heap is a min heap, meaning that the top element is the lowest
27+
* value of the heap. TernaryLongHeap implements 3-ary heap.
28+
*
29+
* @lucene.internal
30+
*/
31+
public final class TernaryLongHeap {
32+
33+
private final int initialCapacity;
34+
35+
private long[] heap;
36+
private int size = 0;
37+
private static final int ARITY = 3;
38+
39+
/**
40+
* Constructs a heap with specified size and initializes all elements with the given value.
41+
*
42+
* @param size the number of elements to initialize in the heap.
43+
* @param initialValue the value to fill the heap with.
44+
*/
45+
public TernaryLongHeap(int size, long initialValue) {
46+
this(size <= 0 ? 1 : size);
47+
Arrays.fill(heap, 1, size + 1, initialValue);
48+
this.size = size;
49+
}
50+
51+
/**
52+
* Create an empty priority queue of the configured initial size.
53+
*
54+
* @param initialCapacity the initial capacity of the heap
55+
*/
56+
public TernaryLongHeap(int initialCapacity) {
57+
if (initialCapacity < 1 || initialCapacity >= ArrayUtil.MAX_ARRAY_LENGTH) {
58+
// Throw exception to prevent confusing OOME:
59+
throw new IllegalArgumentException(
60+
"initialCapacity must be > 0 and < "
61+
+ (ArrayUtil.MAX_ARRAY_LENGTH - 1)
62+
+ "; got: "
63+
+ initialCapacity);
64+
}
65+
// NOTE: we add +1 because all access to heap is 1-based not 0-based. heap[0] is unused.
66+
final int heapSize = initialCapacity + 1;
67+
this.initialCapacity = initialCapacity;
68+
this.heap = new long[heapSize];
69+
}
70+
71+
/**
72+
* Adds a value in log(size) time. Grows unbounded as needed to accommodate new values.
73+
*
74+
* @return the new 'top' element in the queue.
75+
*/
76+
public long push(long element) {
77+
size++;
78+
if (size == heap.length) {
79+
heap = ArrayUtil.grow(heap, (size * 3 + 1) / 2);
80+
}
81+
heap[size] = element;
82+
TernaryLongHeap.upHeap(heap, size, ARITY);
83+
return heap[1];
84+
}
85+
86+
/**
87+
* Adds a value to an TernaryLongHeap in log(size) time. If the number of values would exceed the
88+
* heap's initialCapacity, the least value is discarded.
89+
*
90+
* @return whether the value was added (unless the heap is full, or the new value is less than the
91+
* top value)
92+
*/
93+
public boolean insertWithOverflow(long value) {
94+
if (size >= initialCapacity) {
95+
if (value < heap[1]) {
96+
return false;
97+
}
98+
updateTop(value);
99+
return true;
100+
}
101+
push(value);
102+
return true;
103+
}
104+
105+
/**
106+
* Returns the least element of the TernaryLongHeap in constant time. It is up to the caller to
107+
* verify that the heap is not empty; no checking is done, and if no elements have been added, 0
108+
* is returned.
109+
*/
110+
public long top() {
111+
return heap[1];
112+
}
113+
114+
/**
115+
* Removes and returns the least element of the PriorityQueue in log(size) time.
116+
*
117+
* @throws IllegalStateException if the TernaryLongHeap is empty.
118+
*/
119+
public long pop() {
120+
if (size > 0) {
121+
long result = heap[1]; // save first value
122+
heap[1] = heap[size]; // move last to first
123+
size--;
124+
TernaryLongHeap.downHeap(heap, 1, size, ARITY); // adjust heap
125+
return result;
126+
} else {
127+
throw new IllegalStateException("The heap is empty");
128+
}
129+
}
130+
131+
/**
132+
* Replace the top of the pq with {@code newTop}. Should be called when the top value changes.
133+
* Still log(n) worst case, but it's at least twice as fast to
134+
*
135+
* <pre class="prettyprint">
136+
* pq.updateTop(value);
137+
* </pre>
138+
*
139+
* <p>instead of
140+
*
141+
* <pre class="prettyprint">
142+
* pq.pop();
143+
* pq.push(value);
144+
* </pre>
145+
*
146+
* <p>Calling this method on an empty TernaryLongHeap has no visible effect.
147+
*
148+
* @param value the new element that is less than the current top.
149+
* @return the new 'top' element after shuffling the heap.
150+
*/
151+
public long updateTop(long value) {
152+
heap[1] = value;
153+
TernaryLongHeap.downHeap(heap, 1, size, ARITY);
154+
return heap[1];
155+
}
156+
157+
/** Returns the number of elements currently stored in the PriorityQueue. */
158+
public int size() {
159+
return size;
160+
}
161+
162+
/** Removes all entries from the PriorityQueue. */
163+
public void clear() {
164+
size = 0;
165+
}
166+
167+
public void pushAll(TernaryLongHeap other) {
168+
for (int i = 1; i <= other.size; i++) {
169+
push(other.heap[i]);
170+
}
171+
}
172+
173+
/**
174+
* Return the element at the ith location in the heap array. Use for iterating over elements when
175+
* the order doesn't matter. Note that the valid arguments range from [1, size].
176+
*/
177+
public long get(int i) {
178+
return heap[i];
179+
}
180+
181+
/**
182+
* This method returns the internal heap array.
183+
*
184+
* @lucene.internal
185+
*/
186+
// pkg-private for testing
187+
long[] getHeapArray() {
188+
return heap;
189+
}
190+
191+
/**
192+
* Restores heap order by moving an element up the heap until it finds its proper position. Works
193+
* with heaps of any arity (number of children per node).
194+
*
195+
* @param heap the heap array (1-based indexing)
196+
* @param i the index of the element to move up
197+
* @param arity the number of children each node can have
198+
*/
199+
static void upHeap(long[] heap, int i, int arity) {
200+
final long value = heap[i]; // save bottom value
201+
while (i > 1) {
202+
// parent formula for 1-based indexing
203+
final int parent = ((i - 2) / arity) + 1;
204+
final long parentVal = heap[parent];
205+
if (value >= parentVal) break;
206+
heap[i] = parentVal; // shift parent down
207+
i = parent;
208+
}
209+
heap[i] = value; // install saved value
210+
}
211+
212+
/**
213+
* Restores heap order by moving an element down the heap until it finds its proper position.
214+
* Works with heaps of any arity (number of children per node).
215+
*
216+
* @param heap the heap array (1-based indexing)
217+
* @param i the index of the element to move down
218+
* @param size the current size of the heap
219+
* @param arity the number of children each node can have
220+
*/
221+
static void downHeap(long[] heap, int i, int size, int arity) {
222+
long value = heap[i]; // save top value
223+
for (; ; ) {
224+
// first child formula for 1-based indexing
225+
int firstChild = arity * (i - 1) + 2;
226+
if (firstChild > size) break; // i is a leaf
227+
228+
int lastChild = Math.min(firstChild + arity - 1, size);
229+
230+
// find the smallest child in [firstChild, lastChild]
231+
int best = firstChild;
232+
long bestVal = heap[firstChild];
233+
234+
for (int c = firstChild + 1; c <= lastChild; c++) {
235+
final long v = heap[c];
236+
if (v < bestVal) {
237+
bestVal = v;
238+
best = c;
239+
}
240+
}
241+
242+
if (bestVal >= value) break;
243+
244+
heap[i] = bestVal;
245+
i = best;
246+
}
247+
heap[i] = value; // install saved value
248+
}
249+
}

0 commit comments

Comments
 (0)