Move HitQueue in TopScoreDocCollector to a LongHeap (#14714)

gf2121 · web-flow · commit a309bd6da9b8 · 2025-06-06T14:18:15.000+08:00
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -144,6 +144,8 @@ Optimizations
 
 * GITHUB#14674: Optimize AbstractKnnVectorQuery#createBitSet with intoBitset. (Guo Feng)
 
+* GITHUB#14714: Move HitQueue in TopScoreDocCollector to a LongHeap. (Guo Feng)
+
 * GITHUB#14720: Cache high-order bits of hashcode to speed up BytesRefHash. (Pan Guixin)
 
 * GITHUB#14753: Implement IndexedDISI#docIDRunEnd. (Ge Song)
diff --git a/lucene/core/src/java/org/apache/lucene/search/DocScoreEncoder.java b/lucene/core/src/java/org/apache/lucene/search/DocScoreEncoder.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search;
+
+import org.apache.lucene.util.NumericUtils;
+
+/**
+ * An encoder do encode (doc, score) pair as a long whose sort order is same as {@code (o1, o2) ->
+ * Float.compare(o1.score, o2.score)).thenComparing(Comparator.comparingInt((ScoreDoc o) ->
+ * o.doc).reversed())}
+ */
+class DocScoreEncoder {
+
+  static final long LEAST_COMPETITIVE_CODE = encode(Integer.MAX_VALUE, Float.NEGATIVE_INFINITY);
+
+  static long encode(int docId, float score) {
+    return (((long) NumericUtils.floatToSortableInt(score)) << 32) | (Integer.MAX_VALUE - docId);
+  }
+
+  static float toScore(long value) {
+    return NumericUtils.sortableIntToFloat((int) (value >>> 32));
+  }
+
+  static int docId(long value) {
+    return Integer.MAX_VALUE - ((int) value);
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java
@@ -22,10 +22,10 @@
 /** Maintains the maximum score and its corresponding document id concurrently */
 final class MaxScoreAccumulator {
   // we use 2^10-1 to check the remainder with a bitwise operation
-  static final int DEFAULT_INTERVAL = 0x3ff;
+  private static final int DEFAULT_INTERVAL = 0x3ff;
 
   // scores are always positive
-  final LongAccumulator acc = new LongAccumulator(MaxScoreAccumulator::maxEncode, Long.MIN_VALUE);
+  final LongAccumulator acc = new LongAccumulator(Math::max, Long.MIN_VALUE);
 
   // non-final and visible for tests
   long modInterval;
@@ -34,35 +34,8 @@ final class MaxScoreAccumulator {
     this.modInterval = DEFAULT_INTERVAL;
   }
 
-  /**
-   * Return the max encoded docId and score found in the two longs, following the encoding in {@link
-   * #accumulate}.
-   */
-  private static long maxEncode(long v1, long v2) {
-    float score1 = Float.intBitsToFloat((int) (v1 >> 32));
-    float score2 = Float.intBitsToFloat((int) (v2 >> 32));
-    int cmp = Float.compare(score1, score2);
-    if (cmp == 0) {
-      // tie-break on the minimum doc base
-      return (int) v1 < (int) v2 ? v1 : v2;
-    } else if (cmp > 0) {
-      return v1;
-    }
-    return v2;
-  }
-
-  void accumulate(int docId, float score) {
-    assert docId >= 0 && score >= 0;
-    long encode = (((long) Float.floatToIntBits(score)) << 32) | docId;
-    acc.accumulate(encode);
-  }
-
-  public static float toScore(long value) {
-    return Float.intBitsToFloat((int) (value >> 32));
-  }
-
-  public static int docId(long value) {
-    return (int) value;
+  void accumulate(long code) {
+    acc.accumulate(code);
   }
 
   long getRaw() {
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopDocsCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopDocsCollector.java
@@ -153,18 +153,25 @@ public TopDocs topDocs(int start, int howMany) {
     howMany = Math.min(size - start, howMany);
     ScoreDoc[] results = new ScoreDoc[howMany];
 
-    // pq's pop() returns the 'least' element in the queue, therefore need
-    // to discard the first ones, until we reach the requested range.
+    // Prune the least competitive hits until we reach the requested range.
     // Note that this loop will usually not be executed, since the common usage
     // should be that the caller asks for the last howMany results. However it's
     // needed here for completeness.
-    for (int i = pq.size() - start - howMany; i > 0; i--) {
-      pq.pop();
-    }
+    pruneLeastCompetitiveHitsTo(start + howMany);
 
     // Get the requested results from pq.
     populateResults(results, howMany);
 
     return newTopDocs(results, start);
   }
+
+  /**
+   * Prune the least competitive hits until the number of candidates is less than or equal to {@code
+   * keep}. This is typically called before {@link #populateResults} to ensure we are at right pos.
+   */
+  protected void pruneLeastCompetitiveHitsTo(int keep) {
+    for (int i = pq.size() - keep; i > 0; i--) {
+      pq.pop();
+    }
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java
@@ -367,7 +367,7 @@ protected void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOExcepti
       long maxMinScore = minScoreAcc.getRaw();
       float score;
       if (maxMinScore != Long.MIN_VALUE
-          && (score = MaxScoreAccumulator.toScore(maxMinScore)) > minCompetitiveScore) {
+          && (score = DocScoreEncoder.toScore(maxMinScore)) > minCompetitiveScore) {
         scorer.setMinCompetitiveScore(score);
         minCompetitiveScore = score;
         totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
@@ -384,7 +384,7 @@ protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
         minCompetitiveScore = minScore;
         totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
         if (minScoreAcc != null) {
-          minScoreAcc.accumulate(docBase, minScore);
+          minScoreAcc.accumulate(DocScoreEncoder.encode(docBase, minScore));
         }
       }
     }
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.util.LongHeap;
 
 /**
  * A {@link Collector} implementation that collects the top-scoring hits, returning them as a {@link
@@ -32,31 +33,20 @@
 public class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
 
   private final ScoreDoc after;
+  private final LongHeap heap;
   final int totalHitsThreshold;
   final MaxScoreAccumulator minScoreAcc;
 
   // prevents instantiation
   TopScoreDocCollector(
       int numHits, ScoreDoc after, int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) {
-    super(new HitQueue(numHits, true));
+    super(null);
+    this.heap = new LongHeap(numHits, DocScoreEncoder.LEAST_COMPETITIVE_CODE);
     this.after = after;
     this.totalHitsThreshold = totalHitsThreshold;
     this.minScoreAcc = minScoreAcc;
   }
 
-  @Override
-  protected int topDocsSize() {
-    // Note: this relies on sentinel values having Integer.MAX_VALUE as a doc ID.
-    int[] validTopHitCount = new int[1];
-    pq.forEach(
-        scoreDoc -> {
-          if (scoreDoc.doc != Integer.MAX_VALUE) {
-            validTopHitCount[0]++;
-          }
-        });
-    return validTopHitCount[0];
-  }
-
   @Override
   protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
     return results == null
@@ -86,9 +76,8 @@ public LeafCollector getLeafCollector(LeafReaderContext context) throws IOExcept
     return new LeafCollector() {
 
       private Scorable scorer;
-      // HitQueue implements getSentinelObject to return a ScoreDoc, so we know
-      // that at this point top() is already initialized.
-      private ScoreDoc pqTop = pq.top();
+      private long topCode = heap.top();
+      private float topScore = DocScoreEncoder.toScore(topCode);
       private float minCompetitiveScore;
 
       @Override
@@ -121,7 +110,7 @@ public void collect(int doc) throws IOException {
           return;
         }
 
-        if (score <= pqTop.score) {
+        if (score <= topScore) {
           // Note: for queries that match lots of hits, this is the common case: most hits are not
           // competitive.
           if (hitCountSoFar == totalHitsThreshold + 1) {
@@ -139,9 +128,9 @@ public void collect(int doc) throws IOException {
       }
 
       private void collectCompetitiveHit(int doc, float score) throws IOException {
-        pqTop.doc = doc + docBase;
-        pqTop.score = score;
-        pqTop = pq.updateTop();
+        final long code = DocScoreEncoder.encode(doc + docBase, score);
+        topCode = heap.updateTop(code);
+        topScore = DocScoreEncoder.toScore(topCode);
         updateMinCompetitiveScore(scorer);
       }
 
@@ -152,8 +141,8 @@ private void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOException
           // since we tie-break on doc id and collect in doc id order we can require
           // the next float if the global minimum score is set on a document id that is
           // smaller than the ids in the current leaf
-          float score = MaxScoreAccumulator.toScore(maxMinScore);
-          score = docBase >= MaxScoreAccumulator.docId(maxMinScore) ? Math.nextUp(score) : score;
+          float score = DocScoreEncoder.toScore(maxMinScore);
+          score = docBase >= DocScoreEncoder.docId(maxMinScore) ? Math.nextUp(score) : score;
           if (score > minCompetitiveScore) {
             scorer.setMinCompetitiveScore(score);
             minCompetitiveScore = score;
@@ -168,19 +157,45 @@ private void updateMinCompetitiveScore(Scorable scorer) throws IOException {
           // pqTop is never null since TopScoreDocCollector fills the priority queue with sentinel
           // values if the top element is a sentinel value, its score will be -Infty and the below
           // logic is still valid
-          float localMinScore = Math.nextUp(pqTop.score);
+          float localMinScore = Math.nextUp(topScore);
           if (localMinScore > minCompetitiveScore) {
             scorer.setMinCompetitiveScore(localMinScore);
             totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
             minCompetitiveScore = localMinScore;
             if (minScoreAcc != null) {
               // we don't use the next float but we register the document id so that other leaves or
               // leaf partitions can require it if they are after the current maximum
-              minScoreAcc.accumulate(pqTop.doc, pqTop.score);
+              minScoreAcc.accumulate(topCode);
             }
           }
         }
       }
     };
   }
+
+  @Override
+  protected int topDocsSize() {
+    int cnt = 0;
+    for (int i = 1; i <= heap.size(); i++) {
+      if (heap.get(i) != DocScoreEncoder.LEAST_COMPETITIVE_CODE) {
+        cnt++;
+      }
+    }
+    return cnt;
+  }
+
+  @Override
+  protected void populateResults(ScoreDoc[] results, int howMany) {
+    for (int i = howMany - 1; i >= 0; i--) {
+      long encode = heap.pop();
+      results[i] = new ScoreDoc(DocScoreEncoder.docId(encode), DocScoreEncoder.toScore(encode));
+    }
+  }
+
+  @Override
+  protected void pruneLeastCompetitiveHitsTo(int keep) {
+    for (int i = heap.size() - keep; i > 0; i--) {
+      heap.pop();
+    }
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/LongHeap.java b/lucene/core/src/java/org/apache/lucene/util/LongHeap.java
@@ -16,6 +16,8 @@
  */
 package org.apache.lucene.util;
 
+import java.util.Arrays;
+
 /**
  * A min heap that stores longs; a primitive priority queue that like all priority queues maintains
  * a partial ordering of its elements such that the least element can always be found in constant
@@ -33,6 +35,18 @@ public final class LongHeap {
   private long[] heap;
   private int size = 0;
 
+  /**
+   * Constructs a heap with specified size and initializes all elements with the given value.
+   *
+   * @param size the number of elements to initialize in the heap.
+   * @param initialValue the value to fill the heap with.
+   */
+  public LongHeap(int size, long initialValue) {
+    this(size);
+    Arrays.fill(heap, 1, size + 1, initialValue);
+    this.size = size;
+  }
+
   /**
    * Create an empty priority queue of the configured initial size.
    *
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDocScoreEncoder.java b/lucene/core/src/test/org/apache/lucene/search/TestDocScoreEncoder.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search;
+
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+public class TestDocScoreEncoder extends LuceneTestCase {
+
+  public void testRandom() {
+    for (int i = 0; i < 1000; i++) {
+      doAssert(
+          Float.intBitsToFloat(random().nextInt()),
+          random().nextInt(Integer.MAX_VALUE),
+          Float.intBitsToFloat(random().nextInt()),
+          random().nextInt(Integer.MAX_VALUE));
+    }
+  }
+
+  public void testSameDoc() {
+    for (int i = 0; i < 1000; i++) {
+      doAssert(
+          Float.intBitsToFloat(random().nextInt()), 1, Float.intBitsToFloat(random().nextInt()), 1);
+    }
+  }
+
+  public void testSameScore() {
+    for (int i = 0; i < 1000; i++) {
+      doAssert(1f, random().nextInt(Integer.MAX_VALUE), 1f, random().nextInt(Integer.MAX_VALUE));
+    }
+  }
+
+  private void doAssert(float score1, int doc1, float score2, int doc2) {
+    if (Float.isNaN(score1) || Float.isNaN(score2)) {
+      return;
+    }
+
+    long code1 = DocScoreEncoder.encode(doc1, score1);
+    long code2 = DocScoreEncoder.encode(doc2, score2);
+
+    assertEquals(doc1, DocScoreEncoder.docId(code1));
+    assertEquals(doc2, DocScoreEncoder.docId(code2));
+    assertEquals(score1, DocScoreEncoder.toScore(code1), 0f);
+    assertEquals(score2, DocScoreEncoder.toScore(code2), 0f);
+
+    if (score1 < score2) {
+      assertTrue(code1 < code2);
+    } else if (score1 > score2) {
+      assertTrue(code1 > code2);
+    } else if (doc1 == doc2) {
+      assertEquals(code1, code2);
+    } else {
+      assertEquals(code1 > code2, doc1 < doc2);
+    }
+  }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java

Original file line number	Diff line number	Diff line change
`@@ -367,7 +367,7 @@ protected void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOExcepti`
`367`	`367`	`long maxMinScore = minScoreAcc.getRaw();`
`368`	`368`	`float score;`
`369`	`369`	`if (maxMinScore != Long.MIN_VALUE`
`370`		`- && (score = MaxScoreAccumulator.toScore(maxMinScore)) > minCompetitiveScore) {`
	`370`	`+ && (score = DocScoreEncoder.toScore(maxMinScore)) > minCompetitiveScore) {`
`371`	`371`	`scorer.setMinCompetitiveScore(score);`
`372`	`372`	`minCompetitiveScore = score;`
`373`	`373`	`totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;`
`@@ -384,7 +384,7 @@ protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {`
`384`	`384`	`minCompetitiveScore = minScore;`
`385`	`385`	`totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;`
`386`	`386`	`if (minScoreAcc != null) {`
`387`		`- minScoreAcc.accumulate(docBase, minScore);`
	`387`	`+ minScoreAcc.accumulate(DocScoreEncoder.encode(docBase, minScore));`
`388`	`388`	`}`
`389`	`389`	`}`
`390`	`390`	`}`