diff --git a/src/main/java/it/cavallium/dbengine/lucene/MaxScoreAccumulator.java b/src/main/java/it/cavallium/dbengine/lucene/MaxScoreAccumulator.java index 57bbac5..8d7c92c 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/MaxScoreAccumulator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/MaxScoreAccumulator.java @@ -20,13 +20,19 @@ package it.cavallium.dbengine.lucene; import java.util.Objects; import java.util.concurrent.atomic.LongAccumulator; -/** Maintains the maximum score and its corresponding document id concurrently */ +/** + * Maintains the maximum score and its corresponding document id concurrently + * + * This class must mirror this changes: + * + * Lucene MaxScoreAccumulator changes on GitHub + */ public final class MaxScoreAccumulator { // we use 2^10-1 to check the remainder with a bitwise operation static final int DEFAULT_INTERVAL = 0x3ff; // scores are always positive - final LongAccumulator acc = new LongAccumulator(Long::max, Long.MIN_VALUE); + final LongAccumulator acc = new LongAccumulator(MaxScoreAccumulator::maxEncode, Long.MIN_VALUE); // non-final and visible for tests public long modInterval; @@ -35,9 +41,26 @@ public final class MaxScoreAccumulator { this.modInterval = DEFAULT_INTERVAL; } - public void accumulate(int docID, float score) { - assert docID >= 0 && score >= 0; - long encode = (((long) Float.floatToIntBits(score)) << 32) | docID; + /** + * Return the max encoded DocAndScore in a way that is consistent with {@link + * DocAndScore#compareTo}. + */ + private static long maxEncode(long v1, long v2) { + float score1 = Float.intBitsToFloat((int) (v1 >> 32)); + float score2 = Float.intBitsToFloat((int) (v2 >> 32)); + int cmp = Float.compare(score1, score2); + if (cmp == 0) { + // tie-break on the minimum doc base + return (int) v1 < (int) v2 ? v1 : v2; + } else if (cmp > 0) { + return v1; + } + return v2; + } + + public void accumulate(int docBase, float score) { + assert docBase >= 0 && score >= 0; + long encode = (((long) Float.floatToIntBits(score)) << 32) | docBase; acc.accumulate(encode); } @@ -47,16 +70,16 @@ public final class MaxScoreAccumulator { return null; } float score = Float.intBitsToFloat((int) (value >> 32)); - int docID = (int) value; - return new DocAndScore(docID, score); + int docBase = (int) value; + return new DocAndScore(docBase, score); } public static class DocAndScore implements Comparable { - public final int docID; + public final int docBase; public final float score; - public DocAndScore(int docID, float score) { - this.docID = docID; + public DocAndScore(int docBase, float score) { + this.docBase = docBase; this.score = score; } @@ -64,7 +87,14 @@ public final class MaxScoreAccumulator { public int compareTo(DocAndScore o) { int cmp = Float.compare(score, o.score); if (cmp == 0) { - return Integer.compare(docID, o.docID); + // tie-break on the minimum doc base + // For a given minimum competitive score, we want to know the first segment + // where this score occurred, hence the reverse order here. + // On segments with a lower docBase, any document whose score is greater + // than or equal to this score would be competitive, while on segments with a + // higher docBase, documents need to have a strictly greater score to be + // competitive since we tie break on doc ID. + return Integer.compare(o.docBase, docBase); } return cmp; } @@ -74,17 +104,17 @@ public final class MaxScoreAccumulator { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; DocAndScore result = (DocAndScore) o; - return docID == result.docID && Float.compare(result.score, score) == 0; + return docBase == result.docBase && Float.compare(result.score, score) == 0; } @Override public int hashCode() { - return Objects.hash(docID, score); + return Objects.hash(docBase, score); } @Override public String toString() { - return "DocAndScore{" + "docID=" + docID + ", score=" + score + '}'; + return "DocAndScore{" + "docBase=" + docBase + ", score=" + score + '}'; } } } \ No newline at end of file diff --git a/src/main/java/it/cavallium/dbengine/lucene/collector/LMDBFullFieldDocCollector.java b/src/main/java/it/cavallium/dbengine/lucene/collector/LMDBFullFieldDocCollector.java index 7683659..f76794e 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/collector/LMDBFullFieldDocCollector.java +++ b/src/main/java/it/cavallium/dbengine/lucene/collector/LMDBFullFieldDocCollector.java @@ -51,6 +51,9 @@ import reactor.core.publisher.Flux; *

See the {@link #create(LLTempLMDBEnv, Sort, int, int)} (org.apache.lucene.search.Sort, int, int)} method for instantiating a * TopFieldCollector. * + * This class must mirror this changes: + * + * Lucene TopFieldCollector changes on GitHub */ public abstract class LMDBFullFieldDocCollector extends FullDocsCollector, LLSlotDoc, LLFieldDoc> { diff --git a/src/main/java/it/cavallium/dbengine/lucene/collector/LMDBFullScoreDocCollector.java b/src/main/java/it/cavallium/dbengine/lucene/collector/LMDBFullScoreDocCollector.java index 909d3a8..6dcef78 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/collector/LMDBFullScoreDocCollector.java +++ b/src/main/java/it/cavallium/dbengine/lucene/collector/LMDBFullScoreDocCollector.java @@ -48,6 +48,10 @@ import org.jetbrains.annotations.Nullable; * *

NOTE: The values {@link Float#NaN} and {@link Float#NEGATIVE_INFINITY} are not valid * scores. This collector will not properly collect hits with such scores. + * + * This class must mirror this changes: + * + * Lucene TopScoreDocCollector changes on GitHub */ public abstract class LMDBFullScoreDocCollector extends FullDocsCollector, LLScoreDoc, LLScoreDoc> { @@ -73,14 +77,15 @@ public abstract class LMDBFullScoreDocCollector extends FullDocsCollector maxMinScore.docID ? Math.nextUp(maxMinScore.score) : maxMinScore.score; + docBase >= maxMinScore.docBase ? Math.nextUp(maxMinScore.score) : maxMinScore.score; if (score > minCompetitiveScore) { assert hitsThresholdChecker.isThresholdReached(true); scorer.setMinCompetitiveScore(score); @@ -306,7 +311,7 @@ public abstract class LMDBFullScoreDocCollector extends FullDocsCollector { private final IArray docIDs; private final boolean enableSkipping; // if skipping functionality should be enabled @@ -55,9 +58,9 @@ public class DocComparator extends FieldComparator { @Override public LeafFieldComparator getLeafComparator(LeafReaderContext context) { - // TODO: can we "map" our docIDs to the current - // reader? saves having to then subtract on every - // compare call + // TODO: can we "map" our docIDs to the current + // reader? saves having to then subtract on every + // compare call return new DocLeafComparator(context); } diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/DoubleComparator.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/DoubleComparator.java index 34a2b2a..6ca96a3 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/comparators/DoubleComparator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/DoubleComparator.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator; /** * Comparator based on {@link Double#compare} for {@code numHits}. This comparator provides a * skipping functionality - an iterator that can skip over non-competitive documents. + * Based on {@link org.apache.lucene.search.comparators.DoubleComparator} */ public class DoubleComparator extends NumericComparator { private final IArray values; diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/FloatComparator.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/FloatComparator.java index dd1b401..6776333 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/comparators/FloatComparator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/FloatComparator.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator; /** * Comparator based on {@link Float#compare} for {@code numHits}. This comparator provides a * skipping functionality – an iterator that can skip over non-competitive documents. + * Based on {@link org.apache.lucene.search.comparators.FloatComparator} */ public class FloatComparator extends NumericComparator { private final IArray values; diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/IntComparator.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/IntComparator.java index 9cccc6c..21fa126 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/comparators/IntComparator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/IntComparator.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator; /** * Comparator based on {@link Integer#compare} for {@code numHits}. This comparator provides a * skipping functionality – an iterator that can skip over non-competitive documents. + * Based on {@link org.apache.lucene.search.comparators.IntComparator} */ public class IntComparator extends NumericComparator { private final IArray values; diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/LongComparator.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/LongComparator.java index b5d6e8d..ca2b610 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/comparators/LongComparator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/LongComparator.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator; /** * Comparator based on {@link Long#compare} for {@code numHits}. This comparator provides a skipping * functionality – an iterator that can skip over non-competitive documents. + * Based on {@link org.apache.lucene.search.comparators.LongComparator} */ public class LongComparator extends NumericComparator { private final IArray values; diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/MinDocIterator.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/MinDocIterator.java index 15a7bba..6f6bf5d 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/comparators/MinDocIterator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/MinDocIterator.java @@ -20,7 +20,10 @@ package it.cavallium.dbengine.lucene.comparators; import java.io.IOException; import org.apache.lucene.search.DocIdSetIterator; -/** Docs iterator that starts iterating from a configurable minimum document */ +/** + * Docs iterator that starts iterating from a configurable minimum document + * Based on {@link org.apache.lucene.search.comparators.MinDocIterator} + * */ public class MinDocIterator extends DocIdSetIterator { final int segmentMinDoc; final int maxDoc; diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/MultiLeafFieldComparator.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/MultiLeafFieldComparator.java index cb73204..e21a189 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/comparators/MultiLeafFieldComparator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/MultiLeafFieldComparator.java @@ -21,6 +21,9 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.LeafFieldComparator; import org.apache.lucene.search.Scorable; +/** + * Based on {@link org.apache.lucene.search.comparators.MinDocIterator} + */ public final class MultiLeafFieldComparator implements LeafFieldComparator { private final LeafFieldComparator[] comparators; diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/NumericComparator.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/NumericComparator.java index 721e6ab..3dd2ee6 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/comparators/NumericComparator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/NumericComparator.java @@ -35,6 +35,13 @@ import org.apache.lucene.util.DocIdSetBuilder; /** * Abstract numeric comparator for comparing numeric values. This comparator provides a skipping * functionality – an iterator that can skip over non-competitive documents. + * + *

Parameter {@code field} provided in the constructor is used as a field name in the default + * implementations of the methods {@code getNumericDocValues} and {@code getPointValues} to retrieve + * doc values and points. You can pass a dummy value for a field name (e.g. when sorting by script), + * but in this case you must override both of these methods. + * + * Based on {@link org.apache.lucene.search.comparators.NumericComparator} */ public abstract class NumericComparator extends FieldComparator { protected final T missingValue; @@ -92,7 +99,7 @@ public abstract class NumericComparator extends FieldComparato public NumericLeafComparator(LeafReaderContext context) throws IOException { this.docValues = getNumericDocValues(context, field); - this.pointValues = canSkipDocuments ? context.reader().getPointValues(field) : null; + this.pointValues = canSkipDocuments ? getPointValues(context, field) : null; if (pointValues != null) { FieldInfo info = context.reader().getFieldInfos().fieldInfo(field); if (info == null || info.getPointDimensionCount() == 0) { @@ -130,12 +137,44 @@ public abstract class NumericComparator extends FieldComparato } } - /** Retrieves the NumericDocValues for the field in this segment */ - protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field) + /** + * Retrieves the NumericDocValues for the field in this segment + * + *

If you override this method, you must also override {@link + * #getPointValues(LeafReaderContext, String)} This class uses sort optimization that leverages + * points to filter out non-competitive matches, which relies on the assumption that points and + * doc values record the same information. + * + * @param context – reader context + * @param field - field name + * @return numeric doc values for the field in this segment. + * @throws IOException If there is a low-level I/O error + */ + protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field) throws IOException { return DocValues.getNumeric(context.reader(), field); } + /** + * Retrieves point values for the field in this segment + * + *

If you override this method, you must also override {@link + * #getNumericDocValues(LeafReaderContext, String)} This class uses sort optimization that + * leverages points to filter out non-competitive matches, which relies on the assumption that + * points and doc values record the same information. Return {@code null} even if no points + * implementation is available, in this case sort optimization with points will be disabled. + * + * @param context – reader context + * @param field - field name + * @return point values for the field in this segment if they are available or {@code null} if + * sort optimization with points should be disabled. + * @throws IOException If there is a low-level I/O error + */ + protected PointValues getPointValues(LeafReaderContext context, String field) + throws IOException { + return context.reader().getPointValues(field); + } + @Override public void setBottom(int slot) throws IOException { queueFull = true; // if we are setting bottom, it means that we have collected enough hits diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/RelevanceComparator.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/RelevanceComparator.java index 11414b3..743f768 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/comparators/RelevanceComparator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/RelevanceComparator.java @@ -38,6 +38,7 @@ import org.apache.lucene.util.BytesRefBuilder; * Sorts by descending relevance. NOTE: if you are sorting only by descending relevance and then secondarily by * ascending docID, performance is faster using {@link org.apache.lucene.search.TopScoreDocCollector} directly (which {@link * org.apache.lucene.search.IndexSearcher#search(Query, int)} uses when no {@link org.apache.lucene.search.Sort} is specified). + * Based on {@link org.apache.lucene.search.FieldComparator.RelevanceComparator} */ public final class RelevanceComparator extends FieldComparator implements LeafFieldComparator { diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/TermOrdValComparator.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/TermOrdValComparator.java index 057f594..9a717d8 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/comparators/TermOrdValComparator.java +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/TermOrdValComparator.java @@ -26,6 +26,7 @@ import org.apache.lucene.util.BytesRefBuilder; * using the ordinals. For medium to large results, this comparator will be much faster than * {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets * it may be slower. + * Based on {@link org.apache.lucene.search.FieldComparator.TermOrdValComparator} */ public class TermOrdValComparator extends FieldComparator implements LeafFieldComparator { /* Ords for each slot. diff --git a/src/main/java/it/cavallium/dbengine/lucene/comparators/package-info.java b/src/main/java/it/cavallium/dbengine/lucene/comparators/package-info.java new file mode 100644 index 0000000..abb4d8b --- /dev/null +++ b/src/main/java/it/cavallium/dbengine/lucene/comparators/package-info.java @@ -0,0 +1,6 @@ +/** + * This package must mirror this changes: + * + * Lucene comparators changes on GitHub + */ +package it.cavallium.dbengine.lucene.comparators; \ No newline at end of file