Update comparators

This commit is contained in:
Andrea Cavalli 2021-11-16 23:19:13 +01:00
parent 891255e18e
commit 3d7e80b4ec
14 changed files with 125 additions and 27 deletions

View File

@ -20,13 +20,19 @@ package it.cavallium.dbengine.lucene;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.atomic.LongAccumulator; import java.util.concurrent.atomic.LongAccumulator;
/** Maintains the maximum score and its corresponding document id concurrently */ /**
* Maintains the maximum score and its corresponding document id concurrently
*
* This class must mirror this changes:
* <a href="https://github.com/apache/lucene/commits/94b66c0ed279fe23656d451fecd56fdfd106e1ea/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java">
* Lucene MaxScoreAccumulator changes on GitHub</a>
*/
public final class MaxScoreAccumulator { public final class MaxScoreAccumulator {
// we use 2^10-1 to check the remainder with a bitwise operation // we use 2^10-1 to check the remainder with a bitwise operation
static final int DEFAULT_INTERVAL = 0x3ff; static final int DEFAULT_INTERVAL = 0x3ff;
// scores are always positive // scores are always positive
final LongAccumulator acc = new LongAccumulator(Long::max, Long.MIN_VALUE); final LongAccumulator acc = new LongAccumulator(MaxScoreAccumulator::maxEncode, Long.MIN_VALUE);
// non-final and visible for tests // non-final and visible for tests
public long modInterval; public long modInterval;
@ -35,9 +41,26 @@ public final class MaxScoreAccumulator {
this.modInterval = DEFAULT_INTERVAL; this.modInterval = DEFAULT_INTERVAL;
} }
public void accumulate(int docID, float score) { /**
assert docID >= 0 && score >= 0; * Return the max encoded DocAndScore in a way that is consistent with {@link
long encode = (((long) Float.floatToIntBits(score)) << 32) | docID; * DocAndScore#compareTo}.
*/
private static long maxEncode(long v1, long v2) {
float score1 = Float.intBitsToFloat((int) (v1 >> 32));
float score2 = Float.intBitsToFloat((int) (v2 >> 32));
int cmp = Float.compare(score1, score2);
if (cmp == 0) {
// tie-break on the minimum doc base
return (int) v1 < (int) v2 ? v1 : v2;
} else if (cmp > 0) {
return v1;
}
return v2;
}
public void accumulate(int docBase, float score) {
assert docBase >= 0 && score >= 0;
long encode = (((long) Float.floatToIntBits(score)) << 32) | docBase;
acc.accumulate(encode); acc.accumulate(encode);
} }
@ -47,16 +70,16 @@ public final class MaxScoreAccumulator {
return null; return null;
} }
float score = Float.intBitsToFloat((int) (value >> 32)); float score = Float.intBitsToFloat((int) (value >> 32));
int docID = (int) value; int docBase = (int) value;
return new DocAndScore(docID, score); return new DocAndScore(docBase, score);
} }
public static class DocAndScore implements Comparable<DocAndScore> { public static class DocAndScore implements Comparable<DocAndScore> {
public final int docID; public final int docBase;
public final float score; public final float score;
public DocAndScore(int docID, float score) { public DocAndScore(int docBase, float score) {
this.docID = docID; this.docBase = docBase;
this.score = score; this.score = score;
} }
@ -64,7 +87,14 @@ public final class MaxScoreAccumulator {
public int compareTo(DocAndScore o) { public int compareTo(DocAndScore o) {
int cmp = Float.compare(score, o.score); int cmp = Float.compare(score, o.score);
if (cmp == 0) { if (cmp == 0) {
return Integer.compare(docID, o.docID); // tie-break on the minimum doc base
// For a given minimum competitive score, we want to know the first segment
// where this score occurred, hence the reverse order here.
// On segments with a lower docBase, any document whose score is greater
// than or equal to this score would be competitive, while on segments with a
// higher docBase, documents need to have a strictly greater score to be
// competitive since we tie break on doc ID.
return Integer.compare(o.docBase, docBase);
} }
return cmp; return cmp;
} }
@ -74,17 +104,17 @@ public final class MaxScoreAccumulator {
if (this == o) return true; if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false; if (o == null || getClass() != o.getClass()) return false;
DocAndScore result = (DocAndScore) o; DocAndScore result = (DocAndScore) o;
return docID == result.docID && Float.compare(result.score, score) == 0; return docBase == result.docBase && Float.compare(result.score, score) == 0;
} }
@Override @Override
public int hashCode() { public int hashCode() {
return Objects.hash(docID, score); return Objects.hash(docBase, score);
} }
@Override @Override
public String toString() { public String toString() {
return "DocAndScore{" + "docID=" + docID + ", score=" + score + '}'; return "DocAndScore{" + "docBase=" + docBase + ", score=" + score + '}';
} }
} }
} }

View File

@ -51,6 +51,9 @@ import reactor.core.publisher.Flux;
* <p>See the {@link #create(LLTempLMDBEnv, Sort, int, int)} (org.apache.lucene.search.Sort, int, int)} method for instantiating a * <p>See the {@link #create(LLTempLMDBEnv, Sort, int, int)} (org.apache.lucene.search.Sort, int, int)} method for instantiating a
* TopFieldCollector. * TopFieldCollector.
* *
* This class must mirror this changes:
* <a href="https://github.com/apache/lucene/commits/main/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java">
* Lucene TopFieldCollector changes on GitHub</a>
*/ */
public abstract class LMDBFullFieldDocCollector public abstract class LMDBFullFieldDocCollector
extends FullDocsCollector<LMDBPriorityQueue<LLSlotDoc>, LLSlotDoc, LLFieldDoc> { extends FullDocsCollector<LMDBPriorityQueue<LLSlotDoc>, LLSlotDoc, LLFieldDoc> {

View File

@ -48,6 +48,10 @@ import org.jetbrains.annotations.Nullable;
* *
* <p><b>NOTE</b>: The values {@link Float#NaN} and {@link Float#NEGATIVE_INFINITY} are not valid * <p><b>NOTE</b>: The values {@link Float#NaN} and {@link Float#NEGATIVE_INFINITY} are not valid
* scores. This collector will not properly collect hits with such scores. * scores. This collector will not properly collect hits with such scores.
*
* This class must mirror this changes:
* <a href="https://github.com/apache/lucene/commits/main/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java">
* Lucene TopScoreDocCollector changes on GitHub</a>
*/ */
public abstract class LMDBFullScoreDocCollector extends FullDocsCollector<LMDBPriorityQueue<LLScoreDoc>, LLScoreDoc, LLScoreDoc> { public abstract class LMDBFullScoreDocCollector extends FullDocsCollector<LMDBPriorityQueue<LLScoreDoc>, LLScoreDoc, LLScoreDoc> {
@ -73,14 +77,15 @@ public abstract class LMDBFullScoreDocCollector extends FullDocsCollector<LMDBPr
public LeafCollector getLeafCollector(LeafReaderContext context) { public LeafCollector getLeafCollector(LeafReaderContext context) {
// reset the minimum competitive score // reset the minimum competitive score
docBase = context.docBase; docBase = context.docBase;
minCompetitiveScore = 0f;
return new ScorerLeafCollector() { return new ScorerLeafCollector() {
@Override @Override
public void setScorer(Scorable scorer) throws IOException { public void setScorer(Scorable scorer) throws IOException {
super.setScorer(scorer); super.setScorer(scorer);
minCompetitiveScore = 0f; if (minScoreAcc == null) {
updateMinCompetitiveScore(scorer); updateMinCompetitiveScore(scorer);
if (minScoreAcc != null) { } else {
updateGlobalMinCompetitiveScore(scorer); updateGlobalMinCompetitiveScore(scorer);
} }
} }
@ -280,7 +285,7 @@ public abstract class LMDBFullScoreDocCollector extends FullDocsCollector<LMDBPr
// the next float if the global minimum score is set on a document id that is // the next float if the global minimum score is set on a document id that is
// smaller than the ids in the current leaf // smaller than the ids in the current leaf
float score = float score =
docBase > maxMinScore.docID ? Math.nextUp(maxMinScore.score) : maxMinScore.score; docBase >= maxMinScore.docBase ? Math.nextUp(maxMinScore.score) : maxMinScore.score;
if (score > minCompetitiveScore) { if (score > minCompetitiveScore) {
assert hitsThresholdChecker.isThresholdReached(true); assert hitsThresholdChecker.isThresholdReached(true);
scorer.setMinCompetitiveScore(score); scorer.setMinCompetitiveScore(score);
@ -306,7 +311,7 @@ public abstract class LMDBFullScoreDocCollector extends FullDocsCollector<LMDBPr
// we don't use the next float but we register the document // we don't use the next float but we register the document
// id so that other leaves can require it if they are after // id so that other leaves can require it if they are after
// the current maximum // the current maximum
minScoreAcc.accumulate(pqTop.doc(), pqTop.score()); minScoreAcc.accumulate(docBase, pqTop.score());
} }
} }
} }

View File

@ -30,7 +30,10 @@ import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafFieldComparator; import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable; import org.apache.lucene.search.Scorable;
/** Comparator that sorts by asc _doc */ /**
* Comparator that sorts by asc _doc
* Based on {@link org.apache.lucene.search.comparators.DocComparator}
* */
public class DocComparator extends FieldComparator<Integer> { public class DocComparator extends FieldComparator<Integer> {
private final IArray<Integer> docIDs; private final IArray<Integer> docIDs;
private final boolean enableSkipping; // if skipping functionality should be enabled private final boolean enableSkipping; // if skipping functionality should be enabled

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator;
/** /**
* Comparator based on {@link Double#compare} for {@code numHits}. This comparator provides a * Comparator based on {@link Double#compare} for {@code numHits}. This comparator provides a
* skipping functionality - an iterator that can skip over non-competitive documents. * skipping functionality - an iterator that can skip over non-competitive documents.
* Based on {@link org.apache.lucene.search.comparators.DoubleComparator}
*/ */
public class DoubleComparator extends NumericComparator<Double> { public class DoubleComparator extends NumericComparator<Double> {
private final IArray<Double> values; private final IArray<Double> values;

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator;
/** /**
* Comparator based on {@link Float#compare} for {@code numHits}. This comparator provides a * Comparator based on {@link Float#compare} for {@code numHits}. This comparator provides a
* skipping functionality an iterator that can skip over non-competitive documents. * skipping functionality an iterator that can skip over non-competitive documents.
* Based on {@link org.apache.lucene.search.comparators.FloatComparator}
*/ */
public class FloatComparator extends NumericComparator<Float> { public class FloatComparator extends NumericComparator<Float> {
private final IArray<Float> values; private final IArray<Float> values;

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator;
/** /**
* Comparator based on {@link Integer#compare} for {@code numHits}. This comparator provides a * Comparator based on {@link Integer#compare} for {@code numHits}. This comparator provides a
* skipping functionality an iterator that can skip over non-competitive documents. * skipping functionality an iterator that can skip over non-competitive documents.
* Based on {@link org.apache.lucene.search.comparators.IntComparator}
*/ */
public class IntComparator extends NumericComparator<Integer> { public class IntComparator extends NumericComparator<Integer> {
private final IArray<Integer> values; private final IArray<Integer> values;

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator;
/** /**
* Comparator based on {@link Long#compare} for {@code numHits}. This comparator provides a skipping * Comparator based on {@link Long#compare} for {@code numHits}. This comparator provides a skipping
* functionality an iterator that can skip over non-competitive documents. * functionality an iterator that can skip over non-competitive documents.
* Based on {@link org.apache.lucene.search.comparators.LongComparator}
*/ */
public class LongComparator extends NumericComparator<Long> { public class LongComparator extends NumericComparator<Long> {
private final IArray<Long> values; private final IArray<Long> values;

View File

@ -20,7 +20,10 @@ package it.cavallium.dbengine.lucene.comparators;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
/** Docs iterator that starts iterating from a configurable minimum document */ /**
* Docs iterator that starts iterating from a configurable minimum document
* Based on {@link org.apache.lucene.search.comparators.MinDocIterator}
* */
public class MinDocIterator extends DocIdSetIterator { public class MinDocIterator extends DocIdSetIterator {
final int segmentMinDoc; final int segmentMinDoc;
final int maxDoc; final int maxDoc;

View File

@ -21,6 +21,9 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafFieldComparator; import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable; import org.apache.lucene.search.Scorable;
/**
* Based on {@link org.apache.lucene.search.comparators.MinDocIterator}
*/
public final class MultiLeafFieldComparator implements LeafFieldComparator { public final class MultiLeafFieldComparator implements LeafFieldComparator {
private final LeafFieldComparator[] comparators; private final LeafFieldComparator[] comparators;

View File

@ -35,6 +35,13 @@ import org.apache.lucene.util.DocIdSetBuilder;
/** /**
* Abstract numeric comparator for comparing numeric values. This comparator provides a skipping * Abstract numeric comparator for comparing numeric values. This comparator provides a skipping
* functionality an iterator that can skip over non-competitive documents. * functionality an iterator that can skip over non-competitive documents.
*
* <p>Parameter {@code field} provided in the constructor is used as a field name in the default
* implementations of the methods {@code getNumericDocValues} and {@code getPointValues} to retrieve
* doc values and points. You can pass a dummy value for a field name (e.g. when sorting by script),
* but in this case you must override both of these methods.
*
* Based on {@link org.apache.lucene.search.comparators.NumericComparator}
*/ */
public abstract class NumericComparator<T extends Number> extends FieldComparator<T> { public abstract class NumericComparator<T extends Number> extends FieldComparator<T> {
protected final T missingValue; protected final T missingValue;
@ -92,7 +99,7 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
public NumericLeafComparator(LeafReaderContext context) throws IOException { public NumericLeafComparator(LeafReaderContext context) throws IOException {
this.docValues = getNumericDocValues(context, field); this.docValues = getNumericDocValues(context, field);
this.pointValues = canSkipDocuments ? context.reader().getPointValues(field) : null; this.pointValues = canSkipDocuments ? getPointValues(context, field) : null;
if (pointValues != null) { if (pointValues != null) {
FieldInfo info = context.reader().getFieldInfos().fieldInfo(field); FieldInfo info = context.reader().getFieldInfos().fieldInfo(field);
if (info == null || info.getPointDimensionCount() == 0) { if (info == null || info.getPointDimensionCount() == 0) {
@ -130,12 +137,44 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
} }
} }
/** Retrieves the NumericDocValues for the field in this segment */ /**
* Retrieves the NumericDocValues for the field in this segment
*
* <p>If you override this method, you must also override {@link
* #getPointValues(LeafReaderContext, String)} This class uses sort optimization that leverages
* points to filter out non-competitive matches, which relies on the assumption that points and
* doc values record the same information.
*
* @param context reader context
* @param field - field name
* @return numeric doc values for the field in this segment.
* @throws IOException If there is a low-level I/O error
*/
protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field) protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field)
throws IOException { throws IOException {
return DocValues.getNumeric(context.reader(), field); return DocValues.getNumeric(context.reader(), field);
} }
/**
* Retrieves point values for the field in this segment
*
* <p>If you override this method, you must also override {@link
* #getNumericDocValues(LeafReaderContext, String)} This class uses sort optimization that
* leverages points to filter out non-competitive matches, which relies on the assumption that
* points and doc values record the same information. Return {@code null} even if no points
* implementation is available, in this case sort optimization with points will be disabled.
*
* @param context reader context
* @param field - field name
* @return point values for the field in this segment if they are available or {@code null} if
* sort optimization with points should be disabled.
* @throws IOException If there is a low-level I/O error
*/
protected PointValues getPointValues(LeafReaderContext context, String field)
throws IOException {
return context.reader().getPointValues(field);
}
@Override @Override
public void setBottom(int slot) throws IOException { public void setBottom(int slot) throws IOException {
queueFull = true; // if we are setting bottom, it means that we have collected enough hits queueFull = true; // if we are setting bottom, it means that we have collected enough hits

View File

@ -38,6 +38,7 @@ import org.apache.lucene.util.BytesRefBuilder;
* Sorts by descending relevance. NOTE: if you are sorting only by descending relevance and then secondarily by * Sorts by descending relevance. NOTE: if you are sorting only by descending relevance and then secondarily by
* ascending docID, performance is faster using {@link org.apache.lucene.search.TopScoreDocCollector} directly (which {@link * ascending docID, performance is faster using {@link org.apache.lucene.search.TopScoreDocCollector} directly (which {@link
* org.apache.lucene.search.IndexSearcher#search(Query, int)} uses when no {@link org.apache.lucene.search.Sort} is specified). * org.apache.lucene.search.IndexSearcher#search(Query, int)} uses when no {@link org.apache.lucene.search.Sort} is specified).
* Based on {@link org.apache.lucene.search.FieldComparator.RelevanceComparator}
*/ */
public final class RelevanceComparator extends FieldComparator<Float> implements LeafFieldComparator { public final class RelevanceComparator extends FieldComparator<Float> implements LeafFieldComparator {

View File

@ -26,6 +26,7 @@ import org.apache.lucene.util.BytesRefBuilder;
* using the ordinals. For medium to large results, this comparator will be much faster than * using the ordinals. For medium to large results, this comparator will be much faster than
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets * {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets
* it may be slower. * it may be slower.
* Based on {@link org.apache.lucene.search.FieldComparator.TermOrdValComparator}
*/ */
public class TermOrdValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator { public class TermOrdValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator {
/* Ords for each slot. /* Ords for each slot.

View File

@ -0,0 +1,6 @@
/**
* This package must mirror this changes:
* <a href="https://github.com/apache/lucene/commits/main/lucene/core/src/java/org/apache/lucene/search/comparators">
* Lucene comparators changes on GitHub</a>
*/
package it.cavallium.dbengine.lucene.comparators;