Update comparators

This commit is contained in:
Andrea Cavalli 2021-11-16 23:19:13 +01:00
parent 891255e18e
commit 3d7e80b4ec
14 changed files with 125 additions and 27 deletions

View File

@ -20,13 +20,19 @@ package it.cavallium.dbengine.lucene;
import java.util.Objects;
import java.util.concurrent.atomic.LongAccumulator;
/** Maintains the maximum score and its corresponding document id concurrently */
/**
* Maintains the maximum score and its corresponding document id concurrently
*
* This class must mirror this changes:
* <a href="https://github.com/apache/lucene/commits/94b66c0ed279fe23656d451fecd56fdfd106e1ea/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java">
* Lucene MaxScoreAccumulator changes on GitHub</a>
*/
public final class MaxScoreAccumulator {
// we use 2^10-1 to check the remainder with a bitwise operation
static final int DEFAULT_INTERVAL = 0x3ff;
// scores are always positive
final LongAccumulator acc = new LongAccumulator(Long::max, Long.MIN_VALUE);
final LongAccumulator acc = new LongAccumulator(MaxScoreAccumulator::maxEncode, Long.MIN_VALUE);
// non-final and visible for tests
public long modInterval;
@ -35,9 +41,26 @@ public final class MaxScoreAccumulator {
this.modInterval = DEFAULT_INTERVAL;
}
public void accumulate(int docID, float score) {
assert docID >= 0 && score >= 0;
long encode = (((long) Float.floatToIntBits(score)) << 32) | docID;
/**
* Return the max encoded DocAndScore in a way that is consistent with {@link
* DocAndScore#compareTo}.
*/
private static long maxEncode(long v1, long v2) {
float score1 = Float.intBitsToFloat((int) (v1 >> 32));
float score2 = Float.intBitsToFloat((int) (v2 >> 32));
int cmp = Float.compare(score1, score2);
if (cmp == 0) {
// tie-break on the minimum doc base
return (int) v1 < (int) v2 ? v1 : v2;
} else if (cmp > 0) {
return v1;
}
return v2;
}
public void accumulate(int docBase, float score) {
assert docBase >= 0 && score >= 0;
long encode = (((long) Float.floatToIntBits(score)) << 32) | docBase;
acc.accumulate(encode);
}
@ -47,16 +70,16 @@ public final class MaxScoreAccumulator {
return null;
}
float score = Float.intBitsToFloat((int) (value >> 32));
int docID = (int) value;
return new DocAndScore(docID, score);
int docBase = (int) value;
return new DocAndScore(docBase, score);
}
public static class DocAndScore implements Comparable<DocAndScore> {
public final int docID;
public final int docBase;
public final float score;
public DocAndScore(int docID, float score) {
this.docID = docID;
public DocAndScore(int docBase, float score) {
this.docBase = docBase;
this.score = score;
}
@ -64,7 +87,14 @@ public final class MaxScoreAccumulator {
public int compareTo(DocAndScore o) {
int cmp = Float.compare(score, o.score);
if (cmp == 0) {
return Integer.compare(docID, o.docID);
// tie-break on the minimum doc base
// For a given minimum competitive score, we want to know the first segment
// where this score occurred, hence the reverse order here.
// On segments with a lower docBase, any document whose score is greater
// than or equal to this score would be competitive, while on segments with a
// higher docBase, documents need to have a strictly greater score to be
// competitive since we tie break on doc ID.
return Integer.compare(o.docBase, docBase);
}
return cmp;
}
@ -74,17 +104,17 @@ public final class MaxScoreAccumulator {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
DocAndScore result = (DocAndScore) o;
return docID == result.docID && Float.compare(result.score, score) == 0;
return docBase == result.docBase && Float.compare(result.score, score) == 0;
}
@Override
public int hashCode() {
return Objects.hash(docID, score);
return Objects.hash(docBase, score);
}
@Override
public String toString() {
return "DocAndScore{" + "docID=" + docID + ", score=" + score + '}';
return "DocAndScore{" + "docBase=" + docBase + ", score=" + score + '}';
}
}
}

View File

@ -51,6 +51,9 @@ import reactor.core.publisher.Flux;
* <p>See the {@link #create(LLTempLMDBEnv, Sort, int, int)} (org.apache.lucene.search.Sort, int, int)} method for instantiating a
* TopFieldCollector.
*
* This class must mirror this changes:
* <a href="https://github.com/apache/lucene/commits/main/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java">
* Lucene TopFieldCollector changes on GitHub</a>
*/
public abstract class LMDBFullFieldDocCollector
extends FullDocsCollector<LMDBPriorityQueue<LLSlotDoc>, LLSlotDoc, LLFieldDoc> {

View File

@ -48,6 +48,10 @@ import org.jetbrains.annotations.Nullable;
*
* <p><b>NOTE</b>: The values {@link Float#NaN} and {@link Float#NEGATIVE_INFINITY} are not valid
* scores. This collector will not properly collect hits with such scores.
*
* This class must mirror this changes:
* <a href="https://github.com/apache/lucene/commits/main/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java">
* Lucene TopScoreDocCollector changes on GitHub</a>
*/
public abstract class LMDBFullScoreDocCollector extends FullDocsCollector<LMDBPriorityQueue<LLScoreDoc>, LLScoreDoc, LLScoreDoc> {
@ -73,14 +77,15 @@ public abstract class LMDBFullScoreDocCollector extends FullDocsCollector<LMDBPr
public LeafCollector getLeafCollector(LeafReaderContext context) {
// reset the minimum competitive score
docBase = context.docBase;
minCompetitiveScore = 0f;
return new ScorerLeafCollector() {
@Override
public void setScorer(Scorable scorer) throws IOException {
super.setScorer(scorer);
minCompetitiveScore = 0f;
if (minScoreAcc == null) {
updateMinCompetitiveScore(scorer);
if (minScoreAcc != null) {
} else {
updateGlobalMinCompetitiveScore(scorer);
}
}
@ -280,7 +285,7 @@ public abstract class LMDBFullScoreDocCollector extends FullDocsCollector<LMDBPr
// the next float if the global minimum score is set on a document id that is
// smaller than the ids in the current leaf
float score =
docBase > maxMinScore.docID ? Math.nextUp(maxMinScore.score) : maxMinScore.score;
docBase >= maxMinScore.docBase ? Math.nextUp(maxMinScore.score) : maxMinScore.score;
if (score > minCompetitiveScore) {
assert hitsThresholdChecker.isThresholdReached(true);
scorer.setMinCompetitiveScore(score);
@ -306,7 +311,7 @@ public abstract class LMDBFullScoreDocCollector extends FullDocsCollector<LMDBPr
// we don't use the next float but we register the document
// id so that other leaves can require it if they are after
// the current maximum
minScoreAcc.accumulate(pqTop.doc(), pqTop.score());
minScoreAcc.accumulate(docBase, pqTop.score());
}
}
}

View File

@ -30,7 +30,10 @@ import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable;
/** Comparator that sorts by asc _doc */
/**
* Comparator that sorts by asc _doc
* Based on {@link org.apache.lucene.search.comparators.DocComparator}
* */
public class DocComparator extends FieldComparator<Integer> {
private final IArray<Integer> docIDs;
private final boolean enableSkipping; // if skipping functionality should be enabled

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator;
/**
* Comparator based on {@link Double#compare} for {@code numHits}. This comparator provides a
* skipping functionality - an iterator that can skip over non-competitive documents.
* Based on {@link org.apache.lucene.search.comparators.DoubleComparator}
*/
public class DoubleComparator extends NumericComparator<Double> {
private final IArray<Double> values;

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator;
/**
* Comparator based on {@link Float#compare} for {@code numHits}. This comparator provides a
* skipping functionality an iterator that can skip over non-competitive documents.
* Based on {@link org.apache.lucene.search.comparators.FloatComparator}
*/
public class FloatComparator extends NumericComparator<Float> {
private final IArray<Float> values;

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator;
/**
* Comparator based on {@link Integer#compare} for {@code numHits}. This comparator provides a
* skipping functionality an iterator that can skip over non-competitive documents.
* Based on {@link org.apache.lucene.search.comparators.IntComparator}
*/
public class IntComparator extends NumericComparator<Integer> {
private final IArray<Integer> values;

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.LeafFieldComparator;
/**
* Comparator based on {@link Long#compare} for {@code numHits}. This comparator provides a skipping
* functionality an iterator that can skip over non-competitive documents.
* Based on {@link org.apache.lucene.search.comparators.LongComparator}
*/
public class LongComparator extends NumericComparator<Long> {
private final IArray<Long> values;

View File

@ -20,7 +20,10 @@ package it.cavallium.dbengine.lucene.comparators;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
/** Docs iterator that starts iterating from a configurable minimum document */
/**
* Docs iterator that starts iterating from a configurable minimum document
* Based on {@link org.apache.lucene.search.comparators.MinDocIterator}
* */
public class MinDocIterator extends DocIdSetIterator {
final int segmentMinDoc;
final int maxDoc;

View File

@ -21,6 +21,9 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable;
/**
* Based on {@link org.apache.lucene.search.comparators.MinDocIterator}
*/
public final class MultiLeafFieldComparator implements LeafFieldComparator {
private final LeafFieldComparator[] comparators;

View File

@ -35,6 +35,13 @@ import org.apache.lucene.util.DocIdSetBuilder;
/**
* Abstract numeric comparator for comparing numeric values. This comparator provides a skipping
* functionality an iterator that can skip over non-competitive documents.
*
* <p>Parameter {@code field} provided in the constructor is used as a field name in the default
* implementations of the methods {@code getNumericDocValues} and {@code getPointValues} to retrieve
* doc values and points. You can pass a dummy value for a field name (e.g. when sorting by script),
* but in this case you must override both of these methods.
*
* Based on {@link org.apache.lucene.search.comparators.NumericComparator}
*/
public abstract class NumericComparator<T extends Number> extends FieldComparator<T> {
protected final T missingValue;
@ -92,7 +99,7 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
public NumericLeafComparator(LeafReaderContext context) throws IOException {
this.docValues = getNumericDocValues(context, field);
this.pointValues = canSkipDocuments ? context.reader().getPointValues(field) : null;
this.pointValues = canSkipDocuments ? getPointValues(context, field) : null;
if (pointValues != null) {
FieldInfo info = context.reader().getFieldInfos().fieldInfo(field);
if (info == null || info.getPointDimensionCount() == 0) {
@ -130,12 +137,44 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
}
}
/** Retrieves the NumericDocValues for the field in this segment */
/**
* Retrieves the NumericDocValues for the field in this segment
*
* <p>If you override this method, you must also override {@link
* #getPointValues(LeafReaderContext, String)} This class uses sort optimization that leverages
* points to filter out non-competitive matches, which relies on the assumption that points and
* doc values record the same information.
*
* @param context reader context
* @param field - field name
* @return numeric doc values for the field in this segment.
* @throws IOException If there is a low-level I/O error
*/
protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field)
throws IOException {
return DocValues.getNumeric(context.reader(), field);
}
/**
* Retrieves point values for the field in this segment
*
* <p>If you override this method, you must also override {@link
* #getNumericDocValues(LeafReaderContext, String)} This class uses sort optimization that
* leverages points to filter out non-competitive matches, which relies on the assumption that
* points and doc values record the same information. Return {@code null} even if no points
* implementation is available, in this case sort optimization with points will be disabled.
*
* @param context reader context
* @param field - field name
* @return point values for the field in this segment if they are available or {@code null} if
* sort optimization with points should be disabled.
* @throws IOException If there is a low-level I/O error
*/
protected PointValues getPointValues(LeafReaderContext context, String field)
throws IOException {
return context.reader().getPointValues(field);
}
@Override
public void setBottom(int slot) throws IOException {
queueFull = true; // if we are setting bottom, it means that we have collected enough hits

View File

@ -38,6 +38,7 @@ import org.apache.lucene.util.BytesRefBuilder;
* Sorts by descending relevance. NOTE: if you are sorting only by descending relevance and then secondarily by
* ascending docID, performance is faster using {@link org.apache.lucene.search.TopScoreDocCollector} directly (which {@link
* org.apache.lucene.search.IndexSearcher#search(Query, int)} uses when no {@link org.apache.lucene.search.Sort} is specified).
* Based on {@link org.apache.lucene.search.FieldComparator.RelevanceComparator}
*/
public final class RelevanceComparator extends FieldComparator<Float> implements LeafFieldComparator {

View File

@ -26,6 +26,7 @@ import org.apache.lucene.util.BytesRefBuilder;
* using the ordinals. For medium to large results, this comparator will be much faster than
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets
* it may be slower.
* Based on {@link org.apache.lucene.search.FieldComparator.TermOrdValComparator}
*/
public class TermOrdValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator {
/* Ords for each slot.

View File

@ -0,0 +1,6 @@
/**
* This package must mirror this changes:
* <a href="https://github.com/apache/lucene/commits/main/lucene/core/src/java/org/apache/lucene/search/comparators">
* Lucene comparators changes on GitHub</a>
*/
package it.cavallium.dbengine.lucene.comparators;