298 lines
8.2 KiB
Java
298 lines
8.2 KiB
Java
|
package it.cavallium.dbengine.lucene.comparators;
|
||
|
|
||
|
import it.cavallium.dbengine.database.disk.LLTempLMDBEnv;
|
||
|
import it.cavallium.dbengine.lucene.ByteArrayCodec;
|
||
|
import it.cavallium.dbengine.lucene.BytesRefCodec;
|
||
|
import it.cavallium.dbengine.lucene.FloatCodec;
|
||
|
import it.cavallium.dbengine.lucene.IArray;
|
||
|
import it.cavallium.dbengine.lucene.IntCodec;
|
||
|
import it.cavallium.dbengine.lucene.LMDBArray;
|
||
|
import java.io.IOException;
|
||
|
import java.util.Arrays;
|
||
|
import org.apache.lucene.index.DocValues;
|
||
|
import org.apache.lucene.index.LeafReaderContext;
|
||
|
import org.apache.lucene.index.SortedDocValues;
|
||
|
import org.apache.lucene.search.FieldComparator;
|
||
|
import org.apache.lucene.search.LeafFieldComparator;
|
||
|
import org.apache.lucene.search.Scorable;
|
||
|
import org.apache.lucene.util.BytesRef;
|
||
|
import org.apache.lucene.util.BytesRefBuilder;
|
||
|
|
||
|
/**
|
||
|
* Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to
|
||
|
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the
|
||
|
* string to their relative ordinal positions (using the index returned by {@link
|
||
|
* org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons
|
||
|
* using the ordinals. For medium to large results, this comparator will be much faster than
|
||
|
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets
|
||
|
* it may be slower.
|
||
|
*/
|
||
|
public class TermOrdValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator {
|
||
|
/* Ords for each slot.
|
||
|
@lucene.internal */
|
||
|
final IArray<Integer> ords;
|
||
|
|
||
|
/* Values for each slot.
|
||
|
@lucene.internal */
|
||
|
final IArray<byte[]> values;
|
||
|
|
||
|
/* Which reader last copied a value into the slot. When
|
||
|
we compare two slots, we just compare-by-ord if the
|
||
|
readerGen is the same; else we must compare the
|
||
|
values (slower).
|
||
|
@lucene.internal */
|
||
|
final IArray<Integer> readerGen;
|
||
|
|
||
|
/* Gen of current reader we are on.
|
||
|
@lucene.internal */
|
||
|
int currentReaderGen = -1;
|
||
|
|
||
|
/* Current reader's doc ord/values.
|
||
|
@lucene.internal */
|
||
|
SortedDocValues termsIndex;
|
||
|
|
||
|
private final String field;
|
||
|
|
||
|
/* Bottom slot, or -1 if queue isn't full yet
|
||
|
@lucene.internal */
|
||
|
int bottomSlot = -1;
|
||
|
|
||
|
/* Bottom ord (same as ords[bottomSlot] once bottomSlot
|
||
|
is set). Cached for faster compares.
|
||
|
@lucene.internal */
|
||
|
int bottomOrd;
|
||
|
|
||
|
/* True if current bottom slot matches the current
|
||
|
reader.
|
||
|
@lucene.internal */
|
||
|
boolean bottomSameReader;
|
||
|
|
||
|
/* Bottom value (same as values[bottomSlot] once
|
||
|
bottomSlot is set). Cached for faster compares.
|
||
|
@lucene.internal */
|
||
|
byte[] bottomValue;
|
||
|
|
||
|
/** Set by setTopValue. */
|
||
|
byte[] topValue;
|
||
|
|
||
|
boolean topSameReader;
|
||
|
int topOrd;
|
||
|
|
||
|
/** -1 if missing values are sorted first, 1 if they are sorted last */
|
||
|
final int missingSortCmp;
|
||
|
|
||
|
/** Which ordinal to use for a missing value. */
|
||
|
final int missingOrd;
|
||
|
|
||
|
/** Creates this, sorting missing values first. */
|
||
|
public TermOrdValComparator(LLTempLMDBEnv env, int numHits, String field) {
|
||
|
this(env, numHits, field, false);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to
|
||
|
* put missing values at the end.
|
||
|
*/
|
||
|
public TermOrdValComparator(LLTempLMDBEnv env, int numHits, String field, boolean sortMissingLast) {
|
||
|
ords = new LMDBArray<>(env, new IntCodec(), numHits, 0);
|
||
|
values = new LMDBArray<>(env, new ByteArrayCodec(), numHits, null);
|
||
|
readerGen = new LMDBArray<>(env, new IntCodec(), numHits, 0);
|
||
|
this.field = field;
|
||
|
if (sortMissingLast) {
|
||
|
missingSortCmp = 1;
|
||
|
missingOrd = Integer.MAX_VALUE;
|
||
|
} else {
|
||
|
missingSortCmp = -1;
|
||
|
missingOrd = -1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private int getOrdForDoc(int doc) throws IOException {
|
||
|
if (termsIndex.advanceExact(doc)) {
|
||
|
return termsIndex.ordValue();
|
||
|
} else {
|
||
|
return -1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public int compare(int slot1, int slot2) {
|
||
|
if ((int) readerGen.getOrDefault(slot2, 0) == readerGen.getOrDefault(slot1, 0)) {
|
||
|
return ords.getOrDefault(slot1, 0) - ords.getOrDefault(slot2, 0);
|
||
|
}
|
||
|
|
||
|
final var val1 = values.get(slot1);
|
||
|
final var val2 = values.get(slot2);
|
||
|
if (val1 == null) {
|
||
|
if (val2 == null) {
|
||
|
return 0;
|
||
|
}
|
||
|
return missingSortCmp;
|
||
|
} else if (val2 == null) {
|
||
|
return -missingSortCmp;
|
||
|
}
|
||
|
return Arrays.compare(val1, val2);
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public int compareBottom(int doc) throws IOException {
|
||
|
assert bottomSlot != -1;
|
||
|
int docOrd = getOrdForDoc(doc);
|
||
|
if (docOrd == -1) {
|
||
|
docOrd = missingOrd;
|
||
|
}
|
||
|
if (bottomSameReader) {
|
||
|
// ord is precisely comparable, even in the equal case
|
||
|
return bottomOrd - docOrd;
|
||
|
} else if (bottomOrd >= docOrd) {
|
||
|
// the equals case always means bottom is > doc
|
||
|
// (because we set bottomOrd to the lower bound in
|
||
|
// setBottom):
|
||
|
return 1;
|
||
|
} else {
|
||
|
return -1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public void copy(int slot, int doc) throws IOException {
|
||
|
int ord = getOrdForDoc(doc);
|
||
|
if (ord == -1) {
|
||
|
ord = missingOrd;
|
||
|
values.reset(slot);
|
||
|
} else {
|
||
|
assert ord >= 0;
|
||
|
values.set(slot, copyBytes(termsIndex.lookupOrd(ord)));
|
||
|
}
|
||
|
ords.set(slot, ord);
|
||
|
readerGen.set(slot, currentReaderGen);
|
||
|
}
|
||
|
|
||
|
private byte[] copyBytes(BytesRef lookupOrd) {
|
||
|
if (lookupOrd == null) return null;
|
||
|
return Arrays.copyOfRange(lookupOrd.bytes, lookupOrd.offset, lookupOrd.length);
|
||
|
}
|
||
|
|
||
|
/** Retrieves the SortedDocValues for the field in this segment */
|
||
|
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
|
||
|
throws IOException {
|
||
|
return DocValues.getSorted(context.reader(), field);
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
|
||
|
termsIndex = getSortedDocValues(context, field);
|
||
|
currentReaderGen++;
|
||
|
|
||
|
if (topValue != null) {
|
||
|
// Recompute topOrd/SameReader
|
||
|
int ord = termsIndex.lookupTerm(new BytesRef(topValue));
|
||
|
if (ord >= 0) {
|
||
|
topSameReader = true;
|
||
|
topOrd = ord;
|
||
|
} else {
|
||
|
topSameReader = false;
|
||
|
topOrd = -ord - 2;
|
||
|
}
|
||
|
} else {
|
||
|
topOrd = missingOrd;
|
||
|
topSameReader = true;
|
||
|
}
|
||
|
// System.out.println(" getLeafComparator topOrd=" + topOrd + " topSameReader=" +
|
||
|
// topSameReader);
|
||
|
|
||
|
if (bottomSlot != -1) {
|
||
|
// Recompute bottomOrd/SameReader
|
||
|
setBottom(bottomSlot);
|
||
|
}
|
||
|
|
||
|
return this;
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public void setBottom(final int bottom) throws IOException {
|
||
|
bottomSlot = bottom;
|
||
|
|
||
|
bottomValue = values.get(bottomSlot);
|
||
|
if (currentReaderGen == readerGen.getOrDefault(bottomSlot, 0)) {
|
||
|
bottomOrd = ords.getOrDefault(bottomSlot, 0);
|
||
|
bottomSameReader = true;
|
||
|
} else {
|
||
|
if (bottomValue == null) {
|
||
|
// missingOrd is null for all segments
|
||
|
assert ords.getOrDefault(bottomSlot, 0) == missingOrd;
|
||
|
bottomOrd = missingOrd;
|
||
|
bottomSameReader = true;
|
||
|
readerGen.set(bottomSlot, currentReaderGen);
|
||
|
} else {
|
||
|
final int ord = termsIndex.lookupTerm(new BytesRef(bottomValue));
|
||
|
if (ord < 0) {
|
||
|
bottomOrd = -ord - 2;
|
||
|
bottomSameReader = false;
|
||
|
} else {
|
||
|
bottomOrd = ord;
|
||
|
// exact value match
|
||
|
bottomSameReader = true;
|
||
|
readerGen.set(bottomSlot, currentReaderGen);
|
||
|
ords.set(bottomSlot, bottomOrd);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public void setTopValue(BytesRef value) {
|
||
|
// null is fine: it means the last doc of the prior
|
||
|
// search was missing this value
|
||
|
topValue = copyBytes(value);
|
||
|
// System.out.println("setTopValue " + topValue);
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public BytesRef value(int slot) {
|
||
|
return getBytesRef(values.get(slot));
|
||
|
}
|
||
|
|
||
|
private BytesRef getBytesRef(byte[] bytes) {
|
||
|
if (bytes == null) return null;
|
||
|
return new BytesRef(bytes);
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public int compareTop(int doc) throws IOException {
|
||
|
|
||
|
int ord = getOrdForDoc(doc);
|
||
|
if (ord == -1) {
|
||
|
ord = missingOrd;
|
||
|
}
|
||
|
|
||
|
if (topSameReader) {
|
||
|
// ord is precisely comparable, even in the equal
|
||
|
// case
|
||
|
// System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
|
||
|
return topOrd - ord;
|
||
|
} else if (ord <= topOrd) {
|
||
|
// the equals case always means doc is < value
|
||
|
// (because we set lastOrd to the lower bound)
|
||
|
return 1;
|
||
|
} else {
|
||
|
return -1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public int compareValues(BytesRef val1, BytesRef val2) {
|
||
|
if (val1 == null) {
|
||
|
if (val2 == null) {
|
||
|
return 0;
|
||
|
}
|
||
|
return missingSortCmp;
|
||
|
} else if (val2 == null) {
|
||
|
return -missingSortCmp;
|
||
|
}
|
||
|
return val1.compareTo(val2);
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public void setScorer(Scorable scorer) {}
|
||
|
}
|