CavalliumDBEngine/src/main/java/it/cavallium/dbengine/lucene/comparators/TermOrdValComparator.java

package it.cavallium.dbengine.lucene.comparators;

import it.cavallium.dbengine.database.disk.LLTempLMDBEnv;
import it.cavallium.dbengine.lucene.ByteArrayCodec;
import it.cavallium.dbengine.lucene.BytesRefCodec;
import it.cavallium.dbengine.lucene.FloatCodec;
import it.cavallium.dbengine.lucene.IArray;
import it.cavallium.dbengine.lucene.IntCodec;
import it.cavallium.dbengine.lucene.LMDBArray;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;

/**
 * Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to
 * {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the
 * string to their relative ordinal positions (using the index returned by {@link
 * org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons
 * using the ordinals. For medium to large results, this comparator will be much faster than
 * {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets
 * it may be slower.
 */
public class TermOrdValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator {
	/* Ords for each slot.
	@lucene.internal */
	final IArray<Integer> ords;

	/* Values for each slot.
	@lucene.internal */
	final IArray<byte[]> values;

	/* Which reader last copied a value into the slot. When
	we compare two slots, we just compare-by-ord if the
	readerGen is the same; else we must compare the
	values (slower).
	@lucene.internal */
	final IArray<Integer> readerGen;

	/* Gen of current reader we are on.
	@lucene.internal */
	int currentReaderGen = -1;

	/* Current reader's doc ord/values.
	@lucene.internal */
	SortedDocValues termsIndex;

	private final String field;

	/* Bottom slot, or -1 if queue isn't full yet
	@lucene.internal */
	int bottomSlot = -1;

	/* Bottom ord (same as ords[bottomSlot] once bottomSlot
	is set).  Cached for faster compares.
	@lucene.internal */
	int bottomOrd;

	/* True if current bottom slot matches the current
	reader.
	@lucene.internal */
	boolean bottomSameReader;

	/* Bottom value (same as values[bottomSlot] once
	 bottomSlot is set).  Cached for faster compares.
	@lucene.internal */
	byte[] bottomValue;

	/** Set by setTopValue. */
	byte[] topValue;

	boolean topSameReader;
	int topOrd;

	/** -1 if missing values are sorted first, 1 if they are sorted last */
	final int missingSortCmp;

	/** Which ordinal to use for a missing value. */
	final int missingOrd;

	/** Creates this, sorting missing values first. */
	public TermOrdValComparator(LLTempLMDBEnv env, int numHits, String field) {
		this(env, numHits, field, false);
	}

	/**
	 * Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to
	 * put missing values at the end.
	 */
	public TermOrdValComparator(LLTempLMDBEnv env, int numHits, String field, boolean sortMissingLast) {
		ords = new LMDBArray<>(env, new IntCodec(), numHits, 0);
		values = new LMDBArray<>(env, new ByteArrayCodec(), numHits, null);
		readerGen = new LMDBArray<>(env, new IntCodec(), numHits, 0);
		this.field = field;
		if (sortMissingLast) {
			missingSortCmp = 1;
			missingOrd = Integer.MAX_VALUE;
		} else {
			missingSortCmp = -1;
			missingOrd = -1;
		}
	}

	private int getOrdForDoc(int doc) throws IOException {
		if (termsIndex.advanceExact(doc)) {
			return termsIndex.ordValue();
		} else {
			return -1;
		}
	}

	@Override
	public int compare(int slot1, int slot2) {
		if ((int) readerGen.getOrDefault(slot2, 0) == readerGen.getOrDefault(slot1, 0)) {
			return ords.getOrDefault(slot1, 0) - ords.getOrDefault(slot2, 0);
		}

		final var val1 = values.get(slot1);
		final var val2 = values.get(slot2);
		if (val1 == null) {
			if (val2 == null) {
				return 0;
			}
			return missingSortCmp;
		} else if (val2 == null) {
			return -missingSortCmp;
		}
		return Arrays.compare(val1, val2);
	}

	@Override
	public int compareBottom(int doc) throws IOException {
		assert bottomSlot != -1;
		int docOrd = getOrdForDoc(doc);
		if (docOrd == -1) {
			docOrd = missingOrd;
		}
		if (bottomSameReader) {
			// ord is precisely comparable, even in the equal case
			return bottomOrd - docOrd;
		} else if (bottomOrd >= docOrd) {
			// the equals case always means bottom is > doc
			// (because we set bottomOrd to the lower bound in
			// setBottom):
			return 1;
		} else {
			return -1;
		}
	}

	@Override
	public void copy(int slot, int doc) throws IOException {
		int ord = getOrdForDoc(doc);
		if (ord == -1) {
			ord = missingOrd;
			values.reset(slot);
		} else {
			assert ord >= 0;
			values.set(slot, copyBytes(termsIndex.lookupOrd(ord)));
		}
		ords.set(slot, ord);
		readerGen.set(slot, currentReaderGen);
	}

	private byte[] copyBytes(BytesRef lookupOrd) {
		if (lookupOrd == null) return null;
		return Arrays.copyOfRange(lookupOrd.bytes, lookupOrd.offset, lookupOrd.length);
	}

	/** Retrieves the SortedDocValues for the field in this segment */
	protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
			throws IOException {
		return DocValues.getSorted(context.reader(), field);
	}

	@Override
	public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
		termsIndex = getSortedDocValues(context, field);
		currentReaderGen++;

		if (topValue != null) {
			// Recompute topOrd/SameReader
			int ord = termsIndex.lookupTerm(new BytesRef(topValue));
			if (ord >= 0) {
				topSameReader = true;
				topOrd = ord;
			} else {
				topSameReader = false;
				topOrd = -ord - 2;
			}
		} else {
			topOrd = missingOrd;
			topSameReader = true;
		}
		// System.out.println("  getLeafComparator topOrd=" + topOrd + " topSameReader=" +
		// topSameReader);

		if (bottomSlot != -1) {
			// Recompute bottomOrd/SameReader
			setBottom(bottomSlot);
		}

		return this;
	}

	@Override
	public void setBottom(final int bottom) throws IOException {
		bottomSlot = bottom;

		bottomValue = values.get(bottomSlot);
		if (currentReaderGen == readerGen.getOrDefault(bottomSlot, 0)) {
			bottomOrd = ords.getOrDefault(bottomSlot, 0);
			bottomSameReader = true;
		} else {
			if (bottomValue == null) {
				// missingOrd is null for all segments
				assert ords.getOrDefault(bottomSlot, 0) == missingOrd;
				bottomOrd = missingOrd;
				bottomSameReader = true;
				readerGen.set(bottomSlot, currentReaderGen);
			} else {
				final int ord = termsIndex.lookupTerm(new BytesRef(bottomValue));
				if (ord < 0) {
					bottomOrd = -ord - 2;
					bottomSameReader = false;
				} else {
					bottomOrd = ord;
					// exact value match
					bottomSameReader = true;
					readerGen.set(bottomSlot, currentReaderGen);
					ords.set(bottomSlot, bottomOrd);
				}
			}
		}
	}

	@Override
	public void setTopValue(BytesRef value) {
		// null is fine: it means the last doc of the prior
		// search was missing this value
		topValue = copyBytes(value);
		// System.out.println("setTopValue " + topValue);
	}

	@Override
	public BytesRef value(int slot) {
		return getBytesRef(values.get(slot));
	}

	private BytesRef getBytesRef(byte[] bytes) {
		if (bytes == null) return null;
		return new BytesRef(bytes);
	}

	@Override
	public int compareTop(int doc) throws IOException {

		int ord = getOrdForDoc(doc);
		if (ord == -1) {
			ord = missingOrd;
		}

		if (topSameReader) {
			// ord is precisely comparable, even in the equal
			// case
			// System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
			return topOrd - ord;
		} else if (ord <= topOrd) {
			// the equals case always means doc is < value
			// (because we set lastOrd to the lower bound)
			return 1;
		} else {
			return -1;
		}
	}

	@Override
	public int compareValues(BytesRef val1, BytesRef val2) {
		if (val1 == null) {
			if (val2 == null) {
				return 0;
			}
			return missingSortCmp;
		} else if (val2 == null) {
			return -missingSortCmp;
		}
		return val1.compareTo(val2);
	}

	@Override
	public void setScorer(Scorable scorer) {}
}
Implement some sort codecs 2021-10-15 22:03:53 +02:00			`package it.cavallium.dbengine.lucene.comparators;`

			`import it.cavallium.dbengine.database.disk.LLTempLMDBEnv;`
			`import it.cavallium.dbengine.lucene.ByteArrayCodec;`
			`import it.cavallium.dbengine.lucene.BytesRefCodec;`
			`import it.cavallium.dbengine.lucene.FloatCodec;`
			`import it.cavallium.dbengine.lucene.IArray;`
			`import it.cavallium.dbengine.lucene.IntCodec;`
			`import it.cavallium.dbengine.lucene.LMDBArray;`
			`import java.io.IOException;`
			`import java.util.Arrays;`
			`import org.apache.lucene.index.DocValues;`
			`import org.apache.lucene.index.LeafReaderContext;`
			`import org.apache.lucene.index.SortedDocValues;`
			`import org.apache.lucene.search.FieldComparator;`
			`import org.apache.lucene.search.LeafFieldComparator;`
			`import org.apache.lucene.search.Scorable;`
			`import org.apache.lucene.util.BytesRef;`
			`import org.apache.lucene.util.BytesRefBuilder;`

			`/**`
			`* Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to`
			`* {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the`
			`* string to their relative ordinal positions (using the index returned by {@link`
			`* org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons`
			`* using the ordinals. For medium to large results, this comparator will be much faster than`
			`* {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets`
			`* it may be slower.`
			`*/`
			`public class TermOrdValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator {`
			`/* Ords for each slot.`
			`@lucene.internal */`
			`final IArray<Integer> ords;`

			`/* Values for each slot.`
			`@lucene.internal */`
			`final IArray<byte[]> values;`

			`/* Which reader last copied a value into the slot. When`
			`we compare two slots, we just compare-by-ord if the`
			`readerGen is the same; else we must compare the`
			`values (slower).`
			`@lucene.internal */`
			`final IArray<Integer> readerGen;`

			`/* Gen of current reader we are on.`
			`@lucene.internal */`
			`int currentReaderGen = -1;`

			`/* Current reader's doc ord/values.`
			`@lucene.internal */`
			`SortedDocValues termsIndex;`

			`private final String field;`

			`/* Bottom slot, or -1 if queue isn't full yet`
			`@lucene.internal */`
			`int bottomSlot = -1;`

			`/* Bottom ord (same as ords[bottomSlot] once bottomSlot`
			`is set). Cached for faster compares.`
			`@lucene.internal */`
			`int bottomOrd;`

			`/* True if current bottom slot matches the current`
			`reader.`
			`@lucene.internal */`
			`boolean bottomSameReader;`

			`/* Bottom value (same as values[bottomSlot] once`
			`bottomSlot is set). Cached for faster compares.`
			`@lucene.internal */`
			`byte[] bottomValue;`

			`/** Set by setTopValue. */`
			`byte[] topValue;`

			`boolean topSameReader;`
			`int topOrd;`

			`/** -1 if missing values are sorted first, 1 if they are sorted last */`
			`final int missingSortCmp;`

			`/** Which ordinal to use for a missing value. */`
			`final int missingOrd;`

			`/** Creates this, sorting missing values first. */`
			`public TermOrdValComparator(LLTempLMDBEnv env, int numHits, String field) {`
			`this(env, numHits, field, false);`
			`}`

			`/**`
			`* Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to`
			`* put missing values at the end.`
			`*/`
			`public TermOrdValComparator(LLTempLMDBEnv env, int numHits, String field, boolean sortMissingLast) {`
			`ords = new LMDBArray<>(env, new IntCodec(), numHits, 0);`
			`values = new LMDBArray<>(env, new ByteArrayCodec(), numHits, null);`
			`readerGen = new LMDBArray<>(env, new IntCodec(), numHits, 0);`
			`this.field = field;`
			`if (sortMissingLast) {`
			`missingSortCmp = 1;`
			`missingOrd = Integer.MAX_VALUE;`
			`} else {`
			`missingSortCmp = -1;`
			`missingOrd = -1;`
			`}`
			`}`

			`private int getOrdForDoc(int doc) throws IOException {`
			`if (termsIndex.advanceExact(doc)) {`
			`return termsIndex.ordValue();`
			`} else {`
			`return -1;`
			`}`
			`}`

			`@Override`
			`public int compare(int slot1, int slot2) {`
			`if ((int) readerGen.getOrDefault(slot2, 0) == readerGen.getOrDefault(slot1, 0)) {`
			`return ords.getOrDefault(slot1, 0) - ords.getOrDefault(slot2, 0);`
			`}`

			`final var val1 = values.get(slot1);`
			`final var val2 = values.get(slot2);`
			`if (val1 == null) {`
			`if (val2 == null) {`
			`return 0;`
			`}`
			`return missingSortCmp;`
			`} else if (val2 == null) {`
			`return -missingSortCmp;`
			`}`
			`return Arrays.compare(val1, val2);`
			`}`

			`@Override`
			`public int compareBottom(int doc) throws IOException {`
			`assert bottomSlot != -1;`
			`int docOrd = getOrdForDoc(doc);`
			`if (docOrd == -1) {`
			`docOrd = missingOrd;`
			`}`
			`if (bottomSameReader) {`
			`// ord is precisely comparable, even in the equal case`
			`return bottomOrd - docOrd;`
			`} else if (bottomOrd >= docOrd) {`
			`// the equals case always means bottom is > doc`
			`// (because we set bottomOrd to the lower bound in`
			`// setBottom):`
			`return 1;`
			`} else {`
			`return -1;`
			`}`
			`}`

			`@Override`
			`public void copy(int slot, int doc) throws IOException {`
			`int ord = getOrdForDoc(doc);`
			`if (ord == -1) {`
			`ord = missingOrd;`
			`values.reset(slot);`
			`} else {`
			`assert ord >= 0;`
			`values.set(slot, copyBytes(termsIndex.lookupOrd(ord)));`
			`}`
			`ords.set(slot, ord);`
			`readerGen.set(slot, currentReaderGen);`
			`}`

			`private byte[] copyBytes(BytesRef lookupOrd) {`
			`if (lookupOrd == null) return null;`
			`return Arrays.copyOfRange(lookupOrd.bytes, lookupOrd.offset, lookupOrd.length);`
			`}`

			`/** Retrieves the SortedDocValues for the field in this segment */`
			`protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)`
			`throws IOException {`
			`return DocValues.getSorted(context.reader(), field);`
			`}`

			`@Override`
			`public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {`
			`termsIndex = getSortedDocValues(context, field);`
			`currentReaderGen++;`

			`if (topValue != null) {`
			`// Recompute topOrd/SameReader`
			`int ord = termsIndex.lookupTerm(new BytesRef(topValue));`
			`if (ord >= 0) {`
			`topSameReader = true;`
			`topOrd = ord;`
			`} else {`
			`topSameReader = false;`
			`topOrd = -ord - 2;`
			`}`
			`} else {`
			`topOrd = missingOrd;`
			`topSameReader = true;`
			`}`
			`// System.out.println(" getLeafComparator topOrd=" + topOrd + " topSameReader=" +`
			`// topSameReader);`

			`if (bottomSlot != -1) {`
			`// Recompute bottomOrd/SameReader`
			`setBottom(bottomSlot);`
			`}`

			`return this;`
			`}`

			`@Override`
			`public void setBottom(final int bottom) throws IOException {`
			`bottomSlot = bottom;`

			`bottomValue = values.get(bottomSlot);`
			`if (currentReaderGen == readerGen.getOrDefault(bottomSlot, 0)) {`
			`bottomOrd = ords.getOrDefault(bottomSlot, 0);`
			`bottomSameReader = true;`
			`} else {`
			`if (bottomValue == null) {`
			`// missingOrd is null for all segments`
			`assert ords.getOrDefault(bottomSlot, 0) == missingOrd;`
			`bottomOrd = missingOrd;`
			`bottomSameReader = true;`
			`readerGen.set(bottomSlot, currentReaderGen);`
			`} else {`
			`final int ord = termsIndex.lookupTerm(new BytesRef(bottomValue));`
			`if (ord < 0) {`
			`bottomOrd = -ord - 2;`
			`bottomSameReader = false;`
			`} else {`
			`bottomOrd = ord;`
			`// exact value match`
			`bottomSameReader = true;`
			`readerGen.set(bottomSlot, currentReaderGen);`
			`ords.set(bottomSlot, bottomOrd);`
			`}`
			`}`
			`}`
			`}`

			`@Override`
			`public void setTopValue(BytesRef value) {`
			`// null is fine: it means the last doc of the prior`
			`// search was missing this value`
			`topValue = copyBytes(value);`
			`// System.out.println("setTopValue " + topValue);`
			`}`

			`@Override`
			`public BytesRef value(int slot) {`
			`return getBytesRef(values.get(slot));`
			`}`

			`private BytesRef getBytesRef(byte[] bytes) {`
			`if (bytes == null) return null;`
			`return new BytesRef(bytes);`
			`}`

			`@Override`
			`public int compareTop(int doc) throws IOException {`

			`int ord = getOrdForDoc(doc);`
			`if (ord == -1) {`
			`ord = missingOrd;`
			`}`

			`if (topSameReader) {`
			`// ord is precisely comparable, even in the equal`
			`// case`
			`// System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));`
			`return topOrd - ord;`
			`} else if (ord <= topOrd) {`
			`// the equals case always means doc is < value`
			`// (because we set lastOrd to the lower bound)`
			`return 1;`
			`} else {`
			`return -1;`
			`}`
			`}`

			`@Override`
			`public int compareValues(BytesRef val1, BytesRef val2) {`
			`if (val1 == null) {`
			`if (val2 == null) {`
			`return 0;`
			`}`
			`return missingSortCmp;`
			`} else if (val2 == null) {`
			`return -missingSortCmp;`
			`}`
			`return val1.compareTo(val2);`
			`}`

			`@Override`
			`public void setScorer(Scorable scorer) {}`
			`}`