CavalliumDBEngine/src/main/java/it/cavallium/dbengine/lucene/comparators/NumericComparator.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package it.cavallium.dbengine.lucene.comparators;

import java.io.IOException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ArrayUtil.ByteArrayComparator;
import org.apache.lucene.util.DocIdSetBuilder;

/**
 * Abstract numeric comparator for comparing numeric values. This comparator provides a skipping
 * functionality – an iterator that can skip over non-competitive documents.
 *
 * <p>Parameter {@code field} provided in the constructor is used as a field name in the default
 * implementations of the methods {@code getNumericDocValues} and {@code getPointValues} to retrieve
 * doc values and points. You can pass a dummy value for a field name (e.g. when sorting by script),
 * but in this case you must override both of these methods.
 *
 * Based on {@link org.apache.lucene.search.comparators.NumericComparator}
 */
public abstract class NumericComparator<T extends Number> extends FieldComparator<T> {
  protected final T missingValue;
  protected final String field;
  protected final boolean reverse;
  private final int bytesCount; // how many bytes are used to encode this number
  private final ByteArrayComparator bytesComparator;

  protected boolean topValueSet;
  protected boolean singleSort; // singleSort is true, if sort is based on a single sort field.
  protected boolean hitsThresholdReached;
  protected boolean queueFull;
  private boolean canSkipDocuments;

  protected NumericComparator(
      String field, T missingValue, boolean reverse, int sortPos, int bytesCount) {
    this.field = field;
    this.missingValue = missingValue;
    this.reverse = reverse;
    // skipping functionality is only relevant for primary sort
    this.canSkipDocuments = (sortPos == 0);
    this.bytesCount = bytesCount;
    this.bytesComparator = ArrayUtil.getUnsignedComparator(bytesCount);
  }

  @Override
  public void setTopValue(T value) {
    topValueSet = true;
  }

  @Override
  public void setSingleSort() {
    singleSort = true;
  }

  @Override
  public void disableSkipping() {
    canSkipDocuments = false;
  }

  /** Leaf comparator for {@link NumericComparator} that provides skipping functionality */
  public abstract class NumericLeafComparator implements LeafFieldComparator {
    protected final NumericDocValues docValues;
    private final PointValues pointValues;
    // if skipping functionality should be enabled on this segment
    private final boolean enableSkipping;
    private final int maxDoc;
    private final byte[] minValueAsBytes;
    private final byte[] maxValueAsBytes;

    private DocIdSetIterator competitiveIterator;
    private long iteratorCost;
    private int maxDocVisited = -1;
    private int updateCounter = 0;

    public NumericLeafComparator(LeafReaderContext context) throws IOException {
      this.docValues = getNumericDocValues(context, field);
      this.pointValues = canSkipDocuments ? getPointValues(context, field) : null;
      if (pointValues != null) {
        FieldInfo info = context.reader().getFieldInfos().fieldInfo(field);
        if (info == null || info.getPointDimensionCount() == 0) {
          throw new IllegalStateException(
              "Field "
                  + field
                  + " doesn't index points according to FieldInfos yet returns non-null PointValues");
        } else if (info.getPointDimensionCount() > 1) {
          throw new IllegalArgumentException(
              "Field " + field + " is indexed with multiple dimensions, sorting is not supported");
        } else if (info.getPointNumBytes() != bytesCount) {
          throw new IllegalArgumentException(
              "Field "
                  + field
                  + " is indexed with "
                  + info.getPointNumBytes()
                  + " bytes per dimension, but "
                  + NumericComparator.this
                  + " expected "
                  + bytesCount);
        }
        this.enableSkipping = true; // skipping is enabled when points are available
        this.maxDoc = context.reader().maxDoc();
        this.maxValueAsBytes =
            reverse == false ? new byte[bytesCount] : topValueSet ? new byte[bytesCount] : null;
        this.minValueAsBytes =
            reverse ? new byte[bytesCount] : topValueSet ? new byte[bytesCount] : null;
        this.competitiveIterator = DocIdSetIterator.all(maxDoc);
        this.iteratorCost = maxDoc;
      } else {
        this.enableSkipping = false;
        this.maxDoc = 0;
        this.maxValueAsBytes = null;
        this.minValueAsBytes = null;
      }
    }

		/**
		 * Retrieves the NumericDocValues for the field in this segment
		 *
		 * <p>If you override this method, you must also override {@link
		 * #getPointValues(LeafReaderContext, String)} This class uses sort optimization that leverages
		 * points to filter out non-competitive matches, which relies on the assumption that points and
		 * doc values record the same information.
		 *
		 * @param context – reader context
		 * @param field - field name
		 * @return numeric doc values for the field in this segment.
		 * @throws IOException If there is a low-level I/O error
		 */
		protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field)
        throws IOException {
      return DocValues.getNumeric(context.reader(), field);
    }

		/**
		 * Retrieves point values for the field in this segment
		 *
		 * <p>If you override this method, you must also override {@link
		 * #getNumericDocValues(LeafReaderContext, String)} This class uses sort optimization that
		 * leverages points to filter out non-competitive matches, which relies on the assumption that
		 * points and doc values record the same information. Return {@code null} even if no points
		 * implementation is available, in this case sort optimization with points will be disabled.
		 *
		 * @param context – reader context
		 * @param field - field name
		 * @return point values for the field in this segment if they are available or {@code null} if
		 *     sort optimization with points should be disabled.
		 * @throws IOException If there is a low-level I/O error
		 */
		protected PointValues getPointValues(LeafReaderContext context, String field)
				throws IOException {
			return context.reader().getPointValues(field);
		}

    @Override
    public void setBottom(int slot) throws IOException {
      queueFull = true; // if we are setting bottom, it means that we have collected enough hits
      updateCompetitiveIterator(); // update an iterator if we set a new bottom
    }

    @Override
    public void copy(int slot, int doc) throws IOException {
      maxDocVisited = doc;
    }

    @Override
    public void setScorer(Scorable scorer) throws IOException {
      if (scorer instanceof Scorer) {
        iteratorCost =
            ((Scorer) scorer).iterator().cost(); // starting iterator cost is the scorer's cost
        updateCompetitiveIterator(); // update an iterator when we have a new segment
      }
    }

    @Override
    public void setHitsThresholdReached() throws IOException {
      hitsThresholdReached = true;
      updateCompetitiveIterator();
    }

    // update its iterator to include possibly only docs that are "stronger" than the current bottom
    // entry
    private void updateCompetitiveIterator() throws IOException {
      if (enableSkipping == false || hitsThresholdReached == false || queueFull == false) return;
      // if some documents have missing points, check that missing values prohibits optimization
      if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) {
        return; // we can't filter out documents, as documents with missing values are competitive
      }

      updateCounter++;
      if (updateCounter > 256
          && (updateCounter & 0x1f) != 0x1f) { // Start sampling if we get called too much
        return;
      }
      if (reverse == false) {
        encodeBottom(maxValueAsBytes);
        if (topValueSet) {
          encodeTop(minValueAsBytes);
        }
      } else {
        encodeBottom(minValueAsBytes);
        if (topValueSet) {
          encodeTop(maxValueAsBytes);
        }
      }

      DocIdSetBuilder result = new DocIdSetBuilder(maxDoc);
      PointValues.IntersectVisitor visitor =
          new PointValues.IntersectVisitor() {
            DocIdSetBuilder.BulkAdder adder;

            @Override
            public void grow(int count) {
              adder = result.grow(count);
            }

            @Override
            public void visit(int docID) {
              if (docID <= maxDocVisited) {
                return; // Already visited or skipped
              }
              adder.add(docID);
            }

            @Override
            public void visit(int docID, byte[] packedValue) {
              if (docID <= maxDocVisited) {
                return; // already visited or skipped
              }
              if (maxValueAsBytes != null) {
                int cmp = bytesComparator.compare(packedValue, 0, maxValueAsBytes, 0);
                // if doc's value is too high or for single sort even equal, it is not competitive
                // and the doc can be skipped
                if (cmp > 0 || (singleSort && cmp == 0)) return;
              }
              if (minValueAsBytes != null) {
                int cmp = bytesComparator.compare(packedValue, 0, minValueAsBytes, 0);
                // if doc's value is too low or for single sort even equal, it is not competitive
                // and the doc can be skipped
                if (cmp < 0 || (singleSort && cmp == 0)) return;
              }
              adder.add(docID); // doc is competitive
            }

            @Override
            public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
              if (maxValueAsBytes != null) {
                int cmp = bytesComparator.compare(minPackedValue, 0, maxValueAsBytes, 0);
                if (cmp > 0 || (singleSort && cmp == 0))
                  return PointValues.Relation.CELL_OUTSIDE_QUERY;
              }
              if (minValueAsBytes != null) {
                int cmp = bytesComparator.compare(maxPackedValue, 0, minValueAsBytes, 0);
                if (cmp < 0 || (singleSort && cmp == 0))
                  return PointValues.Relation.CELL_OUTSIDE_QUERY;
              }
              if ((maxValueAsBytes != null
                      && bytesComparator.compare(maxPackedValue, 0, maxValueAsBytes, 0) > 0)
                  || (minValueAsBytes != null
                      && bytesComparator.compare(minPackedValue, 0, minValueAsBytes, 0) < 0)) {
                return PointValues.Relation.CELL_CROSSES_QUERY;
              }
              return PointValues.Relation.CELL_INSIDE_QUERY;
            }
          };
      final long threshold = iteratorCost >>> 3;
      long estimatedNumberOfMatches =
          pointValues.estimatePointCount(visitor); // runs in O(log(numPoints))
      if (estimatedNumberOfMatches >= threshold) {
        // the new range is not selective enough to be worth materializing, it doesn't reduce number
        // of docs at least 8x
        return;
      }
      pointValues.intersect(visitor);
      competitiveIterator = result.build().iterator();
      iteratorCost = competitiveIterator.cost();
    }

    @Override
    public DocIdSetIterator competitiveIterator() {
      if (enableSkipping == false) return null;
      return new DocIdSetIterator() {
        private int docID = competitiveIterator.docID();

        @Override
        public int nextDoc() throws IOException {
          return advance(docID + 1);
        }

        @Override
        public int docID() {
          return docID;
        }

        @Override
        public long cost() {
          return competitiveIterator.cost();
        }

        @Override
        public int advance(int target) throws IOException {
          return docID = competitiveIterator.advance(target);
        }
      };
    }

    protected abstract boolean isMissingValueCompetitive();

    protected abstract void encodeBottom(byte[] packedValue);

    protected abstract void encodeTop(byte[] packedValue);
  }
}