Facets optimizations

This commit is contained in:
Andrea Cavalli 2022-01-18 14:16:32 +01:00
parent b2216c1b2c
commit 6baa05de51
7 changed files with 291 additions and 45 deletions

View File

@ -0,0 +1,29 @@
package it.cavallium.dbengine.lucene;
import it.unimi.dsi.fastutil.ints.IntHash;
public class IntSmear implements IntHash.Strategy {
@Override
public int hashCode(int e) {
return smear(e);
}
/*
* This method was written by Doug Lea with assistance from members of JCP
* JSR-166 Expert Group and released to the public domain, as explained at
* http://creativecommons.org/licenses/publicdomain
*
* As of 2010/06/11, this method is identical to the (package private) hash
* method in OpenJDK 7's java.util.HashMap class.
*/
static int smear(int hashCode) {
hashCode ^= (hashCode >>> 20) ^ (hashCode >>> 12);
return hashCode ^ (hashCode >>> 7) ^ (hashCode >>> 4);
}
@Override
public boolean equals(int a, int b) {
return a == b;
}
}

View File

@ -1,18 +1,20 @@
package it.cavallium.dbengine.lucene.collector; package it.cavallium.dbengine.lucene.collector;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList; import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.NotImplementedException; import org.apache.commons.lang3.NotImplementedException;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollectorManager; import org.apache.lucene.facet.FacetsCollectorManager;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.RandomSamplingFacetsCollector; import org.apache.lucene.facet.RandomSamplingFacetsCollector;
import org.apache.lucene.facet.range.DoubleRange; import org.apache.lucene.facet.range.DoubleRange;
@ -20,29 +22,30 @@ import org.apache.lucene.facet.range.DoubleRangeFacetCounts;
import org.apache.lucene.facet.range.LongRange; import org.apache.lucene.facet.range.LongRange;
import org.apache.lucene.facet.range.LongRangeFacetCounts; import org.apache.lucene.facet.range.LongRangeFacetCounts;
import org.apache.lucene.facet.range.Range; import org.apache.lucene.facet.range.Range;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.CachingCollector;
import org.apache.lucene.search.Collector; import org.apache.lucene.search.Collector;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.DoubleValuesSource; import org.apache.lucene.search.DoubleValuesSource;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.LongValuesSource; import org.apache.lucene.search.LongValuesSource;
import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
public class DecimalBucketMultiCollectorManager implements CollectorMultiManager<Buckets, Buckets> { public class DecimalBucketMultiCollectorManager implements CollectorMultiManager<Buckets, Buckets> {
private final FacetsCollectorManager facetsCollectorManager; private static final boolean USE_SINGLE_FACET_COLLECTOR = false;
private static final boolean AMORTIZE = true;
private final boolean randomSamplingEnabled;
private final FastFacetsCollectorManager facetsCollectorManager;
private final FastRandomSamplingFacetsCollector randomSamplingFacetsCollector;
private final Range[] bucketRanges; private final Range[] bucketRanges;
private final List<Query> queries; private final List<Query> queries;
private final @Nullable Query normalizationQuery; private final @Nullable Query normalizationQuery;
private final @Nullable Integer collectionRate;
private final @Nullable Integer sampleSize; private final @Nullable Integer sampleSize;
private final String bucketField; private final String bucketField;
@ -64,6 +67,7 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
BucketValueSource bucketValueSource, BucketValueSource bucketValueSource,
List<Query> queries, List<Query> queries,
@Nullable Query normalizationQuery, @Nullable Query normalizationQuery,
@Nullable Integer collectionRate,
@Nullable Integer sampleSize) { @Nullable Integer sampleSize) {
this.queries = queries; this.queries = queries;
this.normalizationQuery = normalizationQuery; this.normalizationQuery = normalizationQuery;
@ -75,6 +79,7 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
this.totalLength = bucketLength * bucketsInt; this.totalLength = bucketLength * bucketsInt;
this.bucketField = bucketField; this.bucketField = bucketField;
this.bucketValueSource = bucketValueSource; this.bucketValueSource = bucketValueSource;
this.collectionRate = collectionRate;
this.sampleSize = sampleSize; this.sampleSize = sampleSize;
if (USE_LONGS) { if (USE_LONGS) {
@ -102,26 +107,15 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
} }
} }
this.facetsCollectorManager = new FacetsCollectorManager() { this.randomSamplingEnabled = sampleSize != null;
@Override int intCollectionRate = this.collectionRate == null ? 1 : this.collectionRate;
public FacetsCollector newCollector() { if (randomSamplingEnabled) {
if (sampleSize != null) { randomSamplingFacetsCollector = new FastRandomSamplingFacetsCollector(intCollectionRate, sampleSize, 0);
return new RandomSamplingFacetsCollector(sampleSize) { this.facetsCollectorManager = null;
@Override } else {
public ScoreMode scoreMode() { this.randomSamplingFacetsCollector = null;
return ScoreMode.COMPLETE_NO_SCORES; this.facetsCollectorManager = new FastFacetsCollectorManager(intCollectionRate);
} }
};
} else {
return new FacetsCollector(false) {
@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}
};
}
}
};
} }
public double[] newBuckets() { public double[] newBuckets() {
@ -129,18 +123,28 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
} }
public Buckets search(IndexSearcher indexSearcher) throws IOException { public Buckets search(IndexSearcher indexSearcher) throws IOException {
Query globalQuery; Query query;
if (normalizationQuery != null) { if (USE_SINGLE_FACET_COLLECTOR && normalizationQuery != null) {
globalQuery = normalizationQuery; query = normalizationQuery;
} else if (queries.size() == 0) {
query = new MatchNoDocsQuery();
} else if (queries.size() == 1) {
query = queries.get(0);
} else { } else {
var booleanQueryBuilder = new BooleanQuery.Builder(); var booleanQueryBuilder = new BooleanQuery.Builder();
for (Query query : queries) { for (Query queryEntry : queries) {
booleanQueryBuilder.add(query, Occur.SHOULD); booleanQueryBuilder.add(queryEntry, Occur.SHOULD);
} }
booleanQueryBuilder.setMinimumNumberShouldMatch(1); booleanQueryBuilder.setMinimumNumberShouldMatch(1);
globalQuery = booleanQueryBuilder.build(); query = booleanQueryBuilder.build();
}
it.cavallium.dbengine.lucene.collector.FacetsCollector queryFacetsCollector;
if (randomSamplingEnabled) {
indexSearcher.search(query, randomSamplingFacetsCollector);
queryFacetsCollector = randomSamplingFacetsCollector;
} else {
queryFacetsCollector = indexSearcher.search(query, facetsCollectorManager);
} }
var facetsCollector = indexSearcher.search(globalQuery, facetsCollectorManager);
double[] reducedNormalizationBuckets = newBuckets(); double[] reducedNormalizationBuckets = newBuckets();
List<DoubleArrayList> seriesReducedBuckets = new ArrayList<>(queries.size()); List<DoubleArrayList> seriesReducedBuckets = new ArrayList<>(queries.size());
for (int i = 0; i < queries.size(); i++) { for (int i = 0; i < queries.size(); i++) {
@ -148,7 +152,7 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
seriesReducedBuckets.add(DoubleArrayList.wrap(buckets)); seriesReducedBuckets.add(DoubleArrayList.wrap(buckets));
} }
int serieIndex = 0; int serieIndex = 0;
for (Query query : queries) { for (Query queryEntry : queries) {
var reducedBuckets = seriesReducedBuckets.get(serieIndex); var reducedBuckets = seriesReducedBuckets.get(serieIndex);
Facets facets; Facets facets;
if (USE_LONGS) { if (USE_LONGS) {
@ -165,8 +169,8 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
} }
facets = new LongRangeFacetCounts(bucketField, facets = new LongRangeFacetCounts(bucketField,
valuesSource, valuesSource,
facetsCollector, queryFacetsCollector.getLuceneFacetsCollector(),
query, USE_SINGLE_FACET_COLLECTOR && normalizationQuery != null || queries.size() > 1 ? queryEntry : null,
(LongRange[]) bucketRanges (LongRange[]) bucketRanges
); );
} else { } else {
@ -182,12 +186,19 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
} }
facets = new DoubleRangeFacetCounts(bucketField, facets = new DoubleRangeFacetCounts(bucketField,
valuesSource, valuesSource,
facetsCollector, queryFacetsCollector.getLuceneFacetsCollector(),
query, USE_SINGLE_FACET_COLLECTOR && normalizationQuery != null || queries.size() > 1 ? queryEntry : null,
(DoubleRange[]) bucketRanges (DoubleRange[]) bucketRanges
); );
} }
FacetResult children = facets.getTopChildren(0, bucketField); FacetResult children = facets.getTopChildren(0, bucketField);
if (AMORTIZE && randomSamplingEnabled) {
var cfg = new FacetsConfig();
for (Range bucketRange : bucketRanges) {
cfg.setIndexFieldName(bucketRange.label, bucketField);
}
((RandomSamplingFacetsCollector) queryFacetsCollector.getLuceneFacetsCollector()).amortizeFacetCounts(children, cfg, indexSearcher);
}
for (LabelAndValue labelAndValue : children.labelValues) { for (LabelAndValue labelAndValue : children.labelValues) {
var index = Integer.parseInt(labelAndValue.label); var index = Integer.parseInt(labelAndValue.label);
reducedBuckets.set(index, reducedBuckets.getDouble(index) + labelAndValue.value.doubleValue()); reducedBuckets.set(index, reducedBuckets.getDouble(index) + labelAndValue.value.doubleValue());
@ -195,8 +206,17 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
serieIndex++; serieIndex++;
} }
it.cavallium.dbengine.lucene.collector.FacetsCollector normalizationFacetsCollector;
Facets normalizationFacets; Facets normalizationFacets;
if (normalizationQuery != null) { if (normalizationQuery != null) {
if (USE_SINGLE_FACET_COLLECTOR) {
normalizationFacetsCollector = queryFacetsCollector;
} else if (randomSamplingEnabled) {
indexSearcher.search(normalizationQuery, randomSamplingFacetsCollector);
normalizationFacetsCollector = randomSamplingFacetsCollector;
} else {
normalizationFacetsCollector = indexSearcher.search(normalizationQuery, facetsCollectorManager);
}
if (USE_LONGS) { if (USE_LONGS) {
LongValuesSource valuesSource; LongValuesSource valuesSource;
if (bucketValueSource instanceof NullValueSource) { if (bucketValueSource instanceof NullValueSource) {
@ -210,7 +230,7 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
} }
normalizationFacets = new LongRangeFacetCounts(bucketField, normalizationFacets = new LongRangeFacetCounts(bucketField,
valuesSource, valuesSource,
facetsCollector, normalizationFacetsCollector.getLuceneFacetsCollector(),
null, null,
(LongRange[]) bucketRanges (LongRange[]) bucketRanges
); );
@ -227,12 +247,19 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager
} }
normalizationFacets = new DoubleRangeFacetCounts(bucketField, normalizationFacets = new DoubleRangeFacetCounts(bucketField,
valuesSource, valuesSource,
facetsCollector, normalizationFacetsCollector.getLuceneFacetsCollector(),
null, null,
(DoubleRange[]) bucketRanges (DoubleRange[]) bucketRanges
); );
} }
var normalizationChildren = normalizationFacets.getTopChildren(0, bucketField); var normalizationChildren = normalizationFacets.getTopChildren(0, bucketField);
if (AMORTIZE && randomSamplingEnabled) {
var cfg = new FacetsConfig();
for (Range bucketRange : bucketRanges) {
cfg.setIndexFieldName(bucketRange.label, bucketField);
}
((RandomSamplingFacetsCollector) normalizationFacetsCollector.getLuceneFacetsCollector()).amortizeFacetCounts(normalizationChildren, cfg, indexSearcher);
}
for (LabelAndValue labelAndValue : normalizationChildren.labelValues) { for (LabelAndValue labelAndValue : normalizationChildren.labelValues) {
var index = Integer.parseInt(labelAndValue.label); var index = Integer.parseInt(labelAndValue.label);
reducedNormalizationBuckets[index] += labelAndValue.value.doubleValue(); reducedNormalizationBuckets[index] += labelAndValue.value.doubleValue();

View File

@ -0,0 +1,32 @@
package it.cavallium.dbengine.lucene.collector;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.ScoreMode;
public interface FacetsCollector extends Collector {
static FacetsCollector wrap(org.apache.lucene.facet.FacetsCollector facetsCollector) {
return new FacetsCollector() {
@Override
public org.apache.lucene.facet.FacetsCollector getLuceneFacetsCollector() {
return facetsCollector;
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
return facetsCollector.getLeafCollector(context);
}
@Override
public ScoreMode scoreMode() {
return facetsCollector.scoreMode();
}
};
}
org.apache.lucene.facet.FacetsCollector getLuceneFacetsCollector();
}

View File

@ -0,0 +1,92 @@
package it.cavallium.dbengine.lucene.collector;
import it.cavallium.dbengine.lucene.IntSmear;
import it.unimi.dsi.fastutil.ints.IntHash;
import java.io.IOException;
import java.util.Collection;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.facet.FacetsCollectorManager;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;
public class FastFacetsCollectorManager implements CollectorManager<FacetsCollector, FacetsCollector> {
private final int collectionRate;
private final IntHash.Strategy hash;
private final FacetsCollectorManager facetsCollectorManager;
public FastFacetsCollectorManager(int collectionRate) {
this.collectionRate = collectionRate;
this.hash = new IntSmear();
this.facetsCollectorManager = new FacetsCollectorManager();
}
@Override
public FacetsCollector newCollector() {
return new FastFacetsCollector(collectionRate, hash);
}
@Override
public FacetsCollector reduce(Collection<FacetsCollector> collectors) throws IOException {
return FacetsCollector.wrap(facetsCollectorManager.reduce(collectors
.stream()
.map(FacetsCollector::getLuceneFacetsCollector)
.toList()));
}
private static class FastFacetsCollector implements FacetsCollector {
private final org.apache.lucene.facet.FacetsCollector collector;
private final int collectionRate;
private final IntHash.Strategy hash;
public FastFacetsCollector(int collectionRate, IntHash.Strategy hash) {
this.collectionRate = collectionRate;
this.hash = hash;
this.collector = new org.apache.lucene.facet.FacetsCollector(false) {
@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}
};
}
@Override
public org.apache.lucene.facet.FacetsCollector getLuceneFacetsCollector() {
return collector;
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
var leafCollector = collector.getLeafCollector(context);
return new LeafCollector() {
@Override
public void setScorer(Scorable scorer) throws IOException {
leafCollector.setScorer(scorer);
}
@Override
public void collect(int doc) throws IOException {
if (hash.hashCode(doc) % collectionRate == 0) {
leafCollector.collect(doc);
}
}
@Override
public DocIdSetIterator competitiveIterator() throws IOException {
return leafCollector.competitiveIterator();
}
};
}
@Override
public ScoreMode scoreMode() {
return collector.scoreMode();
}
}
}

View File

@ -0,0 +1,64 @@
package it.cavallium.dbengine.lucene.collector;
import it.cavallium.dbengine.lucene.IntSmear;
import it.unimi.dsi.fastutil.ints.IntHash;
import java.io.IOException;
import org.apache.lucene.facet.RandomSamplingFacetsCollector;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
public class FastRandomSamplingFacetsCollector extends SimpleCollector implements FacetsCollector {
private final RandomSamplingFacetsCollector collector;
private final int collectionRate;
private final IntHash.Strategy hash;
/**
* @param collectionRate collect 1 document every n collectable documents
*/
public FastRandomSamplingFacetsCollector(int collectionRate, int sampleSize) {
this(collectionRate, sampleSize, 0);
}
public FastRandomSamplingFacetsCollector(int collectionRate, int sampleSize, long seed) {
this.collectionRate = collectionRate;
this.hash = new IntSmear();
this.collector = new RandomSamplingFacetsCollector(sampleSize, seed) {
@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}
};
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
collector.getLeafCollector(context);
}
@Override
public void setScorer(Scorable scorer) throws IOException {
collector.setScorer(scorer);
}
@Override
public void collect(int doc) throws IOException {
if (hash.hashCode(doc) % collectionRate == 0) {
collector.collect(doc);
}
}
@Override
public ScoreMode scoreMode() {
return collector.scoreMode();
}
@Override
public org.apache.lucene.facet.FacetsCollector getLuceneFacetsCollector() {
return collector;
}
}

View File

@ -5,4 +5,5 @@ import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
public record BucketParams(double min, double max, int buckets, String bucketFieldName, public record BucketParams(double min, double max, int buckets, String bucketFieldName,
@NotNull BucketValueSource valueSource, @Nullable Integer sampleSize) {} @NotNull BucketValueSource valueSource, @Nullable Integer collectionRate,
@Nullable Integer sampleSize) {}

View File

@ -46,6 +46,7 @@ public class DecimalBucketMultiSearcher {
bucketParams.valueSource(), bucketParams.valueSource(),
queries, queries,
normalizationQuery, normalizationQuery,
bucketParams.collectionRate(),
bucketParams.sampleSize() bucketParams.sampleSize()
); );
}) })