diff --git a/src/main/java/it/cavallium/dbengine/lucene/IntSmear.java b/src/main/java/it/cavallium/dbengine/lucene/IntSmear.java new file mode 100644 index 0000000..b3b770e --- /dev/null +++ b/src/main/java/it/cavallium/dbengine/lucene/IntSmear.java @@ -0,0 +1,29 @@ +package it.cavallium.dbengine.lucene; + +import it.unimi.dsi.fastutil.ints.IntHash; + +public class IntSmear implements IntHash.Strategy { + + @Override + public int hashCode(int e) { + return smear(e); + } + + /* + * This method was written by Doug Lea with assistance from members of JCP + * JSR-166 Expert Group and released to the public domain, as explained at + * http://creativecommons.org/licenses/publicdomain + * + * As of 2010/06/11, this method is identical to the (package private) hash + * method in OpenJDK 7's java.util.HashMap class. + */ + static int smear(int hashCode) { + hashCode ^= (hashCode >>> 20) ^ (hashCode >>> 12); + return hashCode ^ (hashCode >>> 7) ^ (hashCode >>> 4); + } + + @Override + public boolean equals(int a, int b) { + return a == b; + } +} diff --git a/src/main/java/it/cavallium/dbengine/lucene/collector/DecimalBucketMultiCollectorManager.java b/src/main/java/it/cavallium/dbengine/lucene/collector/DecimalBucketMultiCollectorManager.java index 391c8af..a97e90f 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/collector/DecimalBucketMultiCollectorManager.java +++ b/src/main/java/it/cavallium/dbengine/lucene/collector/DecimalBucketMultiCollectorManager.java @@ -1,18 +1,20 @@ package it.cavallium.dbengine.lucene.collector; +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; import it.unimi.dsi.fastutil.doubles.DoubleArrayList; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; import java.util.List; -import java.util.Set; import org.apache.commons.lang3.NotImplementedException; -import org.apache.lucene.document.DocumentStoredFieldVisitor; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsCollectorManager; +import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.RandomSamplingFacetsCollector; import org.apache.lucene.facet.range.DoubleRange; @@ -20,29 +22,30 @@ import org.apache.lucene.facet.range.DoubleRangeFacetCounts; import org.apache.lucene.facet.range.LongRange; import org.apache.lucene.facet.range.LongRangeFacetCounts; import org.apache.lucene.facet.range.Range; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.CachingCollector; import org.apache.lucene.search.Collector; -import org.apache.lucene.search.CollectorManager; import org.apache.lucene.search.DoubleValuesSource; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.LongValuesSource; -import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorable; import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.SimpleCollector; import org.jetbrains.annotations.Nullable; public class DecimalBucketMultiCollectorManager implements CollectorMultiManager { - private final FacetsCollectorManager facetsCollectorManager; + private static final boolean USE_SINGLE_FACET_COLLECTOR = false; + private static final boolean AMORTIZE = true; + private final boolean randomSamplingEnabled; + private final FastFacetsCollectorManager facetsCollectorManager; + private final FastRandomSamplingFacetsCollector randomSamplingFacetsCollector; private final Range[] bucketRanges; private final List queries; private final @Nullable Query normalizationQuery; + private final @Nullable Integer collectionRate; private final @Nullable Integer sampleSize; private final String bucketField; @@ -64,6 +67,7 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager BucketValueSource bucketValueSource, List queries, @Nullable Query normalizationQuery, + @Nullable Integer collectionRate, @Nullable Integer sampleSize) { this.queries = queries; this.normalizationQuery = normalizationQuery; @@ -75,6 +79,7 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager this.totalLength = bucketLength * bucketsInt; this.bucketField = bucketField; this.bucketValueSource = bucketValueSource; + this.collectionRate = collectionRate; this.sampleSize = sampleSize; if (USE_LONGS) { @@ -102,26 +107,15 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager } } - this.facetsCollectorManager = new FacetsCollectorManager() { - @Override - public FacetsCollector newCollector() { - if (sampleSize != null) { - return new RandomSamplingFacetsCollector(sampleSize) { - @Override - public ScoreMode scoreMode() { - return ScoreMode.COMPLETE_NO_SCORES; - } - }; - } else { - return new FacetsCollector(false) { - @Override - public ScoreMode scoreMode() { - return ScoreMode.COMPLETE_NO_SCORES; - } - }; - } - } - }; + this.randomSamplingEnabled = sampleSize != null; + int intCollectionRate = this.collectionRate == null ? 1 : this.collectionRate; + if (randomSamplingEnabled) { + randomSamplingFacetsCollector = new FastRandomSamplingFacetsCollector(intCollectionRate, sampleSize, 0); + this.facetsCollectorManager = null; + } else { + this.randomSamplingFacetsCollector = null; + this.facetsCollectorManager = new FastFacetsCollectorManager(intCollectionRate); + } } public double[] newBuckets() { @@ -129,18 +123,28 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager } public Buckets search(IndexSearcher indexSearcher) throws IOException { - Query globalQuery; - if (normalizationQuery != null) { - globalQuery = normalizationQuery; + Query query; + if (USE_SINGLE_FACET_COLLECTOR && normalizationQuery != null) { + query = normalizationQuery; + } else if (queries.size() == 0) { + query = new MatchNoDocsQuery(); + } else if (queries.size() == 1) { + query = queries.get(0); } else { var booleanQueryBuilder = new BooleanQuery.Builder(); - for (Query query : queries) { - booleanQueryBuilder.add(query, Occur.SHOULD); + for (Query queryEntry : queries) { + booleanQueryBuilder.add(queryEntry, Occur.SHOULD); } booleanQueryBuilder.setMinimumNumberShouldMatch(1); - globalQuery = booleanQueryBuilder.build(); + query = booleanQueryBuilder.build(); + } + it.cavallium.dbengine.lucene.collector.FacetsCollector queryFacetsCollector; + if (randomSamplingEnabled) { + indexSearcher.search(query, randomSamplingFacetsCollector); + queryFacetsCollector = randomSamplingFacetsCollector; + } else { + queryFacetsCollector = indexSearcher.search(query, facetsCollectorManager); } - var facetsCollector = indexSearcher.search(globalQuery, facetsCollectorManager); double[] reducedNormalizationBuckets = newBuckets(); List seriesReducedBuckets = new ArrayList<>(queries.size()); for (int i = 0; i < queries.size(); i++) { @@ -148,7 +152,7 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager seriesReducedBuckets.add(DoubleArrayList.wrap(buckets)); } int serieIndex = 0; - for (Query query : queries) { + for (Query queryEntry : queries) { var reducedBuckets = seriesReducedBuckets.get(serieIndex); Facets facets; if (USE_LONGS) { @@ -165,8 +169,8 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager } facets = new LongRangeFacetCounts(bucketField, valuesSource, - facetsCollector, - query, + queryFacetsCollector.getLuceneFacetsCollector(), + USE_SINGLE_FACET_COLLECTOR && normalizationQuery != null || queries.size() > 1 ? queryEntry : null, (LongRange[]) bucketRanges ); } else { @@ -182,12 +186,19 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager } facets = new DoubleRangeFacetCounts(bucketField, valuesSource, - facetsCollector, - query, + queryFacetsCollector.getLuceneFacetsCollector(), + USE_SINGLE_FACET_COLLECTOR && normalizationQuery != null || queries.size() > 1 ? queryEntry : null, (DoubleRange[]) bucketRanges ); } FacetResult children = facets.getTopChildren(0, bucketField); + if (AMORTIZE && randomSamplingEnabled) { + var cfg = new FacetsConfig(); + for (Range bucketRange : bucketRanges) { + cfg.setIndexFieldName(bucketRange.label, bucketField); + } + ((RandomSamplingFacetsCollector) queryFacetsCollector.getLuceneFacetsCollector()).amortizeFacetCounts(children, cfg, indexSearcher); + } for (LabelAndValue labelAndValue : children.labelValues) { var index = Integer.parseInt(labelAndValue.label); reducedBuckets.set(index, reducedBuckets.getDouble(index) + labelAndValue.value.doubleValue()); @@ -195,8 +206,17 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager serieIndex++; } + it.cavallium.dbengine.lucene.collector.FacetsCollector normalizationFacetsCollector; Facets normalizationFacets; if (normalizationQuery != null) { + if (USE_SINGLE_FACET_COLLECTOR) { + normalizationFacetsCollector = queryFacetsCollector; + } else if (randomSamplingEnabled) { + indexSearcher.search(normalizationQuery, randomSamplingFacetsCollector); + normalizationFacetsCollector = randomSamplingFacetsCollector; + } else { + normalizationFacetsCollector = indexSearcher.search(normalizationQuery, facetsCollectorManager); + } if (USE_LONGS) { LongValuesSource valuesSource; if (bucketValueSource instanceof NullValueSource) { @@ -210,7 +230,7 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager } normalizationFacets = new LongRangeFacetCounts(bucketField, valuesSource, - facetsCollector, + normalizationFacetsCollector.getLuceneFacetsCollector(), null, (LongRange[]) bucketRanges ); @@ -227,12 +247,19 @@ public class DecimalBucketMultiCollectorManager implements CollectorMultiManager } normalizationFacets = new DoubleRangeFacetCounts(bucketField, valuesSource, - facetsCollector, + normalizationFacetsCollector.getLuceneFacetsCollector(), null, (DoubleRange[]) bucketRanges ); } var normalizationChildren = normalizationFacets.getTopChildren(0, bucketField); + if (AMORTIZE && randomSamplingEnabled) { + var cfg = new FacetsConfig(); + for (Range bucketRange : bucketRanges) { + cfg.setIndexFieldName(bucketRange.label, bucketField); + } + ((RandomSamplingFacetsCollector) normalizationFacetsCollector.getLuceneFacetsCollector()).amortizeFacetCounts(normalizationChildren, cfg, indexSearcher); + } for (LabelAndValue labelAndValue : normalizationChildren.labelValues) { var index = Integer.parseInt(labelAndValue.label); reducedNormalizationBuckets[index] += labelAndValue.value.doubleValue(); diff --git a/src/main/java/it/cavallium/dbengine/lucene/collector/FacetsCollector.java b/src/main/java/it/cavallium/dbengine/lucene/collector/FacetsCollector.java new file mode 100644 index 0000000..2888e22 --- /dev/null +++ b/src/main/java/it/cavallium/dbengine/lucene/collector/FacetsCollector.java @@ -0,0 +1,32 @@ +package it.cavallium.dbengine.lucene.collector; + +import java.io.IOException; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.ScoreMode; + +public interface FacetsCollector extends Collector { + + static FacetsCollector wrap(org.apache.lucene.facet.FacetsCollector facetsCollector) { + return new FacetsCollector() { + + @Override + public org.apache.lucene.facet.FacetsCollector getLuceneFacetsCollector() { + return facetsCollector; + } + + @Override + public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { + return facetsCollector.getLeafCollector(context); + } + + @Override + public ScoreMode scoreMode() { + return facetsCollector.scoreMode(); + } + }; + } + + org.apache.lucene.facet.FacetsCollector getLuceneFacetsCollector(); +} diff --git a/src/main/java/it/cavallium/dbengine/lucene/collector/FastFacetsCollectorManager.java b/src/main/java/it/cavallium/dbengine/lucene/collector/FastFacetsCollectorManager.java new file mode 100644 index 0000000..ac92d6c --- /dev/null +++ b/src/main/java/it/cavallium/dbengine/lucene/collector/FastFacetsCollectorManager.java @@ -0,0 +1,92 @@ +package it.cavallium.dbengine.lucene.collector; + +import it.cavallium.dbengine.lucene.IntSmear; +import it.unimi.dsi.fastutil.ints.IntHash; +import java.io.IOException; +import java.util.Collection; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.lucene.facet.FacetsCollectorManager; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.CollectorManager; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.Scorable; +import org.apache.lucene.search.ScoreMode; + +public class FastFacetsCollectorManager implements CollectorManager { + + private final int collectionRate; + private final IntHash.Strategy hash; + private final FacetsCollectorManager facetsCollectorManager; + + public FastFacetsCollectorManager(int collectionRate) { + this.collectionRate = collectionRate; + this.hash = new IntSmear(); + this.facetsCollectorManager = new FacetsCollectorManager(); + } + + @Override + public FacetsCollector newCollector() { + return new FastFacetsCollector(collectionRate, hash); + } + + @Override + public FacetsCollector reduce(Collection collectors) throws IOException { + return FacetsCollector.wrap(facetsCollectorManager.reduce(collectors + .stream() + .map(FacetsCollector::getLuceneFacetsCollector) + .toList())); + } + + private static class FastFacetsCollector implements FacetsCollector { + + private final org.apache.lucene.facet.FacetsCollector collector; + private final int collectionRate; + private final IntHash.Strategy hash; + + public FastFacetsCollector(int collectionRate, IntHash.Strategy hash) { + this.collectionRate = collectionRate; + this.hash = hash; + this.collector = new org.apache.lucene.facet.FacetsCollector(false) { + @Override + public ScoreMode scoreMode() { + return ScoreMode.COMPLETE_NO_SCORES; + } + }; + } + + + @Override + public org.apache.lucene.facet.FacetsCollector getLuceneFacetsCollector() { + return collector; + } + + @Override + public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { + var leafCollector = collector.getLeafCollector(context); + return new LeafCollector() { + @Override + public void setScorer(Scorable scorer) throws IOException { + leafCollector.setScorer(scorer); + } + + @Override + public void collect(int doc) throws IOException { + if (hash.hashCode(doc) % collectionRate == 0) { + leafCollector.collect(doc); + } + } + + @Override + public DocIdSetIterator competitiveIterator() throws IOException { + return leafCollector.competitiveIterator(); + } + }; + } + + @Override + public ScoreMode scoreMode() { + return collector.scoreMode(); + } + } +} diff --git a/src/main/java/it/cavallium/dbengine/lucene/collector/FastRandomSamplingFacetsCollector.java b/src/main/java/it/cavallium/dbengine/lucene/collector/FastRandomSamplingFacetsCollector.java new file mode 100644 index 0000000..45b4c53 --- /dev/null +++ b/src/main/java/it/cavallium/dbengine/lucene/collector/FastRandomSamplingFacetsCollector.java @@ -0,0 +1,64 @@ +package it.cavallium.dbengine.lucene.collector; + +import it.cavallium.dbengine.lucene.IntSmear; +import it.unimi.dsi.fastutil.ints.IntHash; +import java.io.IOException; +import org.apache.lucene.facet.RandomSamplingFacetsCollector; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.Scorable; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.SimpleCollector; + +public class FastRandomSamplingFacetsCollector extends SimpleCollector implements FacetsCollector { + + private final RandomSamplingFacetsCollector collector; + private final int collectionRate; + private final IntHash.Strategy hash; + + /** + * @param collectionRate collect 1 document every n collectable documents + */ + public FastRandomSamplingFacetsCollector(int collectionRate, int sampleSize) { + this(collectionRate, sampleSize, 0); + } + + public FastRandomSamplingFacetsCollector(int collectionRate, int sampleSize, long seed) { + this.collectionRate = collectionRate; + this.hash = new IntSmear(); + this.collector = new RandomSamplingFacetsCollector(sampleSize, seed) { + @Override + public ScoreMode scoreMode() { + return ScoreMode.COMPLETE_NO_SCORES; + } + }; + } + + @Override + protected void doSetNextReader(LeafReaderContext context) throws IOException { + collector.getLeafCollector(context); + } + + @Override + public void setScorer(Scorable scorer) throws IOException { + collector.setScorer(scorer); + } + + @Override + public void collect(int doc) throws IOException { + if (hash.hashCode(doc) % collectionRate == 0) { + collector.collect(doc); + } + } + + @Override + public ScoreMode scoreMode() { + return collector.scoreMode(); + } + + @Override + public org.apache.lucene.facet.FacetsCollector getLuceneFacetsCollector() { + return collector; + } +} diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/BucketParams.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/BucketParams.java index bc3d217..717410e 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/searcher/BucketParams.java +++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/BucketParams.java @@ -5,4 +5,5 @@ import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; public record BucketParams(double min, double max, int buckets, String bucketFieldName, - @NotNull BucketValueSource valueSource, @Nullable Integer sampleSize) {} + @NotNull BucketValueSource valueSource, @Nullable Integer collectionRate, + @Nullable Integer sampleSize) {} diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/DecimalBucketMultiSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/DecimalBucketMultiSearcher.java index e516290..5309fd3 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/searcher/DecimalBucketMultiSearcher.java +++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/DecimalBucketMultiSearcher.java @@ -46,6 +46,7 @@ public class DecimalBucketMultiSearcher { bucketParams.valueSource(), queries, normalizationQuery, + bucketParams.collectionRate(), bucketParams.sampleSize() ); })