From 4d5f8b5b37d1fbccbf38439a1c6d52896de84f05 Mon Sep 17 00:00:00 2001 From: Andrea Cavalli Date: Fri, 8 Oct 2021 02:13:33 +0200 Subject: [PATCH] Implement MoreLikeThis transformer --- .../dbengine/lucene/LuceneUtils.java | 3 +- .../lucene/mlt/MultiMoreLikeThis.java | 12 +-- .../searcher/AdaptiveLuceneLocalSearcher.java | 1 + .../searcher/AdaptiveLuceneMultiSearcher.java | 3 +- .../searcher/CountLuceneLocalSearcher.java | 21 ++-- .../lucene/searcher/LLSearchTransformer.java | 3 +- .../searcher/SimpleLuceneLocalSearcher.java | 31 +++--- ...leUnsortedUnscoredLuceneMultiSearcher.java | 102 ++++++++++-------- ...UnscoredContinuousLuceneMultiSearcher.java | 4 + 9 files changed, 106 insertions(+), 74 deletions(-) diff --git a/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java b/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java index c447de5..ad9f299 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java +++ b/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java @@ -486,11 +486,12 @@ public class LuceneUtils { } public static Mono getMoreLikeThisQuery( - List indexSearchers, + LLIndexSearchers inputIndexSearchers, LocalQueryParams localQueryParams, Analyzer analyzer, Similarity similarity, Flux>> mltDocumentFieldsFlux) { + var indexSearchers = inputIndexSearchers.shards(); Query luceneAdditionalQuery; try { luceneAdditionalQuery = localQueryParams.query(); diff --git a/src/main/java/it/cavallium/dbengine/lucene/mlt/MultiMoreLikeThis.java b/src/main/java/it/cavallium/dbengine/lucene/mlt/MultiMoreLikeThis.java index 88504c7..4446ee7 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/mlt/MultiMoreLikeThis.java +++ b/src/main/java/it/cavallium/dbengine/lucene/mlt/MultiMoreLikeThis.java @@ -50,7 +50,7 @@ import org.apache.lucene.util.PriorityQueue; * Generate "more like this" similarity queries. Based on this mail: * *

- * Lucene does let you access the document frequency of terms, with BigCompositeReader.docFreq().
+ * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
  * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
  * is usually fast enough.  But looking up the docFreq() of every term in the document is
  * probably too slow.
@@ -84,7 +84,7 @@ import org.apache.lucene.util.PriorityQueue;
  * usage is as follows. The bold fragment is specific to this class. 
* *
- * BigCompositeReader ir = ...
+ * IndexReader ir = ...
  * IndexSearcher is = ...
  *
  * MoreLikeThis mlt = new MoreLikeThis(ir);
@@ -264,7 +264,7 @@ public final class MultiMoreLikeThis {
 	/** For idf() calculations. */
 	private TFIDFSimilarity similarity; // = new DefaultSimilarity();
 
-	/** BigCompositeReader to use */
+	/** IndexReader to use */
 	private final BigCompositeReader ir;
 
 	/** Boost factor to use when boosting the terms */
@@ -289,7 +289,7 @@ public final class MultiMoreLikeThis {
 		this.boostFactor = boostFactor;
 	}
 
-	/** Constructor requiring a BigCompositeReader. */
+	/** Constructor requiring an IndexReader. */
 	public MultiMoreLikeThis(BigCompositeReader ir) {
 		this(ir, new ClassicSimilarity());
 	}
@@ -401,7 +401,7 @@ public final class MultiMoreLikeThis {
 	 *     be still considered relevant.
 	 */
 	public void setMaxDocFreqPct(long maxPercentage) {
-		setMaxDocFreq((maxPercentage) * ir.maxDoc() / 100L);
+		setMaxDocFreq(maxPercentage * ir.maxDoc() / 100L);
 	}
 
 	/**
@@ -437,7 +437,7 @@ public final class MultiMoreLikeThis {
 
 	/**
 	 * Sets the field names that will be used when generating the 'More Like This' query. Set this to
-	 * null for the field names to be determined at runtime from the BigCompositeReader provided in the
+	 * null for the field names to be determined at runtime from the IndexReader provided in the
 	 * constructor.
 	 *
 	 * @param fieldNames the field names that will be used when generating the 'More Like This' query.
diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/AdaptiveLuceneLocalSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/AdaptiveLuceneLocalSearcher.java
index b4560d3..016638f 100644
--- a/src/main/java/it/cavallium/dbengine/lucene/searcher/AdaptiveLuceneLocalSearcher.java
+++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/AdaptiveLuceneLocalSearcher.java
@@ -3,6 +3,7 @@ package it.cavallium.dbengine.lucene.searcher;
 import io.net5.buffer.api.Send;
 import it.cavallium.dbengine.database.disk.LLIndexSearcher;
 import it.cavallium.dbengine.database.disk.LLIndexSearchers;
+import org.apache.lucene.search.IndexSearcher;
 import reactor.core.publisher.Mono;
 
 public class AdaptiveLuceneLocalSearcher implements LuceneLocalSearcher {
diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/AdaptiveLuceneMultiSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/AdaptiveLuceneMultiSearcher.java
index 3b6a835..c594eb6 100644
--- a/src/main/java/it/cavallium/dbengine/lucene/searcher/AdaptiveLuceneMultiSearcher.java
+++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/AdaptiveLuceneMultiSearcher.java
@@ -28,7 +28,8 @@ public class AdaptiveLuceneMultiSearcher implements LuceneMultiSearcher {
 		} else if (queryParams.isSorted() || queryParams.isScored()) {
 			return scoredSimpleLuceneShardSearcher.collectMulti(indexSearchersMono, queryParams, keyFieldName, transformer);
 		} else {
-			if (((long) queryParams.offset() + (long) queryParams.limit()) <= (long) queryParams.pageLimits().getPageLimit(0)) {
+			if (((long) queryParams.offset() + (long) queryParams.limit()) <= (long) queryParams.pageLimits().getPageLimit(0)
+					|| transformer != null) {
 				// Run single-page searches using the paged multi searcher
 				return unsortedUnscoredPagedLuceneMultiSearcher.collectMulti(indexSearchersMono, queryParams, keyFieldName, transformer);
 			} else {
diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/CountLuceneLocalSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/CountLuceneLocalSearcher.java
index 845500b..7229db3 100644
--- a/src/main/java/it/cavallium/dbengine/lucene/searcher/CountLuceneLocalSearcher.java
+++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/CountLuceneLocalSearcher.java
@@ -4,6 +4,9 @@ import io.net5.buffer.api.Send;
 import it.cavallium.dbengine.client.query.current.data.TotalHitsCount;
 import it.cavallium.dbengine.database.LLUtils;
 import it.cavallium.dbengine.database.disk.LLIndexSearcher;
+import it.cavallium.dbengine.database.disk.LLIndexSearchers;
+import it.cavallium.dbengine.lucene.searcher.LLSearchTransformer.TransformerInput;
+import org.apache.lucene.search.IndexSearcher;
 import reactor.core.publisher.Flux;
 import reactor.core.publisher.Mono;
 import reactor.core.scheduler.Schedulers;
@@ -18,12 +21,18 @@ public class CountLuceneLocalSearcher implements LuceneLocalSearcher {
 		return Mono
 				.usingWhen(
 						indexSearcherMono,
-						indexSearcher -> Mono.fromCallable(() -> {
-							try (var is = indexSearcher.receive()) {
-								LLUtils.ensureBlocking();
-								return is.getIndexSearcher().count(queryParams.query());
-							}
-						}).subscribeOn(Schedulers.boundedElastic()),
+						indexSearcher -> {
+							var queryParamsMono = transformer
+									.transform(Mono.fromSupplier(() -> new TransformerInput(LLIndexSearchers.unsharded(indexSearcher),
+											queryParams)));
+
+							return queryParamsMono.flatMap(queryParams2 -> Mono.fromCallable(() -> {
+								try (var is = indexSearcher.receive()) {
+									LLUtils.ensureBlocking();
+									return is.getIndexSearcher().count(queryParams2.query());
+								}
+							}).subscribeOn(Schedulers.boundedElastic()));
+						},
 						is -> Mono.empty()
 				)
 				.map(count -> new LuceneSearchResult(TotalHitsCount.of(count, true), Flux.empty(), null).send())
diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/LLSearchTransformer.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/LLSearchTransformer.java
index b098a90..65982db 100644
--- a/src/main/java/it/cavallium/dbengine/lucene/searcher/LLSearchTransformer.java
+++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/LLSearchTransformer.java
@@ -1,6 +1,7 @@
 package it.cavallium.dbengine.lucene.searcher;
 
 import it.cavallium.dbengine.database.disk.LLIndexSearcher;
+import it.cavallium.dbengine.database.disk.LLIndexSearchers;
 import java.util.List;
 import org.apache.lucene.index.IndexReader;
 import reactor.core.publisher.Mono;
@@ -10,7 +11,7 @@ public interface LLSearchTransformer {
 	LLSearchTransformer NO_TRANSFORMATION = queryParamsMono -> queryParamsMono
 			.map(TransformerInput::queryParams);
 
-	record TransformerInput(List indexSearchers,
+	record TransformerInput(LLIndexSearchers indexSearchers,
 													LocalQueryParams queryParams) {}
 
 	Mono transform(Mono inputMono);
diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleLuceneLocalSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleLuceneLocalSearcher.java
index e69b01b..2aacdf7 100644
--- a/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleLuceneLocalSearcher.java
+++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleLuceneLocalSearcher.java
@@ -11,6 +11,7 @@ import it.cavallium.dbengine.database.disk.LLIndexSearcher;
 import it.cavallium.dbengine.database.disk.LLIndexSearchers;
 import it.cavallium.dbengine.database.disk.LLIndexSearchers.UnshardedIndexSearchers;
 import it.cavallium.dbengine.lucene.LuceneUtils;
+import it.cavallium.dbengine.lucene.searcher.LLSearchTransformer.TransformerInput;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
@@ -37,18 +38,24 @@ public class SimpleLuceneLocalSearcher implements LuceneLocalSearcher {
 
 		var indexSearchersMono = indexSearcherMono.map(LLIndexSearchers::unsharded);
 
-		return LLUtils.usingResource(indexSearchersMono, indexSearchers -> this
-				// Search first page results
-				.searchFirstPage(indexSearchers.shards(), queryParams, paginationInfo)
-				// Compute the results of the first page
-				.transform(firstPageTopDocsMono -> this.computeFirstPageResults(firstPageTopDocsMono, indexSearchers.shards(),
-						keyFieldName, queryParams))
-				// Compute other results
-				.transform(firstResult -> this.computeOtherResults(firstResult, indexSearchers.shards(), queryParams,
-						keyFieldName, indexSearchers::close))
-				// Ensure that one LuceneSearchResult is always returned
-				.single(),
-				false);
+		return LLUtils.usingResource(indexSearchersMono, indexSearchers -> {
+			var queryParamsMono = transformer
+							.transform(Mono.fromSupplier(() -> new TransformerInput(indexSearchers, queryParams)));
+
+			return queryParamsMono.flatMap(queryParams2 -> this
+					// Search first page results
+					.searchFirstPage(indexSearchers.shards(), queryParams2, paginationInfo)
+					// Compute the results of the first page
+					.transform(firstPageTopDocsMono -> this.computeFirstPageResults(firstPageTopDocsMono, indexSearchers.shards(),
+							keyFieldName, queryParams2))
+					// Compute other results
+					.transform(firstResult -> this.computeOtherResults(firstResult, indexSearchers.shards(), queryParams2,
+							keyFieldName, indexSearchers::close))
+					// Ensure that one LuceneSearchResult is always returned
+					.single()
+			);
+		},
+		false);
 	}
 
 	/**
diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleUnsortedUnscoredLuceneMultiSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleUnsortedUnscoredLuceneMultiSearcher.java
index fcc8ade..f1be4e4 100644
--- a/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleUnsortedUnscoredLuceneMultiSearcher.java
+++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleUnsortedUnscoredLuceneMultiSearcher.java
@@ -7,6 +7,7 @@ import it.cavallium.dbengine.database.LLUtils;
 import it.cavallium.dbengine.database.disk.LLIndexSearcher;
 import it.cavallium.dbengine.database.disk.LLIndexSearchers;
 import it.cavallium.dbengine.lucene.LuceneUtils;
+import it.cavallium.dbengine.lucene.searcher.LLSearchTransformer.TransformerInput;
 import java.util.ArrayList;
 import java.util.List;
 import reactor.core.publisher.Flux;
@@ -25,55 +26,62 @@ public class SimpleUnsortedUnscoredLuceneMultiSearcher implements LuceneMultiSea
 			LocalQueryParams queryParams,
 			String keyFieldName,
 			LLSearchTransformer transformer) {
-		var indexSearchersSendResource = Mono
-				.fromRunnable(() -> {
-					LLUtils.ensureBlocking();
-					if (queryParams.isSorted() && queryParams.limit() > 0) {
-						throw new UnsupportedOperationException("Sorted queries are not supported"
-								+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
-					}
-					if (queryParams.isScored() && queryParams.limit() > 0) {
-						throw new UnsupportedOperationException("Scored queries are not supported"
-								+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
-					}
-				})
-				.then(indexSearchersMono);
-		var localQueryParams = getLocalQueryParams(queryParams);
 
-		return LLUtils.usingSendResource(indexSearchersSendResource,
-				indexSearchers -> Flux
-						.fromIterable(indexSearchers.shards())
-						.flatMap(searcher -> {
-							var llSearcher = Mono.fromCallable(() -> new LLIndexSearcher(searcher, false, null).send());
-							return localSearcher.collect(llSearcher, localQueryParams, keyFieldName, transformer);
-						})
-						.collectList()
-						.map(results -> {
-							List resultsToDrop = new ArrayList<>(results.size());
-							List> resultsFluxes = new ArrayList<>(results.size());
-							boolean exactTotalHitsCount = true;
-							long totalHitsCountValue = 0;
-							for (Send resultToReceive : results) {
-								LuceneSearchResult result = resultToReceive.receive();
-								resultsToDrop.add(result);
-								resultsFluxes.add(result.results());
-								exactTotalHitsCount &= result.totalHitsCount().exact();
-								totalHitsCountValue += result.totalHitsCount().value();
+		return LLUtils.usingSendResource(indexSearchersMono,
+				indexSearchers -> {
+					var queryParamsMono = transformer
+							.transform(Mono.fromSupplier(() -> new TransformerInput(indexSearchers,
+									queryParams)));
+
+					return queryParamsMono.flatMap(queryParams2 -> {
+						var localQueryParams = getLocalQueryParams(queryParams2);
+						return Mono
+								.fromRunnable(() -> {
+									LLUtils.ensureBlocking();
+									if (queryParams2.isSorted() && queryParams2.limit() > 0) {
+										throw new UnsupportedOperationException("Sorted queries are not supported"
+												+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
+									}
+									if (queryParams2.isScored() && queryParams2.limit() > 0) {
+										throw new UnsupportedOperationException("Scored queries are not supported"
+												+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
+									}
+								})
+								.thenMany(Flux.fromIterable(indexSearchers.shards()))
+								.flatMap(searcher -> {
+									var llSearcher = Mono.fromCallable(() -> new LLIndexSearcher(searcher, false, null).send());
+									return localSearcher.collect(llSearcher, localQueryParams, keyFieldName, transformer);
+								})
+								.collectList()
+								.map(results -> {
+									List resultsToDrop = new ArrayList<>(results.size());
+									List> resultsFluxes = new ArrayList<>(results.size());
+									boolean exactTotalHitsCount = true;
+									long totalHitsCountValue = 0;
+									for (Send resultToReceive : results) {
+										LuceneSearchResult result = resultToReceive.receive();
+										resultsToDrop.add(result);
+										resultsFluxes.add(result.results());
+										exactTotalHitsCount &= result.totalHitsCount().exact();
+										totalHitsCountValue += result.totalHitsCount().value();
+									}
+
+									var totalHitsCount = new TotalHitsCount(totalHitsCountValue, exactTotalHitsCount);
+									Flux mergedFluxes = Flux
+											.merge(resultsFluxes)
+											.skip(queryParams2.offset())
+											.take(queryParams2.limit(), true);
+
+									return new LuceneSearchResult(totalHitsCount, mergedFluxes, () -> {
+										for (LuceneSearchResult luceneSearchResult : resultsToDrop) {
+											luceneSearchResult.close();
+										}
+										indexSearchers.close();
+									}).send();
+								});
 							}
-
-							var totalHitsCount = new TotalHitsCount(totalHitsCountValue, exactTotalHitsCount);
-							Flux mergedFluxes = Flux
-									.merge(resultsFluxes)
-									.skip(queryParams.offset())
-									.take(queryParams.limit(), true);
-
-							return new LuceneSearchResult(totalHitsCount, mergedFluxes, () -> {
-								for (LuceneSearchResult luceneSearchResult : resultsToDrop) {
-									luceneSearchResult.close();
-								}
-								indexSearchers.close();
-							}).send();
-						}),
+					);
+				},
 				false
 		);
 	}
diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/UnsortedUnscoredContinuousLuceneMultiSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/UnsortedUnscoredContinuousLuceneMultiSearcher.java
index f1f3eb7..2a98fd8 100644
--- a/src/main/java/it/cavallium/dbengine/lucene/searcher/UnsortedUnscoredContinuousLuceneMultiSearcher.java
+++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/UnsortedUnscoredContinuousLuceneMultiSearcher.java
@@ -39,6 +39,10 @@ public class UnsortedUnscoredContinuousLuceneMultiSearcher implements LuceneMult
 		var indexSearchersSendResource = Mono
 				.fromRunnable(() -> {
 					LLUtils.ensureBlocking();
+					if (transformer != null) {
+						throw new UnsupportedOperationException("Transformers are not supported"
+								+ " by UnsortedUnscoredContinuousLuceneMultiSearcher");
+					}
 					if (queryParams.isSorted() && queryParams.limit() > 0) {
 						throw new UnsupportedOperationException("Sorted queries are not supported"
 								+ " by UnsortedUnscoredContinuousLuceneMultiSearcher");