Implement MoreLikeThis transformer

This commit is contained in:
Andrea Cavalli 2021-10-08 02:13:33 +02:00
parent 81d8abe72c
commit 4d5f8b5b37
9 changed files with 106 additions and 74 deletions

View File

@ -486,11 +486,12 @@ public class LuceneUtils {
}
public static Mono<LocalQueryParams> getMoreLikeThisQuery(
List<LLIndexSearcher> indexSearchers,
LLIndexSearchers inputIndexSearchers,
LocalQueryParams localQueryParams,
Analyzer analyzer,
Similarity similarity,
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux) {
var indexSearchers = inputIndexSearchers.shards();
Query luceneAdditionalQuery;
try {
luceneAdditionalQuery = localQueryParams.query();

View File

@ -50,7 +50,7 @@ import org.apache.lucene.util.PriorityQueue;
* Generate "more like this" similarity queries. Based on this mail:
*
* <pre><code>
* Lucene does let you access the document frequency of terms, with BigCompositeReader.docFreq().
* Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
* Term frequencies can be computed by re-tokenizing the text, which, for a single document,
* is usually fast enough. But looking up the docFreq() of every term in the document is
* probably too slow.
@ -84,7 +84,7 @@ import org.apache.lucene.util.PriorityQueue;
* usage is as follows. The bold fragment is specific to this class. <br>
*
* <pre class="prettyprint">
* BigCompositeReader ir = ...
* IndexReader ir = ...
* IndexSearcher is = ...
*
* MoreLikeThis mlt = new MoreLikeThis(ir);
@ -264,7 +264,7 @@ public final class MultiMoreLikeThis {
/** For idf() calculations. */
private TFIDFSimilarity similarity; // = new DefaultSimilarity();
/** BigCompositeReader to use */
/** IndexReader to use */
private final BigCompositeReader<?> ir;
/** Boost factor to use when boosting the terms */
@ -289,7 +289,7 @@ public final class MultiMoreLikeThis {
this.boostFactor = boostFactor;
}
/** Constructor requiring a BigCompositeReader. */
/** Constructor requiring an IndexReader. */
public MultiMoreLikeThis(BigCompositeReader<?> ir) {
this(ir, new ClassicSimilarity());
}
@ -401,7 +401,7 @@ public final class MultiMoreLikeThis {
* be still considered relevant.
*/
public void setMaxDocFreqPct(long maxPercentage) {
setMaxDocFreq((maxPercentage) * ir.maxDoc() / 100L);
setMaxDocFreq(maxPercentage * ir.maxDoc() / 100L);
}
/**
@ -437,7 +437,7 @@ public final class MultiMoreLikeThis {
/**
* Sets the field names that will be used when generating the 'More Like This' query. Set this to
* null for the field names to be determined at runtime from the BigCompositeReader provided in the
* null for the field names to be determined at runtime from the IndexReader provided in the
* constructor.
*
* @param fieldNames the field names that will be used when generating the 'More Like This' query.

View File

@ -3,6 +3,7 @@ package it.cavallium.dbengine.lucene.searcher;
import io.net5.buffer.api.Send;
import it.cavallium.dbengine.database.disk.LLIndexSearcher;
import it.cavallium.dbengine.database.disk.LLIndexSearchers;
import org.apache.lucene.search.IndexSearcher;
import reactor.core.publisher.Mono;
public class AdaptiveLuceneLocalSearcher implements LuceneLocalSearcher {

View File

@ -28,7 +28,8 @@ public class AdaptiveLuceneMultiSearcher implements LuceneMultiSearcher {
} else if (queryParams.isSorted() || queryParams.isScored()) {
return scoredSimpleLuceneShardSearcher.collectMulti(indexSearchersMono, queryParams, keyFieldName, transformer);
} else {
if (((long) queryParams.offset() + (long) queryParams.limit()) <= (long) queryParams.pageLimits().getPageLimit(0)) {
if (((long) queryParams.offset() + (long) queryParams.limit()) <= (long) queryParams.pageLimits().getPageLimit(0)
|| transformer != null) {
// Run single-page searches using the paged multi searcher
return unsortedUnscoredPagedLuceneMultiSearcher.collectMulti(indexSearchersMono, queryParams, keyFieldName, transformer);
} else {

View File

@ -4,6 +4,9 @@ import io.net5.buffer.api.Send;
import it.cavallium.dbengine.client.query.current.data.TotalHitsCount;
import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.database.disk.LLIndexSearcher;
import it.cavallium.dbengine.database.disk.LLIndexSearchers;
import it.cavallium.dbengine.lucene.searcher.LLSearchTransformer.TransformerInput;
import org.apache.lucene.search.IndexSearcher;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import reactor.core.scheduler.Schedulers;
@ -18,12 +21,18 @@ public class CountLuceneLocalSearcher implements LuceneLocalSearcher {
return Mono
.usingWhen(
indexSearcherMono,
indexSearcher -> Mono.fromCallable(() -> {
try (var is = indexSearcher.receive()) {
LLUtils.ensureBlocking();
return is.getIndexSearcher().count(queryParams.query());
}
}).subscribeOn(Schedulers.boundedElastic()),
indexSearcher -> {
var queryParamsMono = transformer
.transform(Mono.fromSupplier(() -> new TransformerInput(LLIndexSearchers.unsharded(indexSearcher),
queryParams)));
return queryParamsMono.flatMap(queryParams2 -> Mono.fromCallable(() -> {
try (var is = indexSearcher.receive()) {
LLUtils.ensureBlocking();
return is.getIndexSearcher().count(queryParams2.query());
}
}).subscribeOn(Schedulers.boundedElastic()));
},
is -> Mono.empty()
)
.map(count -> new LuceneSearchResult(TotalHitsCount.of(count, true), Flux.empty(), null).send())

View File

@ -1,6 +1,7 @@
package it.cavallium.dbengine.lucene.searcher;
import it.cavallium.dbengine.database.disk.LLIndexSearcher;
import it.cavallium.dbengine.database.disk.LLIndexSearchers;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import reactor.core.publisher.Mono;
@ -10,7 +11,7 @@ public interface LLSearchTransformer {
LLSearchTransformer NO_TRANSFORMATION = queryParamsMono -> queryParamsMono
.map(TransformerInput::queryParams);
record TransformerInput(List<LLIndexSearcher> indexSearchers,
record TransformerInput(LLIndexSearchers indexSearchers,
LocalQueryParams queryParams) {}
Mono<LocalQueryParams> transform(Mono<TransformerInput> inputMono);

View File

@ -11,6 +11,7 @@ import it.cavallium.dbengine.database.disk.LLIndexSearcher;
import it.cavallium.dbengine.database.disk.LLIndexSearchers;
import it.cavallium.dbengine.database.disk.LLIndexSearchers.UnshardedIndexSearchers;
import it.cavallium.dbengine.lucene.LuceneUtils;
import it.cavallium.dbengine.lucene.searcher.LLSearchTransformer.TransformerInput;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
@ -37,18 +38,24 @@ public class SimpleLuceneLocalSearcher implements LuceneLocalSearcher {
var indexSearchersMono = indexSearcherMono.map(LLIndexSearchers::unsharded);
return LLUtils.usingResource(indexSearchersMono, indexSearchers -> this
// Search first page results
.searchFirstPage(indexSearchers.shards(), queryParams, paginationInfo)
// Compute the results of the first page
.transform(firstPageTopDocsMono -> this.computeFirstPageResults(firstPageTopDocsMono, indexSearchers.shards(),
keyFieldName, queryParams))
// Compute other results
.transform(firstResult -> this.computeOtherResults(firstResult, indexSearchers.shards(), queryParams,
keyFieldName, indexSearchers::close))
// Ensure that one LuceneSearchResult is always returned
.single(),
false);
return LLUtils.usingResource(indexSearchersMono, indexSearchers -> {
var queryParamsMono = transformer
.transform(Mono.fromSupplier(() -> new TransformerInput(indexSearchers, queryParams)));
return queryParamsMono.flatMap(queryParams2 -> this
// Search first page results
.searchFirstPage(indexSearchers.shards(), queryParams2, paginationInfo)
// Compute the results of the first page
.transform(firstPageTopDocsMono -> this.computeFirstPageResults(firstPageTopDocsMono, indexSearchers.shards(),
keyFieldName, queryParams2))
// Compute other results
.transform(firstResult -> this.computeOtherResults(firstResult, indexSearchers.shards(), queryParams2,
keyFieldName, indexSearchers::close))
// Ensure that one LuceneSearchResult is always returned
.single()
);
},
false);
}
/**

View File

@ -7,6 +7,7 @@ import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.database.disk.LLIndexSearcher;
import it.cavallium.dbengine.database.disk.LLIndexSearchers;
import it.cavallium.dbengine.lucene.LuceneUtils;
import it.cavallium.dbengine.lucene.searcher.LLSearchTransformer.TransformerInput;
import java.util.ArrayList;
import java.util.List;
import reactor.core.publisher.Flux;
@ -25,55 +26,62 @@ public class SimpleUnsortedUnscoredLuceneMultiSearcher implements LuceneMultiSea
LocalQueryParams queryParams,
String keyFieldName,
LLSearchTransformer transformer) {
var indexSearchersSendResource = Mono
.fromRunnable(() -> {
LLUtils.ensureBlocking();
if (queryParams.isSorted() && queryParams.limit() > 0) {
throw new UnsupportedOperationException("Sorted queries are not supported"
+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
}
if (queryParams.isScored() && queryParams.limit() > 0) {
throw new UnsupportedOperationException("Scored queries are not supported"
+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
}
})
.then(indexSearchersMono);
var localQueryParams = getLocalQueryParams(queryParams);
return LLUtils.usingSendResource(indexSearchersSendResource,
indexSearchers -> Flux
.fromIterable(indexSearchers.shards())
.flatMap(searcher -> {
var llSearcher = Mono.fromCallable(() -> new LLIndexSearcher(searcher, false, null).send());
return localSearcher.collect(llSearcher, localQueryParams, keyFieldName, transformer);
})
.collectList()
.map(results -> {
List<LuceneSearchResult> resultsToDrop = new ArrayList<>(results.size());
List<Flux<LLKeyScore>> resultsFluxes = new ArrayList<>(results.size());
boolean exactTotalHitsCount = true;
long totalHitsCountValue = 0;
for (Send<LuceneSearchResult> resultToReceive : results) {
LuceneSearchResult result = resultToReceive.receive();
resultsToDrop.add(result);
resultsFluxes.add(result.results());
exactTotalHitsCount &= result.totalHitsCount().exact();
totalHitsCountValue += result.totalHitsCount().value();
return LLUtils.usingSendResource(indexSearchersMono,
indexSearchers -> {
var queryParamsMono = transformer
.transform(Mono.fromSupplier(() -> new TransformerInput(indexSearchers,
queryParams)));
return queryParamsMono.flatMap(queryParams2 -> {
var localQueryParams = getLocalQueryParams(queryParams2);
return Mono
.fromRunnable(() -> {
LLUtils.ensureBlocking();
if (queryParams2.isSorted() && queryParams2.limit() > 0) {
throw new UnsupportedOperationException("Sorted queries are not supported"
+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
}
if (queryParams2.isScored() && queryParams2.limit() > 0) {
throw new UnsupportedOperationException("Scored queries are not supported"
+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
}
})
.thenMany(Flux.fromIterable(indexSearchers.shards()))
.flatMap(searcher -> {
var llSearcher = Mono.fromCallable(() -> new LLIndexSearcher(searcher, false, null).send());
return localSearcher.collect(llSearcher, localQueryParams, keyFieldName, transformer);
})
.collectList()
.map(results -> {
List<LuceneSearchResult> resultsToDrop = new ArrayList<>(results.size());
List<Flux<LLKeyScore>> resultsFluxes = new ArrayList<>(results.size());
boolean exactTotalHitsCount = true;
long totalHitsCountValue = 0;
for (Send<LuceneSearchResult> resultToReceive : results) {
LuceneSearchResult result = resultToReceive.receive();
resultsToDrop.add(result);
resultsFluxes.add(result.results());
exactTotalHitsCount &= result.totalHitsCount().exact();
totalHitsCountValue += result.totalHitsCount().value();
}
var totalHitsCount = new TotalHitsCount(totalHitsCountValue, exactTotalHitsCount);
Flux<LLKeyScore> mergedFluxes = Flux
.merge(resultsFluxes)
.skip(queryParams2.offset())
.take(queryParams2.limit(), true);
return new LuceneSearchResult(totalHitsCount, mergedFluxes, () -> {
for (LuceneSearchResult luceneSearchResult : resultsToDrop) {
luceneSearchResult.close();
}
indexSearchers.close();
}).send();
});
}
var totalHitsCount = new TotalHitsCount(totalHitsCountValue, exactTotalHitsCount);
Flux<LLKeyScore> mergedFluxes = Flux
.merge(resultsFluxes)
.skip(queryParams.offset())
.take(queryParams.limit(), true);
return new LuceneSearchResult(totalHitsCount, mergedFluxes, () -> {
for (LuceneSearchResult luceneSearchResult : resultsToDrop) {
luceneSearchResult.close();
}
indexSearchers.close();
}).send();
}),
);
},
false
);
}

View File

@ -39,6 +39,10 @@ public class UnsortedUnscoredContinuousLuceneMultiSearcher implements LuceneMult
var indexSearchersSendResource = Mono
.fromRunnable(() -> {
LLUtils.ensureBlocking();
if (transformer != null) {
throw new UnsupportedOperationException("Transformers are not supported"
+ " by UnsortedUnscoredContinuousLuceneMultiSearcher");
}
if (queryParams.isSorted() && queryParams.limit() > 0) {
throw new UnsupportedOperationException("Sorted queries are not supported"
+ " by UnsortedUnscoredContinuousLuceneMultiSearcher");