Simplify query transformations

This commit is contained in:
Andrea Cavalli 2022-01-28 19:31:25 +01:00
parent 5c0434c73f
commit 58943b5e08
9 changed files with 138 additions and 134 deletions

View File

@ -1,11 +1,14 @@
package it.cavallium.dbengine.client;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import it.cavallium.dbengine.database.LLIndexRequest;
import it.cavallium.dbengine.database.LLSoftUpdateDocument;
import it.cavallium.dbengine.database.LLUpdateDocument;
import it.cavallium.dbengine.database.LLTerm;
import it.cavallium.dbengine.database.LLUpdateFields;
import it.cavallium.dbengine.database.LLUtils;
import java.util.Map;
import java.util.Set;
import org.jetbrains.annotations.NotNull;
import reactor.core.publisher.Flux;
@ -43,7 +46,7 @@ public abstract class Indicizer<T, U> {
public abstract IndicizerSimilarities getPerFieldSimilarity();
public Flux<Tuple2<String, Set<String>>> getMoreLikeThisDocumentFields(T key, U value) {
return Flux.empty();
public Multimap<String, String> getMoreLikeThisDocumentFields(T key, U value) {
return Multimaps.forMap(Map.of());
}
}

View File

@ -90,7 +90,7 @@ public class LuceneIndexImpl<T, U> implements LuceneIndex<T, U> {
public Mono<Hits<HitKey<T>>> moreLikeThis(ClientQueryParams queryParams,
T key,
U mltDocumentValue) {
Flux<Tuple2<String, Set<String>>> mltDocumentFields
var mltDocumentFields
= indicizer.getMoreLikeThisDocumentFields(key, mltDocumentValue);
return luceneIndex

View File

@ -1,5 +1,6 @@
package it.cavallium.dbengine.database;
import com.google.common.collect.Multimap;
import io.net5.buffer.api.Resource;
import io.net5.buffer.api.Send;
import it.cavallium.data.generator.nativedata.Nullablefloat;
@ -51,7 +52,7 @@ public interface LLLuceneIndex extends LLSnapshottable {
Mono<LLSearchResultShard> moreLikeThis(@Nullable LLSnapshot snapshot,
QueryParams queryParams,
String keyFieldName,
Flux<Tuple2<String, Set<String>>> mltDocumentFields);
Multimap<String, String> mltDocumentFields);
/**
* @param queryParams the limit is valid for each lucene instance. If you have 15 instances, the number of elements

View File

@ -6,10 +6,10 @@ import static it.cavallium.dbengine.database.LLUtils.toDocument;
import static it.cavallium.dbengine.database.LLUtils.toFields;
import static it.cavallium.dbengine.lucene.searcher.LLSearchTransformer.NO_TRANSFORMATION;
import com.google.common.collect.Multimap;
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Timer;
import io.net5.buffer.api.Resource;
import io.net5.buffer.api.Send;
import it.cavallium.dbengine.client.DirectIOOptions;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
@ -32,10 +32,10 @@ import it.cavallium.dbengine.lucene.AlwaysDirectIOFSDirectory;
import it.cavallium.dbengine.lucene.LuceneHacks;
import it.cavallium.dbengine.lucene.LuceneUtils;
import it.cavallium.dbengine.lucene.collector.Buckets;
import it.cavallium.dbengine.lucene.mlt.MoreLikeThisTransformer;
import it.cavallium.dbengine.lucene.searcher.AdaptiveLocalSearcher;
import it.cavallium.dbengine.lucene.searcher.BucketParams;
import it.cavallium.dbengine.lucene.searcher.DecimalBucketMultiSearcher;
import it.cavallium.dbengine.lucene.searcher.LLSearchTransformer;
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
import it.cavallium.dbengine.lucene.searcher.LocalSearcher;
import java.io.IOException;
@ -45,7 +45,6 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.Phaser;
import java.util.concurrent.TimeUnit;
@ -80,7 +79,6 @@ import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import reactor.core.scheduler.Scheduler;
import reactor.core.scheduler.Schedulers;
import reactor.util.function.Tuple2;
public class LLLocalLuceneIndex implements LLLuceneIndex {
@ -442,10 +440,10 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
public Mono<LLSearchResultShard> moreLikeThis(@Nullable LLSnapshot snapshot,
QueryParams queryParams,
String keyFieldName,
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux) {
Multimap<String, String> mltDocumentFieldsFlux) {
LocalQueryParams localQueryParams = LuceneUtils.toLocalQueryParams(queryParams, luceneAnalyzer);
var searcher = this.searcherManager.retrieveSearcher(snapshot);
var transformer = new MoreLikeThisTransformer(mltDocumentFieldsFlux);
var transformer = new MoreLikeThisTransformer(mltDocumentFieldsFlux, luceneAnalyzer, luceneSimilarity);
return localSearcher
.collect(searcher, localQueryParams, keyFieldName, transformer)
@ -601,18 +599,4 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
return lowMemory;
}
private class MoreLikeThisTransformer implements LLSearchTransformer {
private final Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux;
public MoreLikeThisTransformer(Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux) {
this.mltDocumentFieldsFlux = mltDocumentFieldsFlux;
}
@Override
public Mono<LocalQueryParams> transform(Mono<TransformerInput> inputMono) {
return inputMono.flatMap(input -> LuceneUtils.getMoreLikeThisQuery(input.indexSearchers(), input.queryParams(),
luceneAnalyzer, luceneSimilarity, mltDocumentFieldsFlux));
}
}
}

View File

@ -2,8 +2,8 @@ package it.cavallium.dbengine.database.disk;
import static it.cavallium.dbengine.client.UninterruptibleScheduler.uninterruptibleScheduler;
import com.google.common.collect.Multimap;
import io.micrometer.core.instrument.MeterRegistry;
import io.net5.buffer.api.Resource;
import io.net5.buffer.api.Send;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
@ -12,23 +12,22 @@ import it.cavallium.dbengine.client.query.QueryParser;
import it.cavallium.dbengine.client.query.current.data.Query;
import it.cavallium.dbengine.client.query.current.data.QueryParams;
import it.cavallium.dbengine.database.LLIndexRequest;
import it.cavallium.dbengine.database.LLUpdateDocument;
import it.cavallium.dbengine.database.LLItem;
import it.cavallium.dbengine.database.LLLuceneIndex;
import it.cavallium.dbengine.database.LLSearchResultShard;
import it.cavallium.dbengine.database.LLSnapshot;
import it.cavallium.dbengine.database.LLTerm;
import it.cavallium.dbengine.database.LLUpdateDocument;
import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.lucene.LuceneHacks;
import it.cavallium.dbengine.lucene.LuceneUtils;
import it.cavallium.dbengine.lucene.collector.Buckets;
import it.cavallium.dbengine.lucene.mlt.MoreLikeThisTransformer;
import it.cavallium.dbengine.lucene.searcher.AdaptiveMultiSearcher;
import it.cavallium.dbengine.lucene.searcher.BucketParams;
import it.cavallium.dbengine.lucene.searcher.DecimalBucketMultiSearcher;
import it.cavallium.dbengine.lucene.searcher.LLSearchTransformer;
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
import it.cavallium.dbengine.lucene.searcher.MultiSearcher;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Path;
@ -39,7 +38,6 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
@ -49,7 +47,6 @@ import org.jetbrains.annotations.Nullable;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import reactor.core.scheduler.Schedulers;
import reactor.util.function.Tuple2;
public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
@ -233,10 +230,10 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
public Mono<LLSearchResultShard> moreLikeThis(@Nullable LLSnapshot snapshot,
QueryParams queryParams,
String keyFieldName,
Flux<Tuple2<String, Set<String>>> mltDocumentFields) {
Multimap<String, String> mltDocumentFields) {
LocalQueryParams localQueryParams = LuceneUtils.toLocalQueryParams(queryParams, luceneAnalyzer);
var searchers = this.getIndexSearchers(snapshot);
var transformer = new MultiMoreLikeThisTransformer(mltDocumentFields);
var transformer = new MoreLikeThisTransformer(mltDocumentFields, luceneAnalyzer, luceneSimilarity);
// Collect all the shards results into a single global result
return multiSearcher
@ -340,19 +337,4 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
public boolean isLowMemoryMode() {
return luceneIndices[0].isLowMemoryMode();
}
private class MultiMoreLikeThisTransformer implements LLSearchTransformer {
private final Flux<Tuple2<String, Set<String>>> mltDocumentFields;
public MultiMoreLikeThisTransformer(Flux<Tuple2<String, Set<String>>> mltDocumentFields) {
this.mltDocumentFields = mltDocumentFields;
}
@Override
public Mono<LocalQueryParams> transform(Mono<TransformerInput> inputMono) {
return inputMono.flatMap(input -> LuceneUtils.getMoreLikeThisQuery(input.indexSearchers(), input.queryParams(),
luceneAnalyzer, luceneSimilarity, mltDocumentFields));
}
}
}

View File

@ -2,6 +2,8 @@ package it.cavallium.dbengine.lucene;
import static it.cavallium.dbengine.client.UninterruptibleScheduler.uninterruptibleScheduler;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import it.cavallium.dbengine.client.CompositeSnapshot;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
@ -31,12 +33,12 @@ import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -49,6 +51,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queries.mlt.MoreLikeThisQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.search.Collector;
@ -79,7 +82,6 @@ import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import reactor.core.scheduler.Schedulers;
import reactor.util.concurrent.Queues;
import reactor.util.function.Tuple2;
public class LuceneUtils {
@ -470,80 +472,57 @@ public class LuceneUtils {
}
}
public static Mono<LocalQueryParams> getMoreLikeThisQuery(
LLIndexSearchers inputIndexSearchers,
public static Query getMoreLikeThisQuery(LLIndexSearchers inputIndexSearchers,
LocalQueryParams localQueryParams,
Analyzer analyzer,
Similarity similarity,
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux) {
var indexSearchers = inputIndexSearchers.shards();
Query luceneAdditionalQuery;
try {
luceneAdditionalQuery = localQueryParams.query();
} catch (Exception e) {
return Mono.error(e);
Multimap<String, String> mltDocumentFieldsMultimap) throws IOException {
List<IndexSearcher> indexSearchers = inputIndexSearchers.shards();
Query luceneAdditionalQuery = localQueryParams.query();
// Create the mutable version of the input
Map<String, Collection<String>> mltDocumentFields = HashMultimap.create(mltDocumentFieldsMultimap).asMap();
mltDocumentFields.entrySet().removeIf(entry -> entry.getValue().isEmpty());
if (mltDocumentFields.isEmpty()) {
return new MatchNoDocsQuery();
}
MultiMoreLikeThis mlt;
if (indexSearchers.size() == 1) {
mlt = new MultiMoreLikeThis(new BigCompositeReader<>(indexSearchers.get(0).getIndexReader(), IndexReader[]::new),
null
);
} else {
IndexReader[] indexReaders = new IndexReader[indexSearchers.size()];
for (int i = 0, size = indexSearchers.size(); i < size; i++) {
indexReaders[i] = indexSearchers.get(i).getIndexReader();
}
mlt = new MultiMoreLikeThis(new BigCompositeReader<>(indexReaders, new ArrayIndexComparator(indexReaders)), null);
}
mlt.setAnalyzer(analyzer);
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new));
mlt.setMinTermFreq(1);
mlt.setMinDocFreq(3);
mlt.setMaxDocFreqPct(20);
mlt.setBoost(localQueryParams.needsScores());
mlt.setStopWords(ENGLISH_AND_ITALIAN_STOP_WORDS);
if (similarity instanceof TFIDFSimilarity tfidfSimilarity) {
mlt.setSimilarity(tfidfSimilarity);
} else {
mlt.setSimilarity(new ClassicSimilarity());
}
return mltDocumentFieldsFlux
.collectMap(Tuple2::getT1, Tuple2::getT2, HashMap::new)
.flatMap(mltDocumentFields -> Mono.fromCallable(() -> {
mltDocumentFields.entrySet().removeIf(entry -> entry.getValue().isEmpty());
if (mltDocumentFields.isEmpty()) {
return new LocalQueryParams(new MatchNoDocsQuery(),
localQueryParams.offsetLong(),
localQueryParams.limitLong(),
DEFAULT_PAGE_LIMITS,
localQueryParams.minCompetitiveScore(),
localQueryParams.sort(),
localQueryParams.computePreciseHitsCount(),
localQueryParams.timeout()
);
}
MultiMoreLikeThis mlt;
if (indexSearchers.size() == 1) {
mlt = new MultiMoreLikeThis(new BigCompositeReader<>(indexSearchers.get(0).getIndexReader(), IndexReader[]::new), null);
} else {
IndexReader[] indexReaders = new IndexReader[indexSearchers.size()];
for (int i = 0, size = indexSearchers.size(); i < size; i++) {
indexReaders[i] = indexSearchers.get(i).getIndexReader();
}
mlt = new MultiMoreLikeThis(new BigCompositeReader<>(indexReaders, new ArrayIndexComparator(indexReaders)), null);
}
mlt.setAnalyzer(analyzer);
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new));
mlt.setMinTermFreq(1);
mlt.setMinDocFreq(3);
mlt.setMaxDocFreqPct(20);
mlt.setBoost(localQueryParams.needsScores());
mlt.setStopWords(ENGLISH_AND_ITALIAN_STOP_WORDS);
if (similarity instanceof TFIDFSimilarity tfidfSimilarity) {
mlt.setSimilarity(tfidfSimilarity);
} else {
mlt.setSimilarity(new ClassicSimilarity());
}
// Get the reference docId and apply it to MoreLikeThis, to generate the query
@SuppressWarnings({"unchecked", "rawtypes"})
var mltQuery = mlt.like((Map) mltDocumentFields);
Query luceneQuery;
if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) {
luceneQuery = new Builder()
.add(mltQuery, Occur.MUST)
.add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST)
.build();
} else {
luceneQuery = mltQuery;
}
return new LocalQueryParams(luceneQuery,
localQueryParams.offsetLong(),
localQueryParams.limitLong(),
DEFAULT_PAGE_LIMITS,
localQueryParams.minCompetitiveScore(),
localQueryParams.sort(),
localQueryParams.computePreciseHitsCount(),
localQueryParams.timeout());
}).subscribeOn(uninterruptibleScheduler(Schedulers.boundedElastic())))
.publishOn(Schedulers.parallel());
// Get the reference docId and apply it to MoreLikeThis, to generate the query
Query mltQuery = mlt.like(mltDocumentFields);
Query luceneQuery;
if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) {
luceneQuery = new Builder()
.add(mltQuery, Occur.MUST)
.add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST)
.build();
} else {
luceneQuery = mltQuery;
}
return luceneQuery;
}
public static Collector withTimeout(Collector collector, Duration timeout) {

View File

@ -0,0 +1,55 @@
package it.cavallium.dbengine.lucene.mlt;
import static it.cavallium.dbengine.client.UninterruptibleScheduler.uninterruptibleScheduler;
import com.google.common.collect.Multimap;
import it.cavallium.dbengine.lucene.LuceneUtils;
import it.cavallium.dbengine.lucene.searcher.LLSearchTransformer;
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
import java.io.IOException;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import reactor.core.publisher.Mono;
import reactor.core.scheduler.Schedulers;
public class MoreLikeThisTransformer implements LLSearchTransformer {
private final Multimap<String, String> mltDocumentFields;
private final PerFieldAnalyzerWrapper luceneAnalyzer;
private final Similarity luceneSimilarity;
public MoreLikeThisTransformer(Multimap<String, String> mltDocumentFields,
PerFieldAnalyzerWrapper luceneAnalyzer,
Similarity luceneSimilarity) {
this.mltDocumentFields = mltDocumentFields;
this.luceneAnalyzer = luceneAnalyzer;
this.luceneSimilarity = luceneSimilarity;
}
@Override
public Mono<LocalQueryParams> transform(Mono<TransformerInput> inputMono) {
return inputMono.publishOn(uninterruptibleScheduler(Schedulers.boundedElastic())).handle((input, sink) -> {
try {
var rewrittenQuery = LuceneUtils.getMoreLikeThisQuery(input.indexSearchers(),
input.queryParams(),
luceneAnalyzer,
luceneSimilarity,
mltDocumentFields
);
var queryParams = input.queryParams();
sink.next(new LocalQueryParams(rewrittenQuery,
queryParams.offsetLong(),
queryParams.limitLong(),
queryParams.pageLimits(),
queryParams.minCompetitiveScore(),
queryParams.sort(),
queryParams.computePreciseHitsCount(),
queryParams.timeout()
));
} catch (IOException ex) {
sink.error(ex);
}
});
}
}

View File

@ -564,7 +564,7 @@ public final class MultiMoreLikeThis {
* @param filteredDocument Document with field values extracted for selected fields.
* @return More Like This query for the passed document.
*/
public Query like(Map<String, Collection<Object>> filteredDocument) throws IOException {
public Query like(Map<String, ? extends Collection<?>> filteredDocument) throws IOException {
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection<String> fields = BigCompositeReader.getIndexedFields(ir);
@ -743,11 +743,11 @@ public final class MultiMoreLikeThis {
}
}
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues)
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, ? extends Collection<?>> field2fieldValues)
throws IOException {
Map<String, Map<String, Long>> field2termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
Collection<Object> fieldValues = field2fieldValues.get(fieldName);
Collection<?> fieldValues = field2fieldValues.get(fieldName);
if (fieldValues == null) {
continue;
}

View File

@ -43,19 +43,8 @@ public class UnsortedUnscoredSimpleMultiSearcher implements MultiSearcher {
return queryParamsMono.flatMap(queryParams2 -> {
var localQueryParams = getLocalQueryParams(queryParams2);
return Mono
.fromRunnable(() -> {
LLUtils.ensureBlocking();
if (queryParams2.isSorted() && queryParams2.limitLong() > 0) {
throw new UnsupportedOperationException("Sorted queries are not supported"
+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
}
if (queryParams2.needsScores() && queryParams2.limitLong() > 0) {
throw new UnsupportedOperationException("Scored queries are not supported"
+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
}
})
.thenMany(Flux.fromIterable(indexSearchers.shards()))
return Flux
.fromIterable(indexSearchers.shards())
.flatMap(searcher -> {
var llSearcher = Mono.fromCallable(() -> new LLIndexSearcher(searcher, false, null).send());
return localSearcher.collect(llSearcher, localQueryParams, keyFieldName, transformer);
@ -85,6 +74,17 @@ public class UnsortedUnscoredSimpleMultiSearcher implements MultiSearcher {
}
indexSearchers.close();
});
})
.doFirst(() -> {
LLUtils.ensureBlocking();
if (queryParams2.isSorted() && queryParams2.limitLong() > 0) {
throw new UnsupportedOperationException("Sorted queries are not supported"
+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
}
if (queryParams2.needsScores() && queryParams2.limitLong() > 0) {
throw new UnsupportedOperationException("Scored queries are not supported"
+ " by SimpleUnsortedUnscoredLuceneMultiSearcher");
}
});
}
);