Add custom MoreLikeThis with sharding support

This commit is contained in:
Andrea Cavalli 2021-09-20 00:22:22 +02:00
parent 65db1711b5
commit 5cfb5f49cd
2 changed files with 1093 additions and 41 deletions

View File

@ -10,13 +10,13 @@ import io.net5.buffer.api.Send;
import io.net5.util.IllegalReferenceCountException; import io.net5.util.IllegalReferenceCountException;
import io.net5.util.internal.PlatformDependent; import io.net5.util.internal.PlatformDependent;
import it.cavallium.dbengine.database.collections.DatabaseStage; import it.cavallium.dbengine.database.collections.DatabaseStage;
import it.cavallium.dbengine.database.disk.LLIndexContext;
import it.cavallium.dbengine.database.disk.LLIndexSearcher; import it.cavallium.dbengine.database.disk.LLIndexSearcher;
import it.cavallium.dbengine.database.disk.LLLocalLuceneIndex; import it.cavallium.dbengine.database.disk.LLLocalLuceneIndex;
import it.cavallium.dbengine.database.disk.MemorySegmentUtils; import it.cavallium.dbengine.database.disk.MemorySegmentUtils;
import it.cavallium.dbengine.database.serialization.SerializationException; import it.cavallium.dbengine.database.serialization.SerializationException;
import it.cavallium.dbengine.database.serialization.SerializationFunction; import it.cavallium.dbengine.database.serialization.SerializationFunction;
import it.cavallium.dbengine.lucene.RandomSortField; import it.cavallium.dbengine.lucene.RandomSortField;
import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer;
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams; import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
@ -33,6 +33,7 @@ import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.ToIntFunction; import java.util.function.ToIntFunction;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint; import org.apache.lucene.document.FloatPoint;
@ -41,9 +42,10 @@ import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.queries.mlt.MoreLikeThis; import it.cavallium.dbengine.lucene.mlt.MultiMoreLikeThis;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.ConstantScoreQuery;
@ -51,10 +53,11 @@ import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort; import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
@ -397,9 +400,11 @@ public class LLUtils {
} }
public static Mono<LocalQueryParams> getMoreLikeThisQuery( public static Mono<LocalQueryParams> getMoreLikeThisQuery(
LLIndexSearcher indexSearcher, List<LLIndexSearcher> indexSearchers,
@Nullable LLSnapshot snapshot, @Nullable LLSnapshot snapshot,
LocalQueryParams localQueryParams, LocalQueryParams localQueryParams,
Analyzer analyzer,
Similarity similarity,
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux) { Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux) {
Query luceneAdditionalQuery; Query luceneAdditionalQuery;
try { try {
@ -409,59 +414,61 @@ public class LLUtils {
} }
return mltDocumentFieldsFlux return mltDocumentFieldsFlux
.collectMap(Tuple2::getT1, Tuple2::getT2, HashMap::new) .collectMap(Tuple2::getT1, Tuple2::getT2, HashMap::new)
.flatMap(mltDocumentFields -> { .flatMap(mltDocumentFields -> Mono.fromCallable(() -> {
mltDocumentFields.entrySet().removeIf(entry -> entry.getValue().isEmpty()); mltDocumentFields.entrySet().removeIf(entry -> entry.getValue().isEmpty());
if (mltDocumentFields.isEmpty()) { if (mltDocumentFields.isEmpty()) {
return Mono.just(new LocalQueryParams(new MatchNoDocsQuery(), return new LocalQueryParams(new MatchNoDocsQuery(),
localQueryParams.offset(), localQueryParams.offset(),
localQueryParams.limit(), localQueryParams.limit(),
localQueryParams.minCompetitiveScore(), localQueryParams.minCompetitiveScore(),
localQueryParams.sort(), localQueryParams.sort(),
localQueryParams.scoreMode() localQueryParams.scoreMode()
)); );
} }
new IndexSearcher MultiMoreLikeThis mlt;
return indexSearcher.getIndexSearcher().search(snapshot, indexSearcher -> Mono.fromCallable(() -> { if (indexSearchers.size() == 1) {
var mlt = new MoreLikeThis(indexSearcher.getIndexReader()); mlt = new MultiMoreLikeThis(indexSearchers.get(0).getIndexReader(), null);
mlt.setAnalyzer(llLocalLuceneIndex.indexWriter.getAnalyzer()); } else {
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new)); IndexReader[] indexReaders = new IndexReader[indexSearchers.size()];
mlt.setMinTermFreq(1); for (int i = 0, size = indexSearchers.size(); i < size; i++) {
mlt.setMinDocFreq(3); indexReaders[i] = indexSearchers.get(i).getIndexReader();
mlt.setMaxDocFreqPct(20);
mlt.setBoost(localQueryParams.scoreMode().needsScores());
mlt.setStopWords(EnglishItalianStopFilter.getStopWordsString());
var similarity = llLocalLuceneIndex.getSimilarity();
if (similarity instanceof TFIDFSimilarity) {
mlt.setSimilarity((TFIDFSimilarity) similarity);
} else {
LLLocalLuceneIndex.logger.trace(MARKER_ROCKSDB, "Using an unsupported similarity algorithm for MoreLikeThis:"
+ " {}. You must use a similarity instance based on TFIDFSimilarity!", similarity);
} }
mlt = new MultiMoreLikeThis(indexReaders, null);
}
mlt.setAnalyzer(analyzer);
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new));
mlt.setMinTermFreq(1);
mlt.setMinDocFreq(3);
mlt.setMaxDocFreqPct(20);
mlt.setBoost(localQueryParams.scoreMode().needsScores());
mlt.setStopWords(EnglishItalianStopFilter.getStopWordsString());
if (similarity instanceof TFIDFSimilarity tfidfSimilarity) {
mlt.setSimilarity(tfidfSimilarity);
} else {
mlt.setSimilarity(new ClassicSimilarity());
}
// Get the reference docId and apply it to MoreLikeThis, to generate the query // Get the reference docId and apply it to MoreLikeThis, to generate the query
@SuppressWarnings({"unchecked", "rawtypes"}) @SuppressWarnings({"unchecked", "rawtypes"})
var mltQuery = mlt.like((Map) mltDocumentFields); var mltQuery = mlt.like((Map) mltDocumentFields);
Query luceneQuery; Query luceneQuery;
if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) { if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) {
luceneQuery = new BooleanQuery.Builder() luceneQuery = new BooleanQuery.Builder()
.add(mltQuery, Occur.MUST) .add(mltQuery, Occur.MUST)
.add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST) .add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST)
.build(); .build();
} else { } else {
luceneQuery = mltQuery; luceneQuery = mltQuery;
} }
return luceneQuery; return new LocalQueryParams(luceneQuery,
})
.subscribeOn(Schedulers.boundedElastic())
.map(luceneQuery -> new LocalQueryParams(luceneQuery,
localQueryParams.offset(), localQueryParams.offset(),
localQueryParams.limit(), localQueryParams.limit(),
localQueryParams.minCompetitiveScore(), localQueryParams.minCompetitiveScore(),
localQueryParams.sort(), localQueryParams.sort(),
localQueryParams.scoreMode() localQueryParams.scoreMode()
))); );
}); }).subscribeOn(Schedulers.boundedElastic()));
} }
public static record DirectBuffer(@NotNull Send<Buffer> buffer, @NotNull ByteBuffer byteBuffer) {} public static record DirectBuffer(@NotNull Send<Buffer> buffer, @NotNull ByteBuffer byteBuffer) {}

File diff suppressed because it is too large Load Diff