Add custom MoreLikeThis with sharding support
This commit is contained in:
parent
65db1711b5
commit
5cfb5f49cd
@ -10,13 +10,13 @@ import io.net5.buffer.api.Send;
|
|||||||
import io.net5.util.IllegalReferenceCountException;
|
import io.net5.util.IllegalReferenceCountException;
|
||||||
import io.net5.util.internal.PlatformDependent;
|
import io.net5.util.internal.PlatformDependent;
|
||||||
import it.cavallium.dbengine.database.collections.DatabaseStage;
|
import it.cavallium.dbengine.database.collections.DatabaseStage;
|
||||||
import it.cavallium.dbengine.database.disk.LLIndexContext;
|
|
||||||
import it.cavallium.dbengine.database.disk.LLIndexSearcher;
|
import it.cavallium.dbengine.database.disk.LLIndexSearcher;
|
||||||
import it.cavallium.dbengine.database.disk.LLLocalLuceneIndex;
|
import it.cavallium.dbengine.database.disk.LLLocalLuceneIndex;
|
||||||
import it.cavallium.dbengine.database.disk.MemorySegmentUtils;
|
import it.cavallium.dbengine.database.disk.MemorySegmentUtils;
|
||||||
import it.cavallium.dbengine.database.serialization.SerializationException;
|
import it.cavallium.dbengine.database.serialization.SerializationException;
|
||||||
import it.cavallium.dbengine.database.serialization.SerializationFunction;
|
import it.cavallium.dbengine.database.serialization.SerializationFunction;
|
||||||
import it.cavallium.dbengine.lucene.RandomSortField;
|
import it.cavallium.dbengine.lucene.RandomSortField;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer;
|
||||||
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
|
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
@ -33,6 +33,7 @@ import java.util.concurrent.Callable;
|
|||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.function.ToIntFunction;
|
import java.util.function.ToIntFunction;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.FloatPoint;
|
import org.apache.lucene.document.FloatPoint;
|
||||||
@ -41,9 +42,10 @@ import org.apache.lucene.document.LongPoint;
|
|||||||
import org.apache.lucene.document.SortedNumericDocValuesField;
|
import org.apache.lucene.document.SortedNumericDocValuesField;
|
||||||
import org.apache.lucene.document.StringField;
|
import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexableField;
|
import org.apache.lucene.index.IndexableField;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.queries.mlt.MoreLikeThis;
|
import it.cavallium.dbengine.lucene.mlt.MultiMoreLikeThis;
|
||||||
import org.apache.lucene.search.BooleanClause.Occur;
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.ConstantScoreQuery;
|
import org.apache.lucene.search.ConstantScoreQuery;
|
||||||
@ -51,10 +53,11 @@ import org.apache.lucene.search.MatchAllDocsQuery;
|
|||||||
import org.apache.lucene.search.MatchNoDocsQuery;
|
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.ScoreMode;
|
import org.apache.lucene.search.ScoreMode;
|
||||||
import org.apache.lucene.search.SearcherManager;
|
|
||||||
import org.apache.lucene.search.Sort;
|
import org.apache.lucene.search.Sort;
|
||||||
import org.apache.lucene.search.SortField;
|
import org.apache.lucene.search.SortField;
|
||||||
import org.apache.lucene.search.SortedNumericSortField;
|
import org.apache.lucene.search.SortedNumericSortField;
|
||||||
|
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
@ -397,9 +400,11 @@ public class LLUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static Mono<LocalQueryParams> getMoreLikeThisQuery(
|
public static Mono<LocalQueryParams> getMoreLikeThisQuery(
|
||||||
LLIndexSearcher indexSearcher,
|
List<LLIndexSearcher> indexSearchers,
|
||||||
@Nullable LLSnapshot snapshot,
|
@Nullable LLSnapshot snapshot,
|
||||||
LocalQueryParams localQueryParams,
|
LocalQueryParams localQueryParams,
|
||||||
|
Analyzer analyzer,
|
||||||
|
Similarity similarity,
|
||||||
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux) {
|
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux) {
|
||||||
Query luceneAdditionalQuery;
|
Query luceneAdditionalQuery;
|
||||||
try {
|
try {
|
||||||
@ -409,59 +414,61 @@ public class LLUtils {
|
|||||||
}
|
}
|
||||||
return mltDocumentFieldsFlux
|
return mltDocumentFieldsFlux
|
||||||
.collectMap(Tuple2::getT1, Tuple2::getT2, HashMap::new)
|
.collectMap(Tuple2::getT1, Tuple2::getT2, HashMap::new)
|
||||||
.flatMap(mltDocumentFields -> {
|
.flatMap(mltDocumentFields -> Mono.fromCallable(() -> {
|
||||||
mltDocumentFields.entrySet().removeIf(entry -> entry.getValue().isEmpty());
|
mltDocumentFields.entrySet().removeIf(entry -> entry.getValue().isEmpty());
|
||||||
if (mltDocumentFields.isEmpty()) {
|
if (mltDocumentFields.isEmpty()) {
|
||||||
return Mono.just(new LocalQueryParams(new MatchNoDocsQuery(),
|
return new LocalQueryParams(new MatchNoDocsQuery(),
|
||||||
localQueryParams.offset(),
|
localQueryParams.offset(),
|
||||||
localQueryParams.limit(),
|
localQueryParams.limit(),
|
||||||
localQueryParams.minCompetitiveScore(),
|
localQueryParams.minCompetitiveScore(),
|
||||||
localQueryParams.sort(),
|
localQueryParams.sort(),
|
||||||
localQueryParams.scoreMode()
|
localQueryParams.scoreMode()
|
||||||
));
|
);
|
||||||
}
|
}
|
||||||
new IndexSearcher
|
MultiMoreLikeThis mlt;
|
||||||
return indexSearcher.getIndexSearcher().search(snapshot, indexSearcher -> Mono.fromCallable(() -> {
|
if (indexSearchers.size() == 1) {
|
||||||
var mlt = new MoreLikeThis(indexSearcher.getIndexReader());
|
mlt = new MultiMoreLikeThis(indexSearchers.get(0).getIndexReader(), null);
|
||||||
mlt.setAnalyzer(llLocalLuceneIndex.indexWriter.getAnalyzer());
|
} else {
|
||||||
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new));
|
IndexReader[] indexReaders = new IndexReader[indexSearchers.size()];
|
||||||
mlt.setMinTermFreq(1);
|
for (int i = 0, size = indexSearchers.size(); i < size; i++) {
|
||||||
mlt.setMinDocFreq(3);
|
indexReaders[i] = indexSearchers.get(i).getIndexReader();
|
||||||
mlt.setMaxDocFreqPct(20);
|
|
||||||
mlt.setBoost(localQueryParams.scoreMode().needsScores());
|
|
||||||
mlt.setStopWords(EnglishItalianStopFilter.getStopWordsString());
|
|
||||||
var similarity = llLocalLuceneIndex.getSimilarity();
|
|
||||||
if (similarity instanceof TFIDFSimilarity) {
|
|
||||||
mlt.setSimilarity((TFIDFSimilarity) similarity);
|
|
||||||
} else {
|
|
||||||
LLLocalLuceneIndex.logger.trace(MARKER_ROCKSDB, "Using an unsupported similarity algorithm for MoreLikeThis:"
|
|
||||||
+ " {}. You must use a similarity instance based on TFIDFSimilarity!", similarity);
|
|
||||||
}
|
}
|
||||||
|
mlt = new MultiMoreLikeThis(indexReaders, null);
|
||||||
|
}
|
||||||
|
mlt.setAnalyzer(analyzer);
|
||||||
|
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new));
|
||||||
|
mlt.setMinTermFreq(1);
|
||||||
|
mlt.setMinDocFreq(3);
|
||||||
|
mlt.setMaxDocFreqPct(20);
|
||||||
|
mlt.setBoost(localQueryParams.scoreMode().needsScores());
|
||||||
|
mlt.setStopWords(EnglishItalianStopFilter.getStopWordsString());
|
||||||
|
if (similarity instanceof TFIDFSimilarity tfidfSimilarity) {
|
||||||
|
mlt.setSimilarity(tfidfSimilarity);
|
||||||
|
} else {
|
||||||
|
mlt.setSimilarity(new ClassicSimilarity());
|
||||||
|
}
|
||||||
|
|
||||||
// Get the reference docId and apply it to MoreLikeThis, to generate the query
|
// Get the reference docId and apply it to MoreLikeThis, to generate the query
|
||||||
@SuppressWarnings({"unchecked", "rawtypes"})
|
@SuppressWarnings({"unchecked", "rawtypes"})
|
||||||
var mltQuery = mlt.like((Map) mltDocumentFields);
|
var mltQuery = mlt.like((Map) mltDocumentFields);
|
||||||
Query luceneQuery;
|
Query luceneQuery;
|
||||||
if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) {
|
if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) {
|
||||||
luceneQuery = new BooleanQuery.Builder()
|
luceneQuery = new BooleanQuery.Builder()
|
||||||
.add(mltQuery, Occur.MUST)
|
.add(mltQuery, Occur.MUST)
|
||||||
.add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST)
|
.add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST)
|
||||||
.build();
|
.build();
|
||||||
} else {
|
} else {
|
||||||
luceneQuery = mltQuery;
|
luceneQuery = mltQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
return luceneQuery;
|
return new LocalQueryParams(luceneQuery,
|
||||||
})
|
|
||||||
.subscribeOn(Schedulers.boundedElastic())
|
|
||||||
.map(luceneQuery -> new LocalQueryParams(luceneQuery,
|
|
||||||
localQueryParams.offset(),
|
localQueryParams.offset(),
|
||||||
localQueryParams.limit(),
|
localQueryParams.limit(),
|
||||||
localQueryParams.minCompetitiveScore(),
|
localQueryParams.minCompetitiveScore(),
|
||||||
localQueryParams.sort(),
|
localQueryParams.sort(),
|
||||||
localQueryParams.scoreMode()
|
localQueryParams.scoreMode()
|
||||||
)));
|
);
|
||||||
});
|
}).subscribeOn(Schedulers.boundedElastic()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static record DirectBuffer(@NotNull Send<Buffer> buffer, @NotNull ByteBuffer byteBuffer) {}
|
public static record DirectBuffer(@NotNull Send<Buffer> buffer, @NotNull ByteBuffer byteBuffer) {}
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user