Enhance morelikethis
This commit is contained in:
parent
a20dadfb39
commit
96a908b833
|
@ -1,5 +1,7 @@
|
|||
package it.cavallium.dbengine.database;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
@ -11,6 +13,8 @@ public class EnglishItalianStopFilter extends StopFilter {
|
|||
|
||||
private static final CharArraySet stopWords;
|
||||
|
||||
private static final Set<String> stopWordsString;
|
||||
|
||||
/**
|
||||
* Constructs a filter which removes words from the input TokenStream that are named in the Set.
|
||||
*
|
||||
|
@ -997,9 +1001,20 @@ public class EnglishItalianStopFilter extends StopFilter {
|
|||
"vostro",
|
||||
"è"
|
||||
);
|
||||
var stopWordsString2 = new HashSet<>(englishStopWords);
|
||||
stopWordsString2.addAll(italianStopWords);
|
||||
stopWordsString = Collections.unmodifiableSet(stopWordsString2);
|
||||
stopWords = CharArraySet.copy(Stream
|
||||
.concat(englishStopWords.stream(), oldItalianStopWords.stream())
|
||||
.map(String::toCharArray)
|
||||
.collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
public static CharArraySet getStopWords() {
|
||||
return stopWords;
|
||||
}
|
||||
|
||||
public static Set<String> getStopWordsString() {
|
||||
return stopWordsString;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ package it.cavallium.dbengine.database.disk;
|
|||
import static it.cavallium.dbengine.lucene.LuceneUtils.checkScoringArgumentsValidity;
|
||||
|
||||
import com.google.common.base.Suppliers;
|
||||
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
|
||||
import it.cavallium.dbengine.database.LLDocument;
|
||||
import it.cavallium.dbengine.database.LLKeyScore;
|
||||
import it.cavallium.dbengine.database.LLLuceneIndex;
|
||||
|
@ -50,6 +51,7 @@ import org.apache.lucene.search.SearcherManager;
|
|||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
@ -146,9 +148,12 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||
if (lowMemory) {
|
||||
this.luceneQueryScheduler = lowMemorySupplier.get();
|
||||
} else {
|
||||
this.luceneQueryScheduler = Schedulers.newBoundedElastic(Runtime
|
||||
.getRuntime()
|
||||
.availableProcessors(), Schedulers.DEFAULT_BOUNDED_ELASTIC_QUEUESIZE, "lucene-query", 60, true);
|
||||
this.luceneQueryScheduler = Schedulers.newBoundedElastic(Schedulers.DEFAULT_BOUNDED_ELASTIC_SIZE,
|
||||
Schedulers.DEFAULT_BOUNDED_ELASTIC_QUEUESIZE,
|
||||
"lucene-query",
|
||||
60,
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
// Create scheduled tasks lifecycle manager
|
||||
|
@ -409,55 +414,56 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||
return Mono.just(LLSearchResult.empty());
|
||||
}
|
||||
|
||||
return acquireSearcherWrapper(snapshot, doDistributedPre, actionId)
|
||||
.flatMap(indexSearcher -> Mono
|
||||
.fromCallable(() -> {
|
||||
var mlt = new MoreLikeThis(indexSearcher.getIndexReader());
|
||||
mlt.setAnalyzer(indexWriter.getAnalyzer());
|
||||
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new));
|
||||
mlt.setMinTermFreq(1);
|
||||
//mlt.setMinDocFreq(1);
|
||||
mlt.setBoost(true);
|
||||
return acquireSearcherWrapper(snapshot, doDistributedPre, actionId).flatMap(indexSearcher -> Mono
|
||||
.fromCallable(() -> {
|
||||
var mlt = new MoreLikeThis(indexSearcher.getIndexReader());
|
||||
mlt.setAnalyzer(indexWriter.getAnalyzer());
|
||||
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new));
|
||||
mlt.setMinTermFreq(1);
|
||||
mlt.setMinDocFreq(3);
|
||||
mlt.setMaxDocFreqPct(20);
|
||||
mlt.setBoost(true);
|
||||
mlt.setStopWords(EnglishItalianStopFilter.getStopWordsString());
|
||||
var similarity = getSimilarity();
|
||||
if (similarity instanceof TFIDFSimilarity) {
|
||||
mlt.setSimilarity((TFIDFSimilarity) similarity);
|
||||
} else {
|
||||
logger.trace("Using an unsupported similarity algorithm for MoreLikeThis: {}. You must use a similarity instance based on TFIDFSimilarity!", similarity);
|
||||
}
|
||||
|
||||
// Get the reference doc and apply it to MoreLikeThis, to generate the query
|
||||
//noinspection BlockingMethodInNonBlockingContext
|
||||
return mlt.like((Map) mltDocumentFields);
|
||||
})
|
||||
.subscribeOn(luceneQueryScheduler)
|
||||
.flatMap(mltQuery -> Mono
|
||||
.fromCallable(() -> {
|
||||
Query luceneQuery;
|
||||
if (luceneAdditionalQuery != null) {
|
||||
luceneQuery = new BooleanQuery.Builder()
|
||||
.add(mltQuery, Occur.MUST)
|
||||
.add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST)
|
||||
.build();
|
||||
} else {
|
||||
luceneQuery = mltQuery;
|
||||
}
|
||||
// Get the reference doc and apply it to MoreLikeThis, to generate the query
|
||||
//noinspection BlockingMethodInNonBlockingContext
|
||||
var mltQuery = mlt.like((Map) mltDocumentFields);
|
||||
Query luceneQuery;
|
||||
if (luceneAdditionalQuery != null) {
|
||||
luceneQuery = new BooleanQuery.Builder()
|
||||
.add(mltQuery, Occur.MUST)
|
||||
.add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST)
|
||||
.build();
|
||||
} else {
|
||||
luceneQuery = mltQuery;
|
||||
}
|
||||
|
||||
return luceneSearch(doDistributedPre,
|
||||
indexSearcher,
|
||||
limit,
|
||||
minCompetitiveScore,
|
||||
keyFieldName,
|
||||
scoreDivisor,
|
||||
luceneQuery,
|
||||
new Sort(SortField.FIELD_SCORE),
|
||||
ScoreMode.TOP_SCORES
|
||||
);
|
||||
}).subscribeOn(luceneQueryScheduler)
|
||||
)
|
||||
.materialize()
|
||||
.flatMap(signal -> {
|
||||
if (signal.isOnComplete() || signal.isOnError()) {
|
||||
return releaseSearcherWrapper(snapshot, indexSearcher).thenReturn(signal);
|
||||
} else {
|
||||
return Mono.just(signal);
|
||||
}
|
||||
})
|
||||
.<LLSearchResult>dematerialize()
|
||||
);
|
||||
return luceneSearch(doDistributedPre,
|
||||
indexSearcher,
|
||||
limit,
|
||||
minCompetitiveScore,
|
||||
keyFieldName,
|
||||
scoreDivisor,
|
||||
luceneQuery,
|
||||
new Sort(SortField.FIELD_SCORE),
|
||||
ScoreMode.TOP_SCORES
|
||||
);
|
||||
})
|
||||
.subscribeOn(luceneQueryScheduler)
|
||||
.materialize()
|
||||
.flatMap(signal -> {
|
||||
if (signal.isOnComplete() || signal.isOnError()) {
|
||||
return releaseSearcherWrapper(snapshot, indexSearcher).thenReturn(signal);
|
||||
} else {
|
||||
return Mono.just(signal);
|
||||
}
|
||||
}).<LLSearchResult>dematerialize());
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ import org.apache.lucene.search.IndexSearcher;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
/**
|
||||
|
@ -77,4 +78,14 @@ public class AdaptiveStreamSearcher implements LuceneStreamSearcher {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isScoreSort(Sort luceneSort) {
|
||||
if (luceneSort == null) return false;
|
||||
for (SortField sortField : luceneSort.getSort()) {
|
||||
if (sortField != SortField.FIELD_SCORE) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user