Improve performance

This commit is contained in:
Andrea Cavalli 2021-07-17 23:06:26 +02:00
parent 43439c6f10
commit ff9ee54857
12 changed files with 81 additions and 116 deletions

View File

@ -1,5 +1,6 @@
package it.cavallium.dbengine.client;
import it.cavallium.dbengine.client.query.current.data.NoSort;
import it.cavallium.dbengine.client.query.current.data.NumericSort;
import it.cavallium.dbengine.client.query.current.data.RandomSort;
import it.cavallium.dbengine.client.query.current.data.ScoreSort;
@ -66,6 +67,10 @@ public class MultiSort<T> {
return new MultiSort<>(ScoreSort.of());
}
public static <T, U> MultiSort<SearchResultItem<T, U>> noScoreNoSortWithValues() {
return new MultiSort<>(NoSort.of());
}
public Sort getQuerySort() {
return querySort;
}

View File

@ -38,6 +38,7 @@ public final record ClientQueryParams<T>(@Nullable CompositeSnapshot snapshot,
case COMPLETE -> ScoreMode.of(false, true);
case COMPLETE_NO_SCORES -> ScoreMode.of(false, false);
case TOP_SCORES -> ScoreMode.of(true, true);
case NO_SCORES -> ScoreMode.of(true, false);
//noinspection UnnecessaryDefault
default -> throw new IllegalArgumentException();
};

View File

@ -158,7 +158,7 @@ public class QueryParser {
} else if (scoreMode.computeScores() && !scoreMode.onlyTopScores()) {
return ScoreMode.COMPLETE;
} else if (!scoreMode.computeScores() && scoreMode.onlyTopScores()) {
throw new IllegalStateException("Conflicting score mode options: [computeScores = false, onlyTopScore = true]");
return ScoreMode.TOP_DOCS;
} else if (!scoreMode.computeScores() && !scoreMode.onlyTopScores()) {
return ScoreMode.COMPLETE_NO_SCORES;
} else {

View File

@ -18,5 +18,11 @@ public enum LLScoreMode {
* hits using the {@link Scorer#setMinCompetitiveScore(float)} API.
* This can reduce time if using setMinCompetitiveScore.
*/
TOP_SCORES
TOP_SCORES,
/**
* Produced scorers will allow visiting some matches but scores won't be
* available.
* Much faster in multi-lucene indices than complete, because it will not need global scores calculation.
*/
NO_SCORES
}

View File

@ -27,7 +27,9 @@ import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
@ -48,6 +50,7 @@ import org.apache.lucene.search.similarities.BooleanSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.novasearch.lucene.search.similarities.BM25Similarity;
import org.novasearch.lucene.search.similarities.BM25Similarity.BM25Model;
@ -55,11 +58,16 @@ import org.novasearch.lucene.search.similarities.LdpSimilarity;
import org.novasearch.lucene.search.similarities.LtcSimilarity;
import org.novasearch.lucene.search.similarities.RobertsonSimilarity;
import org.reactivestreams.Publisher;
import org.warp.commonutils.log.Logger;
import org.warp.commonutils.log.LoggerFactory;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import reactor.core.scheduler.Scheduler;
public class LuceneUtils {
private static final Logger logger = LoggerFactory.getLogger(LuceneUtils.class);
private static final Analyzer lucene4GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 4, 4);
private static final Analyzer lucene4GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 4, 4);
private static final Analyzer lucene4GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 4, 4);
@ -168,35 +176,29 @@ public class LuceneUtils {
return minCompetitiveScore == null || score >= minCompetitiveScore;
}
@Nullable
/**
* @throws NoSuchElementException when the key is not found
* @throws IOException when an error occurs when reading the document
*/
@NotNull
public static String keyOfTopDoc(int docId, IndexReader indexReader,
String keyFieldName) throws IOException {
String keyFieldName) throws IOException, NoSuchElementException {
if (docId > indexReader.maxDoc()) {
throw new IOException("Document " + docId + " > maxDoc (" +indexReader.maxDoc() + ")");
}
Document d = indexReader.document(docId, Set.of(keyFieldName));
if (d.getFields().isEmpty()) {
StringBuilder sb = new StringBuilder();
sb.append("The document docId: ").append(docId).append(" is empty.");
var realFields = indexReader.document(docId).getFields();
if (!realFields.isEmpty()) {
sb.append("\n");
sb.append("Present fields:\n");
boolean first = true;
for (IndexableField field : realFields) {
if (first) {
first = false;
} else {
sb.append("\n");
}
sb.append(" - ").append(field.name());
}
}
throw new IOException(sb.toString());
throw new NoSuchElementException(
"Can't get key (field \"" + keyFieldName + "\") of document docId: " + docId + ". Available fields: []");
} else {
var field = d.getField(keyFieldName);
if (field == null) {
throw new IOException("Can't get key of document docId: " + docId);
throw new NoSuchElementException(
"Can't get key (field \"" + keyFieldName + "\") of document docId: " + docId + ". Available fields: " + d
.getFields()
.stream()
.map(IndexableField::name)
.collect(Collectors.joining(",", "[", "]")));
} else {
return field.stringValue();
}
@ -353,8 +355,11 @@ public class LuceneUtils {
float score = hit.score;
var indexSearcher = indexSearchers.shard(shardIndex);
try {
@Nullable String collectedDoc = keyOfTopDoc(shardDocId, indexSearcher.getIndexReader(), keyFieldName);
return new LLKeyScore(shardDocId, score, Mono.justOrEmpty(collectedDoc));
String collectedDoc = keyOfTopDoc(shardDocId, indexSearcher.getIndexReader(), keyFieldName);
return new LLKeyScore(shardDocId, score, Mono.just(collectedDoc));
} catch (NoSuchElementException ex) {
logger.debug("Error: document " + shardDocId + " key is not present!");
return null;
} catch (Exception ex) {
return new LLKeyScore(shardDocId, score, Mono.error(ex));
}
@ -425,4 +430,8 @@ public class LuceneUtils {
}
return result;
}
public static int totalHitsThreshold() {
return 0;
}
}

View File

@ -5,6 +5,6 @@ import org.apache.lucene.search.ScoreDoc;
public record PaginationInfo(long totalLimit, long firstPageOffset, long firstPageLimit, boolean forceSinglePage) {
public static final int MAX_SINGLE_SEARCH_LIMIT = 1000;
public static final int MAX_SINGLE_SEARCH_LIMIT = 256;
public static final int FIRST_PAGE_LIMIT = 10;
}

View File

@ -29,7 +29,7 @@ public class ScoredLuceneMultiSearcher implements LuceneMultiSearcher {
}
CollectorManager<TopFieldCollector, TopDocs> sharedManager = new ScoringShardsCollectorManager(luceneSort,
LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset() + paginationInfo.firstPageLimit()),
null, 1000, LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset()),
null, LuceneUtils.totalHitsThreshold(), LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset()),
LuceneUtils.safeLongToInt(paginationInfo.firstPageLimit()));
return new ScoredSimpleLuceneShardSearcher(sharedManager, queryParams.query(), paginationInfo);
});

View File

@ -100,7 +100,7 @@ class ScoredSimpleLuceneShardSearcher implements LuceneShardSearcher {
}
CollectorManager<TopFieldCollector, TopDocs> sharedManager
= new ScoringShardsCollectorManager(luceneSort, s.currentPageLimit(),
(FieldDoc) s.last(), 1000, 0, s.currentPageLimit());
(FieldDoc) s.last(), LuceneUtils.totalHitsThreshold(), 0, s.currentPageLimit());
//noinspection BlockingMethodInNonBlockingContext
TopDocs pageTopDocs = Flux
.fromIterable(indexSearchersArray)

View File

@ -9,7 +9,9 @@ import it.cavallium.dbengine.lucene.LuceneUtils;
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
import reactor.core.scheduler.Scheduler;
@ -31,15 +33,19 @@ public class SimpleLuceneLocalSearcher implements LuceneLocalSearcher {
} else {
paginationInfo = new PaginationInfo(queryParams.limit(), queryParams.offset(), FIRST_PAGE_LIMIT, false);
}
//noinspection BlockingMethodInNonBlockingContext
TopDocs firstPageTopDocs = TopDocsSearcher.getTopDocs(indexSearcher,
queryParams.query(),
queryParams.sort(),
LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset() + paginationInfo.firstPageLimit()),
null,
queryParams.scoreMode().needsScores(),
1000,
LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset()), LuceneUtils.safeLongToInt(paginationInfo.firstPageLimit()));
TopDocs firstPageTopDocs;
{
TopDocsCollector<ScoreDoc> firstPageCollector = TopDocsSearcher.getTopDocsCollector(
queryParams.sort(),
LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset() + paginationInfo.firstPageLimit()),
null,
LuceneUtils.totalHitsThreshold());
//noinspection BlockingMethodInNonBlockingContext
indexSearcher.search(queryParams.query(), firstPageCollector);
firstPageTopDocs = firstPageCollector.topDocs(LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset()),
LuceneUtils.safeLongToInt(paginationInfo.firstPageLimit())
);
}
Flux<LLKeyScore> firstPageMono = LuceneUtils
.convertHits(
firstPageTopDocs.scoreDocs,
@ -61,9 +67,14 @@ public class SimpleLuceneLocalSearcher implements LuceneLocalSearcher {
if (s.last() != null && s.remainingLimit() > 0) {
TopDocs pageTopDocs;
try {
TopDocsCollector<ScoreDoc> collector = TopDocsSearcher.getTopDocsCollector(queryParams.sort(),
s.currentPageLimit(),
s.last(),
LuceneUtils.totalHitsThreshold()
);
//noinspection BlockingMethodInNonBlockingContext
pageTopDocs = TopDocsSearcher.getTopDocs(indexSearcher, queryParams.query(),
queryParams.sort(), s.currentPageLimit(), s.last(), queryParams.scoreMode().needsScores(), 1000);
indexSearcher.search(queryParams.query(), collector);
pageTopDocs = collector.topDocs();
} catch (IOException e) {
sink.error(e);
return EMPTY_STATUS;

View File

@ -1,96 +1,29 @@
package it.cavallium.dbengine.lucene.searcher;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.misc.search.DiversifiedTopDocsCollector;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.HitQueue;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.MultiCollectorManager.Collectors;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TotalHits.Relation;
class TopDocsSearcher {
private final boolean doDocScores;
private final IndexSearcher indexSearcher;
private final Query query;
private final Sort luceneSort;
private final int limit;
private final FieldDoc after;
private final int totalHitsThreshold;
@Deprecated
public TopDocsSearcher(IndexSearcher indexSearcher,
Query query,
Sort luceneSort,
int limit,
FieldDoc after,
boolean doDocScores,
int totalHitsThreshold) {
this.indexSearcher = indexSearcher;
this.query = query;
this.luceneSort = luceneSort;
this.limit = limit;
this.after = after;
this.doDocScores = doDocScores;
this.totalHitsThreshold = totalHitsThreshold;
}
/**
* This method must not be called more than once!
*/
@Deprecated
public TopDocs getTopDocs(int offset, int limit) throws IOException {
return getTopDocs(indexSearcher, query, luceneSort, limit, after, doDocScores, totalHitsThreshold, offset, limit);
}
/**
* This method must not be called more than once!
*/
@Deprecated
public TopDocs getTopDocs() throws IOException {
return getTopDocs(indexSearcher, query, luceneSort, limit, after, doDocScores, totalHitsThreshold);
}
public static TopDocs getTopDocs(IndexSearcher indexSearcher,
Query query,
Sort luceneSort,
int limit,
ScoreDoc after,
boolean doDocScores,
int totalHitsThreshold,
int topDocsStartOffset,
int topDocsHowMany) throws IOException {
TopDocsCollector<?> collector = getTopDocsCollector(luceneSort, limit, after, totalHitsThreshold);
indexSearcher.search(query, collector);
TopDocs topDocs = collector.topDocs(topDocsStartOffset, topDocsHowMany);
if (doDocScores) {
TopFieldCollector.populateScores(topDocs.scoreDocs, indexSearcher, query);
}
return topDocs;
}
public static TopDocs getTopDocs(IndexSearcher indexSearcher,
Query query,
Sort luceneSort,
int limit,
ScoreDoc after,
boolean doDocScores,
int totalHitsThreshold) throws IOException {
TopDocsCollector<?> collector = getTopDocsCollector(luceneSort, limit, after, totalHitsThreshold);
indexSearcher.search(query, collector);
TopDocs topDocs = collector.topDocs();
if (doDocScores) {
TopFieldCollector.populateScores(topDocs.scoreDocs, indexSearcher, query);
}
return topDocs;
}
@SuppressWarnings({"unchecked", "rawtypes"})
public static TopDocsCollector<ScoreDoc> getTopDocsCollector(Sort luceneSort,
int limit,

View File

@ -24,7 +24,7 @@ public class UnscoredLuceneMultiSearcher implements LuceneMultiSearcher {
UnscoredCollectorManager unsortedCollectorManager = new UnscoredCollectorManager(() -> TopDocsSearcher.getTopDocsCollector(queryParams.sort(),
LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset() + paginationInfo.firstPageLimit()),
null,
1000
LuceneUtils.totalHitsThreshold()
), queryParams.offset(), queryParams.limit(), queryParams.sort());
return new UnscoredLuceneShardSearcher(unsortedCollectorManager, queryParams.query(), paginationInfo);
});

View File

@ -92,7 +92,7 @@ class UnscoredLuceneShardSearcher implements LuceneShardSearcher {
Query luceneQuery = queryParams.query();
UnscoredCollectorManager currentPageUnsortedCollectorManager = new UnscoredCollectorManager(
() -> TopDocsSearcher.getTopDocsCollector(queryParams.sort(), s.currentPageLimit(),
s.last(), 1000), 0, s.currentPageLimit(), queryParams.sort());
s.last(), LuceneUtils.totalHitsThreshold()), 0, s.currentPageLimit(), queryParams.sort());
//noinspection BlockingMethodInNonBlockingContext
TopDocs pageTopDocs = Flux
.fromIterable(indexSearchersArray)