From ff9ee548579a81cd6edbb56790d7cfa0e5b7c452 Mon Sep 17 00:00:00 2001 From: Andrea Cavalli Date: Sat, 17 Jul 2021 23:06:26 +0200 Subject: [PATCH] Improve performance --- .../cavallium/dbengine/client/MultiSort.java | 5 ++ .../client/query/ClientQueryParams.java | 1 + .../dbengine/client/query/QueryParser.java | 2 +- .../dbengine/database/LLScoreMode.java | 8 +- .../dbengine/lucene/LuceneUtils.java | 53 +++++++----- .../lucene/searcher/PaginationInfo.java | 2 +- .../searcher/ScoredLuceneMultiSearcher.java | 2 +- .../ScoredSimpleLuceneShardSearcher.java | 2 +- .../searcher/SimpleLuceneLocalSearcher.java | 33 ++++--- .../lucene/searcher/TopDocsSearcher.java | 85 ++----------------- .../searcher/UnscoredLuceneMultiSearcher.java | 2 +- .../searcher/UnscoredLuceneShardSearcher.java | 2 +- 12 files changed, 81 insertions(+), 116 deletions(-) diff --git a/src/main/java/it/cavallium/dbengine/client/MultiSort.java b/src/main/java/it/cavallium/dbengine/client/MultiSort.java index 6053972..8acfadf 100644 --- a/src/main/java/it/cavallium/dbengine/client/MultiSort.java +++ b/src/main/java/it/cavallium/dbengine/client/MultiSort.java @@ -1,5 +1,6 @@ package it.cavallium.dbengine.client; +import it.cavallium.dbengine.client.query.current.data.NoSort; import it.cavallium.dbengine.client.query.current.data.NumericSort; import it.cavallium.dbengine.client.query.current.data.RandomSort; import it.cavallium.dbengine.client.query.current.data.ScoreSort; @@ -66,6 +67,10 @@ public class MultiSort { return new MultiSort<>(ScoreSort.of()); } + public static MultiSort> noScoreNoSortWithValues() { + return new MultiSort<>(NoSort.of()); + } + public Sort getQuerySort() { return querySort; } diff --git a/src/main/java/it/cavallium/dbengine/client/query/ClientQueryParams.java b/src/main/java/it/cavallium/dbengine/client/query/ClientQueryParams.java index da662cf..6db636e 100644 --- a/src/main/java/it/cavallium/dbengine/client/query/ClientQueryParams.java +++ b/src/main/java/it/cavallium/dbengine/client/query/ClientQueryParams.java @@ -38,6 +38,7 @@ public final record ClientQueryParams(@Nullable CompositeSnapshot snapshot, case COMPLETE -> ScoreMode.of(false, true); case COMPLETE_NO_SCORES -> ScoreMode.of(false, false); case TOP_SCORES -> ScoreMode.of(true, true); + case NO_SCORES -> ScoreMode.of(true, false); //noinspection UnnecessaryDefault default -> throw new IllegalArgumentException(); }; diff --git a/src/main/java/it/cavallium/dbengine/client/query/QueryParser.java b/src/main/java/it/cavallium/dbengine/client/query/QueryParser.java index 176bef6..7b60568 100644 --- a/src/main/java/it/cavallium/dbengine/client/query/QueryParser.java +++ b/src/main/java/it/cavallium/dbengine/client/query/QueryParser.java @@ -158,7 +158,7 @@ public class QueryParser { } else if (scoreMode.computeScores() && !scoreMode.onlyTopScores()) { return ScoreMode.COMPLETE; } else if (!scoreMode.computeScores() && scoreMode.onlyTopScores()) { - throw new IllegalStateException("Conflicting score mode options: [computeScores = false, onlyTopScore = true]"); + return ScoreMode.TOP_DOCS; } else if (!scoreMode.computeScores() && !scoreMode.onlyTopScores()) { return ScoreMode.COMPLETE_NO_SCORES; } else { diff --git a/src/main/java/it/cavallium/dbengine/database/LLScoreMode.java b/src/main/java/it/cavallium/dbengine/database/LLScoreMode.java index d7cf517..7674e0b 100644 --- a/src/main/java/it/cavallium/dbengine/database/LLScoreMode.java +++ b/src/main/java/it/cavallium/dbengine/database/LLScoreMode.java @@ -18,5 +18,11 @@ public enum LLScoreMode { * hits using the {@link Scorer#setMinCompetitiveScore(float)} API. * This can reduce time if using setMinCompetitiveScore. */ - TOP_SCORES + TOP_SCORES, + /** + * Produced scorers will allow visiting some matches but scores won't be + * available. + * Much faster in multi-lucene indices than complete, because it will not need global scores calculation. + */ + NO_SCORES } diff --git a/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java b/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java index 8c0c9cc..b716253 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java +++ b/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java @@ -27,7 +27,9 @@ import java.util.Comparator; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; +import java.util.NoSuchElementException; import java.util.Set; +import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; @@ -48,6 +50,7 @@ import org.apache.lucene.search.similarities.BooleanSimilarity; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper; import org.apache.lucene.search.similarities.Similarity; +import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.novasearch.lucene.search.similarities.BM25Similarity; import org.novasearch.lucene.search.similarities.BM25Similarity.BM25Model; @@ -55,11 +58,16 @@ import org.novasearch.lucene.search.similarities.LdpSimilarity; import org.novasearch.lucene.search.similarities.LtcSimilarity; import org.novasearch.lucene.search.similarities.RobertsonSimilarity; import org.reactivestreams.Publisher; +import org.warp.commonutils.log.Logger; +import org.warp.commonutils.log.LoggerFactory; import reactor.core.publisher.Flux; import reactor.core.publisher.Mono; import reactor.core.scheduler.Scheduler; public class LuceneUtils { + + private static final Logger logger = LoggerFactory.getLogger(LuceneUtils.class); + private static final Analyzer lucene4GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 4, 4); private static final Analyzer lucene4GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 4, 4); private static final Analyzer lucene4GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 4, 4); @@ -168,35 +176,29 @@ public class LuceneUtils { return minCompetitiveScore == null || score >= minCompetitiveScore; } - @Nullable + /** + * @throws NoSuchElementException when the key is not found + * @throws IOException when an error occurs when reading the document + */ + @NotNull public static String keyOfTopDoc(int docId, IndexReader indexReader, - String keyFieldName) throws IOException { + String keyFieldName) throws IOException, NoSuchElementException { if (docId > indexReader.maxDoc()) { throw new IOException("Document " + docId + " > maxDoc (" +indexReader.maxDoc() + ")"); } Document d = indexReader.document(docId, Set.of(keyFieldName)); if (d.getFields().isEmpty()) { - StringBuilder sb = new StringBuilder(); - sb.append("The document docId: ").append(docId).append(" is empty."); - var realFields = indexReader.document(docId).getFields(); - if (!realFields.isEmpty()) { - sb.append("\n"); - sb.append("Present fields:\n"); - boolean first = true; - for (IndexableField field : realFields) { - if (first) { - first = false; - } else { - sb.append("\n"); - } - sb.append(" - ").append(field.name()); - } - } - throw new IOException(sb.toString()); + throw new NoSuchElementException( + "Can't get key (field \"" + keyFieldName + "\") of document docId: " + docId + ". Available fields: []"); } else { var field = d.getField(keyFieldName); if (field == null) { - throw new IOException("Can't get key of document docId: " + docId); + throw new NoSuchElementException( + "Can't get key (field \"" + keyFieldName + "\") of document docId: " + docId + ". Available fields: " + d + .getFields() + .stream() + .map(IndexableField::name) + .collect(Collectors.joining(",", "[", "]"))); } else { return field.stringValue(); } @@ -353,8 +355,11 @@ public class LuceneUtils { float score = hit.score; var indexSearcher = indexSearchers.shard(shardIndex); try { - @Nullable String collectedDoc = keyOfTopDoc(shardDocId, indexSearcher.getIndexReader(), keyFieldName); - return new LLKeyScore(shardDocId, score, Mono.justOrEmpty(collectedDoc)); + String collectedDoc = keyOfTopDoc(shardDocId, indexSearcher.getIndexReader(), keyFieldName); + return new LLKeyScore(shardDocId, score, Mono.just(collectedDoc)); + } catch (NoSuchElementException ex) { + logger.debug("Error: document " + shardDocId + " key is not present!"); + return null; } catch (Exception ex) { return new LLKeyScore(shardDocId, score, Mono.error(ex)); } @@ -425,4 +430,8 @@ public class LuceneUtils { } return result; } + + public static int totalHitsThreshold() { + return 0; + } } diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/PaginationInfo.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/PaginationInfo.java index b8af906..41138b8 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/searcher/PaginationInfo.java +++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/PaginationInfo.java @@ -5,6 +5,6 @@ import org.apache.lucene.search.ScoreDoc; public record PaginationInfo(long totalLimit, long firstPageOffset, long firstPageLimit, boolean forceSinglePage) { - public static final int MAX_SINGLE_SEARCH_LIMIT = 1000; + public static final int MAX_SINGLE_SEARCH_LIMIT = 256; public static final int FIRST_PAGE_LIMIT = 10; } diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/ScoredLuceneMultiSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/ScoredLuceneMultiSearcher.java index 3e14402..b044055 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/searcher/ScoredLuceneMultiSearcher.java +++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/ScoredLuceneMultiSearcher.java @@ -29,7 +29,7 @@ public class ScoredLuceneMultiSearcher implements LuceneMultiSearcher { } CollectorManager sharedManager = new ScoringShardsCollectorManager(luceneSort, LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset() + paginationInfo.firstPageLimit()), - null, 1000, LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset()), + null, LuceneUtils.totalHitsThreshold(), LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset()), LuceneUtils.safeLongToInt(paginationInfo.firstPageLimit())); return new ScoredSimpleLuceneShardSearcher(sharedManager, queryParams.query(), paginationInfo); }); diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/ScoredSimpleLuceneShardSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/ScoredSimpleLuceneShardSearcher.java index 5f3f455..64b0a82 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/searcher/ScoredSimpleLuceneShardSearcher.java +++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/ScoredSimpleLuceneShardSearcher.java @@ -100,7 +100,7 @@ class ScoredSimpleLuceneShardSearcher implements LuceneShardSearcher { } CollectorManager sharedManager = new ScoringShardsCollectorManager(luceneSort, s.currentPageLimit(), - (FieldDoc) s.last(), 1000, 0, s.currentPageLimit()); + (FieldDoc) s.last(), LuceneUtils.totalHitsThreshold(), 0, s.currentPageLimit()); //noinspection BlockingMethodInNonBlockingContext TopDocs pageTopDocs = Flux .fromIterable(indexSearchersArray) diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleLuceneLocalSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleLuceneLocalSearcher.java index 1fc355c..6b622cb 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleLuceneLocalSearcher.java +++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/SimpleLuceneLocalSearcher.java @@ -9,7 +9,9 @@ import it.cavallium.dbengine.lucene.LuceneUtils; import java.io.IOException; import java.util.Objects; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopDocsCollector; import reactor.core.publisher.Flux; import reactor.core.publisher.Mono; import reactor.core.scheduler.Scheduler; @@ -31,15 +33,19 @@ public class SimpleLuceneLocalSearcher implements LuceneLocalSearcher { } else { paginationInfo = new PaginationInfo(queryParams.limit(), queryParams.offset(), FIRST_PAGE_LIMIT, false); } - //noinspection BlockingMethodInNonBlockingContext - TopDocs firstPageTopDocs = TopDocsSearcher.getTopDocs(indexSearcher, - queryParams.query(), - queryParams.sort(), - LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset() + paginationInfo.firstPageLimit()), - null, - queryParams.scoreMode().needsScores(), - 1000, - LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset()), LuceneUtils.safeLongToInt(paginationInfo.firstPageLimit())); + TopDocs firstPageTopDocs; + { + TopDocsCollector firstPageCollector = TopDocsSearcher.getTopDocsCollector( + queryParams.sort(), + LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset() + paginationInfo.firstPageLimit()), + null, + LuceneUtils.totalHitsThreshold()); + //noinspection BlockingMethodInNonBlockingContext + indexSearcher.search(queryParams.query(), firstPageCollector); + firstPageTopDocs = firstPageCollector.topDocs(LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset()), + LuceneUtils.safeLongToInt(paginationInfo.firstPageLimit()) + ); + } Flux firstPageMono = LuceneUtils .convertHits( firstPageTopDocs.scoreDocs, @@ -61,9 +67,14 @@ public class SimpleLuceneLocalSearcher implements LuceneLocalSearcher { if (s.last() != null && s.remainingLimit() > 0) { TopDocs pageTopDocs; try { + TopDocsCollector collector = TopDocsSearcher.getTopDocsCollector(queryParams.sort(), + s.currentPageLimit(), + s.last(), + LuceneUtils.totalHitsThreshold() + ); //noinspection BlockingMethodInNonBlockingContext - pageTopDocs = TopDocsSearcher.getTopDocs(indexSearcher, queryParams.query(), - queryParams.sort(), s.currentPageLimit(), s.last(), queryParams.scoreMode().needsScores(), 1000); + indexSearcher.search(queryParams.query(), collector); + pageTopDocs = collector.topDocs(); } catch (IOException e) { sink.error(e); return EMPTY_STATUS; diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/TopDocsSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/TopDocsSearcher.java index 4906294..f7206a7 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/searcher/TopDocsSearcher.java +++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/TopDocsSearcher.java @@ -1,96 +1,29 @@ package it.cavallium.dbengine.lucene.searcher; import java.io.IOException; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.misc.search.DiversifiedTopDocsCollector; +import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.Collector; import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.HitQueue; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.MultiCollectorManager.Collectors; import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorable; import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocsCollector; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.TotalHits.Relation; class TopDocsSearcher { - private final boolean doDocScores; - private final IndexSearcher indexSearcher; - private final Query query; - private final Sort luceneSort; - private final int limit; - private final FieldDoc after; - private final int totalHitsThreshold; - - @Deprecated - public TopDocsSearcher(IndexSearcher indexSearcher, - Query query, - Sort luceneSort, - int limit, - FieldDoc after, - boolean doDocScores, - int totalHitsThreshold) { - this.indexSearcher = indexSearcher; - this.query = query; - this.luceneSort = luceneSort; - this.limit = limit; - this.after = after; - this.doDocScores = doDocScores; - this.totalHitsThreshold = totalHitsThreshold; - } - - /** - * This method must not be called more than once! - */ - @Deprecated - public TopDocs getTopDocs(int offset, int limit) throws IOException { - return getTopDocs(indexSearcher, query, luceneSort, limit, after, doDocScores, totalHitsThreshold, offset, limit); - } - - /** - * This method must not be called more than once! - */ - @Deprecated - public TopDocs getTopDocs() throws IOException { - return getTopDocs(indexSearcher, query, luceneSort, limit, after, doDocScores, totalHitsThreshold); - } - - public static TopDocs getTopDocs(IndexSearcher indexSearcher, - Query query, - Sort luceneSort, - int limit, - ScoreDoc after, - boolean doDocScores, - int totalHitsThreshold, - - int topDocsStartOffset, - int topDocsHowMany) throws IOException { - TopDocsCollector collector = getTopDocsCollector(luceneSort, limit, after, totalHitsThreshold); - indexSearcher.search(query, collector); - TopDocs topDocs = collector.topDocs(topDocsStartOffset, topDocsHowMany); - if (doDocScores) { - TopFieldCollector.populateScores(topDocs.scoreDocs, indexSearcher, query); - } - return topDocs; - } - - public static TopDocs getTopDocs(IndexSearcher indexSearcher, - Query query, - Sort luceneSort, - int limit, - ScoreDoc after, - boolean doDocScores, - int totalHitsThreshold) throws IOException { - TopDocsCollector collector = getTopDocsCollector(luceneSort, limit, after, totalHitsThreshold); - indexSearcher.search(query, collector); - TopDocs topDocs = collector.topDocs(); - if (doDocScores) { - TopFieldCollector.populateScores(topDocs.scoreDocs, indexSearcher, query); - } - return topDocs; - } - @SuppressWarnings({"unchecked", "rawtypes"}) public static TopDocsCollector getTopDocsCollector(Sort luceneSort, int limit, diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/UnscoredLuceneMultiSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/UnscoredLuceneMultiSearcher.java index 5baeb3f..ebf4092 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/searcher/UnscoredLuceneMultiSearcher.java +++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/UnscoredLuceneMultiSearcher.java @@ -24,7 +24,7 @@ public class UnscoredLuceneMultiSearcher implements LuceneMultiSearcher { UnscoredCollectorManager unsortedCollectorManager = new UnscoredCollectorManager(() -> TopDocsSearcher.getTopDocsCollector(queryParams.sort(), LuceneUtils.safeLongToInt(paginationInfo.firstPageOffset() + paginationInfo.firstPageLimit()), null, - 1000 + LuceneUtils.totalHitsThreshold() ), queryParams.offset(), queryParams.limit(), queryParams.sort()); return new UnscoredLuceneShardSearcher(unsortedCollectorManager, queryParams.query(), paginationInfo); }); diff --git a/src/main/java/it/cavallium/dbengine/lucene/searcher/UnscoredLuceneShardSearcher.java b/src/main/java/it/cavallium/dbengine/lucene/searcher/UnscoredLuceneShardSearcher.java index f1cb7ac..c97e07b 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/searcher/UnscoredLuceneShardSearcher.java +++ b/src/main/java/it/cavallium/dbengine/lucene/searcher/UnscoredLuceneShardSearcher.java @@ -92,7 +92,7 @@ class UnscoredLuceneShardSearcher implements LuceneShardSearcher { Query luceneQuery = queryParams.query(); UnscoredCollectorManager currentPageUnsortedCollectorManager = new UnscoredCollectorManager( () -> TopDocsSearcher.getTopDocsCollector(queryParams.sort(), s.currentPageLimit(), - s.last(), 1000), 0, s.currentPageLimit(), queryParams.sort()); + s.last(), LuceneUtils.totalHitsThreshold()), 0, s.currentPageLimit(), queryParams.sort()); //noinspection BlockingMethodInNonBlockingContext TopDocs pageTopDocs = Flux .fromIterable(indexSearchersArray)