Global scores between multiple lucene shards
This commit is contained in:
parent
059da90ef4
commit
151884b772
6
pom.xml
6
pom.xml
@ -168,7 +168,11 @@
|
|||||||
<artifactId>reactor-tools</artifactId>
|
<artifactId>reactor-tools</artifactId>
|
||||||
<version>3.4.2</version>
|
<version>3.4.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.novasearch</groupId>
|
||||||
|
<artifactId>lucene-relevance</artifactId>
|
||||||
|
<version>8.0.0.0.5</version>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
<build>
|
<build>
|
||||||
<testSourceDirectory>src/test/java</testSourceDirectory>
|
<testSourceDirectory>src/test/java</testSourceDirectory>
|
||||||
|
@ -5,9 +5,11 @@ import it.cavallium.dbengine.database.LLItem;
|
|||||||
import it.cavallium.dbengine.database.LLLuceneIndex;
|
import it.cavallium.dbengine.database.LLLuceneIndex;
|
||||||
import it.cavallium.dbengine.database.LLScoreMode;
|
import it.cavallium.dbengine.database.LLScoreMode;
|
||||||
import it.cavallium.dbengine.database.LLTerm;
|
import it.cavallium.dbengine.database.LLTerm;
|
||||||
import it.cavallium.dbengine.database.analyzer.TextFieldsAnalyzer;
|
import it.cavallium.dbengine.lucene.LuceneUtils;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||||
import it.cavallium.dbengine.database.disk.LLLocalDatabaseConnection;
|
import it.cavallium.dbengine.database.disk.LLLocalDatabaseConnection;
|
||||||
import it.cavallium.dbengine.lucene.serializer.TermQuery;
|
import it.cavallium.dbengine.lucene.serializer.Query;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -27,12 +29,12 @@ public class IndicizationExample {
|
|||||||
.addDocument(new LLTerm("id", "123"),
|
.addDocument(new LLTerm("id", "123"),
|
||||||
new LLDocument(new LLItem[]{
|
new LLDocument(new LLItem[]{
|
||||||
LLItem.newStringField("id", "123", Store.YES),
|
LLItem.newStringField("id", "123", Store.YES),
|
||||||
LLItem.newStringField("name", "Mario", Store.NO),
|
LLItem.newTextField("name", "Mario", Store.NO),
|
||||||
LLItem.newStringField("surname", "Rossi", Store.NO)
|
LLItem.newStringField("surname", "Rossi", Store.NO)
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
.then(index.refresh())
|
.then(index.refresh())
|
||||||
.then(index.search(null, new TermQuery("name", "Mario"), 1, null, LLScoreMode.COMPLETE_NO_SCORES, "id"))
|
.then(index.search(null, Query.exactSearch(TextFieldsAnalyzer.PartialString,"name", "Mario"), 1, null, LLScoreMode.COMPLETE, "id"))
|
||||||
.flatMap(results -> results
|
.flatMap(results -> results
|
||||||
.results()
|
.results()
|
||||||
.flatMap(r -> r)
|
.flatMap(r -> r)
|
||||||
@ -45,6 +47,70 @@ public class IndicizationExample {
|
|||||||
)
|
)
|
||||||
.subscribeOn(Schedulers.parallel())
|
.subscribeOn(Schedulers.parallel())
|
||||||
.block();
|
.block();
|
||||||
|
tempIndex(true)
|
||||||
|
.flatMap(index ->
|
||||||
|
index
|
||||||
|
.addDocument(new LLTerm("id", "126"),
|
||||||
|
new LLDocument(new LLItem[]{
|
||||||
|
LLItem.newStringField("id", "126", Store.YES),
|
||||||
|
LLItem.newTextField("name", "Marioxq", Store.NO),
|
||||||
|
LLItem.newStringField("surname", "Rossi", Store.NO)
|
||||||
|
})
|
||||||
|
)
|
||||||
|
.then(index
|
||||||
|
.addDocument(new LLTerm("id", "123"),
|
||||||
|
new LLDocument(new LLItem[]{
|
||||||
|
LLItem.newStringField("id", "123", Store.YES),
|
||||||
|
LLItem.newTextField("name", "Mario", Store.NO),
|
||||||
|
LLItem.newStringField("surname", "Rossi", Store.NO)
|
||||||
|
})
|
||||||
|
))
|
||||||
|
.then(index
|
||||||
|
.addDocument(new LLTerm("id", "124"),
|
||||||
|
new LLDocument(new LLItem[]{
|
||||||
|
LLItem.newStringField("id", "124", Store.YES),
|
||||||
|
LLItem.newTextField("name", "Mariossi", Store.NO),
|
||||||
|
LLItem.newStringField("surname", "Rossi", Store.NO)
|
||||||
|
})
|
||||||
|
))
|
||||||
|
.then(index
|
||||||
|
.addDocument(new LLTerm("id", "125"),
|
||||||
|
new LLDocument(new LLItem[]{
|
||||||
|
LLItem.newStringField("id", "125", Store.YES),
|
||||||
|
LLItem.newTextField("name", "Mario marios", Store.NO),
|
||||||
|
LLItem.newStringField("surname", "Rossi", Store.NO)
|
||||||
|
})
|
||||||
|
))
|
||||||
|
.then(index
|
||||||
|
.addDocument(new LLTerm("id", "128"),
|
||||||
|
new LLDocument(new LLItem[]{
|
||||||
|
LLItem.newStringField("id", "128", Store.YES),
|
||||||
|
LLItem.newTextField("name", "Marion", Store.NO),
|
||||||
|
LLItem.newStringField("surname", "Rossi", Store.NO)
|
||||||
|
})
|
||||||
|
))
|
||||||
|
.then(index
|
||||||
|
.addDocument(new LLTerm("id", "127"),
|
||||||
|
new LLDocument(new LLItem[]{
|
||||||
|
LLItem.newStringField("id", "127", Store.YES),
|
||||||
|
LLItem.newTextField("name", "Mariotto", Store.NO),
|
||||||
|
LLItem.newStringField("surname", "Rossi", Store.NO)
|
||||||
|
})
|
||||||
|
))
|
||||||
|
.then(index.refresh())
|
||||||
|
.then(index.search(null, Query.exactSearch(TextFieldsAnalyzer.PartialString,"name", "Mario"), 10, MultiSort.topScore()
|
||||||
|
.getQuerySort(), LLScoreMode.COMPLETE, "id"))
|
||||||
|
.flatMap(results -> LuceneUtils.mergeStream(results
|
||||||
|
.results(), MultiSort.topScoreRaw(), 10)
|
||||||
|
.doOnNext(value -> System.out.println("Value: " + value))
|
||||||
|
.then(results.totalHitsCount())
|
||||||
|
)
|
||||||
|
.doOnNext(count -> System.out.println("Total hits: " + count))
|
||||||
|
.doOnTerminate(() -> System.out.println("Completed"))
|
||||||
|
.then(index.close())
|
||||||
|
)
|
||||||
|
.subscribeOn(Schedulers.parallel())
|
||||||
|
.block();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static final class CurrentCustomType {
|
public static final class CurrentCustomType {
|
||||||
@ -86,8 +152,9 @@ public class IndicizationExample {
|
|||||||
.subscribeOn(Schedulers.boundedElastic())
|
.subscribeOn(Schedulers.boundedElastic())
|
||||||
.then(new LLLocalDatabaseConnection(wrkspcPath, true).connect())
|
.then(new LLLocalDatabaseConnection(wrkspcPath, true).connect())
|
||||||
.flatMap(conn -> conn.getLuceneIndex("testindices",
|
.flatMap(conn -> conn.getLuceneIndex("testindices",
|
||||||
3,
|
10,
|
||||||
TextFieldsAnalyzer.PartialWords,
|
TextFieldsAnalyzer.PartialString,
|
||||||
|
TextFieldsSimilarity.NGramBM25Plus,
|
||||||
Duration.ofSeconds(5),
|
Duration.ofSeconds(5),
|
||||||
Duration.ofSeconds(5),
|
Duration.ofSeconds(5),
|
||||||
false
|
false
|
||||||
|
@ -7,7 +7,7 @@ import it.cavallium.dbengine.database.LLSnapshot;
|
|||||||
import it.cavallium.dbengine.database.LLSnapshottable;
|
import it.cavallium.dbengine.database.LLSnapshottable;
|
||||||
import it.cavallium.dbengine.database.LLSort;
|
import it.cavallium.dbengine.database.LLSort;
|
||||||
import it.cavallium.dbengine.database.LLTerm;
|
import it.cavallium.dbengine.database.LLTerm;
|
||||||
import it.cavallium.dbengine.database.LuceneUtils;
|
import it.cavallium.dbengine.lucene.LuceneUtils;
|
||||||
import it.cavallium.dbengine.database.collections.Joiner.ValueGetter;
|
import it.cavallium.dbengine.database.collections.Joiner.ValueGetter;
|
||||||
import it.cavallium.dbengine.lucene.serializer.Query;
|
import it.cavallium.dbengine.lucene.serializer.Query;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package it.cavallium.dbengine.client;
|
package it.cavallium.dbengine.client;
|
||||||
|
|
||||||
|
import it.cavallium.dbengine.database.LLKeyScore;
|
||||||
import it.cavallium.dbengine.database.LLSort;
|
import it.cavallium.dbengine.database.LLSort;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.function.ToIntFunction;
|
import java.util.function.ToIntFunction;
|
||||||
@ -63,12 +64,16 @@ public class MultiSort<T> {
|
|||||||
return new MultiSort<>(LLSort.newRandomSortField(), (a, b) -> 0);
|
return new MultiSort<>(LLSort.newRandomSortField(), (a, b) -> 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static MultiSort<LLKeyScore> topScoreRaw() {
|
||||||
|
return new MultiSort<>(LLSort.newSortScore(), Comparator.comparingDouble(LLKeyScore::getScore).reversed());
|
||||||
|
}
|
||||||
|
|
||||||
public static <T> MultiSort<SearchResultKey<T>> topScore() {
|
public static <T> MultiSort<SearchResultKey<T>> topScore() {
|
||||||
return new MultiSort<>(null, Comparator.<SearchResultKey<T>>comparingDouble(SearchResultKey::getScore).reversed());
|
return new MultiSort<>(LLSort.newSortScore(), Comparator.<SearchResultKey<T>>comparingDouble(SearchResultKey::getScore).reversed());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T, U> MultiSort<SearchResultItem<T, U>> topScoreWithValues() {
|
public static <T, U> MultiSort<SearchResultItem<T, U>> topScoreWithValues() {
|
||||||
return new MultiSort<>(null, Comparator.<SearchResultItem<T, U>>comparingDouble(SearchResultItem::getScore).reversed());
|
return new MultiSort<>(LLSort.newSortScore(), Comparator.<SearchResultItem<T, U>>comparingDouble(SearchResultItem::getScore).reversed());
|
||||||
}
|
}
|
||||||
|
|
||||||
public LLSort getQuerySort() {
|
public LLSort getQuerySort() {
|
||||||
|
@ -0,0 +1,9 @@
|
|||||||
|
package it.cavallium.dbengine.database;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
|
|
||||||
|
public interface LLCollectionStatisticsGetter {
|
||||||
|
|
||||||
|
CollectionStatistics collectionStatistics(String field) throws IOException;
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
package it.cavallium.dbengine.database;
|
package it.cavallium.dbengine.database;
|
||||||
|
|
||||||
import it.cavallium.dbengine.database.analyzer.TextFieldsAnalyzer;
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import reactor.core.publisher.Mono;
|
import reactor.core.publisher.Mono;
|
||||||
@ -15,6 +16,7 @@ public interface LLDatabaseConnection {
|
|||||||
Mono<? extends LLLuceneIndex> getLuceneIndex(String name,
|
Mono<? extends LLLuceneIndex> getLuceneIndex(String name,
|
||||||
int instancesCount,
|
int instancesCount,
|
||||||
TextFieldsAnalyzer textFieldsAnalyzer,
|
TextFieldsAnalyzer textFieldsAnalyzer,
|
||||||
|
TextFieldsSimilarity scorer,
|
||||||
Duration queryRefreshDebounceTime,
|
Duration queryRefreshDebounceTime,
|
||||||
Duration commitDebounceTime,
|
Duration commitDebounceTime,
|
||||||
boolean lowMemory);
|
boolean lowMemory);
|
||||||
|
@ -1,7 +1,22 @@
|
|||||||
package it.cavallium.dbengine.database;
|
package it.cavallium.dbengine.database;
|
||||||
|
|
||||||
public enum LLScoreMode {
|
import org.apache.lucene.search.Scorer;
|
||||||
|
|
||||||
|
public enum LLScoreMode {
|
||||||
|
/**
|
||||||
|
* Produced scorers will allow visiting all matches and get their score.
|
||||||
|
*/
|
||||||
COMPLETE,
|
COMPLETE,
|
||||||
TOP_SCORES,
|
/**
|
||||||
COMPLETE_NO_SCORES
|
* Produced scorers will allow visiting all matches but scores won't be
|
||||||
|
* available.
|
||||||
|
* Much faster in multi-lucene indices than complete, because it will not need global scores calculation.
|
||||||
|
*/
|
||||||
|
COMPLETE_NO_SCORES,
|
||||||
|
/**
|
||||||
|
* Produced scorers will optionally allow skipping over non-competitive
|
||||||
|
* hits using the {@link Scorer#setMinCompetitiveScore(float)} API.
|
||||||
|
* This can reduce time if using setMinCompetitiveScore.
|
||||||
|
*/
|
||||||
|
TOP_SCORES
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,13 @@
|
|||||||
|
package it.cavallium.dbengine.database;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
|
||||||
|
public interface LLSearchCollectionStatisticsGetter {
|
||||||
|
|
||||||
|
CollectionStatistics collectionStatistics(IndexSearcher indexSearcher,
|
||||||
|
String field,
|
||||||
|
boolean distributedPre,
|
||||||
|
long actionId) throws IOException;
|
||||||
|
}
|
@ -25,6 +25,7 @@ public class LLSearchResult {
|
|||||||
return (a, b) -> {
|
return (a, b) -> {
|
||||||
var mergedTotals = a.totalHitsCount.flatMap(aL -> b.totalHitsCount.map(bL -> aL + bL));
|
var mergedTotals = a.totalHitsCount.flatMap(aL -> b.totalHitsCount.map(bL -> aL + bL));
|
||||||
var mergedResults = Flux.merge(a.results, b.results);
|
var mergedResults = Flux.merge(a.results, b.results);
|
||||||
|
|
||||||
return new LLSearchResult(mergedTotals, mergedResults);
|
return new LLSearchResult(mergedTotals, mergedResults);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -37,6 +38,10 @@ public class LLSearchResult {
|
|||||||
return this.results;
|
return this.results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Mono<Void> completion() {
|
||||||
|
return results.flatMap(r -> r).then();
|
||||||
|
}
|
||||||
|
|
||||||
public boolean equals(final Object o) {
|
public boolean equals(final Object o) {
|
||||||
if (o == this) {
|
if (o == this) {
|
||||||
return true;
|
return true;
|
||||||
|
@ -22,6 +22,14 @@ public class LLSort {
|
|||||||
return new LLSort(null, LLSortType.RANDOM, false);
|
return new LLSort(null, LLSortType.RANDOM, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static LLSort newSortScore() {
|
||||||
|
return new LLSort(null, LLSortType.SCORE, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static LLSort newSortDoc() {
|
||||||
|
return new LLSort(null, LLSortType.DOC, false);
|
||||||
|
}
|
||||||
|
|
||||||
public String getFieldName() {
|
public String getFieldName() {
|
||||||
return fieldName;
|
return fieldName;
|
||||||
}
|
}
|
||||||
|
@ -2,5 +2,7 @@ package it.cavallium.dbengine.database;
|
|||||||
|
|
||||||
public enum LLSortType {
|
public enum LLSortType {
|
||||||
LONG,
|
LONG,
|
||||||
RANDOM
|
RANDOM,
|
||||||
|
SCORE,
|
||||||
|
DOC
|
||||||
}
|
}
|
||||||
|
@ -45,6 +45,10 @@ public class LLUtils {
|
|||||||
return new Sort(new SortedNumericSortField(sort.getFieldName(), SortField.Type.LONG, sort.isReverse()));
|
return new Sort(new SortedNumericSortField(sort.getFieldName(), SortField.Type.LONG, sort.isReverse()));
|
||||||
} else if (sort.getType() == LLSortType.RANDOM) {
|
} else if (sort.getType() == LLSortType.RANDOM) {
|
||||||
return new Sort(new RandomSortField());
|
return new Sort(new RandomSortField());
|
||||||
|
} else if (sort.getType() == LLSortType.SCORE) {
|
||||||
|
return new Sort(SortField.FIELD_SCORE);
|
||||||
|
} else if (sort.getType() == LLSortType.DOC) {
|
||||||
|
return new Sort(SortField.FIELD_DOC);
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -1,88 +0,0 @@
|
|||||||
package it.cavallium.dbengine.database;
|
|
||||||
|
|
||||||
import it.cavallium.dbengine.client.MultiSort;
|
|
||||||
import it.cavallium.dbengine.database.analyzer.N4CharGramAnalyzer;
|
|
||||||
import it.cavallium.dbengine.database.analyzer.N4CharGramEdgeAnalyzer;
|
|
||||||
import it.cavallium.dbengine.database.analyzer.TextFieldsAnalyzer;
|
|
||||||
import it.cavallium.dbengine.database.analyzer.WordAnalyzer;
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
|
|
||||||
import org.apache.lucene.analysis.en.KStemFilter;
|
|
||||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
|
||||||
import org.jetbrains.annotations.Nullable;
|
|
||||||
import reactor.core.publisher.Flux;
|
|
||||||
|
|
||||||
public class LuceneUtils {
|
|
||||||
private static final Analyzer lucene4CharGramAnalyzerEdgeInstance = new N4CharGramEdgeAnalyzer();
|
|
||||||
private static final Analyzer lucene4CharGramAnalyzerInstance = new N4CharGramAnalyzer();
|
|
||||||
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(true, true);
|
|
||||||
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(true, false);
|
|
||||||
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, true);
|
|
||||||
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false);
|
|
||||||
|
|
||||||
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
|
|
||||||
switch (analyzer) {
|
|
||||||
case PartialWordsEdge:
|
|
||||||
return lucene4CharGramAnalyzerEdgeInstance;
|
|
||||||
case PartialWords:
|
|
||||||
return lucene4CharGramAnalyzerInstance;
|
|
||||||
case FullText:
|
|
||||||
return luceneWordAnalyzerStopWordsAndStemInstance;
|
|
||||||
case WordWithStopwordsStripping:
|
|
||||||
return luceneWordAnalyzerStopWordsInstance;
|
|
||||||
case WordWithStemming:
|
|
||||||
return luceneWordAnalyzerStemInstance;
|
|
||||||
case WordSimple:
|
|
||||||
return luceneWordAnalyzerSimpleInstance;
|
|
||||||
default:
|
|
||||||
throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param stem Enable stem filters on words.
|
|
||||||
* Pass false if it will be used with a n-gram filter
|
|
||||||
*/
|
|
||||||
public static TokenStream newCommonFilter(TokenStream tokenStream, boolean stem) {
|
|
||||||
tokenStream = newCommonNormalizer(tokenStream);
|
|
||||||
if (stem) {
|
|
||||||
tokenStream = new KStemFilter(tokenStream);
|
|
||||||
tokenStream = new EnglishPossessiveFilter(tokenStream);
|
|
||||||
}
|
|
||||||
return tokenStream;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static TokenStream newCommonNormalizer(TokenStream tokenStream) {
|
|
||||||
tokenStream = new ASCIIFoldingFilter(tokenStream);
|
|
||||||
tokenStream = new LowerCaseFilter(tokenStream);
|
|
||||||
return tokenStream;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Merge streams together maintaining absolute order
|
|
||||||
*/
|
|
||||||
public static <T> Flux<T> mergeStream(Flux<Flux<T>> mappedMultiResults,
|
|
||||||
@Nullable MultiSort<T> sort,
|
|
||||||
@Nullable Integer limit) {
|
|
||||||
if (limit != null && limit == 0) {
|
|
||||||
return mappedMultiResults.flatMap(f -> f).ignoreElements().flux();
|
|
||||||
}
|
|
||||||
return mappedMultiResults.collectList().flatMapMany(mappedMultiResultsList -> {
|
|
||||||
Flux<T> mergedFlux;
|
|
||||||
if (sort == null) {
|
|
||||||
mergedFlux = Flux.merge(mappedMultiResultsList);
|
|
||||||
} else {
|
|
||||||
//noinspection unchecked
|
|
||||||
mergedFlux = Flux.mergeOrdered(32, sort.getResultSort(), mappedMultiResultsList.toArray(Flux[]::new));
|
|
||||||
}
|
|
||||||
if (limit == null) {
|
|
||||||
return mergedFlux;
|
|
||||||
} else {
|
|
||||||
return mergedFlux.take(limit);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
@ -0,0 +1,33 @@
|
|||||||
|
package it.cavallium.dbengine.database.disk;
|
||||||
|
|
||||||
|
import it.cavallium.dbengine.database.LLSearchCollectionStatisticsGetter;
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
|
||||||
|
public class LLIndexSearcherWithCustomCollectionStatistics extends IndexSearcher {
|
||||||
|
|
||||||
|
private final IndexSearcher indexSearcher;
|
||||||
|
private final LLSearchCollectionStatisticsGetter customCollectionStatisticsGetter;
|
||||||
|
private final boolean distributedPre;
|
||||||
|
private final long actionId;
|
||||||
|
|
||||||
|
public LLIndexSearcherWithCustomCollectionStatistics(IndexSearcher indexSearcher,
|
||||||
|
LLSearchCollectionStatisticsGetter customCollectionStatisticsGetter,
|
||||||
|
boolean distributedPre,
|
||||||
|
long actionId) {
|
||||||
|
super(indexSearcher.getIndexReader());
|
||||||
|
this.indexSearcher = indexSearcher;
|
||||||
|
this.setSimilarity(indexSearcher.getSimilarity());
|
||||||
|
this.setQueryCache(indexSearcher.getQueryCache());
|
||||||
|
this.setQueryCachingPolicy(indexSearcher.getQueryCachingPolicy());
|
||||||
|
this.customCollectionStatisticsGetter = customCollectionStatisticsGetter;
|
||||||
|
this.distributedPre = distributedPre;
|
||||||
|
this.actionId = actionId;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CollectionStatistics collectionStatistics(String field) throws IOException {
|
||||||
|
return customCollectionStatisticsGetter.collectionStatistics(indexSearcher, field, distributedPre, actionId);
|
||||||
|
}
|
||||||
|
}
|
@ -3,7 +3,8 @@ package it.cavallium.dbengine.database.disk;
|
|||||||
import it.cavallium.dbengine.database.Column;
|
import it.cavallium.dbengine.database.Column;
|
||||||
import it.cavallium.dbengine.database.LLDatabaseConnection;
|
import it.cavallium.dbengine.database.LLDatabaseConnection;
|
||||||
import it.cavallium.dbengine.database.LLLuceneIndex;
|
import it.cavallium.dbengine.database.LLLuceneIndex;
|
||||||
import it.cavallium.dbengine.database.analyzer.TextFieldsAnalyzer;
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
@ -51,6 +52,7 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
|
|||||||
public Mono<LLLuceneIndex> getLuceneIndex(String name,
|
public Mono<LLLuceneIndex> getLuceneIndex(String name,
|
||||||
int instancesCount,
|
int instancesCount,
|
||||||
TextFieldsAnalyzer textFieldsAnalyzer,
|
TextFieldsAnalyzer textFieldsAnalyzer,
|
||||||
|
TextFieldsSimilarity textFieldsSimilarity,
|
||||||
Duration queryRefreshDebounceTime,
|
Duration queryRefreshDebounceTime,
|
||||||
Duration commitDebounceTime,
|
Duration commitDebounceTime,
|
||||||
boolean lowMemory) {
|
boolean lowMemory) {
|
||||||
@ -61,6 +63,7 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
|
|||||||
name,
|
name,
|
||||||
instancesCount,
|
instancesCount,
|
||||||
textFieldsAnalyzer,
|
textFieldsAnalyzer,
|
||||||
|
textFieldsSimilarity,
|
||||||
queryRefreshDebounceTime,
|
queryRefreshDebounceTime,
|
||||||
commitDebounceTime,
|
commitDebounceTime,
|
||||||
lowMemory
|
lowMemory
|
||||||
@ -69,9 +72,11 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
|
|||||||
return new LLLocalLuceneIndex(basePath.resolve("lucene"),
|
return new LLLocalLuceneIndex(basePath.resolve("lucene"),
|
||||||
name,
|
name,
|
||||||
textFieldsAnalyzer,
|
textFieldsAnalyzer,
|
||||||
|
textFieldsSimilarity,
|
||||||
queryRefreshDebounceTime,
|
queryRefreshDebounceTime,
|
||||||
commitDebounceTime,
|
commitDebounceTime,
|
||||||
lowMemory
|
lowMemory,
|
||||||
|
null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
@ -93,30 +93,6 @@ public class LLLocalKeyValueDatabase implements LLKeyValueDatabase {
|
|||||||
this.handles.put(columns.get(i), handles.get(i));
|
this.handles.put(columns.get(i), handles.get(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
System.out.println("----Data----");
|
|
||||||
this.handles.forEach((Column column, ColumnFamilyHandle hnd) -> {
|
|
||||||
System.out.println("Column: " + column.getName());
|
|
||||||
if (!column.getName().contains("hash")) {
|
|
||||||
var val = new ArrayList<String>();
|
|
||||||
var iter = db.newIterator(hnd);
|
|
||||||
iter.seekToFirst();
|
|
||||||
while (iter.isValid()) {
|
|
||||||
val.add(Column.toString(iter.key()));
|
|
||||||
System.out.println(" " + Column.toString(iter.key()));
|
|
||||||
iter.next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
System.out.println("----Columns----");
|
|
||||||
this.handles.forEach((Column column, ColumnFamilyHandle hnd) -> {
|
|
||||||
System.out.println("Column: " + column.getName());
|
|
||||||
});
|
|
||||||
*/
|
|
||||||
|
|
||||||
flushDb(db, handles);
|
flushDb(db, handles);
|
||||||
} catch (RocksDBException ex) {
|
} catch (RocksDBException ex) {
|
||||||
throw new IOException(ex);
|
throw new IOException(ex);
|
||||||
|
@ -4,15 +4,18 @@ import it.cavallium.dbengine.database.LLDocument;
|
|||||||
import it.cavallium.dbengine.database.LLKeyScore;
|
import it.cavallium.dbengine.database.LLKeyScore;
|
||||||
import it.cavallium.dbengine.database.LLLuceneIndex;
|
import it.cavallium.dbengine.database.LLLuceneIndex;
|
||||||
import it.cavallium.dbengine.database.LLScoreMode;
|
import it.cavallium.dbengine.database.LLScoreMode;
|
||||||
|
import it.cavallium.dbengine.database.LLSearchCollectionStatisticsGetter;
|
||||||
import it.cavallium.dbengine.database.LLSearchResult;
|
import it.cavallium.dbengine.database.LLSearchResult;
|
||||||
import it.cavallium.dbengine.database.LLSnapshot;
|
import it.cavallium.dbengine.database.LLSnapshot;
|
||||||
import it.cavallium.dbengine.database.LLSort;
|
import it.cavallium.dbengine.database.LLSort;
|
||||||
import it.cavallium.dbengine.database.LLTerm;
|
import it.cavallium.dbengine.database.LLTerm;
|
||||||
import it.cavallium.dbengine.database.LLUtils;
|
import it.cavallium.dbengine.database.LLUtils;
|
||||||
import it.cavallium.dbengine.database.LuceneUtils;
|
import it.cavallium.dbengine.lucene.LuceneUtils;
|
||||||
import it.cavallium.dbengine.database.analyzer.TextFieldsAnalyzer;
|
|
||||||
import it.cavallium.dbengine.lucene.ScheduledTaskLifecycle;
|
import it.cavallium.dbengine.lucene.ScheduledTaskLifecycle;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||||
import it.cavallium.dbengine.lucene.searcher.AdaptiveStreamSearcher;
|
import it.cavallium.dbengine.lucene.searcher.AdaptiveStreamSearcher;
|
||||||
|
import it.cavallium.dbengine.lucene.searcher.AllowOnlyQueryParsingCollectorStreamSearcher;
|
||||||
import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher;
|
import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher;
|
||||||
import it.cavallium.dbengine.lucene.searcher.PagedStreamSearcher;
|
import it.cavallium.dbengine.lucene.searcher.PagedStreamSearcher;
|
||||||
import it.cavallium.dbengine.lucene.serializer.QueryParser;
|
import it.cavallium.dbengine.lucene.serializer.QueryParser;
|
||||||
@ -39,6 +42,7 @@ import org.apache.lucene.search.Query;
|
|||||||
import org.apache.lucene.search.ScoreMode;
|
import org.apache.lucene.search.ScoreMode;
|
||||||
import org.apache.lucene.search.SearcherManager;
|
import org.apache.lucene.search.SearcherManager;
|
||||||
import org.apache.lucene.search.Sort;
|
import org.apache.lucene.search.Sort;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
@ -48,6 +52,7 @@ import reactor.core.publisher.Mono;
|
|||||||
import reactor.core.publisher.Sinks;
|
import reactor.core.publisher.Sinks;
|
||||||
import reactor.core.publisher.Sinks.EmissionException;
|
import reactor.core.publisher.Sinks.EmissionException;
|
||||||
import reactor.core.publisher.Sinks.EmitResult;
|
import reactor.core.publisher.Sinks.EmitResult;
|
||||||
|
import reactor.core.publisher.Sinks.Empty;
|
||||||
import reactor.core.publisher.Sinks.Many;
|
import reactor.core.publisher.Sinks.Many;
|
||||||
import reactor.core.publisher.Sinks.One;
|
import reactor.core.publisher.Sinks.One;
|
||||||
import reactor.core.scheduler.Scheduler;
|
import reactor.core.scheduler.Scheduler;
|
||||||
@ -58,6 +63,7 @@ import reactor.util.function.Tuples;
|
|||||||
public class LLLocalLuceneIndex implements LLLuceneIndex {
|
public class LLLocalLuceneIndex implements LLLuceneIndex {
|
||||||
|
|
||||||
private static final LuceneStreamSearcher streamSearcher = new AdaptiveStreamSearcher();
|
private static final LuceneStreamSearcher streamSearcher = new AdaptiveStreamSearcher();
|
||||||
|
private static final AllowOnlyQueryParsingCollectorStreamSearcher allowOnlyQueryParsingCollectorStreamSearcher = new AllowOnlyQueryParsingCollectorStreamSearcher();
|
||||||
/**
|
/**
|
||||||
* Global lucene index scheduler.
|
* Global lucene index scheduler.
|
||||||
* There is only a single thread globally to not overwhelm the disk with
|
* There is only a single thread globally to not overwhelm the disk with
|
||||||
@ -84,15 +90,19 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
*/
|
*/
|
||||||
private final ConcurrentHashMap<Long, LuceneIndexSnapshot> snapshots = new ConcurrentHashMap<>();
|
private final ConcurrentHashMap<Long, LuceneIndexSnapshot> snapshots = new ConcurrentHashMap<>();
|
||||||
private final boolean lowMemory;
|
private final boolean lowMemory;
|
||||||
|
private final TextFieldsSimilarity similarity;
|
||||||
|
|
||||||
private final ScheduledTaskLifecycle scheduledTasksLifecycle;
|
private final ScheduledTaskLifecycle scheduledTasksLifecycle;
|
||||||
|
private final @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter;
|
||||||
|
|
||||||
public LLLocalLuceneIndex(Path luceneBasePath,
|
public LLLocalLuceneIndex(Path luceneBasePath,
|
||||||
String name,
|
String name,
|
||||||
TextFieldsAnalyzer analyzer,
|
TextFieldsAnalyzer analyzer,
|
||||||
|
TextFieldsSimilarity similarity,
|
||||||
Duration queryRefreshDebounceTime,
|
Duration queryRefreshDebounceTime,
|
||||||
Duration commitDebounceTime,
|
Duration commitDebounceTime,
|
||||||
boolean lowMemory) throws IOException {
|
boolean lowMemory,
|
||||||
|
@Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter) throws IOException {
|
||||||
if (name.length() == 0) {
|
if (name.length() == 0) {
|
||||||
throw new IOException("Empty lucene database name");
|
throw new IOException("Empty lucene database name");
|
||||||
}
|
}
|
||||||
@ -101,6 +111,8 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
this.luceneIndexName = name;
|
this.luceneIndexName = name;
|
||||||
this.snapshotter = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
|
this.snapshotter = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
|
||||||
this.lowMemory = lowMemory;
|
this.lowMemory = lowMemory;
|
||||||
|
this.similarity = similarity;
|
||||||
|
this.distributedCollectionStatisticsGetter = distributedCollectionStatisticsGetter;
|
||||||
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.getAnalyzer(analyzer));
|
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.getAnalyzer(analyzer));
|
||||||
indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
|
indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
|
||||||
indexWriterConfig.setIndexDeletionPolicy(snapshotter);
|
indexWriterConfig.setIndexDeletionPolicy(snapshotter);
|
||||||
@ -112,6 +124,7 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
indexWriterConfig.setRAMBufferSizeMB(128);
|
indexWriterConfig.setRAMBufferSizeMB(128);
|
||||||
indexWriterConfig.setRAMPerThreadHardLimitMB(512);
|
indexWriterConfig.setRAMPerThreadHardLimitMB(512);
|
||||||
}
|
}
|
||||||
|
indexWriterConfig.setSimilarity(getSimilarity());
|
||||||
this.indexWriter = new IndexWriter(directory, indexWriterConfig);
|
this.indexWriter = new IndexWriter(directory, indexWriterConfig);
|
||||||
this.searcherManager = new SearcherManager(indexWriter, false, false, null);
|
this.searcherManager = new SearcherManager(indexWriter, false, false, null);
|
||||||
|
|
||||||
@ -123,6 +136,10 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
registerScheduledFixedTask(this::scheduledQueryRefresh, queryRefreshDebounceTime);
|
registerScheduledFixedTask(this::scheduledQueryRefresh, queryRefreshDebounceTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Similarity getSimilarity() {
|
||||||
|
return LuceneUtils.getSimilarity(similarity);
|
||||||
|
}
|
||||||
|
|
||||||
private void registerScheduledFixedTask(Runnable task, Duration duration) {
|
private void registerScheduledFixedTask(Runnable task, Duration duration) {
|
||||||
scheduledTasksLifecycle.registerScheduledTask(luceneBlockingScheduler.schedulePeriodically(() -> {
|
scheduledTasksLifecycle.registerScheduledTask(luceneBlockingScheduler.schedulePeriodically(() -> {
|
||||||
scheduledTasksLifecycle.startScheduledTask();
|
scheduledTasksLifecycle.startScheduledTask();
|
||||||
@ -280,13 +297,24 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
}).subscribeOn(luceneBlockingScheduler);
|
}).subscribeOn(luceneBlockingScheduler);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Mono<IndexSearcher> acquireSearcherWrapper(LLSnapshot snapshot) {
|
private Mono<IndexSearcher> acquireSearcherWrapper(LLSnapshot snapshot, boolean distributedPre, long actionId) {
|
||||||
return Mono.fromCallable(() -> {
|
return Mono.fromCallable(() -> {
|
||||||
|
IndexSearcher indexSearcher;
|
||||||
if (snapshot == null) {
|
if (snapshot == null) {
|
||||||
//noinspection BlockingMethodInNonBlockingContext
|
//noinspection BlockingMethodInNonBlockingContext
|
||||||
return searcherManager.acquire();
|
indexSearcher = searcherManager.acquire();
|
||||||
|
indexSearcher.setSimilarity(getSimilarity());
|
||||||
} else {
|
} else {
|
||||||
return resolveSnapshot(snapshot).getIndexSearcher();
|
indexSearcher = resolveSnapshot(snapshot).getIndexSearcher();
|
||||||
|
}
|
||||||
|
if (distributedCollectionStatisticsGetter != null && actionId != -1) {
|
||||||
|
return new LLIndexSearcherWithCustomCollectionStatistics(indexSearcher,
|
||||||
|
distributedCollectionStatisticsGetter,
|
||||||
|
distributedPre,
|
||||||
|
actionId
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
return indexSearcher;
|
||||||
}
|
}
|
||||||
}).subscribeOn(luceneBlockingScheduler);
|
}).subscribeOn(luceneBlockingScheduler);
|
||||||
}
|
}
|
||||||
@ -304,12 +332,38 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
}).subscribeOn(luceneBlockingScheduler);
|
}).subscribeOn(luceneBlockingScheduler);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings({"Convert2MethodRef", "unchecked", "rawtypes"})
|
|
||||||
@Override
|
@Override
|
||||||
public Mono<LLSearchResult> moreLikeThis(@Nullable LLSnapshot snapshot,
|
public Mono<LLSearchResult> moreLikeThis(@Nullable LLSnapshot snapshot,
|
||||||
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux,
|
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux,
|
||||||
int limit,
|
int limit,
|
||||||
String keyFieldName) {
|
String keyFieldName) {
|
||||||
|
return moreLikeThis(snapshot, mltDocumentFieldsFlux, limit, keyFieldName, false, 0, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Mono<LLSearchResult> distributedMoreLikeThis(@Nullable LLSnapshot snapshot,
|
||||||
|
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux,
|
||||||
|
int limit,
|
||||||
|
String keyFieldName,
|
||||||
|
long actionId,
|
||||||
|
int scoreDivisor) {
|
||||||
|
return moreLikeThis(snapshot, mltDocumentFieldsFlux, limit, keyFieldName, false, actionId, scoreDivisor);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Mono<Void> distributedPreMoreLikeThis(@Nullable LLSnapshot snapshot,
|
||||||
|
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux,
|
||||||
|
String keyFieldName, long actionId) {
|
||||||
|
return moreLikeThis(snapshot, mltDocumentFieldsFlux, -1, keyFieldName, true, actionId, 1)
|
||||||
|
.flatMap(LLSearchResult::completion);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings({"Convert2MethodRef", "unchecked", "rawtypes"})
|
||||||
|
private Mono<LLSearchResult> moreLikeThis(@Nullable LLSnapshot snapshot,
|
||||||
|
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsFlux,
|
||||||
|
int limit,
|
||||||
|
String keyFieldName,
|
||||||
|
boolean doDistributedPre,
|
||||||
|
long actionId,
|
||||||
|
int scoreDivisor) {
|
||||||
return mltDocumentFieldsFlux
|
return mltDocumentFieldsFlux
|
||||||
.collectMap(Tuple2::getT1, Tuple2::getT2, HashMap::new)
|
.collectMap(Tuple2::getT1, Tuple2::getT2, HashMap::new)
|
||||||
.flatMap(mltDocumentFields -> {
|
.flatMap(mltDocumentFields -> {
|
||||||
@ -317,7 +371,7 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
return Mono.just(LLSearchResult.empty());
|
return Mono.just(LLSearchResult.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
return acquireSearcherWrapper(snapshot)
|
return acquireSearcherWrapper(snapshot, doDistributedPre, actionId)
|
||||||
.flatMap(indexSearcher -> Mono
|
.flatMap(indexSearcher -> Mono
|
||||||
.fromCallable(() -> {
|
.fromCallable(() -> {
|
||||||
var mlt = new MoreLikeThis(indexSearcher.getIndexReader());
|
var mlt = new MoreLikeThis(indexSearcher.getIndexReader());
|
||||||
@ -339,31 +393,39 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
.many()
|
.many()
|
||||||
.unicast()
|
.unicast()
|
||||||
.onBackpressureBuffer(new ArrayBlockingQueue<>(1000));
|
.onBackpressureBuffer(new ArrayBlockingQueue<>(1000));
|
||||||
|
Empty<Void> completeSink = Sinks.empty();
|
||||||
|
|
||||||
luceneBlockingScheduler.schedule(() -> {
|
Schedulers.boundedElastic().schedule(() -> {
|
||||||
try {
|
try {
|
||||||
streamSearcher.search(indexSearcher,
|
if (doDistributedPre) {
|
||||||
query,
|
allowOnlyQueryParsingCollectorStreamSearcher.search(indexSearcher, query);
|
||||||
limit,
|
totalHitsCountSink.tryEmitValue(0L);
|
||||||
null,
|
} else {
|
||||||
ScoreMode.TOP_SCORES,
|
streamSearcher.search(indexSearcher,
|
||||||
keyFieldName,
|
query,
|
||||||
keyScore -> {
|
limit,
|
||||||
EmitResult result = topKeysSink.tryEmitNext(keyScore);
|
null,
|
||||||
if (result.isFailure()) {
|
ScoreMode.TOP_SCORES,
|
||||||
throw new EmissionException(result);
|
keyFieldName,
|
||||||
}
|
keyScore -> {
|
||||||
},
|
EmitResult result = topKeysSink.tryEmitNext(fixKeyScore(keyScore, scoreDivisor));
|
||||||
totalHitsCount -> {
|
if (result.isFailure()) {
|
||||||
EmitResult result = totalHitsCountSink.tryEmitValue(totalHitsCount);
|
throw new EmissionException(result);
|
||||||
if (result.isFailure()) {
|
}
|
||||||
throw new EmissionException(result);
|
},
|
||||||
}
|
totalHitsCount -> {
|
||||||
});
|
EmitResult result = totalHitsCountSink.tryEmitValue(totalHitsCount);
|
||||||
|
if (result.isFailure()) {
|
||||||
|
throw new EmissionException(result);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
topKeysSink.tryEmitComplete();
|
topKeysSink.tryEmitComplete();
|
||||||
|
completeSink.tryEmitEmpty();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
topKeysSink.tryEmitError(e);
|
topKeysSink.tryEmitError(e);
|
||||||
totalHitsCountSink.tryEmitError(e);
|
totalHitsCountSink.tryEmitError(e);
|
||||||
|
completeSink.tryEmitError(e);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -377,12 +439,33 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("Convert2MethodRef")
|
private LLKeyScore fixKeyScore(LLKeyScore keyScore, int scoreDivisor) {
|
||||||
|
return scoreDivisor == 1 ? keyScore : new LLKeyScore(keyScore.getKey(), keyScore.getScore() / (float) scoreDivisor);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Mono<LLSearchResult> search(@Nullable LLSnapshot snapshot, it.cavallium.dbengine.lucene.serializer.Query query, int limit,
|
public Mono<LLSearchResult> search(@Nullable LLSnapshot snapshot, it.cavallium.dbengine.lucene.serializer.Query query, int limit,
|
||||||
@Nullable LLSort sort, LLScoreMode scoreMode, String keyFieldName) {
|
@Nullable LLSort sort, LLScoreMode scoreMode, String keyFieldName) {
|
||||||
|
return search(snapshot, query, limit, sort, scoreMode, keyFieldName, false, 0, 1);
|
||||||
|
}
|
||||||
|
|
||||||
return acquireSearcherWrapper(snapshot)
|
public Mono<LLSearchResult> distributedSearch(@Nullable LLSnapshot snapshot, it.cavallium.dbengine.lucene.serializer.Query query, int limit,
|
||||||
|
@Nullable LLSort sort, LLScoreMode scoreMode, String keyFieldName, long actionId, int scoreDivisor) {
|
||||||
|
return search(snapshot, query, limit, sort, scoreMode, keyFieldName, false, actionId, scoreDivisor);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Mono<Void> distributedPreSearch(@Nullable LLSnapshot snapshot, it.cavallium.dbengine.lucene.serializer.Query query,
|
||||||
|
@Nullable LLSort sort, LLScoreMode scoreMode, String keyFieldName, long actionId) {
|
||||||
|
return search(snapshot, query, -1, sort, scoreMode, keyFieldName, true, actionId, 1)
|
||||||
|
.flatMap(LLSearchResult::completion);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("Convert2MethodRef")
|
||||||
|
private Mono<LLSearchResult> search(@Nullable LLSnapshot snapshot,
|
||||||
|
it.cavallium.dbengine.lucene.serializer.Query query, int limit,
|
||||||
|
@Nullable LLSort sort, LLScoreMode scoreMode, String keyFieldName,
|
||||||
|
boolean doDistributedPre, long actionId, int scoreDivisor) {
|
||||||
|
return acquireSearcherWrapper(snapshot, doDistributedPre, actionId)
|
||||||
.flatMap(indexSearcher -> Mono
|
.flatMap(indexSearcher -> Mono
|
||||||
.fromCallable(() -> {
|
.fromCallable(() -> {
|
||||||
Query luceneQuery = QueryParser.parse(query);
|
Query luceneQuery = QueryParser.parse(query);
|
||||||
@ -402,27 +485,32 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||||||
.many()
|
.many()
|
||||||
.unicast()
|
.unicast()
|
||||||
.onBackpressureBuffer(new ArrayBlockingQueue<>(PagedStreamSearcher.MAX_ITEMS_PER_PAGE));
|
.onBackpressureBuffer(new ArrayBlockingQueue<>(PagedStreamSearcher.MAX_ITEMS_PER_PAGE));
|
||||||
|
Schedulers.boundedElastic().schedule(() -> {
|
||||||
luceneBlockingScheduler.schedule(() -> {
|
|
||||||
try {
|
try {
|
||||||
streamSearcher.search(indexSearcher,
|
if (doDistributedPre) {
|
||||||
luceneQuery,
|
allowOnlyQueryParsingCollectorStreamSearcher.search(indexSearcher, luceneQuery);
|
||||||
limit,
|
totalHitsCountSink.tryEmitValue(0L);
|
||||||
luceneSort,
|
} else {
|
||||||
luceneScoreMode,
|
streamSearcher.search(indexSearcher,
|
||||||
keyFieldName,
|
luceneQuery,
|
||||||
keyScore -> {
|
limit,
|
||||||
EmitResult result = topKeysSink.tryEmitNext(keyScore);
|
luceneSort,
|
||||||
if (result.isFailure()) {
|
luceneScoreMode,
|
||||||
throw new EmissionException(result);
|
keyFieldName,
|
||||||
|
keyScore -> {
|
||||||
|
EmitResult result = topKeysSink.tryEmitNext(fixKeyScore(keyScore, scoreDivisor));
|
||||||
|
if (result.isFailure()) {
|
||||||
|
throw new EmissionException(result);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
totalHitsCount -> {
|
||||||
|
EmitResult result = totalHitsCountSink.tryEmitValue(totalHitsCount);
|
||||||
|
if (result.isFailure()) {
|
||||||
|
throw new EmissionException(result);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
);
|
||||||
totalHitsCount -> {
|
}
|
||||||
EmitResult result = totalHitsCountSink.tryEmitValue(totalHitsCount);
|
|
||||||
if (result.isFailure()) {
|
|
||||||
throw new EmissionException(result);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
topKeysSink.tryEmitComplete();
|
topKeysSink.tryEmitComplete();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
topKeysSink.tryEmitError(e);
|
topKeysSink.tryEmitError(e);
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package it.cavallium.dbengine.database.disk;
|
package it.cavallium.dbengine.database.disk;
|
||||||
|
|
||||||
|
import com.google.common.cache.Cache;
|
||||||
|
import com.google.common.cache.CacheBuilder;
|
||||||
import it.cavallium.dbengine.database.LLDocument;
|
import it.cavallium.dbengine.database.LLDocument;
|
||||||
import it.cavallium.dbengine.database.LLLuceneIndex;
|
import it.cavallium.dbengine.database.LLLuceneIndex;
|
||||||
import it.cavallium.dbengine.database.LLScoreMode;
|
import it.cavallium.dbengine.database.LLScoreMode;
|
||||||
@ -7,7 +9,8 @@ import it.cavallium.dbengine.database.LLSearchResult;
|
|||||||
import it.cavallium.dbengine.database.LLSnapshot;
|
import it.cavallium.dbengine.database.LLSnapshot;
|
||||||
import it.cavallium.dbengine.database.LLSort;
|
import it.cavallium.dbengine.database.LLSort;
|
||||||
import it.cavallium.dbengine.database.LLTerm;
|
import it.cavallium.dbengine.database.LLTerm;
|
||||||
import it.cavallium.dbengine.database.analyzer.TextFieldsAnalyzer;
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||||
import it.cavallium.dbengine.lucene.serializer.Query;
|
import it.cavallium.dbengine.lucene.serializer.Query;
|
||||||
import it.unimi.dsi.fastutil.longs.Long2ObjectMap;
|
import it.unimi.dsi.fastutil.longs.Long2ObjectMap;
|
||||||
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
|
||||||
@ -16,8 +19,13 @@ import java.nio.file.Path;
|
|||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.CopyOnWriteArrayList;
|
import java.util.concurrent.CopyOnWriteArrayList;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.warp.commonutils.batch.ParallelUtils;
|
import org.warp.commonutils.batch.ParallelUtils;
|
||||||
import org.warp.commonutils.functional.IOBiConsumer;
|
import org.warp.commonutils.functional.IOBiConsumer;
|
||||||
@ -34,12 +42,17 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
|
|||||||
private final AtomicLong nextSnapshotNumber = new AtomicLong(1);
|
private final AtomicLong nextSnapshotNumber = new AtomicLong(1);
|
||||||
private final LLLocalLuceneIndex[] luceneIndices;
|
private final LLLocalLuceneIndex[] luceneIndices;
|
||||||
|
|
||||||
|
private final AtomicLong nextActionId = new AtomicLong(0);
|
||||||
|
private final ConcurrentHashMap<Long, Cache<String, CollectionStatistics>[]> statistics = new ConcurrentHashMap<>();
|
||||||
|
private final ConcurrentHashMap<Long, AtomicInteger> completedStreams = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
private final int maxQueueSize = 1000;
|
private final int maxQueueSize = 1000;
|
||||||
|
|
||||||
public LLLocalMultiLuceneIndex(Path lucene,
|
public LLLocalMultiLuceneIndex(Path lucene,
|
||||||
String name,
|
String name,
|
||||||
int instancesCount,
|
int instancesCount,
|
||||||
TextFieldsAnalyzer textFieldsAnalyzer,
|
TextFieldsAnalyzer textFieldsAnalyzer,
|
||||||
|
TextFieldsSimilarity textFieldsSimilarity,
|
||||||
Duration queryRefreshDebounceTime,
|
Duration queryRefreshDebounceTime,
|
||||||
Duration commitDebounceTime,
|
Duration commitDebounceTime,
|
||||||
boolean lowMemory) throws IOException {
|
boolean lowMemory) throws IOException {
|
||||||
@ -50,6 +63,7 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
|
|||||||
|
|
||||||
LLLocalLuceneIndex[] luceneIndices = new LLLocalLuceneIndex[instancesCount];
|
LLLocalLuceneIndex[] luceneIndices = new LLLocalLuceneIndex[instancesCount];
|
||||||
for (int i = 0; i < instancesCount; i++) {
|
for (int i = 0; i < instancesCount; i++) {
|
||||||
|
int finalI = i;
|
||||||
String instanceName;
|
String instanceName;
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
instanceName = name;
|
instanceName = name;
|
||||||
@ -59,14 +73,75 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
|
|||||||
luceneIndices[i] = new LLLocalLuceneIndex(lucene,
|
luceneIndices[i] = new LLLocalLuceneIndex(lucene,
|
||||||
instanceName,
|
instanceName,
|
||||||
textFieldsAnalyzer,
|
textFieldsAnalyzer,
|
||||||
|
textFieldsSimilarity,
|
||||||
queryRefreshDebounceTime,
|
queryRefreshDebounceTime,
|
||||||
commitDebounceTime,
|
commitDebounceTime,
|
||||||
lowMemory
|
lowMemory,
|
||||||
|
(indexSearcher, field, distributedPre, actionId) -> distributedCustomCollectionStatistics(finalI,
|
||||||
|
indexSearcher,
|
||||||
|
field,
|
||||||
|
distributedPre,
|
||||||
|
actionId
|
||||||
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
this.luceneIndices = luceneIndices;
|
this.luceneIndices = luceneIndices;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long newAction() {
|
||||||
|
var statistics = new Cache[luceneIndices.length];
|
||||||
|
for (int i = 0; i < luceneIndices.length; i++) {
|
||||||
|
statistics[i] = CacheBuilder.newBuilder().build();
|
||||||
|
}
|
||||||
|
long actionId = nextActionId.getAndIncrement();
|
||||||
|
//noinspection unchecked
|
||||||
|
this.statistics.put(actionId, statistics);
|
||||||
|
this.completedStreams.put(actionId, new AtomicInteger(0));
|
||||||
|
return actionId;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void completedAction(long actionId) {
|
||||||
|
var completedStreamsCount = completedStreams.get(actionId);
|
||||||
|
if (completedStreamsCount != null) {
|
||||||
|
if (completedStreamsCount.incrementAndGet() >= luceneIndices.length) {
|
||||||
|
this.statistics.remove(actionId);
|
||||||
|
this.completedStreams.remove(actionId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private CollectionStatistics distributedCustomCollectionStatistics(int luceneIndex,
|
||||||
|
IndexSearcher indexSearcher, String field, boolean distributedPre, long actionId) throws IOException {
|
||||||
|
if (distributedPre) {
|
||||||
|
try {
|
||||||
|
return statistics.get(actionId)[luceneIndex].get(field, () -> indexSearcher.collectionStatistics(field));
|
||||||
|
} catch (ExecutionException e) {
|
||||||
|
throw new IOException();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
long maxDoc = 0;
|
||||||
|
long docCount = 0;
|
||||||
|
long sumTotalTermFreq = 0;
|
||||||
|
long sumDocFreq = 0;
|
||||||
|
for (int i = 0; i < luceneIndices.length; i++) {
|
||||||
|
CollectionStatistics iCollStats = statistics.get(actionId)[i].getIfPresent(field);
|
||||||
|
if (iCollStats != null) {
|
||||||
|
maxDoc += iCollStats.maxDoc();
|
||||||
|
docCount += iCollStats.docCount();
|
||||||
|
sumTotalTermFreq += iCollStats.sumTotalTermFreq();
|
||||||
|
sumDocFreq += iCollStats.sumDocFreq();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new CollectionStatistics(field,
|
||||||
|
(int) Math.max(1, Math.min(maxDoc, Integer.MAX_VALUE)),
|
||||||
|
Math.max(1, docCount),
|
||||||
|
Math.max(1, sumTotalTermFreq),
|
||||||
|
Math.max(1, sumDocFreq)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private LLLocalLuceneIndex getLuceneIndex(LLTerm id) {
|
private LLLocalLuceneIndex getLuceneIndex(LLTerm id) {
|
||||||
return luceneIndices[getLuceneIndexId(id)];
|
return luceneIndices[getLuceneIndexId(id)];
|
||||||
}
|
}
|
||||||
@ -130,14 +205,61 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
|
|||||||
Flux<Tuple2<String, Set<String>>> mltDocumentFields,
|
Flux<Tuple2<String, Set<String>>> mltDocumentFields,
|
||||||
int limit,
|
int limit,
|
||||||
String keyFieldName) {
|
String keyFieldName) {
|
||||||
return Flux
|
long actionId;
|
||||||
.fromArray(luceneIndices)
|
int scoreDivisor;
|
||||||
.index()
|
Flux<Tuple2<String, Set<String>>> mltDocumentFieldsShared;
|
||||||
.flatMap(tuple -> Mono
|
Mono<Void> distributedPre;
|
||||||
.fromCallable(() -> resolveSnapshotOptional(snapshot, (int) (long) tuple.getT1()))
|
if (luceneIndices.length > 1) {
|
||||||
.map(luceneSnapshot -> Tuples.of(tuple.getT2(), luceneSnapshot)))
|
actionId = newAction();
|
||||||
.flatMap(tuple -> tuple.getT1().moreLikeThis(tuple.getT2().orElse(null), mltDocumentFields, limit, keyFieldName))
|
scoreDivisor = 20;
|
||||||
.reduce(LLSearchResult.accumulator());
|
mltDocumentFieldsShared = mltDocumentFields.publish().refCount();
|
||||||
|
distributedPre = Flux
|
||||||
|
.fromArray(luceneIndices)
|
||||||
|
.index()
|
||||||
|
.flatMap(tuple -> Mono
|
||||||
|
.fromCallable(() -> resolveSnapshotOptional(snapshot, (int) (long) tuple.getT1()))
|
||||||
|
.map(luceneSnapshot -> Tuples.of(tuple.getT2(), luceneSnapshot))
|
||||||
|
)
|
||||||
|
.flatMap(tuple -> tuple
|
||||||
|
.getT1()
|
||||||
|
.distributedPreMoreLikeThis(tuple.getT2().orElse(null), mltDocumentFieldsShared, keyFieldName, actionId)
|
||||||
|
)
|
||||||
|
.then();
|
||||||
|
} else {
|
||||||
|
actionId = -1;
|
||||||
|
scoreDivisor = 1;
|
||||||
|
mltDocumentFieldsShared = mltDocumentFields;
|
||||||
|
distributedPre = Mono.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return distributedPre.then(Flux
|
||||||
|
.fromArray(luceneIndices)
|
||||||
|
.index()
|
||||||
|
.flatMap(tuple -> Mono
|
||||||
|
.fromCallable(() -> resolveSnapshotOptional(snapshot, (int) (long) tuple.getT1()))
|
||||||
|
.map(luceneSnapshot -> Tuples.of(tuple.getT2(), luceneSnapshot)))
|
||||||
|
.flatMap(tuple -> tuple
|
||||||
|
.getT1()
|
||||||
|
.distributedMoreLikeThis(tuple.getT2().orElse(null),
|
||||||
|
mltDocumentFieldsShared,
|
||||||
|
limit,
|
||||||
|
keyFieldName,
|
||||||
|
actionId,
|
||||||
|
scoreDivisor
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.reduce(LLSearchResult.accumulator())
|
||||||
|
.map(result -> {
|
||||||
|
if (actionId != -1) {
|
||||||
|
var resultsWithTermination = result
|
||||||
|
.results()
|
||||||
|
.map(flux -> flux.doOnTerminate(() -> completedAction(actionId)));
|
||||||
|
return new LLSearchResult(result.totalHitsCount(), resultsWithTermination);
|
||||||
|
} else {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -147,14 +269,60 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
|
|||||||
@Nullable LLSort sort,
|
@Nullable LLSort sort,
|
||||||
LLScoreMode scoreMode,
|
LLScoreMode scoreMode,
|
||||||
String keyFieldName) {
|
String keyFieldName) {
|
||||||
return Flux
|
long actionId;
|
||||||
.fromArray(luceneIndices)
|
int scoreDivisor;
|
||||||
.index()
|
Mono<Void> distributedPre;
|
||||||
.flatMap(tuple -> Mono
|
if (luceneIndices.length <= 1 || scoreMode == LLScoreMode.COMPLETE_NO_SCORES) {
|
||||||
.fromCallable(() -> resolveSnapshotOptional(snapshot, (int) (long) tuple.getT1()))
|
actionId = -1;
|
||||||
.map(luceneSnapshot -> Tuples.of(tuple.getT2(), luceneSnapshot)))
|
scoreDivisor = 1;
|
||||||
.flatMap(tuple -> tuple.getT1().search(tuple.getT2().orElse(null), query, limit, sort, scoreMode, keyFieldName))
|
distributedPre = Mono.empty();
|
||||||
.reduce(LLSearchResult.accumulator());
|
} else {
|
||||||
|
actionId = newAction();
|
||||||
|
scoreDivisor = 20;
|
||||||
|
distributedPre = Flux
|
||||||
|
.fromArray(luceneIndices)
|
||||||
|
.index()
|
||||||
|
.flatMap(tuple -> Mono
|
||||||
|
.fromCallable(() -> resolveSnapshotOptional(snapshot, (int) (long) tuple.getT1()))
|
||||||
|
.map(luceneSnapshot -> Tuples.of(tuple.getT2(), luceneSnapshot))
|
||||||
|
)
|
||||||
|
.flatMap(tuple -> tuple
|
||||||
|
.getT1()
|
||||||
|
.distributedPreSearch(tuple.getT2().orElse(null), query, sort, scoreMode, keyFieldName, actionId)
|
||||||
|
)
|
||||||
|
.then();
|
||||||
|
}
|
||||||
|
return distributedPre
|
||||||
|
.then(Flux
|
||||||
|
.fromArray(luceneIndices)
|
||||||
|
.index()
|
||||||
|
.flatMap(tuple -> Mono
|
||||||
|
.fromCallable(() -> resolveSnapshotOptional(snapshot, (int) (long) tuple.getT1()))
|
||||||
|
.map(luceneSnapshot -> Tuples.of(tuple.getT2(), luceneSnapshot))
|
||||||
|
)
|
||||||
|
.flatMap(tuple -> tuple
|
||||||
|
.getT1()
|
||||||
|
.distributedSearch(tuple.getT2().orElse(null),
|
||||||
|
query,
|
||||||
|
limit,
|
||||||
|
sort,
|
||||||
|
scoreMode,
|
||||||
|
keyFieldName,
|
||||||
|
actionId,
|
||||||
|
scoreDivisor
|
||||||
|
))
|
||||||
|
.reduce(LLSearchResult.accumulator())
|
||||||
|
.map(result -> {
|
||||||
|
if (actionId != -1) {
|
||||||
|
var resultsWithTermination = result
|
||||||
|
.results()
|
||||||
|
.map(flux -> flux.doOnTerminate(() -> completedAction(actionId)));
|
||||||
|
return new LLSearchResult(result.totalHitsCount(), resultsWithTermination);
|
||||||
|
} else {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -3,7 +3,6 @@ package it.cavallium.dbengine.lucene;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.concurrent.locks.ReentrantLock;
|
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.search.Collector;
|
import org.apache.lucene.search.Collector;
|
||||||
import org.apache.lucene.search.LeafCollector;
|
import org.apache.lucene.search.LeafCollector;
|
||||||
@ -17,22 +16,28 @@ public class LuceneParallelStreamCollector implements Collector, LeafCollector {
|
|||||||
private final LuceneParallelStreamConsumer streamConsumer;
|
private final LuceneParallelStreamConsumer streamConsumer;
|
||||||
private final AtomicBoolean stopped;
|
private final AtomicBoolean stopped;
|
||||||
private final AtomicLong totalHitsCounter;
|
private final AtomicLong totalHitsCounter;
|
||||||
private final ReentrantLock lock;
|
|
||||||
private Scorable scorer;
|
private Scorable scorer;
|
||||||
|
|
||||||
public LuceneParallelStreamCollector(int base, ScoreMode scoreMode, LuceneParallelStreamConsumer streamConsumer,
|
public LuceneParallelStreamCollector(int base,
|
||||||
AtomicBoolean stopped, AtomicLong totalHitsCounter, ReentrantLock lock) {
|
ScoreMode scoreMode,
|
||||||
|
LuceneParallelStreamConsumer streamConsumer,
|
||||||
|
AtomicBoolean stopped,
|
||||||
|
AtomicLong totalHitsCounter) {
|
||||||
this.base = base;
|
this.base = base;
|
||||||
this.scoreMode = scoreMode;
|
this.scoreMode = scoreMode;
|
||||||
this.streamConsumer = streamConsumer;
|
this.streamConsumer = streamConsumer;
|
||||||
this.stopped = stopped;
|
this.stopped = stopped;
|
||||||
this.totalHitsCounter = totalHitsCounter;
|
this.totalHitsCounter = totalHitsCounter;
|
||||||
this.lock = lock;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final LeafCollector getLeafCollector(LeafReaderContext context) {
|
public final LeafCollector getLeafCollector(LeafReaderContext context) {
|
||||||
return new LuceneParallelStreamCollector(context.docBase, scoreMode, streamConsumer, stopped, totalHitsCounter, lock);
|
return new LuceneParallelStreamCollector(context.docBase,
|
||||||
|
scoreMode,
|
||||||
|
streamConsumer,
|
||||||
|
stopped,
|
||||||
|
totalHitsCounter
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -44,16 +49,11 @@ public class LuceneParallelStreamCollector implements Collector, LeafCollector {
|
|||||||
public void collect(int doc) throws IOException {
|
public void collect(int doc) throws IOException {
|
||||||
doc += base;
|
doc += base;
|
||||||
totalHitsCounter.incrementAndGet();
|
totalHitsCounter.incrementAndGet();
|
||||||
lock.lock();
|
if (!stopped.get()) {
|
||||||
try {
|
assert (scorer == null) || scorer.docID() == doc;
|
||||||
if (!stopped.get()) {
|
if (!streamConsumer.consume(doc, scorer == null ? 0 : scorer.score())) {
|
||||||
assert (scorer == null) || scorer.docID() == doc;
|
stopped.set(true);
|
||||||
if (!streamConsumer.consume(doc, scorer == null ? 0 : scorer.score())) {
|
|
||||||
stopped.set(true);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} finally {
|
|
||||||
lock.unlock();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@ package it.cavallium.dbengine.lucene;
|
|||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.concurrent.locks.ReentrantLock;
|
|
||||||
import org.apache.lucene.search.CollectorManager;
|
import org.apache.lucene.search.CollectorManager;
|
||||||
import org.apache.lucene.search.ScoreMode;
|
import org.apache.lucene.search.ScoreMode;
|
||||||
|
|
||||||
@ -14,7 +13,6 @@ public class LuceneParallelStreamCollectorManager implements
|
|||||||
private final LuceneParallelStreamConsumer streamConsumer;
|
private final LuceneParallelStreamConsumer streamConsumer;
|
||||||
private final AtomicBoolean stopped;
|
private final AtomicBoolean stopped;
|
||||||
private final AtomicLong totalHitsCounter;
|
private final AtomicLong totalHitsCounter;
|
||||||
private final ReentrantLock lock;
|
|
||||||
|
|
||||||
public static LuceneParallelStreamCollectorManager fromConsumer(
|
public static LuceneParallelStreamCollectorManager fromConsumer(
|
||||||
ScoreMode scoreMode,
|
ScoreMode scoreMode,
|
||||||
@ -22,17 +20,17 @@ public class LuceneParallelStreamCollectorManager implements
|
|||||||
return new LuceneParallelStreamCollectorManager(scoreMode, streamConsumer);
|
return new LuceneParallelStreamCollectorManager(scoreMode, streamConsumer);
|
||||||
}
|
}
|
||||||
|
|
||||||
public LuceneParallelStreamCollectorManager(ScoreMode scoreMode, LuceneParallelStreamConsumer streamConsumer) {
|
public LuceneParallelStreamCollectorManager(ScoreMode scoreMode,
|
||||||
|
LuceneParallelStreamConsumer streamConsumer) {
|
||||||
this.scoreMode = scoreMode;
|
this.scoreMode = scoreMode;
|
||||||
this.streamConsumer = streamConsumer;
|
this.streamConsumer = streamConsumer;
|
||||||
this.stopped = new AtomicBoolean();
|
this.stopped = new AtomicBoolean();
|
||||||
this.totalHitsCounter = new AtomicLong();
|
this.totalHitsCounter = new AtomicLong();
|
||||||
this.lock = new ReentrantLock();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public LuceneParallelStreamCollector newCollector() {
|
public LuceneParallelStreamCollector newCollector() {
|
||||||
return new LuceneParallelStreamCollector(0, scoreMode, streamConsumer, stopped, totalHitsCounter, lock);
|
return new LuceneParallelStreamCollector(0, scoreMode, streamConsumer, stopped, totalHitsCounter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -11,4 +11,5 @@ public class LuceneParallelStreamCollectorResult {
|
|||||||
public long getTotalHitsCount() {
|
public long getTotalHitsCount() {
|
||||||
return totalHitsCount;
|
return totalHitsCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package it.cavallium.dbengine.lucene;
|
package it.cavallium.dbengine.lucene;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
public interface LuceneParallelStreamConsumer {
|
public interface LuceneParallelStreamConsumer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -7,5 +9,5 @@ public interface LuceneParallelStreamConsumer {
|
|||||||
* @param score score of document
|
* @param score score of document
|
||||||
* @return true to continue, false to stop the execution
|
* @return true to continue, false to stop the execution
|
||||||
*/
|
*/
|
||||||
boolean consume(int docId, float score);
|
boolean consume(int docId, float score) throws IOException;
|
||||||
}
|
}
|
||||||
|
170
src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java
Normal file
170
src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
package it.cavallium.dbengine.lucene;
|
||||||
|
|
||||||
|
import it.cavallium.dbengine.client.MultiSort;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.N4CharGramAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.N4CharGramEdgeAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||||
|
import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.similarity.NGramSimilarity;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
|
||||||
|
import org.apache.lucene.analysis.en.KStemFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.misc.SweetSpotSimilarity;
|
||||||
|
import org.apache.lucene.search.similarities.BooleanSimilarity;
|
||||||
|
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
import org.novasearch.lucene.search.similarities.BM25Similarity;
|
||||||
|
import org.novasearch.lucene.search.similarities.BM25Similarity.BM25Model;
|
||||||
|
import org.novasearch.lucene.search.similarities.LdpSimilarity;
|
||||||
|
import org.novasearch.lucene.search.similarities.LtcSimilarity;
|
||||||
|
import org.novasearch.lucene.search.similarities.RobertsonSimilarity;
|
||||||
|
import reactor.core.publisher.Flux;
|
||||||
|
|
||||||
|
public class LuceneUtils {
|
||||||
|
private static final Analyzer lucene4GramWordsAnalyzerEdgeInstance = new N4CharGramEdgeAnalyzer(true);
|
||||||
|
private static final Analyzer lucene4GramStringAnalyzerEdgeInstance = new N4CharGramEdgeAnalyzer(false);
|
||||||
|
private static final Analyzer lucene4GramWordsAnalyzerInstance = new N4CharGramAnalyzer(true);
|
||||||
|
private static final Analyzer lucene4GramStringAnalyzerInstance = new N4CharGramAnalyzer(false);
|
||||||
|
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer();
|
||||||
|
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(true, true);
|
||||||
|
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(true, false);
|
||||||
|
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, true);
|
||||||
|
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false);
|
||||||
|
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC);
|
||||||
|
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS);
|
||||||
|
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L);
|
||||||
|
private static final Similarity luceneBM15PlusSimilarityInstance = new BM25Similarity(1.2f, 0.0f, 0.5f, BM25Model.PLUS);
|
||||||
|
private static final Similarity luceneBM11PlusSimilarityInstance = new BM25Similarity(1.2f, 1.0f, 0.5f, BM25Model.PLUS);
|
||||||
|
private static final Similarity luceneBM25ClassicNGramSimilarityInstance = NGramSimilarity.bm25(BM25Model.CLASSIC);
|
||||||
|
private static final Similarity luceneBM25PlusNGramSimilarityInstance = NGramSimilarity.bm25(BM25Model.PLUS);
|
||||||
|
private static final Similarity luceneBM25LNGramSimilarityInstance = NGramSimilarity.bm25(BM25Model.L);
|
||||||
|
private static final Similarity luceneBM15PlusNGramSimilarityInstance = NGramSimilarity.bm15(BM25Model.PLUS);
|
||||||
|
private static final Similarity luceneBM11PlusNGramSimilarityInstance = NGramSimilarity.bm11(BM25Model.PLUS);
|
||||||
|
private static final Similarity luceneClassicSimilarityInstance = new ClassicSimilarity();
|
||||||
|
private static final Similarity luceneClassicNGramSimilarityInstance = NGramSimilarity.classic();
|
||||||
|
private static final Similarity luceneSweetSpotSimilarityInstance = new SweetSpotSimilarity();
|
||||||
|
private static final Similarity luceneLTCSimilarityInstance = new LtcSimilarity();
|
||||||
|
private static final Similarity luceneLDPSimilarityInstance = new LdpSimilarity();
|
||||||
|
private static final Similarity luceneLDPNoLengthSimilarityInstance = new LdpSimilarity(0, 0.5f);
|
||||||
|
private static final Similarity luceneBooleanSimilarityInstance = new BooleanSimilarity();
|
||||||
|
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity();
|
||||||
|
|
||||||
|
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
|
||||||
|
switch (analyzer) {
|
||||||
|
case PartialWords:
|
||||||
|
return lucene4GramWordsAnalyzerInstance;
|
||||||
|
case PartialString:
|
||||||
|
return lucene4GramStringAnalyzerInstance;
|
||||||
|
case PartialWordsEdge:
|
||||||
|
return lucene4GramWordsAnalyzerEdgeInstance;
|
||||||
|
case PartialStringEdge:
|
||||||
|
return lucene4GramStringAnalyzerEdgeInstance;
|
||||||
|
case Standard:
|
||||||
|
return luceneStandardAnalyzerInstance;
|
||||||
|
case FullText:
|
||||||
|
return luceneWordAnalyzerStopWordsAndStemInstance;
|
||||||
|
case WordWithStopwordsStripping:
|
||||||
|
return luceneWordAnalyzerStopWordsInstance;
|
||||||
|
case WordWithStemming:
|
||||||
|
return luceneWordAnalyzerStemInstance;
|
||||||
|
case WordSimple:
|
||||||
|
return luceneWordAnalyzerSimpleInstance;
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Similarity getSimilarity(TextFieldsSimilarity similarity) {
|
||||||
|
switch (similarity) {
|
||||||
|
case BM25Classic:
|
||||||
|
return luceneBM25ClassicSimilarityInstance;
|
||||||
|
case NGramBM25Classic:
|
||||||
|
return luceneBM25ClassicNGramSimilarityInstance;
|
||||||
|
case BM25L:
|
||||||
|
return luceneBM25LSimilarityInstance;
|
||||||
|
case NGramBM25L:
|
||||||
|
return luceneBM25LNGramSimilarityInstance;
|
||||||
|
case Classic:
|
||||||
|
return luceneClassicSimilarityInstance;
|
||||||
|
case NGramClassic:
|
||||||
|
return luceneClassicNGramSimilarityInstance;
|
||||||
|
case BM25Plus:
|
||||||
|
return luceneBM25PlusSimilarityInstance;
|
||||||
|
case NGramBM25Plus:
|
||||||
|
return luceneBM25PlusNGramSimilarityInstance;
|
||||||
|
case BM15Plus:
|
||||||
|
return luceneBM15PlusSimilarityInstance;
|
||||||
|
case NGramBM15Plus:
|
||||||
|
return luceneBM15PlusNGramSimilarityInstance;
|
||||||
|
case BM11Plus:
|
||||||
|
return luceneBM11PlusSimilarityInstance;
|
||||||
|
case NGramBM11Plus:
|
||||||
|
return luceneBM11PlusNGramSimilarityInstance;
|
||||||
|
case SweetSpot:
|
||||||
|
return luceneSweetSpotSimilarityInstance;
|
||||||
|
case LTC:
|
||||||
|
return luceneLTCSimilarityInstance;
|
||||||
|
case LDP:
|
||||||
|
return luceneLDPSimilarityInstance;
|
||||||
|
case LDPNoLength:
|
||||||
|
return luceneLDPNoLengthSimilarityInstance;
|
||||||
|
case Robertson:
|
||||||
|
return luceneRobertsonSimilarityInstance;
|
||||||
|
case Boolean:
|
||||||
|
return luceneBooleanSimilarityInstance;
|
||||||
|
default:
|
||||||
|
throw new IllegalStateException("Unknown similarity: " + similarity);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param stem Enable stem filters on words.
|
||||||
|
* Pass false if it will be used with a n-gram filter
|
||||||
|
*/
|
||||||
|
public static TokenStream newCommonFilter(TokenStream tokenStream, boolean stem) {
|
||||||
|
tokenStream = newCommonNormalizer(tokenStream);
|
||||||
|
if (stem) {
|
||||||
|
tokenStream = new KStemFilter(tokenStream);
|
||||||
|
tokenStream = new EnglishPossessiveFilter(tokenStream);
|
||||||
|
}
|
||||||
|
return tokenStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static TokenStream newCommonNormalizer(TokenStream tokenStream) {
|
||||||
|
tokenStream = new ASCIIFoldingFilter(tokenStream);
|
||||||
|
tokenStream = new LowerCaseFilter(tokenStream);
|
||||||
|
return tokenStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merge streams together maintaining absolute order
|
||||||
|
*/
|
||||||
|
public static <T> Flux<T> mergeStream(Flux<Flux<T>> mappedMultiResults,
|
||||||
|
@Nullable MultiSort<T> sort,
|
||||||
|
@Nullable Integer limit) {
|
||||||
|
if (limit != null && limit == 0) {
|
||||||
|
return mappedMultiResults.flatMap(f -> f).ignoreElements().flux();
|
||||||
|
}
|
||||||
|
return mappedMultiResults.collectList().flatMapMany(mappedMultiResultsList -> {
|
||||||
|
Flux<T> mergedFlux;
|
||||||
|
if (sort == null) {
|
||||||
|
mergedFlux = Flux.merge(mappedMultiResultsList);
|
||||||
|
} else {
|
||||||
|
//noinspection unchecked
|
||||||
|
mergedFlux = Flux.mergeOrdered(32, sort.getResultSort(), mappedMultiResultsList.toArray(Flux[]::new));
|
||||||
|
}
|
||||||
|
if (limit == null) {
|
||||||
|
return mergedFlux;
|
||||||
|
} else {
|
||||||
|
return mergedFlux.take(limit);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
@ -1,24 +1,34 @@
|
|||||||
package it.cavallium.dbengine.database.analyzer;
|
package it.cavallium.dbengine.lucene.analyzer;
|
||||||
|
|
||||||
import it.cavallium.dbengine.database.LuceneUtils;
|
import it.cavallium.dbengine.lucene.LuceneUtils;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
public class N4CharGramAnalyzer extends Analyzer {
|
public class N4CharGramAnalyzer extends Analyzer {
|
||||||
|
|
||||||
public N4CharGramAnalyzer() {
|
private final boolean words;
|
||||||
|
|
||||||
|
public N4CharGramAnalyzer(boolean words) {
|
||||||
|
this.words = words;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(final String fieldName) {
|
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer();
|
Tokenizer tokenizer;
|
||||||
TokenStream tokenStream = tokenizer;
|
TokenStream tokenStream;
|
||||||
tokenStream = LuceneUtils.newCommonFilter(tokenStream, false);
|
if (words) {
|
||||||
tokenStream = new NGramTokenFilter(tokenStream, 4, 4, false);
|
tokenizer = new StandardTokenizer();
|
||||||
|
tokenStream = tokenizer;
|
||||||
|
} else {
|
||||||
|
tokenizer = new KeywordTokenizer();
|
||||||
|
tokenStream = tokenizer;
|
||||||
|
}
|
||||||
|
tokenStream = LuceneUtils.newCommonFilter(tokenStream, words);
|
||||||
|
tokenStream = new NGramTokenFilter(tokenStream, 3, 5, false);
|
||||||
|
|
||||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||||
}
|
}
|
@ -1,24 +1,34 @@
|
|||||||
package it.cavallium.dbengine.database.analyzer;
|
package it.cavallium.dbengine.lucene.analyzer;
|
||||||
|
|
||||||
import it.cavallium.dbengine.database.LuceneUtils;
|
import it.cavallium.dbengine.lucene.LuceneUtils;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
public class N4CharGramEdgeAnalyzer extends Analyzer {
|
public class N4CharGramEdgeAnalyzer extends Analyzer {
|
||||||
|
|
||||||
public N4CharGramEdgeAnalyzer() {
|
private final boolean words;
|
||||||
|
|
||||||
|
public N4CharGramEdgeAnalyzer(boolean words) {
|
||||||
|
this.words = words;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(final String fieldName) {
|
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer();
|
Tokenizer tokenizer;
|
||||||
TokenStream tokenStream = tokenizer;
|
TokenStream tokenStream;
|
||||||
tokenStream = LuceneUtils.newCommonFilter(tokenStream, false);
|
if (words) {
|
||||||
tokenStream = new EdgeNGramTokenFilter(tokenStream, 4, 4, false);
|
tokenizer = new StandardTokenizer();
|
||||||
|
tokenStream = tokenizer;
|
||||||
|
} else {
|
||||||
|
tokenizer = new KeywordTokenizer();
|
||||||
|
tokenStream = tokenizer;
|
||||||
|
}
|
||||||
|
tokenStream = LuceneUtils.newCommonFilter(tokenStream, words);
|
||||||
|
tokenStream = new EdgeNGramTokenFilter(tokenStream, 3, 5, false);
|
||||||
|
|
||||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||||
}
|
}
|
@ -1,8 +1,11 @@
|
|||||||
package it.cavallium.dbengine.database.analyzer;
|
package it.cavallium.dbengine.lucene.analyzer;
|
||||||
|
|
||||||
public enum TextFieldsAnalyzer {
|
public enum TextFieldsAnalyzer {
|
||||||
PartialWordsEdge,
|
|
||||||
PartialWords,
|
PartialWords,
|
||||||
|
PartialWordsEdge,
|
||||||
|
PartialString,
|
||||||
|
PartialStringEdge,
|
||||||
|
Standard,
|
||||||
WordSimple,
|
WordSimple,
|
||||||
WordWithStopwordsStripping,
|
WordWithStopwordsStripping,
|
||||||
WordWithStemming,
|
WordWithStemming,
|
@ -0,0 +1,22 @@
|
|||||||
|
package it.cavallium.dbengine.lucene.analyzer;
|
||||||
|
|
||||||
|
public enum TextFieldsSimilarity {
|
||||||
|
BM25Classic,
|
||||||
|
NGramBM25Classic,
|
||||||
|
BM25L,
|
||||||
|
NGramBM25L,
|
||||||
|
BM25Plus,
|
||||||
|
NGramBM25Plus,
|
||||||
|
BM15Plus,
|
||||||
|
NGramBM15Plus,
|
||||||
|
BM11Plus,
|
||||||
|
NGramBM11Plus,
|
||||||
|
Classic,
|
||||||
|
NGramClassic,
|
||||||
|
SweetSpot,
|
||||||
|
LTC,
|
||||||
|
LDP,
|
||||||
|
LDPNoLength,
|
||||||
|
Robertson,
|
||||||
|
Boolean
|
||||||
|
}
|
@ -1,7 +1,7 @@
|
|||||||
package it.cavallium.dbengine.database.analyzer;
|
package it.cavallium.dbengine.lucene.analyzer;
|
||||||
|
|
||||||
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
|
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
|
||||||
import it.cavallium.dbengine.database.LuceneUtils;
|
import it.cavallium.dbengine.lucene.LuceneUtils;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
@ -15,6 +15,7 @@ import org.jetbrains.annotations.Nullable;
|
|||||||
*/
|
*/
|
||||||
public class AdaptiveStreamSearcher implements LuceneStreamSearcher {
|
public class AdaptiveStreamSearcher implements LuceneStreamSearcher {
|
||||||
|
|
||||||
|
private static final boolean ENABLE_PARALLEL_COLLECTOR = true;
|
||||||
private final SimpleStreamSearcher simpleStreamSearcher;
|
private final SimpleStreamSearcher simpleStreamSearcher;
|
||||||
private final ParallelCollectorStreamSearcher parallelCollectorStreamSearcher;
|
private final ParallelCollectorStreamSearcher parallelCollectorStreamSearcher;
|
||||||
private final PagedStreamSearcher pagedStreamSearcher;
|
private final PagedStreamSearcher pagedStreamSearcher;
|
||||||
@ -38,10 +39,10 @@ public class AdaptiveStreamSearcher implements LuceneStreamSearcher {
|
|||||||
LongConsumer totalHitsConsumer) throws IOException {
|
LongConsumer totalHitsConsumer) throws IOException {
|
||||||
if (limit == 0) {
|
if (limit == 0) {
|
||||||
totalHitsConsumer.accept(countStreamSearcher.count(indexSearcher, query));
|
totalHitsConsumer.accept(countStreamSearcher.count(indexSearcher, query));
|
||||||
} else if (luceneSort == null) {
|
} else if (luceneSort == null && ENABLE_PARALLEL_COLLECTOR) {
|
||||||
parallelCollectorStreamSearcher.search(indexSearcher, query, limit, null, scoreMode, keyFieldName, consumer, totalHitsConsumer);
|
parallelCollectorStreamSearcher.search(indexSearcher, query, limit, null, scoreMode, keyFieldName, consumer, totalHitsConsumer);
|
||||||
} else {
|
} else {
|
||||||
if (limit > PagedStreamSearcher.MAX_ITEMS_PER_PAGE) {
|
if (luceneSort != null && limit > PagedStreamSearcher.MAX_ITEMS_PER_PAGE) {
|
||||||
pagedStreamSearcher.search(indexSearcher, query, limit, luceneSort, scoreMode, keyFieldName, consumer, totalHitsConsumer);
|
pagedStreamSearcher.search(indexSearcher, query, limit, luceneSort, scoreMode, keyFieldName, consumer, totalHitsConsumer);
|
||||||
} else {
|
} else {
|
||||||
simpleStreamSearcher.search(indexSearcher, query, limit, luceneSort, scoreMode, keyFieldName, consumer, totalHitsConsumer);
|
simpleStreamSearcher.search(indexSearcher, query, limit, luceneSort, scoreMode, keyFieldName, consumer, totalHitsConsumer);
|
||||||
|
@ -0,0 +1,93 @@
|
|||||||
|
package it.cavallium.dbengine.lucene.searcher;
|
||||||
|
|
||||||
|
import it.cavallium.dbengine.database.LLKeyScore;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.concurrent.CompletionException;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
import java.util.function.LongConsumer;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.search.Collector;
|
||||||
|
import org.apache.lucene.search.CollectorManager;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.LeafCollector;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.Scorable;
|
||||||
|
import org.apache.lucene.search.ScoreMode;
|
||||||
|
import org.apache.lucene.search.Sort;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search that only parse query without doing any effective search
|
||||||
|
*/
|
||||||
|
public class AllowOnlyQueryParsingCollectorStreamSearcher implements LuceneStreamSearcher {
|
||||||
|
|
||||||
|
public void search(IndexSearcher indexSearcher,
|
||||||
|
Query query) throws IOException {
|
||||||
|
search(indexSearcher, query, 0, null, null, null, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void search(IndexSearcher indexSearcher,
|
||||||
|
Query query,
|
||||||
|
int limit,
|
||||||
|
@Nullable Sort luceneSort,
|
||||||
|
ScoreMode scoreMode,
|
||||||
|
String keyFieldName,
|
||||||
|
Consumer<LLKeyScore> resultsConsumer,
|
||||||
|
LongConsumer totalHitsConsumer) throws IOException {
|
||||||
|
if (limit > 0) {
|
||||||
|
throw new IllegalArgumentException("Limit > 0 not allowed");
|
||||||
|
}
|
||||||
|
if (luceneSort != null) {
|
||||||
|
throw new IllegalArgumentException("Lucene sort not allowed");
|
||||||
|
}
|
||||||
|
if (scoreMode != null) {
|
||||||
|
throw new IllegalArgumentException("Score mode not allowed");
|
||||||
|
}
|
||||||
|
if (keyFieldName != null) {
|
||||||
|
throw new IllegalArgumentException("Key field name not allowed");
|
||||||
|
}
|
||||||
|
if (resultsConsumer != null) {
|
||||||
|
throw new IllegalArgumentException("Results consumer not allowed");
|
||||||
|
}
|
||||||
|
if (totalHitsConsumer != null) {
|
||||||
|
throw new IllegalArgumentException("Total hits consumer not allowed");
|
||||||
|
}
|
||||||
|
indexSearcher.search(query, new CollectorManager<>() {
|
||||||
|
@Override
|
||||||
|
public Collector newCollector() {
|
||||||
|
return new Collector() {
|
||||||
|
@Override
|
||||||
|
public LeafCollector getLeafCollector(LeafReaderContext context) {
|
||||||
|
return new LeafCollector() {
|
||||||
|
@Override
|
||||||
|
public void setScorer(Scorable scorer) {
|
||||||
|
try {
|
||||||
|
scorer.setMinCompetitiveScore(Float.MAX_VALUE);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new CompletionException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void collect(int doc) {
|
||||||
|
System.out.println(doc);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ScoreMode scoreMode() {
|
||||||
|
return ScoreMode.TOP_SCORES;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object reduce(Collection<Collector> collectors) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
@ -2,9 +2,9 @@ package it.cavallium.dbengine.lucene.searcher;
|
|||||||
|
|
||||||
import it.cavallium.dbengine.database.LLKeyScore;
|
import it.cavallium.dbengine.database.LLKeyScore;
|
||||||
import it.cavallium.dbengine.lucene.LuceneParallelStreamCollectorManager;
|
import it.cavallium.dbengine.lucene.LuceneParallelStreamCollectorManager;
|
||||||
|
import it.cavallium.dbengine.lucene.LuceneParallelStreamCollectorResult;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.CompletionException;
|
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
import java.util.function.LongConsumer;
|
import java.util.function.LongConsumer;
|
||||||
@ -36,32 +36,27 @@ public class ParallelCollectorStreamSearcher implements LuceneStreamSearcher {
|
|||||||
|
|
||||||
AtomicInteger currentCount = new AtomicInteger();
|
AtomicInteger currentCount = new AtomicInteger();
|
||||||
|
|
||||||
var result = indexSearcher.search(query, LuceneParallelStreamCollectorManager.fromConsumer(scoreMode, (docId, score) -> {
|
LuceneParallelStreamCollectorResult result = indexSearcher.search(query, LuceneParallelStreamCollectorManager.fromConsumer(scoreMode, (docId, score) -> {
|
||||||
if (currentCount.getAndIncrement() >= limit) {
|
if (currentCount.getAndIncrement() >= limit) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
try {
|
Document d = indexSearcher.doc(docId, Set.of(keyFieldName));
|
||||||
Document d = indexSearcher.doc(docId, Set.of(keyFieldName));
|
if (d.getFields().isEmpty()) {
|
||||||
if (d.getFields().isEmpty()) {
|
logger.error("The document docId: {} is empty.", docId);
|
||||||
logger.error("The document docId: {} is empty.", docId);
|
var realFields = indexSearcher.doc(docId).getFields();
|
||||||
var realFields = indexSearcher.doc(docId).getFields();
|
if (!realFields.isEmpty()) {
|
||||||
if (!realFields.isEmpty()) {
|
logger.error("Present fields:");
|
||||||
logger.error("Present fields:");
|
for (IndexableField field : realFields) {
|
||||||
for (IndexableField field : realFields) {
|
logger.error(" - {}", field.name());
|
||||||
logger.error(" - {}", field.name());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
var field = d.getField(keyFieldName);
|
|
||||||
if (field == null) {
|
|
||||||
logger.error("Can't get key of document docId: {}", docId);
|
|
||||||
} else {
|
|
||||||
resultsConsumer.accept(new LLKeyScore(field.stringValue(), score));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} else {
|
||||||
e.printStackTrace();
|
var field = d.getField(keyFieldName);
|
||||||
throw new CompletionException(e);
|
if (field == null) {
|
||||||
|
logger.error("Can't get key of document docId: {}", docId);
|
||||||
|
} else {
|
||||||
|
resultsConsumer.accept(new LLKeyScore(field.stringValue(), score));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,12 @@ public class SimpleStreamSearcher implements LuceneStreamSearcher {
|
|||||||
String keyFieldName,
|
String keyFieldName,
|
||||||
Consumer<LLKeyScore> resultsConsumer,
|
Consumer<LLKeyScore> resultsConsumer,
|
||||||
LongConsumer totalHitsConsumer) throws IOException {
|
LongConsumer totalHitsConsumer) throws IOException {
|
||||||
TopDocs topDocs = indexSearcher.search(query, limit, luceneSort, scoreMode != ScoreMode.COMPLETE_NO_SCORES);
|
TopDocs topDocs;
|
||||||
|
if (luceneSort != null) {
|
||||||
|
topDocs = indexSearcher.search(query, limit, luceneSort, scoreMode != ScoreMode.COMPLETE_NO_SCORES);
|
||||||
|
} else {
|
||||||
|
topDocs = indexSearcher.search(query, limit);
|
||||||
|
}
|
||||||
totalHitsConsumer.accept(topDocs.totalHits.value);
|
totalHitsConsumer.accept(topDocs.totalHits.value);
|
||||||
var hits = ObjectArrayList.wrap(topDocs.scoreDocs);
|
var hits = ObjectArrayList.wrap(topDocs.scoreDocs);
|
||||||
for (ScoreDoc hit : hits) {
|
for (ScoreDoc hit : hits) {
|
||||||
|
@ -2,8 +2,8 @@ package it.cavallium.dbengine.lucene.serializer;
|
|||||||
|
|
||||||
import static it.cavallium.dbengine.lucene.serializer.QueryParser.USE_PHRASE_QUERY;
|
import static it.cavallium.dbengine.lucene.serializer.QueryParser.USE_PHRASE_QUERY;
|
||||||
|
|
||||||
import it.cavallium.dbengine.database.LuceneUtils;
|
import it.cavallium.dbengine.lucene.LuceneUtils;
|
||||||
import it.cavallium.dbengine.database.analyzer.TextFieldsAnalyzer;
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
@ -0,0 +1,54 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package it.cavallium.dbengine.lucene.similarity;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
|
import org.novasearch.lucene.search.similarities.BM25Similarity;
|
||||||
|
import org.novasearch.lucene.search.similarities.BM25Similarity.BM25Model;
|
||||||
|
|
||||||
|
public class NGramSimilarity {
|
||||||
|
|
||||||
|
private NGramSimilarity() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Similarity classic() {
|
||||||
|
var instance = new ClassicSimilarity();
|
||||||
|
instance.setDiscountOverlaps(false);
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Similarity bm25(BM25Model model) {
|
||||||
|
var instance = new BM25Similarity(model);
|
||||||
|
instance.setDiscountOverlaps(false);
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Similarity bm15(BM25Model model) {
|
||||||
|
var instance = new BM25Similarity(1.2f, 0.0f, 0.5f, model);
|
||||||
|
instance.setDiscountOverlaps(false);
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Similarity bm11(BM25Model model) {
|
||||||
|
var instance = new BM25Similarity(1.2f, 1.0f, 0.5f, model);
|
||||||
|
instance.setDiscountOverlaps(false);
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user