This commit is contained in:
Andrea Cavalli 2021-02-05 20:34:58 +01:00
parent 527f8afea5
commit 14c2464577
7 changed files with 51 additions and 26 deletions

View File

@ -34,7 +34,7 @@ public class IndicizationExample {
})
)
.then(index.refresh())
.then(index.search(null, Query.exactSearch(TextFieldsAnalyzer.PartialString,"name", "Mario"), 1, null, LLScoreMode.COMPLETE, "id"))
.then(index.search(null, Query.exactSearch(TextFieldsAnalyzer.NGramPartialString,"name", "Mario"), 1, null, LLScoreMode.COMPLETE, "id"))
.flatMap(results -> results
.results()
.flatMap(r -> r)
@ -98,7 +98,7 @@ public class IndicizationExample {
})
))
.then(index.refresh())
.then(index.search(null, Query.exactSearch(TextFieldsAnalyzer.PartialString,"name", "Mario"), 10, MultiSort.topScore()
.then(index.search(null, Query.exactSearch(TextFieldsAnalyzer.NGramPartialString,"name", "Mario"), 10, MultiSort.topScore()
.getQuerySort(), LLScoreMode.COMPLETE, "id"))
.flatMap(results -> LuceneUtils.mergeStream(results
.results(), MultiSort.topScoreRaw(), 10)
@ -153,7 +153,7 @@ public class IndicizationExample {
.then(new LLLocalDatabaseConnection(wrkspcPath, true).connect())
.flatMap(conn -> conn.getLuceneIndex("testindices",
10,
TextFieldsAnalyzer.PartialString,
TextFieldsAnalyzer.NGramPartialString,
TextFieldsSimilarity.NGramBM25Plus,
Duration.ofSeconds(5),
Duration.ofSeconds(5),

View File

@ -16,7 +16,7 @@ public interface LLDatabaseConnection {
Mono<? extends LLLuceneIndex> getLuceneIndex(String name,
int instancesCount,
TextFieldsAnalyzer textFieldsAnalyzer,
TextFieldsSimilarity scorer,
TextFieldsSimilarity textFieldsSimilarity,
Duration queryRefreshDebounceTime,
Duration commitDebounceTime,
boolean lowMemory);

View File

@ -3,6 +3,7 @@ package it.cavallium.dbengine.database;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import it.cavallium.dbengine.database.collections.DatabaseInt;
import it.cavallium.dbengine.database.collections.DatabaseLong;
import java.nio.charset.StandardCharsets;
import reactor.core.publisher.Mono;
@ -30,13 +31,13 @@ public interface LLKeyValueDatabase extends LLSnapshottable, LLKeyValueDatabaseS
.map(DatabaseInt::new);
}
default Mono<DatabaseInt> getLong(String singletonListName, String name, long defaultValue) {
default Mono<DatabaseLong> getLong(String singletonListName, String name, long defaultValue) {
return this
.getSingleton(Column.special(singletonListName).getName().getBytes(StandardCharsets.US_ASCII),
name.getBytes(StandardCharsets.US_ASCII),
Longs.toByteArray(defaultValue)
)
.map(DatabaseInt::new);
.map(DatabaseLong::new);
}
Mono<Long> getProperty(String propertyName);

View File

@ -1,8 +1,8 @@
package it.cavallium.dbengine.lucene;
import it.cavallium.dbengine.client.MultiSort;
import it.cavallium.dbengine.lucene.analyzer.N4CharGramAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.N4CharGramEdgeAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.NCharGramAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.NCharGramEdgeAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer;
@ -27,10 +27,14 @@ import org.novasearch.lucene.search.similarities.RobertsonSimilarity;
import reactor.core.publisher.Flux;
public class LuceneUtils {
private static final Analyzer lucene4GramWordsAnalyzerEdgeInstance = new N4CharGramEdgeAnalyzer(true);
private static final Analyzer lucene4GramStringAnalyzerEdgeInstance = new N4CharGramEdgeAnalyzer(false);
private static final Analyzer lucene4GramWordsAnalyzerInstance = new N4CharGramAnalyzer(true);
private static final Analyzer lucene4GramStringAnalyzerInstance = new N4CharGramAnalyzer(false);
private static final Analyzer lucene4GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 4, 4);
private static final Analyzer lucene4GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 4, 4);
private static final Analyzer lucene4GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 4, 4);
private static final Analyzer lucene4GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 4, 4);
private static final Analyzer lucene3To5GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 3, 5);
private static final Analyzer lucene3To5GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 3, 5);
private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 3, 5);
private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 3, 5);
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer();
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(true, true);
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(true, false);
@ -57,14 +61,22 @@ public class LuceneUtils {
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
switch (analyzer) {
case PartialWords:
case N4GramPartialWords:
return lucene4GramWordsAnalyzerInstance;
case PartialString:
case N4GramPartialString:
return lucene4GramStringAnalyzerInstance;
case PartialWordsEdge:
case N4GramPartialWordsEdge:
return lucene4GramWordsAnalyzerEdgeInstance;
case PartialStringEdge:
case N4GramPartialStringEdge:
return lucene4GramStringAnalyzerEdgeInstance;
case N3To5GramPartialWords:
return lucene3To5GramWordsAnalyzerInstance;
case N3To5GramPartialString:
return lucene3To5GramStringAnalyzerInstance;
case N3To5GramPartialWordsEdge:
return lucene3To5GramWordsAnalyzerEdgeInstance;
case N3To5GramPartialStringEdge:
return lucene3To5GramStringAnalyzerEdgeInstance;
case Standard:
return luceneStandardAnalyzerInstance;
case FullText:

View File

@ -8,12 +8,16 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class N4CharGramAnalyzer extends Analyzer {
public class NCharGramAnalyzer extends Analyzer {
private final boolean words;
private final int minGram;
private final int maxGram;
public N4CharGramAnalyzer(boolean words) {
public NCharGramAnalyzer(boolean words, int minGram, int maxGram) {
this.words = words;
this.minGram = minGram;
this.maxGram = maxGram;
}
@Override
@ -28,7 +32,7 @@ public class N4CharGramAnalyzer extends Analyzer {
tokenStream = tokenizer;
}
tokenStream = LuceneUtils.newCommonFilter(tokenStream, words);
tokenStream = new NGramTokenFilter(tokenStream, 3, 5, false);
tokenStream = new NGramTokenFilter(tokenStream, minGram, maxGram, false);
return new TokenStreamComponents(tokenizer, tokenStream);
}

View File

@ -8,12 +8,16 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class N4CharGramEdgeAnalyzer extends Analyzer {
public class NCharGramEdgeAnalyzer extends Analyzer {
private final boolean words;
private final int minGram;
private final int maxGram;
public N4CharGramEdgeAnalyzer(boolean words) {
public NCharGramEdgeAnalyzer(boolean words, int minGram, int maxGram) {
this.words = words;
this.minGram = minGram;
this.maxGram = maxGram;
}
@Override
@ -28,7 +32,7 @@ public class N4CharGramEdgeAnalyzer extends Analyzer {
tokenStream = tokenizer;
}
tokenStream = LuceneUtils.newCommonFilter(tokenStream, words);
tokenStream = new EdgeNGramTokenFilter(tokenStream, 3, 5, false);
tokenStream = new EdgeNGramTokenFilter(tokenStream, minGram, maxGram, false);
return new TokenStreamComponents(tokenizer, tokenStream);
}

View File

@ -1,10 +1,14 @@
package it.cavallium.dbengine.lucene.analyzer;
public enum TextFieldsAnalyzer {
PartialWords,
PartialWordsEdge,
PartialString,
PartialStringEdge,
N4GramPartialWords,
N4GramPartialWordsEdge,
N4GramPartialString,
N4GramPartialStringEdge,
N3To5GramPartialWords,
N3To5GramPartialWordsEdge,
N3To5GramPartialString,
N3To5GramPartialStringEdge,
Standard,
WordSimple,
WordWithStopwordsStripping,