Rewrite the custom analyzers

This commit is contained in:
Andrea Cavalli 2022-01-11 22:23:07 +01:00
parent ecde6724e5
commit 4e5e4423ff
12 changed files with 92 additions and 1161 deletions

View File

@ -6,7 +6,7 @@ import java.util.Map;
public record IndicizerAnalyzers(TextFieldsAnalyzer defaultAnalyzer, Map<String, TextFieldsAnalyzer> fieldAnalyzer) {
public static IndicizerAnalyzers of() {
return of(TextFieldsAnalyzer.FullText);
return of(TextFieldsAnalyzer.ICUCollationKey);
}
public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer) {

View File

@ -7,7 +7,7 @@ import java.util.Map;
public record IndicizerSimilarities(TextFieldsSimilarity defaultSimilarity, Map<String, TextFieldsSimilarity> fieldSimilarity) {
public static IndicizerSimilarities of() {
return of(TextFieldsSimilarity.BM25Plus);
return of(TextFieldsSimilarity.BM25Standard);
}
public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity) {

View File

@ -8,7 +8,6 @@ import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.client.query.QueryParser;
import it.cavallium.dbengine.client.query.current.data.QueryParams;
import it.cavallium.dbengine.client.query.current.data.TotalHitsCount;
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
import it.cavallium.dbengine.database.LLKeyScore;
import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionary;
@ -41,11 +40,9 @@ import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
@ -63,7 +60,6 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TimeLimitingCollector;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.similarities.BooleanSimilarity;
@ -88,20 +84,15 @@ public class LuceneUtils {
private static final Logger logger = LogManager.getLogger(LuceneUtils.class);
private static final Analyzer lucene4GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 4, 4);
private static final Analyzer lucene4GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 4, 4);
private static final Analyzer lucene4GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 4, 4);
private static final Analyzer lucene4GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 4, 4);
private static final Analyzer lucene3To5GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 3, 5);
private static final Analyzer lucene3To5GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 3, 5);
private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 3, 5);
private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 3, 5);
private static final Analyzer luceneEdge4GramAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(4, 4);
private static final Analyzer lucene4GramAnalyzerInstance = new NCharGramAnalyzer(4, 4);
private static final Analyzer luceneEdge3To5GramAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(3, 5);
private static final Analyzer lucene3To5GramAnalyzerInstance = new NCharGramAnalyzer(3, 5);
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer();
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(false,true, true);
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(false, true, false);
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, false, true);
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false, false);
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer(false, true, true);
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false,true);
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false);
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer(true, true);
private static final Similarity luceneBM25StandardSimilarityInstance = new org.apache.lucene.search.similarities.BM25Similarity();
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC);
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS);
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L);
@ -121,23 +112,26 @@ public class LuceneUtils {
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity();
// TODO: remove this default page limits and make the limits configurable into QueryParams
private static final PageLimits DEFAULT_PAGE_LIMITS = new ExponentialPageLimits();
private static final CharArraySet ENGLISH_AND_ITALIAN_STOP_WORDS;
static {
var cas = new CharArraySet(
EnglishAnalyzer.ENGLISH_STOP_WORDS_SET.size() + ItalianAnalyzer.getDefaultStopSet().size(), true);
cas.addAll(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
cas.addAll(ItalianAnalyzer.getDefaultStopSet());
ENGLISH_AND_ITALIAN_STOP_WORDS = CharArraySet.unmodifiableSet(cas);
}
@SuppressWarnings("DuplicatedCode")
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
return switch (analyzer) {
case N4GramPartialWords -> lucene4GramWordsAnalyzerInstance;
case N4GramPartialString -> lucene4GramStringAnalyzerInstance;
case N4GramPartialWordsEdge -> lucene4GramWordsAnalyzerEdgeInstance;
case N4GramPartialStringEdge -> lucene4GramStringAnalyzerEdgeInstance;
case N3To5GramPartialWords -> lucene3To5GramWordsAnalyzerInstance;
case N3To5GramPartialString -> lucene3To5GramStringAnalyzerInstance;
case N3To5GramPartialWordsEdge -> lucene3To5GramWordsAnalyzerEdgeInstance;
case N3To5GramPartialStringEdge -> lucene3To5GramStringAnalyzerEdgeInstance;
case N4Gram -> lucene4GramAnalyzerInstance;
case N4GramEdge -> luceneEdge4GramAnalyzerEdgeInstance;
case N3To5Gram -> lucene3To5GramAnalyzerInstance;
case N3To5GramEdge -> luceneEdge3To5GramAnalyzerEdgeInstance;
case Standard -> luceneStandardAnalyzerInstance;
case FullText -> luceneWordAnalyzerStopWordsAndStemInstance;
case WordWithStopwordsStripping -> luceneWordAnalyzerStopWordsInstance;
case WordWithStemming -> luceneWordAnalyzerStemInstance;
case WordSimple -> luceneWordAnalyzerSimpleInstance;
case StandardMultilanguage -> luceneWordAnalyzerStemInstance;
case StandardSimple -> luceneWordAnalyzerSimpleInstance;
case ICUCollationKey -> luceneICUCollationKeyInstance;
//noinspection UnnecessaryDefault
default -> throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
@ -147,6 +141,7 @@ public class LuceneUtils {
@SuppressWarnings("DuplicatedCode")
public static Similarity getSimilarity(TextFieldsSimilarity similarity) {
return switch (similarity) {
case BM25Standard -> luceneBM25StandardSimilarityInstance;
case BM25Classic -> luceneBM25ClassicSimilarityInstance;
case NGramBM25Classic -> luceneBM25ClassicNGramSimilarityInstance;
case BM25L -> luceneBM25LSimilarityInstance;
@ -169,26 +164,6 @@ public class LuceneUtils {
};
}
/**
*
* @param stem Enable stem filters on words.
* Pass false if it will be used with a n-gram filter
*/
public static TokenStream newCommonFilter(TokenStream tokenStream, boolean stem) {
tokenStream = newCommonNormalizer(tokenStream);
if (stem) {
tokenStream = new KStemFilter(tokenStream);
tokenStream = new EnglishPossessiveFilter(tokenStream);
}
return tokenStream;
}
public static TokenStream newCommonNormalizer(TokenStream tokenStream) {
tokenStream = new ASCIIFoldingFilter(tokenStream);
tokenStream = new LowerCaseFilter(tokenStream);
return tokenStream;
}
/**
* @throws NoSuchElementException when the key is not found
* @throws IOException when an error occurs when reading the document
@ -532,7 +507,7 @@ public class LuceneUtils {
mlt.setMinDocFreq(3);
mlt.setMaxDocFreqPct(20);
mlt.setBoost(localQueryParams.needsScores());
mlt.setStopWords(EnglishItalianStopFilter.getStopWordsString());
mlt.setStopWords(ENGLISH_AND_ITALIAN_STOP_WORDS);
if (similarity instanceof TFIDFSimilarity tfidfSimilarity) {
mlt.setSimilarity(tfidfSimilarity);
} else {

View File

@ -6,40 +6,22 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class NCharGramAnalyzer extends Analyzer {
private final boolean words;
private final int minGram;
private final int maxGram;
public NCharGramAnalyzer(boolean words, int minGram, int maxGram) {
this.words = words;
public NCharGramAnalyzer(int minGram, int maxGram) {
this.minGram = minGram;
this.maxGram = maxGram;
}
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
Tokenizer tokenizer;
TokenStream tokenStream;
if (words) {
tokenizer = new StandardTokenizer();
} else {
tokenizer = new KeywordTokenizer();
}
tokenStream = tokenizer;
tokenStream = LuceneUtils.newCommonFilter(tokenStream, words);
tokenStream = new NGramTokenFilter(tokenStream, minGram, maxGram, false);
return new TokenStreamComponents(tokenizer, tokenStream);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream tokenStream = in;
tokenStream = LuceneUtils.newCommonNormalizer(tokenStream);
return tokenStream;
Tokenizer tokenizer = new NGramTokenizer(minGram, maxGram);
return new TokenStreamComponents(tokenizer);
}
}

View File

@ -6,40 +6,24 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class NCharGramEdgeAnalyzer extends Analyzer {
private final boolean words;
private final int minGram;
private final int maxGram;
public NCharGramEdgeAnalyzer(boolean words, int minGram, int maxGram) {
this.words = words;
public NCharGramEdgeAnalyzer(int minGram, int maxGram) {
this.minGram = minGram;
this.maxGram = maxGram;
}
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
Tokenizer tokenizer;
TokenStream tokenStream;
if (words) {
tokenizer = new StandardTokenizer();
} else {
tokenizer = new KeywordTokenizer();
}
tokenStream = tokenizer;
tokenStream = LuceneUtils.newCommonFilter(tokenStream, words);
tokenStream = new EdgeNGramTokenFilter(tokenStream, minGram, maxGram, false);
return new TokenStreamComponents(tokenizer, tokenStream);
Tokenizer tokenizer = new EdgeNGramTokenizer(minGram, maxGram);
return new TokenStreamComponents(tokenizer);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream tokenStream = in;
tokenStream = LuceneUtils.newCommonNormalizer(tokenStream);
return tokenStream;
}
}

View File

@ -1,18 +1,12 @@
package it.cavallium.dbengine.lucene.analyzer;
public enum TextFieldsAnalyzer {
N4GramPartialWords,
N4GramPartialWordsEdge,
N4GramPartialString,
N4GramPartialStringEdge,
N3To5GramPartialWords,
N3To5GramPartialWordsEdge,
N3To5GramPartialString,
N3To5GramPartialStringEdge,
N4Gram,
N4GramEdge,
N3To5Gram,
N3To5GramEdge,
Standard,
WordSimple,
StandardSimple,
ICUCollationKey,
WordWithStopwordsStripping,
WordWithStemming,
FullText,
StandardMultilanguage
}

View File

@ -1,6 +1,7 @@
package it.cavallium.dbengine.lucene.analyzer;
public enum TextFieldsSimilarity {
BM25Standard,
BM25Classic,
NGramBM25Classic,
BM25L,

View File

@ -2,54 +2,70 @@ package it.cavallium.dbengine.lucene.analyzer;
import com.ibm.icu.text.Collator;
import com.ibm.icu.util.ULocale;
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
import it.cavallium.dbengine.lucene.LuceneUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.en.EnglishMinimalStemFilterFactory;
import org.apache.lucene.analysis.icu.ICUCollationAttributeFactory;
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
import org.apache.lucene.analysis.icu.ICUFoldingFilterFactory;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
import org.apache.lucene.analysis.it.ItalianLightStemFilterFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class WordAnalyzer extends Analyzer {
private static final EnglishMinimalStemFilterFactory ENGLISH_MINIMAL_STEM_FILTER_FACTORY = new EnglishMinimalStemFilterFactory();
private static final ItalianLightStemFilterFactory ITALIAN_LIGHT_STEM_FILTER_FACTORY = new ItalianLightStemFilterFactory();
private static final Collator ROOT_COLLATOR = Collator.getInstance(ULocale.ROOT);
private static final ICUCollationAttributeFactory ROOT_ICU_ATTRIBUTE_FACTORY = new ICUCollationAttributeFactory(ROOT_COLLATOR);
private final boolean icu;
private final boolean removeStopWords;
private final boolean stem;
public WordAnalyzer(boolean icu, boolean removeStopWords, boolean stem) {
public WordAnalyzer(boolean icu, boolean stem) {
this.icu = icu;
this.removeStopWords = removeStopWords;
this.stem = stem;
if (icu) {
if (!stem) {
throw new IllegalArgumentException("stem must be true if icu is true");
}
}
}
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
Tokenizer tokenizer;
if (icu) {
tokenizer = new StandardTokenizer(new ICUCollationAttributeFactory(Collator.getInstance(ULocale.ROOT)));
} else {
tokenizer = new StandardTokenizer();
}
var tokenizer = new KeywordTokenizer(ROOT_ICU_ATTRIBUTE_FACTORY, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
TokenStream tokenStream = tokenizer;
if (stem) {
tokenStream = new LengthFilter(tokenStream, 1, 120);
}
if (!icu) {
tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem);
}
if (removeStopWords) {
tokenStream = new EnglishItalianStopFilter(tokenStream);
}
tokenStream = new ICUFoldingFilter(tokenStream);
return new TokenStreamComponents(tokenizer, tokenStream);
} else {
var maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
var standardTokenizer = new StandardTokenizer(new ICUCollationAttributeFactory(Collator.getInstance(ULocale.ROOT)));
standardTokenizer.setMaxTokenLength(maxTokenLength);
TokenStream tokenStream = standardTokenizer;
tokenStream = new LowerCaseFilter(tokenStream);
if (stem) {
tokenStream = ITALIAN_LIGHT_STEM_FILTER_FACTORY.create(ENGLISH_MINIMAL_STEM_FILTER_FACTORY.create(tokenStream));
}
return new TokenStreamComponents(r -> {
standardTokenizer.setMaxTokenLength(maxTokenLength);
standardTokenizer.setReader(r);
}, tokenStream);
}
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream tokenStream = in;
tokenStream = LuceneUtils.newCommonNormalizer(tokenStream);
return tokenStream;
if (icu) {
return new ICUFoldingFilter(in);
} else {
return new LowerCaseFilter(in);
}
}
}

View File

@ -83,7 +83,7 @@ public class LocalTemporaryDbGenerator implements TemporaryDbGenerator {
conn.getLuceneIndex(null,
"testluceneindex1",
1,
IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple),
IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey),
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
LUCENE_OPTS,
luceneHacks
@ -91,7 +91,7 @@ public class LocalTemporaryDbGenerator implements TemporaryDbGenerator {
conn.getLuceneIndex("testluceneindex16",
null,
3,
IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple),
IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey),
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
LUCENE_OPTS,
luceneHacks

View File

@ -54,7 +54,7 @@ public class MemoryTemporaryDbGenerator implements TemporaryDbGenerator {
conn.getLuceneIndex(null,
"testluceneindex1",
1,
IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple),
IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey),
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
LUCENE_OPTS,
luceneHacks
@ -62,7 +62,7 @@ public class MemoryTemporaryDbGenerator implements TemporaryDbGenerator {
conn.getLuceneIndex("testluceneindex16",
null,
3,
IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple),
IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey),
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
LUCENE_OPTS,
luceneHacks

View File

@ -57,11 +57,11 @@ public class StringIndicizer extends Indicizer<String, String> {
@Override
public IndicizerAnalyzers getPerFieldAnalyzer() {
return IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple);
return IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey);
}
@Override
public IndicizerSimilarities getPerFieldSimilarity() {
return IndicizerSimilarities.of(TextFieldsSimilarity.Boolean);
return IndicizerSimilarities.of(TextFieldsSimilarity.BM25Standard);
}
}