Rewrite the custom analyzers
This commit is contained in:
parent
ecde6724e5
commit
4e5e4423ff
@ -6,7 +6,7 @@ import java.util.Map;
|
|||||||
public record IndicizerAnalyzers(TextFieldsAnalyzer defaultAnalyzer, Map<String, TextFieldsAnalyzer> fieldAnalyzer) {
|
public record IndicizerAnalyzers(TextFieldsAnalyzer defaultAnalyzer, Map<String, TextFieldsAnalyzer> fieldAnalyzer) {
|
||||||
|
|
||||||
public static IndicizerAnalyzers of() {
|
public static IndicizerAnalyzers of() {
|
||||||
return of(TextFieldsAnalyzer.FullText);
|
return of(TextFieldsAnalyzer.ICUCollationKey);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer) {
|
public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer) {
|
||||||
|
@ -7,7 +7,7 @@ import java.util.Map;
|
|||||||
public record IndicizerSimilarities(TextFieldsSimilarity defaultSimilarity, Map<String, TextFieldsSimilarity> fieldSimilarity) {
|
public record IndicizerSimilarities(TextFieldsSimilarity defaultSimilarity, Map<String, TextFieldsSimilarity> fieldSimilarity) {
|
||||||
|
|
||||||
public static IndicizerSimilarities of() {
|
public static IndicizerSimilarities of() {
|
||||||
return of(TextFieldsSimilarity.BM25Plus);
|
return of(TextFieldsSimilarity.BM25Standard);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity) {
|
public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity) {
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -8,7 +8,6 @@ import it.cavallium.dbengine.client.IndicizerSimilarities;
|
|||||||
import it.cavallium.dbengine.client.query.QueryParser;
|
import it.cavallium.dbengine.client.query.QueryParser;
|
||||||
import it.cavallium.dbengine.client.query.current.data.QueryParams;
|
import it.cavallium.dbengine.client.query.current.data.QueryParams;
|
||||||
import it.cavallium.dbengine.client.query.current.data.TotalHitsCount;
|
import it.cavallium.dbengine.client.query.current.data.TotalHitsCount;
|
||||||
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
|
|
||||||
import it.cavallium.dbengine.database.LLKeyScore;
|
import it.cavallium.dbengine.database.LLKeyScore;
|
||||||
import it.cavallium.dbengine.database.LLUtils;
|
import it.cavallium.dbengine.database.LLUtils;
|
||||||
import it.cavallium.dbengine.database.collections.DatabaseMapDictionary;
|
import it.cavallium.dbengine.database.collections.DatabaseMapDictionary;
|
||||||
@ -41,11 +40,9 @@ import java.util.stream.Collectors;
|
|||||||
import org.apache.logging.log4j.LogManager;
|
import org.apache.logging.log4j.LogManager;
|
||||||
import org.apache.logging.log4j.Logger;
|
import org.apache.logging.log4j.Logger;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||||
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
|
import org.apache.lucene.analysis.it.ItalianAnalyzer;
|
||||||
import org.apache.lucene.analysis.en.KStemFilter;
|
|
||||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
|
||||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
@ -63,7 +60,6 @@ import org.apache.lucene.search.ScoreDoc;
|
|||||||
import org.apache.lucene.search.Sort;
|
import org.apache.lucene.search.Sort;
|
||||||
import org.apache.lucene.search.TimeLimitingCollector;
|
import org.apache.lucene.search.TimeLimitingCollector;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.search.TopDocsCollector;
|
|
||||||
import org.apache.lucene.search.TopFieldDocs;
|
import org.apache.lucene.search.TopFieldDocs;
|
||||||
import org.apache.lucene.search.TotalHits;
|
import org.apache.lucene.search.TotalHits;
|
||||||
import org.apache.lucene.search.similarities.BooleanSimilarity;
|
import org.apache.lucene.search.similarities.BooleanSimilarity;
|
||||||
@ -88,20 +84,15 @@ public class LuceneUtils {
|
|||||||
|
|
||||||
private static final Logger logger = LogManager.getLogger(LuceneUtils.class);
|
private static final Logger logger = LogManager.getLogger(LuceneUtils.class);
|
||||||
|
|
||||||
private static final Analyzer lucene4GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 4, 4);
|
private static final Analyzer luceneEdge4GramAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(4, 4);
|
||||||
private static final Analyzer lucene4GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 4, 4);
|
private static final Analyzer lucene4GramAnalyzerInstance = new NCharGramAnalyzer(4, 4);
|
||||||
private static final Analyzer lucene4GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 4, 4);
|
private static final Analyzer luceneEdge3To5GramAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(3, 5);
|
||||||
private static final Analyzer lucene4GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 4, 4);
|
private static final Analyzer lucene3To5GramAnalyzerInstance = new NCharGramAnalyzer(3, 5);
|
||||||
private static final Analyzer lucene3To5GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 3, 5);
|
|
||||||
private static final Analyzer lucene3To5GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 3, 5);
|
|
||||||
private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 3, 5);
|
|
||||||
private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 3, 5);
|
|
||||||
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer();
|
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer();
|
||||||
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(false,true, true);
|
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false,true);
|
||||||
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(false, true, false);
|
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false);
|
||||||
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, false, true);
|
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer(true, true);
|
||||||
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false, false);
|
private static final Similarity luceneBM25StandardSimilarityInstance = new org.apache.lucene.search.similarities.BM25Similarity();
|
||||||
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer(false, true, true);
|
|
||||||
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC);
|
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC);
|
||||||
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS);
|
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS);
|
||||||
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L);
|
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L);
|
||||||
@ -121,23 +112,26 @@ public class LuceneUtils {
|
|||||||
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity();
|
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity();
|
||||||
// TODO: remove this default page limits and make the limits configurable into QueryParams
|
// TODO: remove this default page limits and make the limits configurable into QueryParams
|
||||||
private static final PageLimits DEFAULT_PAGE_LIMITS = new ExponentialPageLimits();
|
private static final PageLimits DEFAULT_PAGE_LIMITS = new ExponentialPageLimits();
|
||||||
|
private static final CharArraySet ENGLISH_AND_ITALIAN_STOP_WORDS;
|
||||||
|
|
||||||
|
static {
|
||||||
|
var cas = new CharArraySet(
|
||||||
|
EnglishAnalyzer.ENGLISH_STOP_WORDS_SET.size() + ItalianAnalyzer.getDefaultStopSet().size(), true);
|
||||||
|
cas.addAll(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
|
cas.addAll(ItalianAnalyzer.getDefaultStopSet());
|
||||||
|
ENGLISH_AND_ITALIAN_STOP_WORDS = CharArraySet.unmodifiableSet(cas);
|
||||||
|
}
|
||||||
|
|
||||||
@SuppressWarnings("DuplicatedCode")
|
@SuppressWarnings("DuplicatedCode")
|
||||||
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
|
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
|
||||||
return switch (analyzer) {
|
return switch (analyzer) {
|
||||||
case N4GramPartialWords -> lucene4GramWordsAnalyzerInstance;
|
case N4Gram -> lucene4GramAnalyzerInstance;
|
||||||
case N4GramPartialString -> lucene4GramStringAnalyzerInstance;
|
case N4GramEdge -> luceneEdge4GramAnalyzerEdgeInstance;
|
||||||
case N4GramPartialWordsEdge -> lucene4GramWordsAnalyzerEdgeInstance;
|
case N3To5Gram -> lucene3To5GramAnalyzerInstance;
|
||||||
case N4GramPartialStringEdge -> lucene4GramStringAnalyzerEdgeInstance;
|
case N3To5GramEdge -> luceneEdge3To5GramAnalyzerEdgeInstance;
|
||||||
case N3To5GramPartialWords -> lucene3To5GramWordsAnalyzerInstance;
|
|
||||||
case N3To5GramPartialString -> lucene3To5GramStringAnalyzerInstance;
|
|
||||||
case N3To5GramPartialWordsEdge -> lucene3To5GramWordsAnalyzerEdgeInstance;
|
|
||||||
case N3To5GramPartialStringEdge -> lucene3To5GramStringAnalyzerEdgeInstance;
|
|
||||||
case Standard -> luceneStandardAnalyzerInstance;
|
case Standard -> luceneStandardAnalyzerInstance;
|
||||||
case FullText -> luceneWordAnalyzerStopWordsAndStemInstance;
|
case StandardMultilanguage -> luceneWordAnalyzerStemInstance;
|
||||||
case WordWithStopwordsStripping -> luceneWordAnalyzerStopWordsInstance;
|
case StandardSimple -> luceneWordAnalyzerSimpleInstance;
|
||||||
case WordWithStemming -> luceneWordAnalyzerStemInstance;
|
|
||||||
case WordSimple -> luceneWordAnalyzerSimpleInstance;
|
|
||||||
case ICUCollationKey -> luceneICUCollationKeyInstance;
|
case ICUCollationKey -> luceneICUCollationKeyInstance;
|
||||||
//noinspection UnnecessaryDefault
|
//noinspection UnnecessaryDefault
|
||||||
default -> throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
|
default -> throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
|
||||||
@ -147,6 +141,7 @@ public class LuceneUtils {
|
|||||||
@SuppressWarnings("DuplicatedCode")
|
@SuppressWarnings("DuplicatedCode")
|
||||||
public static Similarity getSimilarity(TextFieldsSimilarity similarity) {
|
public static Similarity getSimilarity(TextFieldsSimilarity similarity) {
|
||||||
return switch (similarity) {
|
return switch (similarity) {
|
||||||
|
case BM25Standard -> luceneBM25StandardSimilarityInstance;
|
||||||
case BM25Classic -> luceneBM25ClassicSimilarityInstance;
|
case BM25Classic -> luceneBM25ClassicSimilarityInstance;
|
||||||
case NGramBM25Classic -> luceneBM25ClassicNGramSimilarityInstance;
|
case NGramBM25Classic -> luceneBM25ClassicNGramSimilarityInstance;
|
||||||
case BM25L -> luceneBM25LSimilarityInstance;
|
case BM25L -> luceneBM25LSimilarityInstance;
|
||||||
@ -169,26 +164,6 @@ public class LuceneUtils {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param stem Enable stem filters on words.
|
|
||||||
* Pass false if it will be used with a n-gram filter
|
|
||||||
*/
|
|
||||||
public static TokenStream newCommonFilter(TokenStream tokenStream, boolean stem) {
|
|
||||||
tokenStream = newCommonNormalizer(tokenStream);
|
|
||||||
if (stem) {
|
|
||||||
tokenStream = new KStemFilter(tokenStream);
|
|
||||||
tokenStream = new EnglishPossessiveFilter(tokenStream);
|
|
||||||
}
|
|
||||||
return tokenStream;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static TokenStream newCommonNormalizer(TokenStream tokenStream) {
|
|
||||||
tokenStream = new ASCIIFoldingFilter(tokenStream);
|
|
||||||
tokenStream = new LowerCaseFilter(tokenStream);
|
|
||||||
return tokenStream;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @throws NoSuchElementException when the key is not found
|
* @throws NoSuchElementException when the key is not found
|
||||||
* @throws IOException when an error occurs when reading the document
|
* @throws IOException when an error occurs when reading the document
|
||||||
@ -532,7 +507,7 @@ public class LuceneUtils {
|
|||||||
mlt.setMinDocFreq(3);
|
mlt.setMinDocFreq(3);
|
||||||
mlt.setMaxDocFreqPct(20);
|
mlt.setMaxDocFreqPct(20);
|
||||||
mlt.setBoost(localQueryParams.needsScores());
|
mlt.setBoost(localQueryParams.needsScores());
|
||||||
mlt.setStopWords(EnglishItalianStopFilter.getStopWordsString());
|
mlt.setStopWords(ENGLISH_AND_ITALIAN_STOP_WORDS);
|
||||||
if (similarity instanceof TFIDFSimilarity tfidfSimilarity) {
|
if (similarity instanceof TFIDFSimilarity tfidfSimilarity) {
|
||||||
mlt.setSimilarity(tfidfSimilarity);
|
mlt.setSimilarity(tfidfSimilarity);
|
||||||
} else {
|
} else {
|
||||||
|
@ -6,40 +6,22 @@ import org.apache.lucene.analysis.TokenStream;
|
|||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
public class NCharGramAnalyzer extends Analyzer {
|
public class NCharGramAnalyzer extends Analyzer {
|
||||||
|
|
||||||
private final boolean words;
|
|
||||||
private final int minGram;
|
private final int minGram;
|
||||||
private final int maxGram;
|
private final int maxGram;
|
||||||
|
|
||||||
public NCharGramAnalyzer(boolean words, int minGram, int maxGram) {
|
public NCharGramAnalyzer(int minGram, int maxGram) {
|
||||||
this.words = words;
|
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(final String fieldName) {
|
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||||
Tokenizer tokenizer;
|
Tokenizer tokenizer = new NGramTokenizer(minGram, maxGram);
|
||||||
TokenStream tokenStream;
|
return new TokenStreamComponents(tokenizer);
|
||||||
if (words) {
|
|
||||||
tokenizer = new StandardTokenizer();
|
|
||||||
} else {
|
|
||||||
tokenizer = new KeywordTokenizer();
|
|
||||||
}
|
|
||||||
tokenStream = tokenizer;
|
|
||||||
tokenStream = LuceneUtils.newCommonFilter(tokenStream, words);
|
|
||||||
tokenStream = new NGramTokenFilter(tokenStream, minGram, maxGram, false);
|
|
||||||
|
|
||||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
|
||||||
TokenStream tokenStream = in;
|
|
||||||
tokenStream = LuceneUtils.newCommonNormalizer(tokenStream);
|
|
||||||
return tokenStream;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,40 +6,24 @@ import org.apache.lucene.analysis.TokenStream;
|
|||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||||
|
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
public class NCharGramEdgeAnalyzer extends Analyzer {
|
public class NCharGramEdgeAnalyzer extends Analyzer {
|
||||||
|
|
||||||
private final boolean words;
|
|
||||||
private final int minGram;
|
private final int minGram;
|
||||||
private final int maxGram;
|
private final int maxGram;
|
||||||
|
|
||||||
public NCharGramEdgeAnalyzer(boolean words, int minGram, int maxGram) {
|
public NCharGramEdgeAnalyzer(int minGram, int maxGram) {
|
||||||
this.words = words;
|
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(final String fieldName) {
|
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||||
Tokenizer tokenizer;
|
Tokenizer tokenizer = new EdgeNGramTokenizer(minGram, maxGram);
|
||||||
TokenStream tokenStream;
|
return new TokenStreamComponents(tokenizer);
|
||||||
if (words) {
|
|
||||||
tokenizer = new StandardTokenizer();
|
|
||||||
} else {
|
|
||||||
tokenizer = new KeywordTokenizer();
|
|
||||||
}
|
|
||||||
tokenStream = tokenizer;
|
|
||||||
tokenStream = LuceneUtils.newCommonFilter(tokenStream, words);
|
|
||||||
tokenStream = new EdgeNGramTokenFilter(tokenStream, minGram, maxGram, false);
|
|
||||||
|
|
||||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
|
||||||
TokenStream tokenStream = in;
|
|
||||||
tokenStream = LuceneUtils.newCommonNormalizer(tokenStream);
|
|
||||||
return tokenStream;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,18 +1,12 @@
|
|||||||
package it.cavallium.dbengine.lucene.analyzer;
|
package it.cavallium.dbengine.lucene.analyzer;
|
||||||
|
|
||||||
public enum TextFieldsAnalyzer {
|
public enum TextFieldsAnalyzer {
|
||||||
N4GramPartialWords,
|
N4Gram,
|
||||||
N4GramPartialWordsEdge,
|
N4GramEdge,
|
||||||
N4GramPartialString,
|
N3To5Gram,
|
||||||
N4GramPartialStringEdge,
|
N3To5GramEdge,
|
||||||
N3To5GramPartialWords,
|
|
||||||
N3To5GramPartialWordsEdge,
|
|
||||||
N3To5GramPartialString,
|
|
||||||
N3To5GramPartialStringEdge,
|
|
||||||
Standard,
|
Standard,
|
||||||
WordSimple,
|
StandardSimple,
|
||||||
ICUCollationKey,
|
ICUCollationKey,
|
||||||
WordWithStopwordsStripping,
|
StandardMultilanguage
|
||||||
WordWithStemming,
|
|
||||||
FullText,
|
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package it.cavallium.dbengine.lucene.analyzer;
|
package it.cavallium.dbengine.lucene.analyzer;
|
||||||
|
|
||||||
public enum TextFieldsSimilarity {
|
public enum TextFieldsSimilarity {
|
||||||
|
BM25Standard,
|
||||||
BM25Classic,
|
BM25Classic,
|
||||||
NGramBM25Classic,
|
NGramBM25Classic,
|
||||||
BM25L,
|
BM25L,
|
||||||
|
@ -2,54 +2,70 @@ package it.cavallium.dbengine.lucene.analyzer;
|
|||||||
|
|
||||||
import com.ibm.icu.text.Collator;
|
import com.ibm.icu.text.Collator;
|
||||||
import com.ibm.icu.util.ULocale;
|
import com.ibm.icu.util.ULocale;
|
||||||
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
|
|
||||||
import it.cavallium.dbengine.lucene.LuceneUtils;
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.en.EnglishMinimalStemFilterFactory;
|
||||||
import org.apache.lucene.analysis.icu.ICUCollationAttributeFactory;
|
import org.apache.lucene.analysis.icu.ICUCollationAttributeFactory;
|
||||||
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
|
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
|
||||||
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
|
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
|
||||||
|
import org.apache.lucene.analysis.icu.ICUFoldingFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
|
||||||
|
import org.apache.lucene.analysis.it.ItalianLightStemFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
public class WordAnalyzer extends Analyzer {
|
public class WordAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
private static final EnglishMinimalStemFilterFactory ENGLISH_MINIMAL_STEM_FILTER_FACTORY = new EnglishMinimalStemFilterFactory();
|
||||||
|
private static final ItalianLightStemFilterFactory ITALIAN_LIGHT_STEM_FILTER_FACTORY = new ItalianLightStemFilterFactory();
|
||||||
|
private static final Collator ROOT_COLLATOR = Collator.getInstance(ULocale.ROOT);
|
||||||
|
private static final ICUCollationAttributeFactory ROOT_ICU_ATTRIBUTE_FACTORY = new ICUCollationAttributeFactory(ROOT_COLLATOR);
|
||||||
|
|
||||||
private final boolean icu;
|
private final boolean icu;
|
||||||
private final boolean removeStopWords;
|
|
||||||
private final boolean stem;
|
private final boolean stem;
|
||||||
|
|
||||||
public WordAnalyzer(boolean icu, boolean removeStopWords, boolean stem) {
|
public WordAnalyzer(boolean icu, boolean stem) {
|
||||||
this.icu = icu;
|
this.icu = icu;
|
||||||
this.removeStopWords = removeStopWords;
|
|
||||||
this.stem = stem;
|
this.stem = stem;
|
||||||
|
if (icu) {
|
||||||
|
if (!stem) {
|
||||||
|
throw new IllegalArgumentException("stem must be true if icu is true");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(final String fieldName) {
|
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||||
Tokenizer tokenizer;
|
|
||||||
if (icu) {
|
if (icu) {
|
||||||
tokenizer = new StandardTokenizer(new ICUCollationAttributeFactory(Collator.getInstance(ULocale.ROOT)));
|
var tokenizer = new KeywordTokenizer(ROOT_ICU_ATTRIBUTE_FACTORY, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
|
||||||
|
TokenStream tokenStream = tokenizer;
|
||||||
|
tokenStream = new ICUFoldingFilter(tokenStream);
|
||||||
|
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||||
} else {
|
} else {
|
||||||
tokenizer = new StandardTokenizer();
|
var maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||||
|
var standardTokenizer = new StandardTokenizer(new ICUCollationAttributeFactory(Collator.getInstance(ULocale.ROOT)));
|
||||||
|
standardTokenizer.setMaxTokenLength(maxTokenLength);
|
||||||
|
TokenStream tokenStream = standardTokenizer;
|
||||||
|
tokenStream = new LowerCaseFilter(tokenStream);
|
||||||
|
if (stem) {
|
||||||
|
tokenStream = ITALIAN_LIGHT_STEM_FILTER_FACTORY.create(ENGLISH_MINIMAL_STEM_FILTER_FACTORY.create(tokenStream));
|
||||||
|
}
|
||||||
|
return new TokenStreamComponents(r -> {
|
||||||
|
standardTokenizer.setMaxTokenLength(maxTokenLength);
|
||||||
|
standardTokenizer.setReader(r);
|
||||||
|
}, tokenStream);
|
||||||
}
|
}
|
||||||
TokenStream tokenStream = tokenizer;
|
|
||||||
if (stem) {
|
|
||||||
tokenStream = new LengthFilter(tokenStream, 1, 120);
|
|
||||||
}
|
|
||||||
if (!icu) {
|
|
||||||
tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem);
|
|
||||||
}
|
|
||||||
if (removeStopWords) {
|
|
||||||
tokenStream = new EnglishItalianStopFilter(tokenStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||||
TokenStream tokenStream = in;
|
if (icu) {
|
||||||
tokenStream = LuceneUtils.newCommonNormalizer(tokenStream);
|
return new ICUFoldingFilter(in);
|
||||||
return tokenStream;
|
} else {
|
||||||
|
return new LowerCaseFilter(in);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -83,7 +83,7 @@ public class LocalTemporaryDbGenerator implements TemporaryDbGenerator {
|
|||||||
conn.getLuceneIndex(null,
|
conn.getLuceneIndex(null,
|
||||||
"testluceneindex1",
|
"testluceneindex1",
|
||||||
1,
|
1,
|
||||||
IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple),
|
IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey),
|
||||||
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
|
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
|
||||||
LUCENE_OPTS,
|
LUCENE_OPTS,
|
||||||
luceneHacks
|
luceneHacks
|
||||||
@ -91,7 +91,7 @@ public class LocalTemporaryDbGenerator implements TemporaryDbGenerator {
|
|||||||
conn.getLuceneIndex("testluceneindex16",
|
conn.getLuceneIndex("testluceneindex16",
|
||||||
null,
|
null,
|
||||||
3,
|
3,
|
||||||
IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple),
|
IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey),
|
||||||
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
|
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
|
||||||
LUCENE_OPTS,
|
LUCENE_OPTS,
|
||||||
luceneHacks
|
luceneHacks
|
||||||
|
@ -54,7 +54,7 @@ public class MemoryTemporaryDbGenerator implements TemporaryDbGenerator {
|
|||||||
conn.getLuceneIndex(null,
|
conn.getLuceneIndex(null,
|
||||||
"testluceneindex1",
|
"testluceneindex1",
|
||||||
1,
|
1,
|
||||||
IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple),
|
IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey),
|
||||||
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
|
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
|
||||||
LUCENE_OPTS,
|
LUCENE_OPTS,
|
||||||
luceneHacks
|
luceneHacks
|
||||||
@ -62,7 +62,7 @@ public class MemoryTemporaryDbGenerator implements TemporaryDbGenerator {
|
|||||||
conn.getLuceneIndex("testluceneindex16",
|
conn.getLuceneIndex("testluceneindex16",
|
||||||
null,
|
null,
|
||||||
3,
|
3,
|
||||||
IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple),
|
IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey),
|
||||||
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
|
IndicizerSimilarities.of(TextFieldsSimilarity.Boolean),
|
||||||
LUCENE_OPTS,
|
LUCENE_OPTS,
|
||||||
luceneHacks
|
luceneHacks
|
||||||
|
@ -57,11 +57,11 @@ public class StringIndicizer extends Indicizer<String, String> {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public IndicizerAnalyzers getPerFieldAnalyzer() {
|
public IndicizerAnalyzers getPerFieldAnalyzer() {
|
||||||
return IndicizerAnalyzers.of(TextFieldsAnalyzer.WordSimple);
|
return IndicizerAnalyzers.of(TextFieldsAnalyzer.ICUCollationKey);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public IndicizerSimilarities getPerFieldSimilarity() {
|
public IndicizerSimilarities getPerFieldSimilarity() {
|
||||||
return IndicizerSimilarities.of(TextFieldsSimilarity.Boolean);
|
return IndicizerSimilarities.of(TextFieldsSimilarity.BM25Standard);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user