CavalliumDBEngine/src/main/java/it/cavallium/dbengine/lucene/analyzer/WordAnalyzer.java

72 lines
2.8 KiB
Java
Raw Normal View History

package it.cavallium.dbengine.lucene.analyzer;
2020-12-07 22:15:18 +01:00
2021-05-28 16:04:59 +02:00
import com.ibm.icu.text.Collator;
import com.ibm.icu.util.ULocale;
2020-12-07 22:15:18 +01:00
import org.apache.lucene.analysis.Analyzer;
2022-01-11 22:23:07 +01:00
import org.apache.lucene.analysis.LowerCaseFilter;
2020-12-07 22:15:18 +01:00
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
2022-01-11 22:23:07 +01:00
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.en.EnglishMinimalStemFilterFactory;
2021-05-28 16:04:59 +02:00
import org.apache.lucene.analysis.icu.ICUCollationAttributeFactory;
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
2022-01-11 22:23:07 +01:00
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
import org.apache.lucene.analysis.icu.ICUFoldingFilterFactory;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
import org.apache.lucene.analysis.it.ItalianLightStemFilterFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
2020-12-07 22:15:18 +01:00
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class WordAnalyzer extends Analyzer {
2022-01-11 22:23:07 +01:00
private static final EnglishMinimalStemFilterFactory ENGLISH_MINIMAL_STEM_FILTER_FACTORY = new EnglishMinimalStemFilterFactory();
private static final ItalianLightStemFilterFactory ITALIAN_LIGHT_STEM_FILTER_FACTORY = new ItalianLightStemFilterFactory();
private static final Collator ROOT_COLLATOR = Collator.getInstance(ULocale.ROOT);
private static final ICUCollationAttributeFactory ROOT_ICU_ATTRIBUTE_FACTORY = new ICUCollationAttributeFactory(ROOT_COLLATOR);
2021-05-28 16:04:59 +02:00
private final boolean icu;
2020-12-07 22:15:18 +01:00
private final boolean stem;
2022-01-11 22:23:07 +01:00
public WordAnalyzer(boolean icu, boolean stem) {
2021-05-28 16:04:59 +02:00
this.icu = icu;
2020-12-07 22:15:18 +01:00
this.stem = stem;
2022-01-11 22:23:07 +01:00
if (icu) {
if (!stem) {
throw new IllegalArgumentException("stem must be true if icu is true");
}
}
2020-12-07 22:15:18 +01:00
}
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
2021-05-28 16:04:59 +02:00
if (icu) {
2022-01-11 22:23:07 +01:00
var tokenizer = new KeywordTokenizer(ROOT_ICU_ATTRIBUTE_FACTORY, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
TokenStream tokenStream = tokenizer;
tokenStream = new ICUFoldingFilter(tokenStream);
return new TokenStreamComponents(tokenizer, tokenStream);
2021-05-28 16:04:59 +02:00
} else {
2022-01-11 22:23:07 +01:00
var maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
var standardTokenizer = new StandardTokenizer(new ICUCollationAttributeFactory(Collator.getInstance(ULocale.ROOT)));
standardTokenizer.setMaxTokenLength(maxTokenLength);
TokenStream tokenStream = standardTokenizer;
tokenStream = new LowerCaseFilter(tokenStream);
if (stem) {
tokenStream = ITALIAN_LIGHT_STEM_FILTER_FACTORY.create(ENGLISH_MINIMAL_STEM_FILTER_FACTORY.create(tokenStream));
}
return new TokenStreamComponents(r -> {
standardTokenizer.setMaxTokenLength(maxTokenLength);
standardTokenizer.setReader(r);
}, tokenStream);
2021-05-28 16:04:59 +02:00
}
2020-12-07 22:15:18 +01:00
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
2022-01-11 22:23:07 +01:00
if (icu) {
return new ICUFoldingFilter(in);
} else {
return new LowerCaseFilter(in);
}
2020-12-07 22:15:18 +01:00
}
}