40 lines
1.2 KiB
Java
40 lines
1.2 KiB
Java
package it.cavallium.dbengine.database.analyzer;
|
|
|
|
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
|
|
import it.cavallium.dbengine.database.LuceneUtils;
|
|
import org.apache.lucene.analysis.Analyzer;
|
|
import org.apache.lucene.analysis.TokenStream;
|
|
import org.apache.lucene.analysis.Tokenizer;
|
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|
|
|
public class WordAnalyzer extends Analyzer {
|
|
|
|
private final boolean removeStopWords;
|
|
private final boolean stem;
|
|
|
|
public WordAnalyzer(boolean removeStopWords, boolean stem) {
|
|
this.removeStopWords = removeStopWords;
|
|
this.stem = stem;
|
|
}
|
|
|
|
@Override
|
|
protected TokenStreamComponents createComponents(final String fieldName) {
|
|
Tokenizer tokenizer = new StandardTokenizer();
|
|
TokenStream tokenStream = tokenizer;
|
|
//tokenStream = new LengthFilter(tokenStream, 1, 100);
|
|
if (removeStopWords) {
|
|
tokenStream = new EnglishItalianStopFilter(tokenStream);
|
|
}
|
|
tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem);
|
|
|
|
return new TokenStreamComponents(tokenizer, tokenStream);
|
|
}
|
|
|
|
@Override
|
|
protected TokenStream normalize(String fieldName, TokenStream in) {
|
|
TokenStream tokenStream = in;
|
|
tokenStream = LuceneUtils.newCommonNormalizer(tokenStream);
|
|
return tokenStream;
|
|
}
|
|
}
|