CavalliumDBEngine/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java

333 lines
14 KiB
Java
Raw Normal View History

package it.cavallium.dbengine.lucene;
2021-03-11 14:45:45 +01:00
import it.cavallium.dbengine.client.CompositeSnapshot;
2021-05-28 16:04:59 +02:00
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
2021-07-06 01:30:37 +02:00
import it.cavallium.dbengine.client.query.QueryParser;
import it.cavallium.dbengine.client.query.current.data.QueryParams;
2021-03-11 14:45:45 +01:00
import it.cavallium.dbengine.database.collections.DatabaseMapDictionary;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep;
import it.cavallium.dbengine.database.collections.Joiner.ValueGetter;
2021-02-05 20:34:58 +01:00
import it.cavallium.dbengine.lucene.analyzer.NCharGramAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.NCharGramEdgeAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer;
2021-07-06 01:30:37 +02:00
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
import it.cavallium.dbengine.lucene.similarity.NGramSimilarity;
2021-07-01 21:19:52 +02:00
import java.io.EOFException;
2021-02-14 13:46:11 +01:00
import java.io.IOException;
2021-07-01 21:19:52 +02:00
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
2021-05-28 16:04:59 +02:00
import java.util.HashMap;
2021-03-11 14:45:45 +01:00
import java.util.Map;
import java.util.Map.Entry;
2021-02-14 13:46:11 +01:00
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
2021-05-28 16:04:59 +02:00
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
2021-02-14 13:46:11 +01:00
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
2021-02-14 13:46:11 +01:00
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.similarities.BooleanSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
2021-05-28 16:04:59 +02:00
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.jetbrains.annotations.Nullable;
import org.novasearch.lucene.search.similarities.BM25Similarity;
import org.novasearch.lucene.search.similarities.BM25Similarity.BM25Model;
import org.novasearch.lucene.search.similarities.LdpSimilarity;
import org.novasearch.lucene.search.similarities.LtcSimilarity;
import org.novasearch.lucene.search.similarities.RobertsonSimilarity;
2021-02-20 21:35:09 +01:00
import org.warp.commonutils.log.Logger;
public class LuceneUtils {
2021-02-05 20:34:58 +01:00
private static final Analyzer lucene4GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 4, 4);
private static final Analyzer lucene4GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 4, 4);
private static final Analyzer lucene4GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 4, 4);
private static final Analyzer lucene4GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 4, 4);
private static final Analyzer lucene3To5GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(true, 3, 5);
private static final Analyzer lucene3To5GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer(false, 3, 5);
private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 3, 5);
private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 3, 5);
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer();
2021-05-28 16:04:59 +02:00
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(false,true, true);
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(false, true, false);
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, false, true);
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false, false);
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer(false, true, true);
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC);
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS);
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L);
private static final Similarity luceneBM15PlusSimilarityInstance = new BM25Similarity(1.2f, 0.0f, 0.5f, BM25Model.PLUS);
private static final Similarity luceneBM11PlusSimilarityInstance = new BM25Similarity(1.2f, 1.0f, 0.5f, BM25Model.PLUS);
private static final Similarity luceneBM25ClassicNGramSimilarityInstance = NGramSimilarity.bm25(BM25Model.CLASSIC);
private static final Similarity luceneBM25PlusNGramSimilarityInstance = NGramSimilarity.bm25(BM25Model.PLUS);
private static final Similarity luceneBM25LNGramSimilarityInstance = NGramSimilarity.bm25(BM25Model.L);
private static final Similarity luceneBM15PlusNGramSimilarityInstance = NGramSimilarity.bm15(BM25Model.PLUS);
private static final Similarity luceneBM11PlusNGramSimilarityInstance = NGramSimilarity.bm11(BM25Model.PLUS);
private static final Similarity luceneClassicSimilarityInstance = new ClassicSimilarity();
private static final Similarity luceneClassicNGramSimilarityInstance = NGramSimilarity.classic();
private static final Similarity luceneLTCSimilarityInstance = new LtcSimilarity();
private static final Similarity luceneLDPSimilarityInstance = new LdpSimilarity();
private static final Similarity luceneLDPNoLengthSimilarityInstance = new LdpSimilarity(0, 0.5f);
private static final Similarity luceneBooleanSimilarityInstance = new BooleanSimilarity();
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity();
2021-05-28 16:04:59 +02:00
@SuppressWarnings("DuplicatedCode")
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
2021-05-28 16:04:59 +02:00
return switch (analyzer) {
case N4GramPartialWords -> lucene4GramWordsAnalyzerInstance;
case N4GramPartialString -> lucene4GramStringAnalyzerInstance;
case N4GramPartialWordsEdge -> lucene4GramWordsAnalyzerEdgeInstance;
case N4GramPartialStringEdge -> lucene4GramStringAnalyzerEdgeInstance;
case N3To5GramPartialWords -> lucene3To5GramWordsAnalyzerInstance;
case N3To5GramPartialString -> lucene3To5GramStringAnalyzerInstance;
case N3To5GramPartialWordsEdge -> lucene3To5GramWordsAnalyzerEdgeInstance;
case N3To5GramPartialStringEdge -> lucene3To5GramStringAnalyzerEdgeInstance;
case Standard -> luceneStandardAnalyzerInstance;
case FullText -> luceneWordAnalyzerStopWordsAndStemInstance;
case WordWithStopwordsStripping -> luceneWordAnalyzerStopWordsInstance;
case WordWithStemming -> luceneWordAnalyzerStemInstance;
case WordSimple -> luceneWordAnalyzerSimpleInstance;
case ICUCollationKey -> luceneICUCollationKeyInstance;
//noinspection UnnecessaryDefault
default -> throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
};
}
2021-05-28 16:04:59 +02:00
@SuppressWarnings("DuplicatedCode")
public static Similarity getSimilarity(TextFieldsSimilarity similarity) {
2021-05-28 16:04:59 +02:00
return switch (similarity) {
case BM25Classic -> luceneBM25ClassicSimilarityInstance;
case NGramBM25Classic -> luceneBM25ClassicNGramSimilarityInstance;
case BM25L -> luceneBM25LSimilarityInstance;
case NGramBM25L -> luceneBM25LNGramSimilarityInstance;
case Classic -> luceneClassicSimilarityInstance;
case NGramClassic -> luceneClassicNGramSimilarityInstance;
case BM25Plus -> luceneBM25PlusSimilarityInstance;
case NGramBM25Plus -> luceneBM25PlusNGramSimilarityInstance;
case BM15Plus -> luceneBM15PlusSimilarityInstance;
case NGramBM15Plus -> luceneBM15PlusNGramSimilarityInstance;
case BM11Plus -> luceneBM11PlusSimilarityInstance;
case NGramBM11Plus -> luceneBM11PlusNGramSimilarityInstance;
case LTC -> luceneLTCSimilarityInstance;
case LDP -> luceneLDPSimilarityInstance;
case LDPNoLength -> luceneLDPNoLengthSimilarityInstance;
case Robertson -> luceneRobertsonSimilarityInstance;
case Boolean -> luceneBooleanSimilarityInstance;
//noinspection UnnecessaryDefault
default -> throw new IllegalStateException("Unknown similarity: " + similarity);
};
}
/**
*
* @param stem Enable stem filters on words.
* Pass false if it will be used with a n-gram filter
*/
public static TokenStream newCommonFilter(TokenStream tokenStream, boolean stem) {
tokenStream = newCommonNormalizer(tokenStream);
if (stem) {
tokenStream = new KStemFilter(tokenStream);
tokenStream = new EnglishPossessiveFilter(tokenStream);
}
return tokenStream;
}
public static TokenStream newCommonNormalizer(TokenStream tokenStream) {
tokenStream = new ASCIIFoldingFilter(tokenStream);
tokenStream = new LowerCaseFilter(tokenStream);
return tokenStream;
}
2021-07-04 01:34:17 +02:00
/**
*
* @return false if the result is not relevant
2021-07-04 01:34:17 +02:00
*/
@Nullable
public static boolean filterTopDoc(float score, Float minCompetitiveScore) {
return minCompetitiveScore == null || score >= minCompetitiveScore;
}
@Nullable
public static String keyOfTopDoc(Logger logger, int docId, IndexReader indexReader,
String keyFieldName) throws IOException {
if (docId > indexReader.maxDoc()) {
logger.warn("Document " + docId + " > maxDoc (" +indexReader.maxDoc() + ")");
return null;
}
Document d = indexReader.document(docId, Set.of(keyFieldName));
if (d.getFields().isEmpty()) {
StringBuilder sb = new StringBuilder();
sb.append("The document docId: ").append(docId).append(" is empty.");
var realFields = indexReader.document(docId).getFields();
if (!realFields.isEmpty()) {
sb.append("\n");
logger.error("Present fields:\n");
boolean first = true;
for (IndexableField field : realFields) {
if (first) {
first = false;
} else {
sb.append("\n");
2021-07-04 01:34:17 +02:00
}
sb.append(" - ").append(field.name());
2021-07-04 01:34:17 +02:00
}
}
throw new IOException(sb.toString());
2021-07-04 01:34:17 +02:00
} else {
var field = d.getField(keyFieldName);
if (field == null) {
throw new IOException("Can't get key of document docId: " + docId);
} else {
return field.stringValue();
}
2021-07-04 01:34:17 +02:00
}
}
2021-03-11 14:45:45 +01:00
public static <T, U, V> ValueGetter<Entry<T, U>, V> getAsyncDbValueGetterDeep(
CompositeSnapshot snapshot,
DatabaseMapDictionaryDeep<T, Map<U, V>, DatabaseMapDictionary<U, V>> dictionaryDeep) {
return entry -> dictionaryDeep
.at(snapshot, entry.getKey())
.flatMap(sub -> sub.getValue(snapshot, entry.getValue()).doAfterTerminate(sub::release));
2021-03-11 14:45:45 +01:00
}
2021-05-28 16:04:59 +02:00
public static PerFieldAnalyzerWrapper toPerFieldAnalyzerWrapper(IndicizerAnalyzers indicizerAnalyzers) {
HashMap<String, Analyzer> perFieldAnalyzer = new HashMap<>();
indicizerAnalyzers
.fieldAnalyzer()
.forEach((key, value) -> perFieldAnalyzer.put(key, LuceneUtils.getAnalyzer(value)));
return new PerFieldAnalyzerWrapper(LuceneUtils.getAnalyzer(indicizerAnalyzers.defaultAnalyzer()), perFieldAnalyzer);
}
public static PerFieldSimilarityWrapper toPerFieldSimilarityWrapper(IndicizerSimilarities indicizerSimilarities) {
HashMap<String, Similarity> perFieldSimilarity = new HashMap<>();
indicizerSimilarities
.fieldSimilarity()
.forEach((key, value) -> perFieldSimilarity.put(key, LuceneUtils.getSimilarity(value)));
var defaultSimilarity = LuceneUtils.getSimilarity(indicizerSimilarities.defaultSimilarity());
return new PerFieldSimilarityWrapper() {
@Override
public Similarity get(String name) {
return perFieldSimilarity.getOrDefault(name, defaultSimilarity);
}
};
}
2021-07-01 21:19:52 +02:00
public static int alignUnsigned(int number, boolean expand) {
if (number % 4096 != 0) {
if (expand) {
return number + (4096 - (number % 4096));
} else {
return number - (number % 4096);
}
} else {
return number;
}
}
public static long alignUnsigned(long number, boolean expand) {
if (number % 4096L != 0) {
if (expand) {
return number + (4096L - (number % 4096L));
} else {
return number - (number % 4096L);
}
} else {
return number;
}
}
public static void readInternalAligned(Object ref, FileChannel channel, long pos, ByteBuffer b, int readLength, int usefulLength, long end) throws IOException {
int startBufPosition = b.position();
int readData = 0;
int i;
for(; readLength > 0; readLength -= i) {
int toRead = readLength;
b.limit(b.position() + toRead);
assert b.remaining() == toRead;
var beforeReadBufPosition = b.position();
channel.read(b, pos);
b.limit(Math.min(startBufPosition + usefulLength, b.position() + toRead));
var afterReadBufPosition = b.position();
i = (afterReadBufPosition - beforeReadBufPosition);
readData += i;
if (i < toRead && i > 0) {
if (readData < usefulLength) {
throw new EOFException("read past EOF: " + ref + " buffer: " + b + " chunkLen: " + toRead + " end: " + end);
}
if (readData == usefulLength) {
b.limit(b.position());
// File end reached
return;
}
}
if (i < 0) {
throw new EOFException("read past EOF: " + ref + " buffer: " + b + " chunkLen: " + toRead + " end: " + end);
}
assert i > 0 : "FileChannel.read with non zero-length bb.remaining() must always read at least one byte (FileChannel is in blocking mode, see spec of ReadableByteChannel)";
pos += (long)i;
}
assert readLength == 0;
}
public static int safeLongToInt(long l) {
if (l > 2147483630) {
return 2147483630;
} else if (l < -2147483630) {
return -2147483630;
} else {
return (int) l;
}
}
@Nullable
public static FieldDoc getLastFieldDoc(ScoreDoc[] scoreDocs) {
if (scoreDocs == null) {
return null;
}
if (scoreDocs.length == 0) {
return null;
}
return (FieldDoc) scoreDocs[scoreDocs.length - 1];
}
@Nullable
public static ScoreDoc getLastScoreDoc(ScoreDoc[] scoreDocs) {
if (scoreDocs == null) {
return null;
}
if (scoreDocs.length == 0) {
return null;
}
return scoreDocs[scoreDocs.length - 1];
}
2021-07-06 01:30:37 +02:00
public static LocalQueryParams toLocalQueryParams(QueryParams queryParams) {
return new LocalQueryParams(QueryParser.toQuery(queryParams.query()),
safeLongToInt(queryParams.offset()),
safeLongToInt(queryParams.limit()),
queryParams.minCompetitiveScore().getNullable(),
QueryParser.toSort(queryParams.sort()),
QueryParser.toScoreMode(queryParams.scoreMode())
);
}
}