2021-02-04 22:42:57 +01:00
package it.cavallium.dbengine.lucene ;
2021-05-28 16:04:59 +02:00
import com.ibm.icu.text.Collator ;
import com.ibm.icu.util.ULocale ;
2021-03-11 14:45:45 +01:00
import it.cavallium.dbengine.client.CompositeSnapshot ;
2021-05-28 16:04:59 +02:00
import it.cavallium.dbengine.client.IndicizerAnalyzers ;
import it.cavallium.dbengine.client.IndicizerSimilarities ;
2021-02-04 22:42:57 +01:00
import it.cavallium.dbengine.client.MultiSort ;
2021-03-27 03:35:27 +01:00
import it.cavallium.dbengine.client.SearchResult ;
import it.cavallium.dbengine.client.SearchResultItem ;
import it.cavallium.dbengine.client.SearchResultKey ;
import it.cavallium.dbengine.client.SearchResultKeys ;
2021-02-14 13:46:11 +01:00
import it.cavallium.dbengine.database.LLKeyScore ;
2021-03-27 03:35:27 +01:00
import it.cavallium.dbengine.database.LLSearchResultShard ;
2021-05-28 16:04:59 +02:00
import it.cavallium.dbengine.database.LLUtils ;
2021-03-11 14:45:45 +01:00
import it.cavallium.dbengine.database.collections.DatabaseMapDictionary ;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep ;
import it.cavallium.dbengine.database.collections.Joiner.ValueGetter ;
2021-02-05 20:34:58 +01:00
import it.cavallium.dbengine.lucene.analyzer.NCharGramAnalyzer ;
import it.cavallium.dbengine.lucene.analyzer.NCharGramEdgeAnalyzer ;
2021-02-04 22:42:57 +01:00
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer ;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity ;
import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer ;
2021-03-03 00:13:57 +01:00
import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult ;
import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.ResultItemConsumer ;
2021-02-04 22:42:57 +01:00
import it.cavallium.dbengine.lucene.similarity.NGramSimilarity ;
2021-07-01 21:19:52 +02:00
import java.io.EOFException ;
2021-02-14 13:46:11 +01:00
import java.io.IOException ;
2021-07-01 21:19:52 +02:00
import java.nio.ByteBuffer ;
import java.nio.channels.FileChannel ;
2021-05-28 16:04:59 +02:00
import java.util.HashMap ;
2021-03-11 14:45:45 +01:00
import java.util.Map ;
import java.util.Map.Entry ;
2021-02-14 13:46:11 +01:00
import java.util.Set ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.analysis.Analyzer ;
import org.apache.lucene.analysis.LowerCaseFilter ;
import org.apache.lucene.analysis.TokenStream ;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter ;
import org.apache.lucene.analysis.en.KStemFilter ;
2021-05-28 16:04:59 +02:00
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter ;
2021-05-28 16:04:59 +02:00
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.analysis.standard.StandardAnalyzer ;
2021-02-14 13:46:11 +01:00
import org.apache.lucene.document.Document ;
import org.apache.lucene.index.IndexableField ;
import org.apache.lucene.search.IndexSearcher ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.search.similarities.BooleanSimilarity ;
import org.apache.lucene.search.similarities.ClassicSimilarity ;
2021-05-28 16:04:59 +02:00
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.search.similarities.Similarity ;
import org.jetbrains.annotations.Nullable ;
import org.novasearch.lucene.search.similarities.BM25Similarity ;
import org.novasearch.lucene.search.similarities.BM25Similarity.BM25Model ;
import org.novasearch.lucene.search.similarities.LdpSimilarity ;
import org.novasearch.lucene.search.similarities.LtcSimilarity ;
import org.novasearch.lucene.search.similarities.RobertsonSimilarity ;
2021-02-20 21:35:09 +01:00
import org.warp.commonutils.log.Logger ;
2021-02-04 22:42:57 +01:00
import reactor.core.publisher.Flux ;
2021-03-03 17:29:14 +01:00
import reactor.core.publisher.Mono ;
2021-02-04 22:42:57 +01:00
public class LuceneUtils {
2021-02-05 20:34:58 +01:00
private static final Analyzer lucene4GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer ( true , 4 , 4 ) ;
private static final Analyzer lucene4GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer ( false , 4 , 4 ) ;
private static final Analyzer lucene4GramWordsAnalyzerInstance = new NCharGramAnalyzer ( true , 4 , 4 ) ;
private static final Analyzer lucene4GramStringAnalyzerInstance = new NCharGramAnalyzer ( false , 4 , 4 ) ;
private static final Analyzer lucene3To5GramWordsAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer ( true , 3 , 5 ) ;
private static final Analyzer lucene3To5GramStringAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer ( false , 3 , 5 ) ;
private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer ( true , 3 , 5 ) ;
private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer ( false , 3 , 5 ) ;
2021-02-04 22:42:57 +01:00
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer ( ) ;
2021-05-28 16:04:59 +02:00
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer ( false , true , true ) ;
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer ( false , true , false ) ;
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer ( false , false , true ) ;
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer ( false , false , false ) ;
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer ( false , true , true ) ;
2021-02-04 22:42:57 +01:00
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity ( BM25Model . CLASSIC ) ;
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity ( BM25Model . PLUS ) ;
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity ( BM25Model . L ) ;
private static final Similarity luceneBM15PlusSimilarityInstance = new BM25Similarity ( 1 . 2f , 0 . 0f , 0 . 5f , BM25Model . PLUS ) ;
private static final Similarity luceneBM11PlusSimilarityInstance = new BM25Similarity ( 1 . 2f , 1 . 0f , 0 . 5f , BM25Model . PLUS ) ;
private static final Similarity luceneBM25ClassicNGramSimilarityInstance = NGramSimilarity . bm25 ( BM25Model . CLASSIC ) ;
private static final Similarity luceneBM25PlusNGramSimilarityInstance = NGramSimilarity . bm25 ( BM25Model . PLUS ) ;
private static final Similarity luceneBM25LNGramSimilarityInstance = NGramSimilarity . bm25 ( BM25Model . L ) ;
private static final Similarity luceneBM15PlusNGramSimilarityInstance = NGramSimilarity . bm15 ( BM25Model . PLUS ) ;
private static final Similarity luceneBM11PlusNGramSimilarityInstance = NGramSimilarity . bm11 ( BM25Model . PLUS ) ;
private static final Similarity luceneClassicSimilarityInstance = new ClassicSimilarity ( ) ;
private static final Similarity luceneClassicNGramSimilarityInstance = NGramSimilarity . classic ( ) ;
private static final Similarity luceneLTCSimilarityInstance = new LtcSimilarity ( ) ;
private static final Similarity luceneLDPSimilarityInstance = new LdpSimilarity ( ) ;
private static final Similarity luceneLDPNoLengthSimilarityInstance = new LdpSimilarity ( 0 , 0 . 5f ) ;
private static final Similarity luceneBooleanSimilarityInstance = new BooleanSimilarity ( ) ;
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity ( ) ;
2021-05-28 16:04:59 +02:00
@SuppressWarnings ( " DuplicatedCode " )
2021-02-04 22:42:57 +01:00
public static Analyzer getAnalyzer ( TextFieldsAnalyzer analyzer ) {
2021-05-28 16:04:59 +02:00
return switch ( analyzer ) {
case N4GramPartialWords - > lucene4GramWordsAnalyzerInstance ;
case N4GramPartialString - > lucene4GramStringAnalyzerInstance ;
case N4GramPartialWordsEdge - > lucene4GramWordsAnalyzerEdgeInstance ;
case N4GramPartialStringEdge - > lucene4GramStringAnalyzerEdgeInstance ;
case N3To5GramPartialWords - > lucene3To5GramWordsAnalyzerInstance ;
case N3To5GramPartialString - > lucene3To5GramStringAnalyzerInstance ;
case N3To5GramPartialWordsEdge - > lucene3To5GramWordsAnalyzerEdgeInstance ;
case N3To5GramPartialStringEdge - > lucene3To5GramStringAnalyzerEdgeInstance ;
case Standard - > luceneStandardAnalyzerInstance ;
case FullText - > luceneWordAnalyzerStopWordsAndStemInstance ;
case WordWithStopwordsStripping - > luceneWordAnalyzerStopWordsInstance ;
case WordWithStemming - > luceneWordAnalyzerStemInstance ;
case WordSimple - > luceneWordAnalyzerSimpleInstance ;
case ICUCollationKey - > luceneICUCollationKeyInstance ;
//noinspection UnnecessaryDefault
default - > throw new UnsupportedOperationException ( " Unknown analyzer: " + analyzer ) ;
} ;
2021-02-04 22:42:57 +01:00
}
2021-05-28 16:04:59 +02:00
@SuppressWarnings ( " DuplicatedCode " )
2021-02-04 22:42:57 +01:00
public static Similarity getSimilarity ( TextFieldsSimilarity similarity ) {
2021-05-28 16:04:59 +02:00
return switch ( similarity ) {
case BM25Classic - > luceneBM25ClassicSimilarityInstance ;
case NGramBM25Classic - > luceneBM25ClassicNGramSimilarityInstance ;
case BM25L - > luceneBM25LSimilarityInstance ;
case NGramBM25L - > luceneBM25LNGramSimilarityInstance ;
case Classic - > luceneClassicSimilarityInstance ;
case NGramClassic - > luceneClassicNGramSimilarityInstance ;
case BM25Plus - > luceneBM25PlusSimilarityInstance ;
case NGramBM25Plus - > luceneBM25PlusNGramSimilarityInstance ;
case BM15Plus - > luceneBM15PlusSimilarityInstance ;
case NGramBM15Plus - > luceneBM15PlusNGramSimilarityInstance ;
case BM11Plus - > luceneBM11PlusSimilarityInstance ;
case NGramBM11Plus - > luceneBM11PlusNGramSimilarityInstance ;
case LTC - > luceneLTCSimilarityInstance ;
case LDP - > luceneLDPSimilarityInstance ;
case LDPNoLength - > luceneLDPNoLengthSimilarityInstance ;
case Robertson - > luceneRobertsonSimilarityInstance ;
case Boolean - > luceneBooleanSimilarityInstance ;
//noinspection UnnecessaryDefault
default - > throw new IllegalStateException ( " Unknown similarity: " + similarity ) ;
} ;
2021-02-04 22:42:57 +01:00
}
/ * *
*
* @param stem Enable stem filters on words .
* Pass false if it will be used with a n - gram filter
* /
public static TokenStream newCommonFilter ( TokenStream tokenStream , boolean stem ) {
tokenStream = newCommonNormalizer ( tokenStream ) ;
if ( stem ) {
tokenStream = new KStemFilter ( tokenStream ) ;
tokenStream = new EnglishPossessiveFilter ( tokenStream ) ;
}
return tokenStream ;
}
public static TokenStream newCommonNormalizer ( TokenStream tokenStream ) {
tokenStream = new ASCIIFoldingFilter ( tokenStream ) ;
tokenStream = new LowerCaseFilter ( tokenStream ) ;
return tokenStream ;
}
/ * *
* Merge streams together maintaining absolute order
* /
public static < T > Flux < T > mergeStream ( Flux < Flux < T > > mappedMultiResults ,
@Nullable MultiSort < T > sort ,
2021-04-01 19:48:25 +02:00
long offset ,
2021-02-22 01:37:17 +01:00
@Nullable Long limit ) {
2021-02-04 22:42:57 +01:00
if ( limit ! = null & & limit = = 0 ) {
return mappedMultiResults . flatMap ( f - > f ) . ignoreElements ( ) . flux ( ) ;
2021-03-05 16:17:37 +01:00
} else {
return mappedMultiResults . collectList ( ) . flatMapMany ( mappedMultiResultsList - > {
Flux < T > mergedFlux ;
if ( sort = = null ) {
mergedFlux = Flux . merge ( mappedMultiResultsList ) ;
} else {
//noinspection unchecked
mergedFlux = Flux . mergeOrdered ( 32 , sort . getResultSort ( ) , mappedMultiResultsList . toArray ( Flux [ ] : : new ) ) ;
}
2021-04-01 19:48:25 +02:00
Flux < T > offsetedFlux ;
if ( offset > 0 ) {
offsetedFlux = mergedFlux . skip ( offset ) ;
} else {
offsetedFlux = mergedFlux ;
}
2021-03-05 16:17:37 +01:00
if ( limit = = null | | limit = = Long . MAX_VALUE ) {
2021-04-01 19:48:25 +02:00
return offsetedFlux ;
2021-03-05 16:17:37 +01:00
} else {
2021-05-22 14:46:54 +02:00
return offsetedFlux . take ( limit , true ) ;
2021-03-05 16:17:37 +01:00
}
} ) ;
2021-02-04 22:42:57 +01:00
}
}
2021-02-14 13:46:11 +01:00
2021-03-03 00:13:57 +01:00
public static HandleResult collectTopDoc ( Logger logger ,
2021-02-14 13:46:11 +01:00
int docId ,
float score ,
Float minCompetitiveScore ,
IndexSearcher indexSearcher ,
String keyFieldName ,
2021-03-03 00:13:57 +01:00
ResultItemConsumer resultsConsumer ) throws IOException {
2021-02-14 13:46:11 +01:00
if ( minCompetitiveScore = = null | | score > = minCompetitiveScore ) {
Document d = indexSearcher . doc ( docId , Set . of ( keyFieldName ) ) ;
if ( d . getFields ( ) . isEmpty ( ) ) {
logger . error ( " The document docId: {}, score: {} is empty. " , docId , score ) ;
var realFields = indexSearcher . doc ( docId ) . getFields ( ) ;
if ( ! realFields . isEmpty ( ) ) {
logger . error ( " Present fields: " ) ;
for ( IndexableField field : realFields ) {
logger . error ( " - {} " , field . name ( ) ) ;
}
}
} else {
var field = d . getField ( keyFieldName ) ;
if ( field = = null ) {
logger . error ( " Can't get key of document docId: {}, score: {} " , docId , score ) ;
} else {
2021-03-03 00:13:57 +01:00
if ( resultsConsumer . accept ( new LLKeyScore ( field . stringValue ( ) , score ) ) = = HandleResult . HALT ) {
return HandleResult . HALT ;
}
2021-02-14 13:46:11 +01:00
}
}
}
2021-03-03 00:13:57 +01:00
return HandleResult . CONTINUE ;
2021-02-14 13:46:11 +01:00
}
2021-03-03 17:29:14 +01:00
2021-03-27 03:35:27 +01:00
public static < T > Mono < SearchResultKeys < T > > mergeSignalStreamKeys ( Flux < SearchResultKeys < T > > mappedKeys ,
MultiSort < SearchResultKey < T > > sort ,
2021-04-01 19:48:25 +02:00
long offset ,
2021-03-03 17:29:14 +01:00
Long limit ) {
2021-03-27 03:35:27 +01:00
return mappedKeys . reduce (
new SearchResultKeys < > ( Flux . empty ( ) , 0L ) ,
2021-05-21 00:19:40 +02:00
( a , b ) - > new SearchResultKeys < > ( LuceneUtils . mergeStream ( Flux . just ( a . results ( ) , b . results ( ) ) ,
2021-04-03 19:09:06 +02:00
sort ,
offset ,
limit
2021-05-21 00:19:40 +02:00
) , a . totalHitsCount ( ) + b . totalHitsCount ( ) )
2021-03-27 03:35:27 +01:00
) ;
2021-03-03 17:29:14 +01:00
}
2021-03-27 03:35:27 +01:00
public static < T , U > Mono < SearchResult < T , U > > mergeSignalStreamItems ( Flux < SearchResult < T , U > > mappedKeys ,
MultiSort < SearchResultItem < T , U > > sort ,
2021-04-01 19:48:25 +02:00
long offset ,
2021-03-03 17:29:14 +01:00
Long limit ) {
2021-03-27 03:35:27 +01:00
return mappedKeys . reduce (
new SearchResult < > ( Flux . empty ( ) , 0L ) ,
2021-05-21 00:19:40 +02:00
( a , b ) - > new SearchResult < > ( LuceneUtils . mergeStream ( Flux . just ( a . results ( ) , b . results ( ) ) ,
2021-04-03 19:09:06 +02:00
sort ,
offset ,
limit
2021-05-21 00:19:40 +02:00
) , a . totalHitsCount ( ) + b . totalHitsCount ( ) )
2021-03-27 03:35:27 +01:00
) ;
}
2021-03-03 20:00:58 +01:00
2021-03-27 03:35:27 +01:00
public static Mono < LLSearchResultShard > mergeSignalStreamRaw ( Flux < LLSearchResultShard > mappedKeys ,
MultiSort < LLKeyScore > mappedSort ,
2021-04-01 19:48:25 +02:00
long offset ,
2021-03-27 03:35:27 +01:00
Long limit ) {
return mappedKeys . reduce (
new LLSearchResultShard ( Flux . empty ( ) , 0 ) ,
( s1 , s2 ) - > new LLSearchResultShard (
2021-05-21 00:19:40 +02:00
LuceneUtils . mergeStream ( Flux . just ( s1 . results ( ) , s2 . results ( ) ) , mappedSort , offset , limit ) ,
s1 . totalHitsCount ( ) + s2 . totalHitsCount ( )
2021-03-27 03:35:27 +01:00
)
) ;
2021-03-03 17:29:14 +01:00
}
2021-03-11 14:45:45 +01:00
public static < T , U , V > ValueGetter < Entry < T , U > , V > getAsyncDbValueGetterDeep (
CompositeSnapshot snapshot ,
DatabaseMapDictionaryDeep < T , Map < U , V > , DatabaseMapDictionary < U , V > > dictionaryDeep ) {
return entry - > dictionaryDeep
. at ( snapshot , entry . getKey ( ) )
2021-05-12 21:41:47 +02:00
. flatMap ( sub - > sub . getValue ( snapshot , entry . getValue ( ) ) . doAfterTerminate ( sub : : release ) ) ;
2021-03-11 14:45:45 +01:00
}
2021-05-28 16:04:59 +02:00
public static PerFieldAnalyzerWrapper toPerFieldAnalyzerWrapper ( IndicizerAnalyzers indicizerAnalyzers ) {
HashMap < String , Analyzer > perFieldAnalyzer = new HashMap < > ( ) ;
indicizerAnalyzers
. fieldAnalyzer ( )
. forEach ( ( key , value ) - > perFieldAnalyzer . put ( key , LuceneUtils . getAnalyzer ( value ) ) ) ;
return new PerFieldAnalyzerWrapper ( LuceneUtils . getAnalyzer ( indicizerAnalyzers . defaultAnalyzer ( ) ) , perFieldAnalyzer ) ;
}
public static PerFieldSimilarityWrapper toPerFieldSimilarityWrapper ( IndicizerSimilarities indicizerSimilarities ) {
HashMap < String , Similarity > perFieldSimilarity = new HashMap < > ( ) ;
indicizerSimilarities
. fieldSimilarity ( )
. forEach ( ( key , value ) - > perFieldSimilarity . put ( key , LuceneUtils . getSimilarity ( value ) ) ) ;
var defaultSimilarity = LuceneUtils . getSimilarity ( indicizerSimilarities . defaultSimilarity ( ) ) ;
return new PerFieldSimilarityWrapper ( ) {
@Override
public Similarity get ( String name ) {
return perFieldSimilarity . getOrDefault ( name , defaultSimilarity ) ;
}
} ;
}
2021-07-01 21:19:52 +02:00
public static int alignUnsigned ( int number , boolean expand ) {
if ( number % 4096 ! = 0 ) {
if ( expand ) {
return number + ( 4096 - ( number % 4096 ) ) ;
} else {
return number - ( number % 4096 ) ;
}
} else {
return number ;
}
}
public static long alignUnsigned ( long number , boolean expand ) {
if ( number % 4096L ! = 0 ) {
if ( expand ) {
return number + ( 4096L - ( number % 4096L ) ) ;
} else {
return number - ( number % 4096L ) ;
}
} else {
return number ;
}
}
public static void readInternalAligned ( Object ref , FileChannel channel , long pos , ByteBuffer b , int readLength , int usefulLength , long end ) throws IOException {
int startBufPosition = b . position ( ) ;
int readData = 0 ;
int i ;
for ( ; readLength > 0 ; readLength - = i ) {
int toRead = readLength ;
b . limit ( b . position ( ) + toRead ) ;
assert b . remaining ( ) = = toRead ;
var beforeReadBufPosition = b . position ( ) ;
channel . read ( b , pos ) ;
b . limit ( Math . min ( startBufPosition + usefulLength , b . position ( ) + toRead ) ) ;
var afterReadBufPosition = b . position ( ) ;
i = ( afterReadBufPosition - beforeReadBufPosition ) ;
readData + = i ;
if ( i < toRead & & i > 0 ) {
if ( readData < usefulLength ) {
throw new EOFException ( " read past EOF: " + ref + " buffer: " + b + " chunkLen: " + toRead + " end: " + end ) ;
}
if ( readData = = usefulLength ) {
b . limit ( b . position ( ) ) ;
// File end reached
return ;
}
}
if ( i < 0 ) {
throw new EOFException ( " read past EOF: " + ref + " buffer: " + b + " chunkLen: " + toRead + " end: " + end ) ;
}
assert i > 0 : " FileChannel.read with non zero-length bb.remaining() must always read at least one byte (FileChannel is in blocking mode, see spec of ReadableByteChannel) " ;
pos + = ( long ) i ;
}
assert readLength = = 0 ;
}
2021-02-04 22:42:57 +01:00
}