2021-02-04 22:42:57 +01:00
package it.cavallium.dbengine.lucene ;
2021-12-16 16:14:44 +01:00
import static it.cavallium.dbengine.client.UninterruptibleScheduler.uninterruptibleScheduler ;
2021-03-11 14:45:45 +01:00
import it.cavallium.dbengine.client.CompositeSnapshot ;
2021-05-28 16:04:59 +02:00
import it.cavallium.dbengine.client.IndicizerAnalyzers ;
import it.cavallium.dbengine.client.IndicizerSimilarities ;
2021-07-06 01:30:37 +02:00
import it.cavallium.dbengine.client.query.QueryParser ;
import it.cavallium.dbengine.client.query.current.data.QueryParams ;
2021-08-04 01:12:39 +02:00
import it.cavallium.dbengine.client.query.current.data.TotalHitsCount ;
2021-07-08 17:01:56 +02:00
import it.cavallium.dbengine.database.LLKeyScore ;
2021-09-23 20:57:28 +02:00
import it.cavallium.dbengine.database.LLUtils ;
2021-03-11 14:45:45 +01:00
import it.cavallium.dbengine.database.collections.DatabaseMapDictionary ;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep ;
2021-07-17 11:52:08 +02:00
import it.cavallium.dbengine.database.collections.ValueGetter ;
2021-09-20 12:51:27 +02:00
import it.cavallium.dbengine.database.disk.LLIndexSearchers ;
2022-01-12 16:18:31 +01:00
import it.cavallium.dbengine.lucene.analyzer.LegacyWordAnalyzer ;
2021-02-05 20:34:58 +01:00
import it.cavallium.dbengine.lucene.analyzer.NCharGramAnalyzer ;
import it.cavallium.dbengine.lucene.analyzer.NCharGramEdgeAnalyzer ;
2021-02-04 22:42:57 +01:00
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer ;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity ;
import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer ;
2021-09-29 13:18:06 +02:00
import it.cavallium.dbengine.lucene.mlt.BigCompositeReader ;
2021-09-20 11:35:01 +02:00
import it.cavallium.dbengine.lucene.mlt.MultiMoreLikeThis ;
2021-07-06 01:30:37 +02:00
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams ;
2021-02-04 22:42:57 +01:00
import it.cavallium.dbengine.lucene.similarity.NGramSimilarity ;
2021-12-18 18:16:56 +01:00
import it.unimi.dsi.fastutil.objects.Object2ObjectSortedMap ;
2021-07-01 21:19:52 +02:00
import java.io.EOFException ;
2021-02-14 13:46:11 +01:00
import java.io.IOException ;
2021-07-01 21:19:52 +02:00
import java.nio.ByteBuffer ;
import java.nio.channels.FileChannel ;
2021-12-12 23:40:30 +01:00
import java.time.Duration ;
2021-12-16 16:14:44 +01:00
import java.util.ArrayList ;
2021-05-28 16:04:59 +02:00
import java.util.HashMap ;
2021-09-20 11:35:01 +02:00
import java.util.List ;
2021-03-11 14:45:45 +01:00
import java.util.Map ;
import java.util.Map.Entry ;
2021-07-17 23:06:26 +02:00
import java.util.NoSuchElementException ;
2021-09-20 11:35:01 +02:00
import java.util.Set ;
2021-07-17 23:06:26 +02:00
import java.util.stream.Collectors ;
2021-12-17 01:48:49 +01:00
import org.apache.logging.log4j.LogManager ;
import org.apache.logging.log4j.Logger ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.analysis.Analyzer ;
2022-01-11 22:23:07 +01:00
import org.apache.lucene.analysis.CharArraySet ;
import org.apache.lucene.analysis.en.EnglishAnalyzer ;
import org.apache.lucene.analysis.it.ItalianAnalyzer ;
2021-05-28 16:04:59 +02:00
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.analysis.standard.StandardAnalyzer ;
2021-02-14 13:46:11 +01:00
import org.apache.lucene.document.Document ;
2021-07-06 00:30:14 +02:00
import org.apache.lucene.index.IndexReader ;
2021-02-14 13:46:11 +01:00
import org.apache.lucene.index.IndexableField ;
2021-09-20 11:35:01 +02:00
import org.apache.lucene.search.BooleanClause.Occur ;
2021-09-29 13:18:06 +02:00
import org.apache.lucene.search.BooleanQuery.Builder ;
2021-12-12 23:40:30 +01:00
import org.apache.lucene.search.Collector ;
2021-09-20 11:35:01 +02:00
import org.apache.lucene.search.ConstantScoreQuery ;
2021-09-22 11:03:39 +02:00
import org.apache.lucene.search.IndexSearcher ;
2021-09-20 11:35:01 +02:00
import org.apache.lucene.search.MatchAllDocsQuery ;
import org.apache.lucene.search.MatchNoDocsQuery ;
import org.apache.lucene.search.Query ;
2021-07-06 00:30:14 +02:00
import org.apache.lucene.search.ScoreDoc ;
2021-07-08 17:01:56 +02:00
import org.apache.lucene.search.Sort ;
2021-12-12 23:40:30 +01:00
import org.apache.lucene.search.TimeLimitingCollector ;
2021-07-08 17:01:56 +02:00
import org.apache.lucene.search.TopDocs ;
import org.apache.lucene.search.TopFieldDocs ;
2021-08-04 01:12:39 +02:00
import org.apache.lucene.search.TotalHits ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.search.similarities.BooleanSimilarity ;
import org.apache.lucene.search.similarities.ClassicSimilarity ;
2021-05-28 16:04:59 +02:00
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.search.similarities.Similarity ;
2021-09-20 11:35:01 +02:00
import org.apache.lucene.search.similarities.TFIDFSimilarity ;
2021-07-17 23:06:26 +02:00
import org.jetbrains.annotations.NotNull ;
2021-02-04 22:42:57 +01:00
import org.jetbrains.annotations.Nullable ;
import org.novasearch.lucene.search.similarities.BM25Similarity ;
import org.novasearch.lucene.search.similarities.BM25Similarity.BM25Model ;
import org.novasearch.lucene.search.similarities.LdpSimilarity ;
import org.novasearch.lucene.search.similarities.LtcSimilarity ;
import org.novasearch.lucene.search.similarities.RobertsonSimilarity ;
2021-07-08 17:01:56 +02:00
import reactor.core.publisher.Flux ;
2021-09-20 11:35:01 +02:00
import reactor.core.publisher.Mono ;
2021-09-07 11:28:03 +02:00
import reactor.core.scheduler.Schedulers ;
2021-09-18 18:34:21 +02:00
import reactor.util.concurrent.Queues ;
2021-09-20 11:35:01 +02:00
import reactor.util.function.Tuple2 ;
2021-02-04 22:42:57 +01:00
public class LuceneUtils {
2021-07-17 23:06:26 +02:00
2021-12-17 01:48:49 +01:00
private static final Logger logger = LogManager . getLogger ( LuceneUtils . class ) ;
2021-07-17 23:06:26 +02:00
2022-01-11 22:23:07 +01:00
private static final Analyzer luceneEdge4GramAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer ( 4 , 4 ) ;
private static final Analyzer lucene4GramAnalyzerInstance = new NCharGramAnalyzer ( 4 , 4 ) ;
private static final Analyzer luceneEdge3To5GramAnalyzerEdgeInstance = new NCharGramEdgeAnalyzer ( 3 , 5 ) ;
private static final Analyzer lucene3To5GramAnalyzerInstance = new NCharGramAnalyzer ( 3 , 5 ) ;
2021-02-04 22:42:57 +01:00
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer ( ) ;
2022-01-12 16:18:31 +01:00
private static final Analyzer luceneWordAnalyzerLegacy1Instance = new LegacyWordAnalyzer ( false , true , true ) ;
private static final Analyzer luceneWordAnalyzerLegacy2Instance = new LegacyWordAnalyzer ( false , false , true ) ;
2022-01-18 00:02:55 +01:00
private static final Analyzer luceneWordAnalyzerLegacy3Instance = new LegacyWordAnalyzer ( false , true , true ) ;
2022-01-11 22:23:07 +01:00
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer ( false , true ) ;
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer ( false , false ) ;
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer ( true , true ) ;
private static final Similarity luceneBM25StandardSimilarityInstance = new org . apache . lucene . search . similarities . BM25Similarity ( ) ;
2021-02-04 22:42:57 +01:00
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity ( BM25Model . CLASSIC ) ;
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity ( BM25Model . PLUS ) ;
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity ( BM25Model . L ) ;
private static final Similarity luceneBM15PlusSimilarityInstance = new BM25Similarity ( 1 . 2f , 0 . 0f , 0 . 5f , BM25Model . PLUS ) ;
private static final Similarity luceneBM11PlusSimilarityInstance = new BM25Similarity ( 1 . 2f , 1 . 0f , 0 . 5f , BM25Model . PLUS ) ;
private static final Similarity luceneBM25ClassicNGramSimilarityInstance = NGramSimilarity . bm25 ( BM25Model . CLASSIC ) ;
private static final Similarity luceneBM25PlusNGramSimilarityInstance = NGramSimilarity . bm25 ( BM25Model . PLUS ) ;
private static final Similarity luceneBM25LNGramSimilarityInstance = NGramSimilarity . bm25 ( BM25Model . L ) ;
private static final Similarity luceneBM15PlusNGramSimilarityInstance = NGramSimilarity . bm15 ( BM25Model . PLUS ) ;
private static final Similarity luceneBM11PlusNGramSimilarityInstance = NGramSimilarity . bm11 ( BM25Model . PLUS ) ;
private static final Similarity luceneClassicSimilarityInstance = new ClassicSimilarity ( ) ;
private static final Similarity luceneClassicNGramSimilarityInstance = NGramSimilarity . classic ( ) ;
private static final Similarity luceneLTCSimilarityInstance = new LtcSimilarity ( ) ;
private static final Similarity luceneLDPSimilarityInstance = new LdpSimilarity ( ) ;
private static final Similarity luceneLDPNoLengthSimilarityInstance = new LdpSimilarity ( 0 , 0 . 5f ) ;
private static final Similarity luceneBooleanSimilarityInstance = new BooleanSimilarity ( ) ;
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity ( ) ;
2021-09-20 18:20:59 +02:00
// TODO: remove this default page limits and make the limits configurable into QueryParams
private static final PageLimits DEFAULT_PAGE_LIMITS = new ExponentialPageLimits ( ) ;
2022-01-11 22:23:07 +01:00
private static final CharArraySet ENGLISH_AND_ITALIAN_STOP_WORDS ;
static {
var cas = new CharArraySet (
EnglishAnalyzer . ENGLISH_STOP_WORDS_SET . size ( ) + ItalianAnalyzer . getDefaultStopSet ( ) . size ( ) , true ) ;
cas . addAll ( EnglishAnalyzer . ENGLISH_STOP_WORDS_SET ) ;
cas . addAll ( ItalianAnalyzer . getDefaultStopSet ( ) ) ;
ENGLISH_AND_ITALIAN_STOP_WORDS = CharArraySet . unmodifiableSet ( cas ) ;
}
2021-02-04 22:42:57 +01:00
2021-05-28 16:04:59 +02:00
@SuppressWarnings ( " DuplicatedCode " )
2021-02-04 22:42:57 +01:00
public static Analyzer getAnalyzer ( TextFieldsAnalyzer analyzer ) {
2021-05-28 16:04:59 +02:00
return switch ( analyzer ) {
2022-01-11 22:23:07 +01:00
case N4Gram - > lucene4GramAnalyzerInstance ;
case N4GramEdge - > luceneEdge4GramAnalyzerEdgeInstance ;
case N3To5Gram - > lucene3To5GramAnalyzerInstance ;
case N3To5GramEdge - > luceneEdge3To5GramAnalyzerEdgeInstance ;
2021-05-28 16:04:59 +02:00
case Standard - > luceneStandardAnalyzerInstance ;
2022-01-11 22:23:07 +01:00
case StandardMultilanguage - > luceneWordAnalyzerStemInstance ;
2022-01-12 16:18:31 +01:00
case LegacyFullText - > luceneWordAnalyzerLegacy1Instance ;
case LegacyWordWithStemming - > luceneWordAnalyzerLegacy2Instance ;
2022-01-18 00:02:55 +01:00
case LegacyICU - > luceneWordAnalyzerLegacy3Instance ;
2022-01-11 22:23:07 +01:00
case StandardSimple - > luceneWordAnalyzerSimpleInstance ;
2021-05-28 16:04:59 +02:00
case ICUCollationKey - > luceneICUCollationKeyInstance ;
//noinspection UnnecessaryDefault
default - > throw new UnsupportedOperationException ( " Unknown analyzer: " + analyzer ) ;
} ;
2021-02-04 22:42:57 +01:00
}
2021-05-28 16:04:59 +02:00
@SuppressWarnings ( " DuplicatedCode " )
2021-02-04 22:42:57 +01:00
public static Similarity getSimilarity ( TextFieldsSimilarity similarity ) {
2021-05-28 16:04:59 +02:00
return switch ( similarity ) {
2022-01-11 22:23:07 +01:00
case BM25Standard - > luceneBM25StandardSimilarityInstance ;
2021-05-28 16:04:59 +02:00
case BM25Classic - > luceneBM25ClassicSimilarityInstance ;
case NGramBM25Classic - > luceneBM25ClassicNGramSimilarityInstance ;
case BM25L - > luceneBM25LSimilarityInstance ;
case NGramBM25L - > luceneBM25LNGramSimilarityInstance ;
case Classic - > luceneClassicSimilarityInstance ;
case NGramClassic - > luceneClassicNGramSimilarityInstance ;
case BM25Plus - > luceneBM25PlusSimilarityInstance ;
case NGramBM25Plus - > luceneBM25PlusNGramSimilarityInstance ;
case BM15Plus - > luceneBM15PlusSimilarityInstance ;
case NGramBM15Plus - > luceneBM15PlusNGramSimilarityInstance ;
case BM11Plus - > luceneBM11PlusSimilarityInstance ;
case NGramBM11Plus - > luceneBM11PlusNGramSimilarityInstance ;
case LTC - > luceneLTCSimilarityInstance ;
case LDP - > luceneLDPSimilarityInstance ;
case LDPNoLength - > luceneLDPNoLengthSimilarityInstance ;
case Robertson - > luceneRobertsonSimilarityInstance ;
case Boolean - > luceneBooleanSimilarityInstance ;
//noinspection UnnecessaryDefault
default - > throw new IllegalStateException ( " Unknown similarity: " + similarity ) ;
} ;
2021-02-04 22:42:57 +01:00
}
2021-07-17 23:06:26 +02:00
/ * *
* @throws NoSuchElementException when the key is not found
* @throws IOException when an error occurs when reading the document
* /
@NotNull
2021-07-08 17:01:56 +02:00
public static String keyOfTopDoc ( int docId , IndexReader indexReader ,
2021-07-17 23:06:26 +02:00
String keyFieldName ) throws IOException , NoSuchElementException {
2021-09-05 14:23:46 +02:00
if ( Schedulers . isInNonBlockingThread ( ) ) {
throw new UnsupportedOperationException ( " Called keyOfTopDoc in a nonblocking thread " ) ;
}
2021-07-06 00:30:14 +02:00
if ( docId > indexReader . maxDoc ( ) ) {
2021-07-08 17:01:56 +02:00
throw new IOException ( " Document " + docId + " > maxDoc ( " + indexReader . maxDoc ( ) + " ) " ) ;
2021-07-06 00:30:14 +02:00
}
2021-07-27 00:32:30 +02:00
DocumentStoredSingleFieldVisitor visitor = new DocumentStoredSingleFieldVisitor ( keyFieldName ) ;
indexReader . document ( docId , visitor ) ;
Document d = visitor . getDocument ( ) ;
2021-07-05 12:05:45 +02:00
if ( d . getFields ( ) . isEmpty ( ) ) {
2021-07-17 23:06:26 +02:00
throw new NoSuchElementException (
" Can't get key (field \" " + keyFieldName + " \" ) of document docId: " + docId + " . Available fields: [] " ) ;
2021-07-04 01:34:17 +02:00
} else {
2021-07-05 12:05:45 +02:00
var field = d . getField ( keyFieldName ) ;
if ( field = = null ) {
2021-07-17 23:06:26 +02:00
throw new NoSuchElementException (
" Can't get key (field \" " + keyFieldName + " \" ) of document docId: " + docId + " . Available fields: " + d
. getFields ( )
. stream ( )
. map ( IndexableField : : name )
. collect ( Collectors . joining ( " , " , " [ " , " ] " ) ) ) ;
2021-07-05 12:05:45 +02:00
} else {
return field . stringValue ( ) ;
}
2021-07-04 01:34:17 +02:00
}
}
2021-03-11 14:45:45 +01:00
public static < T , U , V > ValueGetter < Entry < T , U > , V > getAsyncDbValueGetterDeep (
CompositeSnapshot snapshot ,
2021-12-18 18:16:56 +01:00
DatabaseMapDictionaryDeep < T , Object2ObjectSortedMap < U , V > , DatabaseMapDictionary < U , V > > dictionaryDeep ) {
2021-09-23 20:57:28 +02:00
return entry - > LLUtils . usingResource ( dictionaryDeep
. at ( snapshot , entry . getKey ( ) ) , sub - > sub . getValue ( snapshot , entry . getValue ( ) ) , true ) ;
2021-03-11 14:45:45 +01:00
}
2021-05-28 16:04:59 +02:00
public static PerFieldAnalyzerWrapper toPerFieldAnalyzerWrapper ( IndicizerAnalyzers indicizerAnalyzers ) {
HashMap < String , Analyzer > perFieldAnalyzer = new HashMap < > ( ) ;
indicizerAnalyzers
. fieldAnalyzer ( )
. forEach ( ( key , value ) - > perFieldAnalyzer . put ( key , LuceneUtils . getAnalyzer ( value ) ) ) ;
return new PerFieldAnalyzerWrapper ( LuceneUtils . getAnalyzer ( indicizerAnalyzers . defaultAnalyzer ( ) ) , perFieldAnalyzer ) ;
}
public static PerFieldSimilarityWrapper toPerFieldSimilarityWrapper ( IndicizerSimilarities indicizerSimilarities ) {
HashMap < String , Similarity > perFieldSimilarity = new HashMap < > ( ) ;
indicizerSimilarities
. fieldSimilarity ( )
. forEach ( ( key , value ) - > perFieldSimilarity . put ( key , LuceneUtils . getSimilarity ( value ) ) ) ;
var defaultSimilarity = LuceneUtils . getSimilarity ( indicizerSimilarities . defaultSimilarity ( ) ) ;
return new PerFieldSimilarityWrapper ( ) {
@Override
public Similarity get ( String name ) {
return perFieldSimilarity . getOrDefault ( name , defaultSimilarity ) ;
}
} ;
}
2021-07-01 21:19:52 +02:00
public static int alignUnsigned ( int number , boolean expand ) {
if ( number % 4096 ! = 0 ) {
if ( expand ) {
return number + ( 4096 - ( number % 4096 ) ) ;
} else {
return number - ( number % 4096 ) ;
}
} else {
return number ;
}
}
public static long alignUnsigned ( long number , boolean expand ) {
if ( number % 4096L ! = 0 ) {
if ( expand ) {
return number + ( 4096L - ( number % 4096L ) ) ;
} else {
return number - ( number % 4096L ) ;
}
} else {
return number ;
}
}
2021-09-05 14:23:46 +02:00
public static void readInternalAligned ( Object ref ,
FileChannel channel ,
long pos ,
ByteBuffer b ,
int readLength ,
int usefulLength ,
long end ) throws IOException {
if ( Schedulers . isInNonBlockingThread ( ) ) {
throw new UnsupportedOperationException ( " Called readInternalAligned in a nonblocking thread " ) ;
}
2021-07-01 21:19:52 +02:00
int startBufPosition = b . position ( ) ;
int readData = 0 ;
int i ;
for ( ; readLength > 0 ; readLength - = i ) {
int toRead = readLength ;
b . limit ( b . position ( ) + toRead ) ;
assert b . remaining ( ) = = toRead ;
var beforeReadBufPosition = b . position ( ) ;
channel . read ( b , pos ) ;
b . limit ( Math . min ( startBufPosition + usefulLength , b . position ( ) + toRead ) ) ;
var afterReadBufPosition = b . position ( ) ;
i = ( afterReadBufPosition - beforeReadBufPosition ) ;
readData + = i ;
if ( i < toRead & & i > 0 ) {
if ( readData < usefulLength ) {
throw new EOFException ( " read past EOF: " + ref + " buffer: " + b + " chunkLen: " + toRead + " end: " + end ) ;
}
if ( readData = = usefulLength ) {
b . limit ( b . position ( ) ) ;
// File end reached
return ;
}
}
if ( i < 0 ) {
throw new EOFException ( " read past EOF: " + ref + " buffer: " + b + " chunkLen: " + toRead + " end: " + end ) ;
}
assert i > 0 : " FileChannel.read with non zero-length bb.remaining() must always read at least one byte (FileChannel is in blocking mode, see spec of ReadableByteChannel) " ;
2021-09-22 18:33:28 +02:00
pos + = i ;
2021-07-01 21:19:52 +02:00
}
assert readLength = = 0 ;
}
2021-07-06 00:30:14 +02:00
public static int safeLongToInt ( long l ) {
if ( l > 2147483630 ) {
return 2147483630 ;
} else if ( l < - 2147483630 ) {
return - 2147483630 ;
} else {
return ( int ) l ;
}
}
@Nullable
public static ScoreDoc getLastScoreDoc ( ScoreDoc [ ] scoreDocs ) {
if ( scoreDocs = = null ) {
return null ;
}
if ( scoreDocs . length = = 0 ) {
return null ;
}
return scoreDocs [ scoreDocs . length - 1 ] ;
}
2021-07-06 01:30:37 +02:00
2021-11-16 23:19:23 +01:00
public static LocalQueryParams toLocalQueryParams ( QueryParams queryParams , Analyzer analyzer ) {
return new LocalQueryParams ( QueryParser . toQuery ( queryParams . query ( ) , analyzer ) ,
2021-10-15 22:03:53 +02:00
queryParams . offset ( ) ,
queryParams . limit ( ) ,
2021-09-20 18:20:59 +02:00
DEFAULT_PAGE_LIMITS ,
2021-07-06 01:30:37 +02:00
queryParams . minCompetitiveScore ( ) . getNullable ( ) ,
QueryParser . toSort ( queryParams . sort ( ) ) ,
2021-12-12 23:40:30 +01:00
queryParams . computePreciseHitsCount ( ) ,
Duration . ofMillis ( queryParams . timeoutMilliseconds ( ) )
2021-07-06 01:30:37 +02:00
) ;
}
2021-07-08 17:01:56 +02:00
2021-09-18 18:34:21 +02:00
public static Flux < LLKeyScore > convertHits ( Flux < ScoreDoc > hitsFlux ,
2021-09-22 11:03:39 +02:00
List < IndexSearcher > indexSearchers ,
2021-07-08 17:01:56 +02:00
String keyFieldName ,
2021-07-30 14:01:12 +02:00
boolean preserveOrder ) {
2021-09-18 18:34:21 +02:00
if ( preserveOrder ) {
return hitsFlux
2021-12-16 16:14:44 +01:00
. publishOn ( uninterruptibleScheduler ( Schedulers . boundedElastic ( ) ) )
. mapNotNull ( hit - > mapHitBlocking ( hit , indexSearchers , keyFieldName ) )
. publishOn ( Schedulers . parallel ( ) ) ;
2021-09-18 18:34:21 +02:00
} else {
return hitsFlux
2021-12-16 16:14:44 +01:00
. buffer ( Queues . XS_BUFFER_SIZE , ( ) - > new ArrayList < Object > ( Queues . XS_BUFFER_SIZE ) )
. flatMap ( shardHits - > Mono . fromCallable ( ( ) - > {
for ( int i = 0 , size = shardHits . size ( ) ; i < size ; i + + ) {
shardHits . set ( i , mapHitBlocking ( ( ScoreDoc ) shardHits . get ( i ) , indexSearchers , keyFieldName ) ) ;
}
//noinspection unchecked
return ( List < LLKeyScore > ) ( List < ? > ) shardHits ;
} ) . subscribeOn ( uninterruptibleScheduler ( Schedulers . boundedElastic ( ) ) ) )
. flatMapIterable ( a - > a )
. publishOn ( Schedulers . parallel ( ) ) ;
2021-09-18 18:34:21 +02:00
}
2021-07-30 14:01:12 +02:00
}
@Nullable
private static LLKeyScore mapHitBlocking ( ScoreDoc hit ,
2021-09-22 11:03:39 +02:00
List < IndexSearcher > indexSearchers ,
2021-07-30 14:01:12 +02:00
String keyFieldName ) {
2021-12-16 16:14:44 +01:00
assert ! Schedulers . isInNonBlockingThread ( ) ;
2021-07-30 14:01:12 +02:00
int shardDocId = hit . doc ;
int shardIndex = hit . shardIndex ;
float score = hit . score ;
2021-12-16 16:14:44 +01:00
IndexSearcher indexSearcher ;
2021-09-22 11:03:39 +02:00
if ( shardIndex = = - 1 & & indexSearchers . size ( ) = = 1 ) {
2021-12-16 16:14:44 +01:00
indexSearcher = indexSearchers . get ( 0 ) ;
} else {
indexSearcher = indexSearchers . get ( shardIndex ) ;
2021-09-22 11:03:39 +02:00
}
2021-07-30 14:01:12 +02:00
try {
String collectedDoc = keyOfTopDoc ( shardDocId , indexSearcher . getIndexReader ( ) , keyFieldName ) ;
2021-08-24 11:06:25 +02:00
return new LLKeyScore ( shardDocId , score , collectedDoc ) ;
2021-07-30 14:01:12 +02:00
} catch ( NoSuchElementException ex ) {
2021-08-24 11:06:25 +02:00
logger . debug ( " Error: document {} key is not present! " , shardDocId ) ;
2021-07-30 14:01:12 +02:00
return null ;
} catch ( Exception ex ) {
2021-08-24 11:06:25 +02:00
logger . error ( " Failed to read document {} " , shardDocId , ex ) ;
return new LLKeyScore ( shardDocId , score , null ) ;
2021-07-30 14:01:12 +02:00
}
2021-07-08 17:01:56 +02:00
}
2021-10-13 00:23:56 +02:00
public static TopDocs mergeTopDocs (
@Nullable Sort sort ,
2021-09-05 14:23:46 +02:00
@Nullable Integer startN ,
@Nullable Integer topN ,
2021-11-09 00:54:09 +01:00
TopDocs [ ] topDocs ) {
2021-07-08 18:54:53 +02:00
if ( ( startN = = null ) ! = ( topN = = null ) ) {
throw new IllegalArgumentException ( " You must pass startN and topN together or nothing " ) ;
}
2021-07-08 17:01:56 +02:00
TopDocs result ;
if ( sort ! = null ) {
if ( ! ( topDocs instanceof TopFieldDocs [ ] ) ) {
throw new IllegalStateException ( " Expected TopFieldDocs[], got TopDocs[] " ) ;
}
2021-07-08 18:54:53 +02:00
if ( startN = = null ) {
int defaultTopN = 0 ;
for ( TopDocs td : topDocs ) {
int length = td . scoreDocs . length ;
defaultTopN + = length ;
}
result = TopDocs . merge ( sort , 0 , defaultTopN ,
2021-11-09 00:54:09 +01:00
( TopFieldDocs [ ] ) topDocs
2021-07-08 18:54:53 +02:00
) ;
} else {
result = TopDocs . merge ( sort , startN ,
topN ,
2021-11-09 00:54:09 +01:00
( TopFieldDocs [ ] ) topDocs
2021-07-08 18:54:53 +02:00
) ;
}
2021-07-08 17:01:56 +02:00
} else {
2021-07-08 18:54:53 +02:00
if ( startN = = null ) {
int defaultTopN = 0 ;
for ( TopDocs td : topDocs ) {
int length = td . scoreDocs . length ;
defaultTopN + = length ;
}
result = TopDocs . merge ( 0 ,
defaultTopN ,
2021-11-09 00:54:09 +01:00
topDocs
2021-07-08 18:54:53 +02:00
) ;
} else {
result = TopDocs . merge ( startN ,
topN ,
2021-11-09 00:54:09 +01:00
topDocs
2021-07-08 18:54:53 +02:00
) ;
}
2021-07-08 17:01:56 +02:00
}
return result ;
}
2021-07-17 23:06:26 +02:00
2021-10-14 00:49:21 +02:00
public static int totalHitsThreshold ( boolean complete ) {
return complete ? Integer . MAX_VALUE : 1 ;
2021-07-17 23:06:26 +02:00
}
2021-08-04 01:12:39 +02:00
2021-10-15 22:03:53 +02:00
public static long totalHitsThresholdLong ( boolean complete ) {
return complete ? Long . MAX_VALUE : 1 ;
}
2021-08-04 01:12:39 +02:00
public static TotalHitsCount convertTotalHitsCount ( TotalHits totalHits ) {
return switch ( totalHits . relation ) {
case EQUAL_TO - > TotalHitsCount . of ( totalHits . value , true ) ;
case GREATER_THAN_OR_EQUAL_TO - > TotalHitsCount . of ( totalHits . value , false ) ;
} ;
}
public static TotalHitsCount sum ( TotalHitsCount totalHitsCount , TotalHitsCount totalHitsCount1 ) {
return TotalHitsCount . of ( totalHitsCount . value ( ) + totalHitsCount1 . value ( ) ,
totalHitsCount . exact ( ) & & totalHitsCount1 . exact ( )
) ;
}
2021-08-04 01:16:17 +02:00
@SuppressWarnings ( " unused " )
public static String toHumanReadableString ( TotalHitsCount totalHitsCount ) {
if ( totalHitsCount . exact ( ) ) {
return Long . toString ( totalHitsCount . value ( ) ) ;
} else {
return totalHitsCount . value ( ) + " + " ;
}
}
2021-09-08 21:34:52 +02:00
2021-09-20 11:35:01 +02:00
public static Mono < LocalQueryParams > getMoreLikeThisQuery (
2021-10-08 02:13:33 +02:00
LLIndexSearchers inputIndexSearchers ,
2021-09-20 11:35:01 +02:00
LocalQueryParams localQueryParams ,
Analyzer analyzer ,
Similarity similarity ,
Flux < Tuple2 < String , Set < String > > > mltDocumentFieldsFlux ) {
2021-10-08 02:13:33 +02:00
var indexSearchers = inputIndexSearchers . shards ( ) ;
2021-09-20 11:35:01 +02:00
Query luceneAdditionalQuery ;
try {
luceneAdditionalQuery = localQueryParams . query ( ) ;
} catch ( Exception e ) {
return Mono . error ( e ) ;
}
return mltDocumentFieldsFlux
. collectMap ( Tuple2 : : getT1 , Tuple2 : : getT2 , HashMap : : new )
. flatMap ( mltDocumentFields - > Mono . fromCallable ( ( ) - > {
mltDocumentFields . entrySet ( ) . removeIf ( entry - > entry . getValue ( ) . isEmpty ( ) ) ;
if ( mltDocumentFields . isEmpty ( ) ) {
return new LocalQueryParams ( new MatchNoDocsQuery ( ) ,
2021-10-15 22:03:53 +02:00
localQueryParams . offsetLong ( ) ,
localQueryParams . limitLong ( ) ,
2021-09-20 18:20:59 +02:00
DEFAULT_PAGE_LIMITS ,
2021-09-20 11:35:01 +02:00
localQueryParams . minCompetitiveScore ( ) ,
localQueryParams . sort ( ) ,
2021-12-12 23:40:30 +01:00
localQueryParams . computePreciseHitsCount ( ) ,
localQueryParams . timeout ( )
2021-09-20 11:35:01 +02:00
) ;
}
MultiMoreLikeThis mlt ;
if ( indexSearchers . size ( ) = = 1 ) {
2021-09-29 13:18:06 +02:00
mlt = new MultiMoreLikeThis ( new BigCompositeReader < > ( indexSearchers . get ( 0 ) . getIndexReader ( ) , IndexReader [ ] : : new ) , null ) ;
2021-09-20 11:35:01 +02:00
} else {
IndexReader [ ] indexReaders = new IndexReader [ indexSearchers . size ( ) ] ;
for ( int i = 0 , size = indexSearchers . size ( ) ; i < size ; i + + ) {
indexReaders [ i ] = indexSearchers . get ( i ) . getIndexReader ( ) ;
}
2021-09-29 13:18:06 +02:00
mlt = new MultiMoreLikeThis ( new BigCompositeReader < > ( indexReaders , new ArrayIndexComparator ( indexReaders ) ) , null ) ;
2021-09-20 11:35:01 +02:00
}
mlt . setAnalyzer ( analyzer ) ;
mlt . setFieldNames ( mltDocumentFields . keySet ( ) . toArray ( String [ ] : : new ) ) ;
mlt . setMinTermFreq ( 1 ) ;
mlt . setMinDocFreq ( 3 ) ;
mlt . setMaxDocFreqPct ( 20 ) ;
2021-10-14 00:49:21 +02:00
mlt . setBoost ( localQueryParams . needsScores ( ) ) ;
2022-01-11 22:23:07 +01:00
mlt . setStopWords ( ENGLISH_AND_ITALIAN_STOP_WORDS ) ;
2021-09-20 11:35:01 +02:00
if ( similarity instanceof TFIDFSimilarity tfidfSimilarity ) {
mlt . setSimilarity ( tfidfSimilarity ) ;
} else {
mlt . setSimilarity ( new ClassicSimilarity ( ) ) ;
}
// Get the reference docId and apply it to MoreLikeThis, to generate the query
@SuppressWarnings ( { " unchecked " , " rawtypes " } )
var mltQuery = mlt . like ( ( Map ) mltDocumentFields ) ;
Query luceneQuery ;
if ( ! ( luceneAdditionalQuery instanceof MatchAllDocsQuery ) ) {
2021-09-29 13:18:06 +02:00
luceneQuery = new Builder ( )
2021-09-20 11:35:01 +02:00
. add ( mltQuery , Occur . MUST )
. add ( new ConstantScoreQuery ( luceneAdditionalQuery ) , Occur . MUST )
. build ( ) ;
} else {
luceneQuery = mltQuery ;
}
return new LocalQueryParams ( luceneQuery ,
2021-10-15 22:03:53 +02:00
localQueryParams . offsetLong ( ) ,
localQueryParams . limitLong ( ) ,
2021-09-20 18:20:59 +02:00
DEFAULT_PAGE_LIMITS ,
2021-09-20 11:35:01 +02:00
localQueryParams . minCompetitiveScore ( ) ,
localQueryParams . sort ( ) ,
2021-12-12 23:40:30 +01:00
localQueryParams . computePreciseHitsCount ( ) ,
2021-12-16 16:14:44 +01:00
localQueryParams . timeout ( ) ) ;
} ) . subscribeOn ( uninterruptibleScheduler ( Schedulers . boundedElastic ( ) ) ) )
. publishOn ( Schedulers . parallel ( ) ) ;
2021-09-20 11:35:01 +02:00
}
2021-12-12 23:40:30 +01:00
2021-12-16 02:38:56 +01:00
public static Collector withTimeout ( Collector collector , Duration timeout ) {
2021-12-12 23:40:30 +01:00
return new TimeLimitingCollector ( collector , TimeLimitingCollector . getGlobalCounter ( ) , timeout . toMillis ( ) ) ;
}
2021-02-04 22:42:57 +01:00
}