2020-12-07 22:15:18 +01:00
package it.cavallium.dbengine.database.disk ;
2021-02-28 10:57:16 +01:00
import com.google.common.base.Suppliers ;
2021-03-02 01:53:36 +01:00
import it.cavallium.dbengine.client.query.QueryParser ;
import it.cavallium.dbengine.client.query.current.data.QueryParams ;
2021-02-28 14:52:11 +01:00
import it.cavallium.dbengine.database.EnglishItalianStopFilter ;
2020-12-31 12:04:53 +01:00
import it.cavallium.dbengine.database.LLDocument ;
import it.cavallium.dbengine.database.LLKeyScore ;
import it.cavallium.dbengine.database.LLLuceneIndex ;
2021-02-04 22:42:57 +01:00
import it.cavallium.dbengine.database.LLSearchCollectionStatisticsGetter ;
2021-01-29 17:19:01 +01:00
import it.cavallium.dbengine.database.LLSearchResult ;
2020-12-31 12:04:53 +01:00
import it.cavallium.dbengine.database.LLSnapshot ;
import it.cavallium.dbengine.database.LLTerm ;
import it.cavallium.dbengine.database.LLUtils ;
2021-02-04 22:42:57 +01:00
import it.cavallium.dbengine.lucene.LuceneUtils ;
2021-02-03 13:48:30 +01:00
import it.cavallium.dbengine.lucene.ScheduledTaskLifecycle ;
2021-02-04 22:42:57 +01:00
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer ;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity ;
2021-01-30 22:14:48 +01:00
import it.cavallium.dbengine.lucene.searcher.AdaptiveStreamSearcher ;
2021-02-04 22:42:57 +01:00
import it.cavallium.dbengine.lucene.searcher.AllowOnlyQueryParsingCollectorStreamSearcher ;
2021-01-30 22:14:48 +01:00
import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher ;
2021-03-03 00:13:57 +01:00
import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult ;
2020-12-07 22:15:18 +01:00
import java.io.IOException ;
import java.nio.file.Path ;
import java.time.Duration ;
2021-01-30 19:57:50 +01:00
import java.util.HashMap ;
2020-12-07 22:15:18 +01:00
import java.util.Map ;
import java.util.Objects ;
2021-01-29 17:19:01 +01:00
import java.util.Optional ;
2021-03-03 00:13:57 +01:00
import java.util.Random ;
2020-12-07 22:15:18 +01:00
import java.util.Set ;
import java.util.concurrent.ConcurrentHashMap ;
import java.util.concurrent.TimeUnit ;
import java.util.concurrent.atomic.AtomicLong ;
2021-02-28 10:57:16 +01:00
import java.util.function.Supplier ;
2020-12-07 22:15:18 +01:00
import org.apache.lucene.index.IndexCommit ;
import org.apache.lucene.index.IndexWriter ;
import org.apache.lucene.index.IndexWriterConfig ;
import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy ;
import org.apache.lucene.index.SnapshotDeletionPolicy ;
import org.apache.lucene.queries.mlt.MoreLikeThis ;
2021-02-27 17:32:57 +01:00
import org.apache.lucene.search.BooleanClause.Occur ;
import org.apache.lucene.search.BooleanQuery ;
2021-02-27 19:05:13 +01:00
import org.apache.lucene.search.ConstantScoreQuery ;
2020-12-07 22:15:18 +01:00
import org.apache.lucene.search.IndexSearcher ;
import org.apache.lucene.search.Query ;
2021-01-29 17:19:01 +01:00
import org.apache.lucene.search.ScoreMode ;
2020-12-07 22:15:18 +01:00
import org.apache.lucene.search.SearcherManager ;
import org.apache.lucene.search.Sort ;
2021-02-04 22:42:57 +01:00
import org.apache.lucene.search.similarities.Similarity ;
2021-02-28 14:52:11 +01:00
import org.apache.lucene.search.similarities.TFIDFSimilarity ;
2020-12-07 22:15:18 +01:00
import org.apache.lucene.store.Directory ;
2020-12-31 20:10:47 +01:00
import org.apache.lucene.store.FSDirectory ;
2020-12-07 22:15:18 +01:00
import org.jetbrains.annotations.Nullable ;
2021-02-25 00:00:16 +01:00
import org.warp.commonutils.log.Logger ;
import org.warp.commonutils.log.LoggerFactory ;
2020-12-07 22:15:18 +01:00
import reactor.core.publisher.Flux ;
2021-01-30 19:57:50 +01:00
import reactor.core.publisher.GroupedFlux ;
2020-12-07 22:15:18 +01:00
import reactor.core.publisher.Mono ;
2021-01-29 17:19:01 +01:00
import reactor.core.publisher.Sinks ;
import reactor.core.publisher.Sinks.EmissionException ;
import reactor.core.publisher.Sinks.EmitResult ;
import reactor.core.publisher.Sinks.Many ;
import reactor.core.publisher.Sinks.One ;
2021-02-01 02:21:53 +01:00
import reactor.core.scheduler.Scheduler ;
2020-12-07 22:15:18 +01:00
import reactor.core.scheduler.Schedulers ;
2021-01-30 19:57:50 +01:00
import reactor.util.function.Tuple2 ;
2020-12-07 22:15:18 +01:00
import reactor.util.function.Tuples ;
public class LLLocalLuceneIndex implements LLLuceneIndex {
2021-02-25 00:00:16 +01:00
protected static final Logger logger = LoggerFactory . getLogger ( LLLocalLuceneIndex . class ) ;
2020-12-07 22:15:18 +01:00
private static final LuceneStreamSearcher streamSearcher = new AdaptiveStreamSearcher ( ) ;
2021-02-04 22:42:57 +01:00
private static final AllowOnlyQueryParsingCollectorStreamSearcher allowOnlyQueryParsingCollectorStreamSearcher = new AllowOnlyQueryParsingCollectorStreamSearcher ( ) ;
2020-12-12 23:41:09 +01:00
/ * *
* Global lucene index scheduler .
* There is only a single thread globally to not overwhelm the disk with
2021-02-03 14:37:02 +01:00
* concurrent commits or concurrent refreshes .
2020-12-12 23:41:09 +01:00
* /
2021-02-03 14:37:02 +01:00
private static final Scheduler luceneBlockingScheduler = Schedulers . newBoundedElastic ( 1 ,
2021-02-03 13:48:30 +01:00
Schedulers . DEFAULT_BOUNDED_ELASTIC_QUEUESIZE ,
2021-02-28 10:57:16 +01:00
" lucene " ,
2021-02-03 13:48:30 +01:00
120 ,
true
) ;
2021-02-28 10:57:16 +01:00
private static final Supplier < Scheduler > lowMemorySupplier = Suppliers . memoize ( ( ) - >
2021-03-02 12:01:03 +01:00
Schedulers . newBoundedElastic ( 1 , Schedulers . DEFAULT_BOUNDED_ELASTIC_QUEUESIZE , " lucene-low-memory " , Integer . MAX_VALUE ) ) : : get ;
2021-02-28 00:29:56 +01:00
/ * *
* Lucene query scheduler .
* /
2021-02-28 10:57:16 +01:00
private final Scheduler luceneQueryScheduler ;
2020-12-12 23:41:09 +01:00
2020-12-07 22:15:18 +01:00
private final String luceneIndexName ;
private final SnapshotDeletionPolicy snapshotter ;
private final IndexWriter indexWriter ;
private final SearcherManager searcherManager ;
private final Directory directory ;
/ * *
* Last snapshot sequence number . 0 is not used
* /
private final AtomicLong lastSnapshotSeqNo = new AtomicLong ( 0 ) ;
/ * *
2021-03-02 01:53:36 +01:00
* LLSnapshot seq no to index commit point
2020-12-07 22:15:18 +01:00
* /
private final ConcurrentHashMap < Long , LuceneIndexSnapshot > snapshots = new ConcurrentHashMap < > ( ) ;
private final boolean lowMemory ;
2021-02-04 22:42:57 +01:00
private final TextFieldsSimilarity similarity ;
2020-12-07 22:15:18 +01:00
2020-12-12 23:41:09 +01:00
private final ScheduledTaskLifecycle scheduledTasksLifecycle ;
2021-02-04 22:42:57 +01:00
private final @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter ;
2020-12-12 23:41:09 +01:00
2020-12-07 22:15:18 +01:00
public LLLocalLuceneIndex ( Path luceneBasePath ,
String name ,
TextFieldsAnalyzer analyzer ,
2021-02-04 22:42:57 +01:00
TextFieldsSimilarity similarity ,
2020-12-07 22:15:18 +01:00
Duration queryRefreshDebounceTime ,
Duration commitDebounceTime ,
2021-02-04 22:42:57 +01:00
boolean lowMemory ,
@Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter ) throws IOException {
2020-12-07 22:15:18 +01:00
if ( name . length ( ) = = 0 ) {
throw new IOException ( " Empty lucene database name " ) ;
}
Path directoryPath = luceneBasePath . resolve ( name + " .lucene.db " ) ;
2020-12-31 20:10:47 +01:00
this . directory = FSDirectory . open ( directoryPath ) ;
2020-12-07 22:15:18 +01:00
this . luceneIndexName = name ;
this . snapshotter = new SnapshotDeletionPolicy ( new KeepOnlyLastCommitDeletionPolicy ( ) ) ;
this . lowMemory = lowMemory ;
2021-02-04 22:42:57 +01:00
this . similarity = similarity ;
this . distributedCollectionStatisticsGetter = distributedCollectionStatisticsGetter ;
2020-12-07 22:15:18 +01:00
IndexWriterConfig indexWriterConfig = new IndexWriterConfig ( LuceneUtils . getAnalyzer ( analyzer ) ) ;
indexWriterConfig . setOpenMode ( IndexWriterConfig . OpenMode . CREATE_OR_APPEND ) ;
indexWriterConfig . setIndexDeletionPolicy ( snapshotter ) ;
indexWriterConfig . setCommitOnClose ( true ) ;
if ( lowMemory ) {
indexWriterConfig . setRAMBufferSizeMB ( 32 ) ;
indexWriterConfig . setRAMPerThreadHardLimitMB ( 32 ) ;
} else {
indexWriterConfig . setRAMBufferSizeMB ( 128 ) ;
indexWriterConfig . setRAMPerThreadHardLimitMB ( 512 ) ;
}
2021-02-04 22:42:57 +01:00
indexWriterConfig . setSimilarity ( getSimilarity ( ) ) ;
2020-12-07 22:15:18 +01:00
this . indexWriter = new IndexWriter ( directory , indexWriterConfig ) ;
this . searcherManager = new SearcherManager ( indexWriter , false , false , null ) ;
2021-02-28 10:57:16 +01:00
if ( lowMemory ) {
this . luceneQueryScheduler = lowMemorySupplier . get ( ) ;
} else {
2021-02-28 14:52:11 +01:00
this . luceneQueryScheduler = Schedulers . newBoundedElastic ( Schedulers . DEFAULT_BOUNDED_ELASTIC_SIZE ,
Schedulers . DEFAULT_BOUNDED_ELASTIC_QUEUESIZE ,
" lucene-query " ,
60 ,
true
) ;
2021-02-28 10:57:16 +01:00
}
2020-12-12 23:41:09 +01:00
// Create scheduled tasks lifecycle manager
this . scheduledTasksLifecycle = new ScheduledTaskLifecycle ( ) ;
// Start scheduled tasks
registerScheduledFixedTask ( this : : scheduledCommit , commitDebounceTime ) ;
registerScheduledFixedTask ( this : : scheduledQueryRefresh , queryRefreshDebounceTime ) ;
}
2021-02-04 22:42:57 +01:00
private Similarity getSimilarity ( ) {
return LuceneUtils . getSimilarity ( similarity ) ;
}
2020-12-12 23:41:09 +01:00
private void registerScheduledFixedTask ( Runnable task , Duration duration ) {
2021-02-03 14:37:02 +01:00
scheduledTasksLifecycle . registerScheduledTask ( luceneBlockingScheduler . schedulePeriodically ( ( ) - > {
2020-12-31 12:05:04 +01:00
scheduledTasksLifecycle . startScheduledTask ( ) ;
try {
task . run ( ) ;
} finally {
scheduledTasksLifecycle . endScheduledTask ( ) ;
}
} , duration . toMillis ( ) , duration . toMillis ( ) , TimeUnit . MILLISECONDS ) ) ;
2020-12-07 22:15:18 +01:00
}
@Override
public String getLuceneIndexName ( ) {
return luceneIndexName ;
}
@Override
2021-01-30 01:41:04 +01:00
public Mono < LLSnapshot > takeSnapshot ( ) {
return Mono
. fromCallable ( lastSnapshotSeqNo : : incrementAndGet )
2021-02-03 14:37:02 +01:00
. subscribeOn ( luceneBlockingScheduler )
2021-01-30 01:41:04 +01:00
. flatMap ( snapshotSeqNo - > takeLuceneSnapshot ( )
. flatMap ( snapshot - > Mono
. fromCallable ( ( ) - > {
this . snapshots . put ( snapshotSeqNo , new LuceneIndexSnapshot ( snapshot ) ) ;
return new LLSnapshot ( snapshotSeqNo ) ;
} )
2021-02-03 14:37:02 +01:00
. subscribeOn ( luceneBlockingScheduler )
2021-01-30 01:41:04 +01:00
)
) ;
2020-12-07 22:15:18 +01:00
}
/ * *
* Use internally . This method commits before taking the snapshot if there are no commits in a new database ,
* avoiding the exception .
* /
2021-01-30 01:41:04 +01:00
private Mono < IndexCommit > takeLuceneSnapshot ( ) {
2021-02-03 14:37:02 +01:00
return Mono
. fromCallable ( ( ) - > {
try {
//noinspection BlockingMethodInNonBlockingContext
return snapshotter . snapshot ( ) ;
} catch ( IllegalStateException ex ) {
if ( " No index commit to snapshot " . equals ( ex . getMessage ( ) ) ) {
//noinspection BlockingMethodInNonBlockingContext
indexWriter . commit ( ) ;
//noinspection BlockingMethodInNonBlockingContext
return snapshotter . snapshot ( ) ;
} else {
throw ex ;
}
}
} )
. subscribeOn ( luceneBlockingScheduler ) ;
2020-12-07 22:15:18 +01:00
}
@Override
2021-01-30 01:41:04 +01:00
public Mono < Void > releaseSnapshot ( LLSnapshot snapshot ) {
return Mono . < Void > fromCallable ( ( ) - > {
var indexSnapshot = this . snapshots . remove ( snapshot . getSequenceNumber ( ) ) ;
if ( indexSnapshot = = null ) {
2021-03-02 01:53:36 +01:00
throw new IOException ( " LLSnapshot " + snapshot . getSequenceNumber ( ) + " not found! " ) ;
2021-01-30 01:41:04 +01:00
}
2020-12-07 22:15:18 +01:00
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
indexSnapshot . close ( ) ;
2020-12-07 22:15:18 +01:00
2021-01-30 01:41:04 +01:00
var luceneIndexSnapshot = indexSnapshot . getSnapshot ( ) ;
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
snapshotter . release ( luceneIndexSnapshot ) ;
// Delete unused files after releasing the snapshot
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
indexWriter . deleteUnusedFiles ( ) ;
return null ;
2021-02-03 14:37:02 +01:00
} ) . subscribeOn ( luceneBlockingScheduler ) ;
2020-12-07 22:15:18 +01:00
}
@Override
2021-01-30 01:41:04 +01:00
public Mono < Void > addDocument ( LLTerm key , LLDocument doc ) {
return Mono . < Void > fromCallable ( ( ) - > {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
indexWriter . addDocument ( LLUtils . toDocument ( doc ) ) ;
return null ;
2021-02-03 14:37:02 +01:00
} ) . subscribeOn ( luceneBlockingScheduler ) ;
2020-12-07 22:15:18 +01:00
}
@Override
2021-01-30 19:57:50 +01:00
public Mono < Void > addDocuments ( Flux < GroupedFlux < LLTerm , LLDocument > > documents ) {
return documents
. flatMap ( group - > group
. collectList ( )
. flatMap ( docs - > Mono
. < Void > fromCallable ( ( ) - > {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 19:57:50 +01:00
indexWriter . addDocuments ( LLUtils . toDocuments ( docs ) ) ;
return null ;
} )
2021-02-03 14:37:02 +01:00
. subscribeOn ( luceneBlockingScheduler ) )
2021-01-30 19:57:50 +01:00
)
. then ( ) ;
2020-12-07 22:15:18 +01:00
}
2021-01-30 19:57:50 +01:00
2020-12-07 22:15:18 +01:00
@Override
2021-01-30 01:41:04 +01:00
public Mono < Void > deleteDocument ( LLTerm id ) {
return Mono . < Void > fromCallable ( ( ) - > {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
indexWriter . deleteDocuments ( LLUtils . toTerm ( id ) ) ;
return null ;
2021-02-03 14:37:02 +01:00
} ) . subscribeOn ( luceneBlockingScheduler ) ;
2020-12-07 22:15:18 +01:00
}
@Override
2021-01-30 01:41:04 +01:00
public Mono < Void > updateDocument ( LLTerm id , LLDocument document ) {
return Mono . < Void > fromCallable ( ( ) - > {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
indexWriter . updateDocument ( LLUtils . toTerm ( id ) , LLUtils . toDocument ( document ) ) ;
return null ;
2021-02-03 14:37:02 +01:00
} ) . subscribeOn ( luceneBlockingScheduler ) ;
2020-12-07 22:15:18 +01:00
}
@Override
2021-01-30 19:57:50 +01:00
public Mono < Void > updateDocuments ( Flux < GroupedFlux < LLTerm , LLDocument > > documents ) {
return documents . flatMap ( this : : updateDocuments ) . then ( ) ;
}
2021-01-30 01:41:04 +01:00
2021-01-30 19:57:50 +01:00
private Mono < Void > updateDocuments ( GroupedFlux < LLTerm , LLDocument > documents ) {
return documents
. map ( LLUtils : : toDocument )
. collectList ( )
. flatMap ( luceneDocuments - > Mono
. < Void > fromCallable ( ( ) - > {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 19:57:50 +01:00
indexWriter . updateDocuments ( LLUtils . toTerm ( documents . key ( ) ) , luceneDocuments ) ;
return null ;
} )
2021-02-03 14:37:02 +01:00
. subscribeOn ( luceneBlockingScheduler )
2021-01-30 19:57:50 +01:00
) ;
2020-12-07 22:15:18 +01:00
}
@Override
2021-01-30 01:41:04 +01:00
public Mono < Void > deleteAll ( ) {
return Mono . < Void > fromCallable ( ( ) - > {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
indexWriter . deleteAll ( ) ;
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
indexWriter . forceMergeDeletes ( true ) ;
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
indexWriter . commit ( ) ;
return null ;
2021-02-03 14:37:02 +01:00
} ) . subscribeOn ( luceneBlockingScheduler ) ;
2020-12-07 22:15:18 +01:00
}
2021-02-04 22:42:57 +01:00
private Mono < IndexSearcher > acquireSearcherWrapper ( LLSnapshot snapshot , boolean distributedPre , long actionId ) {
2021-01-29 17:19:01 +01:00
return Mono . fromCallable ( ( ) - > {
2021-02-04 22:42:57 +01:00
IndexSearcher indexSearcher ;
2021-01-29 17:19:01 +01:00
if ( snapshot = = null ) {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-02-04 22:42:57 +01:00
indexSearcher = searcherManager . acquire ( ) ;
indexSearcher . setSimilarity ( getSimilarity ( ) ) ;
} else {
indexSearcher = resolveSnapshot ( snapshot ) . getIndexSearcher ( ) ;
}
if ( distributedCollectionStatisticsGetter ! = null & & actionId ! = - 1 ) {
return new LLIndexSearcherWithCustomCollectionStatistics ( indexSearcher ,
distributedCollectionStatisticsGetter ,
distributedPre ,
actionId
) ;
2021-01-29 17:19:01 +01:00
} else {
2021-02-04 22:42:57 +01:00
return indexSearcher ;
2021-01-29 17:19:01 +01:00
}
2021-02-03 14:37:02 +01:00
} ) . subscribeOn ( luceneBlockingScheduler ) ;
2021-01-29 17:19:01 +01:00
}
2020-12-07 22:15:18 +01:00
2021-01-29 17:19:01 +01:00
private Mono < Void > releaseSearcherWrapper ( LLSnapshot snapshot , IndexSearcher indexSearcher ) {
return Mono . < Void > fromRunnable ( ( ) - > {
if ( snapshot = = null ) {
try {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-29 17:19:01 +01:00
searcherManager . release ( indexSearcher ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
}
2021-02-03 14:37:02 +01:00
} ) . subscribeOn ( luceneBlockingScheduler ) ;
2020-12-07 22:15:18 +01:00
}
@Override
2021-01-29 17:19:01 +01:00
public Mono < LLSearchResult > moreLikeThis ( @Nullable LLSnapshot snapshot ,
2021-03-02 01:53:36 +01:00
QueryParams queryParams ,
String keyFieldName ,
Flux < Tuple2 < String , Set < String > > > mltDocumentFieldsFlux ) {
2021-02-28 16:50:59 +01:00
return moreLikeThis ( snapshot ,
2021-03-02 01:53:36 +01:00
queryParams ,
2021-02-28 16:50:59 +01:00
keyFieldName ,
2021-03-02 01:53:36 +01:00
mltDocumentFieldsFlux ,
2021-02-28 16:50:59 +01:00
false ,
0 ,
1
) ;
2021-02-04 22:42:57 +01:00
}
public Mono < LLSearchResult > distributedMoreLikeThis ( @Nullable LLSnapshot snapshot ,
2021-03-02 01:53:36 +01:00
QueryParams queryParams ,
2021-02-04 22:42:57 +01:00
String keyFieldName ,
2021-03-02 01:53:36 +01:00
Flux < Tuple2 < String , Set < String > > > mltDocumentFieldsFlux ,
2021-02-04 22:42:57 +01:00
long actionId ,
int scoreDivisor ) {
2021-02-28 16:50:59 +01:00
return moreLikeThis ( snapshot ,
2021-03-02 01:53:36 +01:00
queryParams ,
2021-02-28 16:50:59 +01:00
keyFieldName ,
2021-03-02 01:53:36 +01:00
mltDocumentFieldsFlux ,
2021-02-28 16:50:59 +01:00
false ,
actionId ,
scoreDivisor
) ;
2021-02-04 22:42:57 +01:00
}
public Mono < Void > distributedPreMoreLikeThis ( @Nullable LLSnapshot snapshot ,
2021-03-02 01:53:36 +01:00
QueryParams queryParams ,
String keyFieldName ,
2021-02-04 22:42:57 +01:00
Flux < Tuple2 < String , Set < String > > > mltDocumentFieldsFlux ,
2021-03-02 01:53:36 +01:00
long actionId ) {
2021-02-28 16:50:59 +01:00
return moreLikeThis ( snapshot ,
2021-03-02 01:53:36 +01:00
queryParams ,
2021-02-28 16:50:59 +01:00
keyFieldName ,
2021-03-02 01:53:36 +01:00
mltDocumentFieldsFlux ,
2021-02-28 16:50:59 +01:00
true ,
actionId ,
1
)
2021-02-04 22:42:57 +01:00
. flatMap ( LLSearchResult : : completion ) ;
}
2021-02-27 17:32:57 +01:00
@SuppressWarnings ( { " unchecked " , " rawtypes " } )
2021-02-04 22:42:57 +01:00
private Mono < LLSearchResult > moreLikeThis ( @Nullable LLSnapshot snapshot ,
2021-03-02 01:53:36 +01:00
QueryParams queryParams ,
2021-02-04 22:42:57 +01:00
String keyFieldName ,
2021-03-02 01:53:36 +01:00
Flux < Tuple2 < String , Set < String > > > mltDocumentFieldsFlux ,
2021-02-04 22:42:57 +01:00
boolean doDistributedPre ,
long actionId ,
int scoreDivisor ) {
2021-02-27 17:32:57 +01:00
Query luceneAdditionalQuery ;
try {
2021-03-02 01:53:36 +01:00
luceneAdditionalQuery = QueryParser . toQuery ( queryParams . getQuery ( ) ) ;
} catch ( Exception e ) {
2021-02-27 17:32:57 +01:00
return Mono . error ( e ) ;
}
2021-01-30 19:57:50 +01:00
return mltDocumentFieldsFlux
. collectMap ( Tuple2 : : getT1 , Tuple2 : : getT2 , HashMap : : new )
. flatMap ( mltDocumentFields - > {
2021-02-27 19:05:13 +01:00
mltDocumentFields . entrySet ( ) . removeIf ( entry - > entry . getValue ( ) . isEmpty ( ) ) ;
2021-01-30 19:57:50 +01:00
if ( mltDocumentFields . isEmpty ( ) ) {
return Mono . just ( LLSearchResult . empty ( ) ) ;
}
2021-02-28 14:52:11 +01:00
return acquireSearcherWrapper ( snapshot , doDistributedPre , actionId ) . flatMap ( indexSearcher - > Mono
. fromCallable ( ( ) - > {
var mlt = new MoreLikeThis ( indexSearcher . getIndexReader ( ) ) ;
mlt . setAnalyzer ( indexWriter . getAnalyzer ( ) ) ;
mlt . setFieldNames ( mltDocumentFields . keySet ( ) . toArray ( String [ ] : : new ) ) ;
mlt . setMinTermFreq ( 1 ) ;
mlt . setMinDocFreq ( 3 ) ;
mlt . setMaxDocFreqPct ( 20 ) ;
2021-03-02 01:53:36 +01:00
mlt . setBoost ( QueryParser . isScoringEnabled ( queryParams ) ) ;
2021-02-28 14:52:11 +01:00
mlt . setStopWords ( EnglishItalianStopFilter . getStopWordsString ( ) ) ;
var similarity = getSimilarity ( ) ;
if ( similarity instanceof TFIDFSimilarity ) {
mlt . setSimilarity ( ( TFIDFSimilarity ) similarity ) ;
} else {
logger . trace ( " Using an unsupported similarity algorithm for MoreLikeThis: {}. You must use a similarity instance based on TFIDFSimilarity! " , similarity ) ;
}
// Get the reference doc and apply it to MoreLikeThis, to generate the query
//noinspection BlockingMethodInNonBlockingContext
var mltQuery = mlt . like ( ( Map ) mltDocumentFields ) ;
Query luceneQuery ;
if ( luceneAdditionalQuery ! = null ) {
luceneQuery = new BooleanQuery . Builder ( )
. add ( mltQuery , Occur . MUST )
. add ( new ConstantScoreQuery ( luceneAdditionalQuery ) , Occur . MUST )
. build ( ) ;
} else {
luceneQuery = mltQuery ;
}
return luceneSearch ( doDistributedPre ,
indexSearcher ,
2021-03-02 01:53:36 +01:00
queryParams . getLimit ( ) ,
queryParams . getMinCompetitiveScore ( ) . getNullable ( ) ,
2021-02-28 14:52:11 +01:00
keyFieldName ,
scoreDivisor ,
luceneQuery ,
2021-03-02 01:53:36 +01:00
QueryParser . toSort ( queryParams . getSort ( ) ) ,
QueryParser . toScoreMode ( queryParams . getScoreMode ( ) )
2021-02-28 14:52:11 +01:00
) ;
} )
. subscribeOn ( luceneQueryScheduler )
. materialize ( )
. flatMap ( signal - > {
if ( signal . isOnComplete ( ) | | signal . isOnError ( ) ) {
return releaseSearcherWrapper ( snapshot , indexSearcher ) . thenReturn ( signal ) ;
} else {
return Mono . just ( signal ) ;
}
} ) . < LLSearchResult > dematerialize ( ) ) ;
2021-01-30 19:57:50 +01:00
} ) ;
2020-12-07 22:15:18 +01:00
}
2021-02-04 22:42:57 +01:00
private LLKeyScore fixKeyScore ( LLKeyScore keyScore , int scoreDivisor ) {
return scoreDivisor = = 1 ? keyScore : new LLKeyScore ( keyScore . getKey ( ) , keyScore . getScore ( ) / ( float ) scoreDivisor ) ;
}
2020-12-07 22:15:18 +01:00
@Override
2021-03-02 01:53:36 +01:00
public Mono < LLSearchResult > search ( @Nullable LLSnapshot snapshot , QueryParams queryParams , String keyFieldName ) {
return search ( snapshot , queryParams , keyFieldName , false , 0 , 1 ) ;
2021-02-04 22:42:57 +01:00
}
2021-01-29 17:19:01 +01:00
2021-03-02 01:53:36 +01:00
public Mono < LLSearchResult > distributedSearch ( @Nullable LLSnapshot snapshot , QueryParams queryParams , String keyFieldName , long actionId , int scoreDivisor ) {
return search ( snapshot , queryParams , keyFieldName , false , actionId , scoreDivisor ) ;
2021-02-04 22:42:57 +01:00
}
2021-03-02 01:53:36 +01:00
public Mono < Void > distributedPreSearch ( @Nullable LLSnapshot snapshot , QueryParams queryParams , String keyFieldName , long actionId ) {
2021-02-14 13:46:11 +01:00
return this
2021-03-02 01:53:36 +01:00
. search ( snapshot , queryParams , keyFieldName , true , actionId , 1 )
2021-02-04 22:42:57 +01:00
. flatMap ( LLSearchResult : : completion ) ;
}
private Mono < LLSearchResult > search ( @Nullable LLSnapshot snapshot ,
2021-03-02 01:53:36 +01:00
QueryParams queryParams , String keyFieldName ,
2021-02-04 22:42:57 +01:00
boolean doDistributedPre , long actionId , int scoreDivisor ) {
return acquireSearcherWrapper ( snapshot , doDistributedPre , actionId )
2021-01-29 17:19:01 +01:00
. flatMap ( indexSearcher - > Mono
. fromCallable ( ( ) - > {
2021-03-02 01:53:36 +01:00
Objects . requireNonNull ( queryParams . getScoreMode ( ) , " ScoreMode must not be null " ) ;
Query luceneQuery = QueryParser . toQuery ( queryParams . getQuery ( ) ) ;
Sort luceneSort = QueryParser . toSort ( queryParams . getSort ( ) ) ;
org . apache . lucene . search . ScoreMode luceneScoreMode = QueryParser . toScoreMode ( queryParams . getScoreMode ( ) ) ;
2021-02-03 13:48:30 +01:00
return Tuples . of ( luceneQuery , Optional . ofNullable ( luceneSort ) , luceneScoreMode ) ;
2021-01-29 17:19:01 +01:00
} )
2021-02-28 00:29:56 +01:00
. subscribeOn ( luceneQueryScheduler )
2021-01-29 17:19:01 +01:00
. flatMap ( tuple - > Mono
. fromCallable ( ( ) - > {
2021-02-03 13:48:30 +01:00
Query luceneQuery = tuple . getT1 ( ) ;
2021-01-29 17:19:01 +01:00
Sort luceneSort = tuple . getT2 ( ) . orElse ( null ) ;
ScoreMode luceneScoreMode = tuple . getT3 ( ) ;
2021-02-27 19:05:13 +01:00
return luceneSearch ( doDistributedPre ,
indexSearcher ,
2021-03-02 01:53:36 +01:00
queryParams . getLimit ( ) ,
queryParams . getMinCompetitiveScore ( ) . getNullable ( ) ,
2021-02-27 19:05:13 +01:00
keyFieldName ,
scoreDivisor ,
luceneQuery ,
luceneSort ,
luceneScoreMode
2021-02-25 00:00:16 +01:00
) ;
2021-02-28 00:29:56 +01:00
} ) . subscribeOn ( luceneQueryScheduler )
2021-01-29 17:19:01 +01:00
)
. materialize ( )
2021-02-17 13:59:35 +01:00
. flatMap ( signal - > {
if ( signal . isOnComplete ( ) | | signal . isOnError ( ) ) {
return releaseSearcherWrapper ( snapshot , indexSearcher ) . thenReturn ( signal ) ;
} else {
return Mono . just ( signal ) ;
}
} )
2021-01-29 17:19:01 +01:00
. dematerialize ( )
) ;
2020-12-07 22:15:18 +01:00
}
2021-02-27 19:05:13 +01:00
private LLSearchResult luceneSearch ( boolean doDistributedPre ,
IndexSearcher indexSearcher ,
long limit ,
@Nullable Float minCompetitiveScore ,
String keyFieldName ,
int scoreDivisor ,
Query luceneQuery ,
Sort luceneSort ,
ScoreMode luceneScoreMode ) {
One < Long > totalHitsCountSink = Sinks . one ( ) ;
Many < LLKeyScore > topKeysSink = Sinks
. many ( )
. unicast ( )
. onBackpressureBuffer ( ) ;
2021-03-03 00:13:57 +01:00
var searchFlux = Mono . < Void > create ( sink - > {
2021-02-27 19:05:13 +01:00
try {
2021-03-03 00:13:57 +01:00
var opId = new Random ( ) . nextInt ( ) ;
2021-02-27 19:05:13 +01:00
if ( doDistributedPre ) {
allowOnlyQueryParsingCollectorStreamSearcher . search ( indexSearcher , luceneQuery ) ;
totalHitsCountSink . tryEmitValue ( 0L ) ;
} else {
int boundedLimit = Math . max ( 0 , limit > Integer . MAX_VALUE ? Integer . MAX_VALUE : ( int ) limit ) ;
2021-03-03 00:13:57 +01:00
logger . warn ( opId + " start " ) ;
2021-02-27 19:05:13 +01:00
streamSearcher . search ( indexSearcher ,
luceneQuery ,
boundedLimit ,
luceneSort ,
luceneScoreMode ,
minCompetitiveScore ,
keyFieldName ,
keyScore - > {
2021-03-03 00:13:57 +01:00
logger . warn ( opId + " item " ) ;
2021-02-27 19:05:13 +01:00
EmitResult result = topKeysSink . tryEmitNext ( fixKeyScore ( keyScore , scoreDivisor ) ) ;
2021-03-03 00:13:57 +01:00
if ( result . isSuccess ( ) ) {
return HandleResult . CONTINUE ;
} else {
2021-02-27 19:05:13 +01:00
if ( result = = EmitResult . FAIL_CANCELLED ) {
logger . debug ( " Fail to emit next value: cancelled " ) ;
2021-03-03 00:13:57 +01:00
return HandleResult . HALT ;
2021-02-27 19:05:13 +01:00
} else if ( result = = EmitResult . FAIL_TERMINATED ) {
logger . debug ( " Fail to emit next value: terminated " ) ;
2021-03-03 00:13:57 +01:00
return HandleResult . HALT ;
2021-02-27 19:05:13 +01:00
} else if ( result = = EmitResult . FAIL_ZERO_SUBSCRIBER ) {
logger . error ( " Fail to emit next value: zero subscriber. You must subscribe to results before total hits if you specified a limit > 0! " ) ;
sink . error ( new EmissionException ( result ) ) ;
throw new EmissionException ( result ) ;
} else {
throw new EmissionException ( result ) ;
}
}
} ,
totalHitsCount - > {
2021-03-03 00:13:57 +01:00
logger . warn ( opId + " total-hits-count " ) ;
2021-02-27 19:05:13 +01:00
EmitResult result = totalHitsCountSink . tryEmitValue ( totalHitsCount ) ;
if ( result . isFailure ( ) ) {
if ( result = = EmitResult . FAIL_CANCELLED ) {
logger . debug ( " Fail to emit total hits count: cancelled " ) ;
} else if ( result = = EmitResult . FAIL_TERMINATED ) {
logger . debug ( " Fail to emit total hits count: terminated " ) ;
} else if ( result = = EmitResult . FAIL_ZERO_SUBSCRIBER ) {
logger . debug ( " Fail to emit total hits count: zero subscriber " ) ;
} else {
sink . error ( new EmissionException ( result ) ) ;
throw new EmissionException ( result ) ;
}
}
}
) ;
}
2021-03-03 00:13:57 +01:00
logger . warn ( opId + " complete " ) ;
2021-02-27 19:05:13 +01:00
topKeysSink . tryEmitComplete ( ) ;
2021-03-03 00:13:57 +01:00
sink . success ( ) ;
2021-02-27 19:05:13 +01:00
} catch ( IOException e ) {
topKeysSink . tryEmitError ( e ) ;
totalHitsCountSink . tryEmitError ( e ) ;
sink . error ( e ) ;
}
2021-03-03 00:13:57 +01:00
} ) . subscribeOn ( luceneQueryScheduler ) . cache ( ) ;
2021-02-27 19:05:13 +01:00
return new LLSearchResult (
Mono . < Long > firstWithValue ( searchFlux . then ( Mono . empty ( ) ) , totalHitsCountSink . asMono ( ) ) ,
Flux . < Flux < LLKeyScore > > merge ( searchFlux . then ( Mono . empty ( ) ) , Flux . just ( topKeysSink . asFlux ( ) ) )
) ;
}
2020-12-07 22:15:18 +01:00
@Override
2021-01-30 01:41:04 +01:00
public Mono < Void > close ( ) {
return Mono
. < Void > fromCallable ( ( ) - > {
scheduledTasksLifecycle . cancelAndWait ( ) ;
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
indexWriter . close ( ) ;
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-01-30 01:41:04 +01:00
directory . close ( ) ;
return null ;
} )
2021-02-03 14:37:02 +01:00
. subscribeOn ( luceneBlockingScheduler ) ;
2020-12-07 22:15:18 +01:00
}
2021-02-03 13:48:30 +01:00
@Override
public Mono < Void > flush ( ) {
return Mono
. < Void > fromCallable ( ( ) - > {
scheduledTasksLifecycle . startScheduledTask ( ) ;
try {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-02-03 13:48:30 +01:00
indexWriter . commit ( ) ;
} finally {
scheduledTasksLifecycle . endScheduledTask ( ) ;
}
return null ;
} )
2021-02-03 14:37:02 +01:00
. subscribeOn ( luceneBlockingScheduler ) ;
2021-02-03 13:48:30 +01:00
}
@Override
public Mono < Void > refresh ( ) {
return Mono
. < Void > fromCallable ( ( ) - > {
scheduledTasksLifecycle . startScheduledTask ( ) ;
try {
2021-02-03 14:37:02 +01:00
//noinspection BlockingMethodInNonBlockingContext
2021-02-03 13:48:30 +01:00
searcherManager . maybeRefreshBlocking ( ) ;
} finally {
scheduledTasksLifecycle . endScheduledTask ( ) ;
}
return null ;
} )
2021-02-03 14:37:02 +01:00
. subscribeOn ( luceneBlockingScheduler ) ;
2021-02-03 13:48:30 +01:00
}
2020-12-07 22:15:18 +01:00
private void scheduledCommit ( ) {
try {
if ( indexWriter . hasUncommittedChanges ( ) ) {
indexWriter . commit ( ) ;
}
} catch ( IOException ex ) {
ex . printStackTrace ( ) ;
}
}
2021-01-29 17:19:01 +01:00
@SuppressWarnings ( " unused " )
2020-12-07 22:15:18 +01:00
private void scheduledQueryRefresh ( ) {
try {
2021-01-29 17:19:01 +01:00
boolean refreshStarted = searcherManager . maybeRefresh ( ) ;
// if refreshStarted == false, another thread is currently already refreshing
2020-12-07 22:15:18 +01:00
} catch ( IOException ex ) {
ex . printStackTrace ( ) ;
}
}
private LuceneIndexSnapshot resolveSnapshot ( @Nullable LLSnapshot snapshot ) {
if ( snapshot = = null ) {
return null ;
}
return Objects . requireNonNull ( snapshots . get ( snapshot . getSequenceNumber ( ) ) ,
( ) - > " Can't resolve snapshot " + snapshot . getSequenceNumber ( )
) ;
}
@Override
public boolean isLowMemoryMode ( ) {
return lowMemory ;
}
}