CavalliumDBEngine/src/main/java/it/cavallium/dbengine/database/disk/LLLocalMultiLuceneIndex.java

419 lines
15 KiB
Java
Raw Normal View History

2020-12-07 22:15:18 +01:00
package it.cavallium.dbengine.database.disk;
import static it.cavallium.dbengine.client.UninterruptibleScheduler.uninterruptibleScheduler;
2022-01-28 19:31:25 +01:00
import com.google.common.collect.Multimap;
import io.micrometer.core.instrument.MeterRegistry;
2022-07-15 02:44:50 +02:00
import io.netty5.util.Send;
2021-11-19 19:03:31 +01:00
import it.cavallium.dbengine.client.query.QueryParser;
2022-07-02 11:44:13 +02:00
import it.cavallium.dbengine.client.query.current.data.NoSort;
2021-11-19 19:03:31 +01:00
import it.cavallium.dbengine.client.query.current.data.Query;
2021-03-02 01:53:36 +01:00
import it.cavallium.dbengine.client.query.current.data.QueryParams;
2022-07-02 11:44:13 +02:00
import it.cavallium.dbengine.client.query.current.data.TotalHitsCount;
2021-11-07 17:46:40 +01:00
import it.cavallium.dbengine.database.LLIndexRequest;
2021-01-24 03:15:05 +01:00
import it.cavallium.dbengine.database.LLLuceneIndex;
import it.cavallium.dbengine.database.LLSearchResultShard;
2021-01-24 03:15:05 +01:00
import it.cavallium.dbengine.database.LLSnapshot;
import it.cavallium.dbengine.database.LLTerm;
2022-01-28 19:31:25 +01:00
import it.cavallium.dbengine.database.LLUpdateDocument;
2022-01-26 14:22:54 +01:00
import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.lucene.LuceneHacks;
2022-03-05 15:46:40 +01:00
import it.cavallium.dbengine.lucene.LuceneRocksDBManager;
2021-07-06 01:30:37 +02:00
import it.cavallium.dbengine.lucene.LuceneUtils;
2021-11-19 19:03:31 +01:00
import it.cavallium.dbengine.lucene.collector.Buckets;
2022-01-28 19:31:25 +01:00
import it.cavallium.dbengine.lucene.mlt.MoreLikeThisTransformer;
import it.cavallium.dbengine.lucene.searcher.AdaptiveMultiSearcher;
2021-11-18 17:13:53 +01:00
import it.cavallium.dbengine.lucene.searcher.BucketParams;
import it.cavallium.dbengine.lucene.searcher.DecimalBucketMultiSearcher;
2022-01-28 21:12:10 +01:00
import it.cavallium.dbengine.lucene.searcher.GlobalQueryRewrite;
2021-07-06 01:30:37 +02:00
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
2022-07-02 11:44:13 +02:00
import it.cavallium.dbengine.lucene.searcher.LuceneSearchResult;
import it.cavallium.dbengine.lucene.searcher.MultiSearcher;
2022-03-05 15:46:40 +01:00
import it.cavallium.dbengine.rpc.current.data.IndicizerAnalyzers;
import it.cavallium.dbengine.rpc.current.data.IndicizerSimilarities;
import it.cavallium.dbengine.rpc.current.data.LuceneOptions;
2022-06-30 13:54:55 +02:00
import it.cavallium.dbengine.utils.SimpleResource;
2022-03-05 15:46:40 +01:00
import it.unimi.dsi.fastutil.ints.IntList;
import java.io.Closeable;
2020-12-07 22:15:18 +01:00
import java.io.IOException;
2022-06-30 13:54:55 +02:00
import java.io.UncheckedIOException;
2022-07-02 11:44:13 +02:00
import java.time.Duration;
import java.util.ArrayList;
2022-03-05 15:46:40 +01:00
import java.util.HashSet;
2021-05-28 16:04:59 +02:00
import java.util.List;
import java.util.Map;
2021-05-28 16:04:59 +02:00
import java.util.Map.Entry;
2022-03-05 15:46:40 +01:00
import java.util.Objects;
2021-02-03 13:48:30 +01:00
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
2020-12-07 22:15:18 +01:00
import java.util.concurrent.atomic.AtomicLong;
2022-03-19 00:08:23 +01:00
import java.util.logging.Level;
import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
2021-11-19 19:03:31 +01:00
import org.jetbrains.annotations.NotNull;
2020-12-07 22:15:18 +01:00
import org.jetbrains.annotations.Nullable;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
2022-03-19 00:08:23 +01:00
import reactor.core.publisher.SignalType;
import reactor.core.scheduler.Schedulers;
2020-12-07 22:15:18 +01:00
2022-06-30 13:54:55 +02:00
public class LLLocalMultiLuceneIndex extends SimpleResource implements LLLuceneIndex {
2020-12-07 22:15:18 +01:00
private static final Logger LOG = LogManager.getLogger(LLLuceneIndex.class);
private static final boolean BYPASS_GROUPBY_BUG = Boolean.parseBoolean(System.getProperty(
"it.cavallium.dbengine.bypassGroupByBug",
"false"
));
2022-01-26 14:22:54 +01:00
static {
LLUtils.initHooks();
}
2022-03-05 15:46:40 +01:00
private final String clusterName;
private final boolean lowMemory;
private final MeterRegistry meterRegistry;
2022-03-05 15:46:40 +01:00
private final ConcurrentHashMap<Long, List<LLSnapshot>> registeredSnapshots = new ConcurrentHashMap<>();
2020-12-07 22:15:18 +01:00
private final AtomicLong nextSnapshotNumber = new AtomicLong(1);
2022-03-05 15:46:40 +01:00
private final LLLocalLuceneIndex[] luceneIndicesById;
private final List<LLLocalLuceneIndex> luceneIndicesSet;
private final int totalShards;
private final Flux<LLLocalLuceneIndex> luceneIndicesFlux;
private final PerFieldAnalyzerWrapper luceneAnalyzer;
private final PerFieldSimilarityWrapper luceneSimilarity;
private final MultiSearcher multiSearcher;
2021-11-18 17:13:53 +01:00
private final DecimalBucketMultiSearcher decimalBucketMultiSearcher = new DecimalBucketMultiSearcher();
public LLLocalMultiLuceneIndex(LLTempHugePqEnv env,
MeterRegistry meterRegistry,
2021-12-30 17:28:06 +01:00
String clusterName,
2022-03-05 15:46:40 +01:00
IntList activeShards,
int totalShards,
2021-05-28 16:04:59 +02:00
IndicizerAnalyzers indicizerAnalyzers,
IndicizerSimilarities indicizerSimilarities,
LuceneOptions luceneOptions,
2022-03-05 15:46:40 +01:00
@Nullable LuceneHacks luceneHacks,
LuceneRocksDBManager rocksDBManager) throws IOException {
2020-12-07 22:15:18 +01:00
2022-03-05 15:46:40 +01:00
if (totalShards <= 1 || totalShards > 100) {
throw new IOException("Unsupported instances count: " + totalShards);
2020-12-07 22:15:18 +01:00
}
this.meterRegistry = meterRegistry;
2022-03-05 15:46:40 +01:00
LLLocalLuceneIndex[] luceneIndices = new LLLocalLuceneIndex[totalShards];
for (int i = 0; i < totalShards; i++) {
if (!activeShards.contains(i)) {
continue;
2020-12-07 22:15:18 +01:00
}
2021-11-21 12:31:23 +01:00
luceneIndices[i] = new LLLocalLuceneIndex(env,
meterRegistry,
2021-12-30 17:28:06 +01:00
clusterName,
2022-03-05 15:46:40 +01:00
i,
2021-05-28 16:04:59 +02:00
indicizerAnalyzers,
indicizerSimilarities,
luceneOptions,
2022-03-05 15:46:40 +01:00
luceneHacks,
rocksDBManager
2020-12-07 22:15:18 +01:00
);
}
2022-03-05 15:46:40 +01:00
this.clusterName = clusterName;
this.totalShards = totalShards;
this.luceneIndicesById = luceneIndices;
var luceneIndicesSet = new HashSet<LLLocalLuceneIndex>();
for (var luceneIndex : luceneIndices) {
if (luceneIndex != null) {
luceneIndicesSet.add(luceneIndex);
}
}
this.luceneIndicesSet = new ArrayList<>(luceneIndicesSet);
this.luceneIndicesFlux = Flux.fromIterable(luceneIndicesSet);
this.luceneAnalyzer = LuceneUtils.toPerFieldAnalyzerWrapper(indicizerAnalyzers);
this.luceneSimilarity = LuceneUtils.toPerFieldSimilarityWrapper(indicizerSimilarities);
2022-03-05 15:46:40 +01:00
this.lowMemory = luceneOptions.lowMemory();
var useHugePq = luceneOptions.allowNonVolatileCollection();
2021-12-12 23:40:30 +01:00
var maxInMemoryResultEntries = luceneOptions.maxInMemoryResultEntries();
if (luceneHacks != null && luceneHacks.customMultiSearcher() != null) {
multiSearcher = luceneHacks.customMultiSearcher().get();
} else {
multiSearcher = new AdaptiveMultiSearcher(env, useHugePq, maxInMemoryResultEntries);
}
2020-12-07 22:15:18 +01:00
}
private LLLocalLuceneIndex getLuceneIndex(LLTerm id) {
2022-03-05 15:46:40 +01:00
return Objects.requireNonNull(luceneIndicesById[LuceneUtils.getLuceneIndexId(id, totalShards)]);
2020-12-07 22:15:18 +01:00
}
@Override
public String getLuceneIndexName() {
2022-03-05 15:46:40 +01:00
return clusterName;
2020-12-07 22:15:18 +01:00
}
2022-06-14 13:10:38 +02:00
private Mono<LLIndexSearchers> getIndexSearchers(LLSnapshot snapshot) {
2022-06-14 17:46:49 +02:00
return luceneIndicesFlux.index()
2021-09-19 19:59:37 +02:00
// Resolve the snapshot of each shard
.flatMap(tuple -> Mono
.fromCallable(() -> resolveSnapshotOptional(snapshot, (int) (long) tuple.getT1()))
2021-09-22 11:03:39 +02:00
.flatMap(luceneSnapshot -> tuple.getT2().retrieveSearcher(luceneSnapshot.orElse(null)))
)
.collectList()
2022-06-14 17:46:49 +02:00
.doOnDiscard(LLIndexSearcher.class, indexSearcher -> {
try {
indexSearcher.close();
2022-06-30 13:54:55 +02:00
} catch (UncheckedIOException ex) {
2022-06-14 17:46:49 +02:00
LOG.error("Failed to close an index searcher", ex);
}
})
.map(indexSearchers -> LLIndexSearchers.of(indexSearchers));
2021-09-19 19:59:37 +02:00
}
2020-12-07 22:15:18 +01:00
@Override
2021-11-07 17:46:40 +01:00
public Mono<Void> addDocument(LLTerm id, LLUpdateDocument doc) {
return getLuceneIndex(id).addDocument(id, doc);
2020-12-07 22:15:18 +01:00
}
@Override
2022-03-19 00:08:23 +01:00
public Mono<Long> addDocuments(boolean atomic, Flux<Entry<LLTerm, LLUpdateDocument>> documents) {
if (BYPASS_GROUPBY_BUG) {
return documents
.buffer(8192)
.flatMap(inputEntries -> {
List<Entry<LLTerm, LLUpdateDocument>>[] sortedEntries = new List[totalShards];
2022-03-19 00:08:23 +01:00
Mono<Long>[] results = new Mono[totalShards];
// Sort entries
for(var inputEntry : inputEntries) {
int luceneIndexId = LuceneUtils.getLuceneIndexId(inputEntry.getKey(), totalShards);
if (sortedEntries[luceneIndexId] == null) {
sortedEntries[luceneIndexId] = new ArrayList<>();
}
sortedEntries[luceneIndexId].add(inputEntry);
}
// Add documents
int luceneIndexId = 0;
for (List<Entry<LLTerm, LLUpdateDocument>> docs : sortedEntries) {
if (docs != null && !docs.isEmpty()) {
LLLocalLuceneIndex luceneIndex = Objects.requireNonNull(luceneIndicesById[luceneIndexId]);
results[luceneIndexId] = luceneIndex.addDocuments(atomic, Flux.fromIterable(docs));
} else {
results[luceneIndexId] = Mono.empty();
}
luceneIndexId++;
}
2022-03-19 00:08:23 +01:00
return Flux.merge(results).reduce(0L, Long::sum);
})
2022-03-19 00:08:23 +01:00
.reduce(0L, Long::sum);
} else {
return documents
.groupBy(term -> getLuceneIndex(term.getKey()))
.flatMap(group -> group.key().addDocuments(atomic, group))
2022-03-19 00:08:23 +01:00
.reduce(0L, Long::sum);
}
2020-12-07 22:15:18 +01:00
}
@Override
public Mono<Void> deleteDocument(LLTerm id) {
return getLuceneIndex(id).deleteDocument(id);
2020-12-07 22:15:18 +01:00
}
@Override
2021-11-07 17:46:40 +01:00
public Mono<Void> update(LLTerm id, LLIndexRequest request) {
return getLuceneIndex(id).update(id, request);
2020-12-07 22:15:18 +01:00
}
@Override
2022-03-19 00:08:23 +01:00
public Mono<Long> updateDocuments(Flux<Entry<LLTerm, LLUpdateDocument>> documents) {
documents = documents
.log("local-multi-update-documents", Level.FINEST, false, SignalType.ON_NEXT, SignalType.ON_COMPLETE);
if (BYPASS_GROUPBY_BUG) {
2022-03-18 19:16:06 +01:00
int bufferSize = 8192;
return documents
2022-03-18 19:16:06 +01:00
.window(bufferSize)
.flatMap(bufferFlux -> bufferFlux
.collect(Collectors.groupingBy(inputEntry -> LuceneUtils.getLuceneIndexId(inputEntry.getKey(), totalShards),
Collectors.collectingAndThen(Collectors.toList(), docs -> {
2022-03-19 00:08:23 +01:00
var luceneIndex = getLuceneIndex(docs.get(0).getKey());
2022-03-18 19:16:06 +01:00
return luceneIndex.updateDocuments(Flux.fromIterable(docs));
}))
)
.map(Map::values)
2022-03-19 00:08:23 +01:00
.flatMap(parts -> Flux.merge(parts).reduce(0L, Long::sum))
2022-03-18 19:16:06 +01:00
)
2022-03-19 00:08:23 +01:00
.reduce(0L, Long::sum);
} else {
return documents
.groupBy(term -> getLuceneIndex(term.getKey()))
.flatMap(group -> group.key().updateDocuments(group))
2022-03-19 00:08:23 +01:00
.reduce(0L, Long::sum);
}
2020-12-07 22:15:18 +01:00
}
@Override
public Mono<Void> deleteAll() {
Iterable<Mono<Void>> it = () -> luceneIndicesSet.stream().map(llLocalLuceneIndex -> llLocalLuceneIndex.deleteAll()).iterator();
return Mono.whenDelayError(it);
2020-12-07 22:15:18 +01:00
}
private LLSnapshot resolveSnapshot(LLSnapshot multiSnapshot, int instanceId) {
if (multiSnapshot != null) {
2022-03-05 15:46:40 +01:00
return registeredSnapshots.get(multiSnapshot.getSequenceNumber()).get(instanceId);
2020-12-07 22:15:18 +01:00
} else {
return null;
}
}
2021-02-03 13:48:30 +01:00
private Optional<LLSnapshot> resolveSnapshotOptional(LLSnapshot multiSnapshot, int instanceId) {
return Optional.ofNullable(resolveSnapshot(multiSnapshot, instanceId));
}
2020-12-07 22:15:18 +01:00
@Override
2022-03-05 15:46:40 +01:00
public Flux<LLSearchResultShard> moreLikeThis(@Nullable LLSnapshot snapshot,
2021-03-02 01:53:36 +01:00
QueryParams queryParams,
String keyFieldName,
2022-01-28 19:31:25 +01:00
Multimap<String, String> mltDocumentFields) {
2021-11-16 23:19:23 +01:00
LocalQueryParams localQueryParams = LuceneUtils.toLocalQueryParams(queryParams, luceneAnalyzer);
var searchers = this.getIndexSearchers(snapshot);
2022-01-28 19:31:25 +01:00
var transformer = new MoreLikeThisTransformer(mltDocumentFields, luceneAnalyzer, luceneSimilarity);
2021-09-19 19:59:37 +02:00
// Collect all the shards results into a single global result
return multiSearcher
.collectMulti(searchers, localQueryParams, keyFieldName, transformer)
2021-09-19 19:59:37 +02:00
// Transform the result type
2022-03-05 15:46:40 +01:00
.map(result -> new LLSearchResultShard(result.results(), result.totalHitsCount(), result::close))
.flux();
2020-12-07 22:15:18 +01:00
}
@Override
2022-03-05 15:46:40 +01:00
public Flux<LLSearchResultShard> search(@Nullable LLSnapshot snapshot,
2021-03-02 01:53:36 +01:00
QueryParams queryParams,
2022-02-26 03:28:20 +01:00
@Nullable String keyFieldName) {
2022-07-02 11:44:13 +02:00
return searchInternal(snapshot, queryParams, keyFieldName)
// Transform the result type
.map(result -> new LLSearchResultShard(result.results(), result.totalHitsCount(), result::close))
.flux();
}
private Mono<LuceneSearchResult> searchInternal(@Nullable LLSnapshot snapshot,
QueryParams queryParams,
@Nullable String keyFieldName) {
2021-11-16 23:19:23 +01:00
LocalQueryParams localQueryParams = LuceneUtils.toLocalQueryParams(queryParams, luceneAnalyzer);
var searchers = getIndexSearchers(snapshot);
2021-04-14 02:37:03 +02:00
2021-09-19 19:59:37 +02:00
// Collect all the shards results into a single global result
2022-07-02 11:44:13 +02:00
return multiSearcher.collectMulti(searchers, localQueryParams, keyFieldName, GlobalQueryRewrite.NO_REWRITE);
}
@Override
public Mono<TotalHitsCount> count(@Nullable LLSnapshot snapshot, Query query, @Nullable Duration timeout) {
var params = LuceneUtils.getCountQueryParams(query);
return Mono
.usingWhen(this.searchInternal(snapshot, params, null),
result -> Mono.just(result.totalHitsCount()),
LLUtils::finalizeResource
)
.defaultIfEmpty(TotalHitsCount.of(0, true));
2020-12-07 22:15:18 +01:00
}
2021-11-18 17:13:53 +01:00
@Override
2021-11-19 19:03:31 +01:00
public Mono<Buckets> computeBuckets(@Nullable LLSnapshot snapshot,
@NotNull List<Query> queries,
@Nullable Query normalizationQuery,
2021-11-18 17:13:53 +01:00
BucketParams bucketParams) {
2021-11-19 19:03:31 +01:00
List<org.apache.lucene.search.Query> localQueries = new ArrayList<>(queries.size());
for (Query query : queries) {
localQueries.add(QueryParser.toQuery(query, luceneAnalyzer));
}
var localNormalizationQuery = QueryParser.toQuery(normalizationQuery, luceneAnalyzer);
2021-11-18 17:13:53 +01:00
var searchers = getIndexSearchers(snapshot);
// Collect all the shards results into a single global result
2022-01-26 14:22:54 +01:00
return decimalBucketMultiSearcher.collectMulti(searchers, bucketParams, localQueries, localNormalizationQuery);
2021-11-18 17:13:53 +01:00
}
2020-12-07 22:15:18 +01:00
@Override
2022-06-30 13:54:55 +02:00
protected void onClose() {
Iterable<Mono<Void>> it = () -> luceneIndicesSet
.stream()
.map(part -> Mono
.<Void>fromRunnable(part::close)
.subscribeOn(uninterruptibleScheduler(Schedulers.boundedElastic()))
.publishOn(Schedulers.parallel())
)
.iterator();
var indicesCloseMono = Mono.whenDelayError(it);
2022-06-30 13:54:55 +02:00
indicesCloseMono
.then(Mono.fromCallable(() -> {
if (multiSearcher instanceof Closeable closeable) {
//noinspection BlockingMethodInNonBlockingContext
closeable.close();
}
return null;
}).subscribeOn(uninterruptibleScheduler(Schedulers.boundedElastic())))
.publishOn(Schedulers.parallel())
2022-06-30 13:54:55 +02:00
.then()
2022-07-02 11:44:13 +02:00
.transform(LLUtils::handleDiscard)
2022-06-30 13:54:55 +02:00
.block();
2020-12-07 22:15:18 +01:00
}
2021-02-03 13:48:30 +01:00
@Override
public Mono<Void> flush() {
Iterable<Mono<Void>> it = () -> luceneIndicesSet.stream().map(LLLuceneIndex::flush).iterator();
return Mono.whenDelayError(it);
}
@Override
public Mono<Void> waitForMerges() {
Iterable<Mono<Void>> it = () -> luceneIndicesSet.stream().map(LLLuceneIndex::waitForMerges).iterator();
return Mono.whenDelayError(it);
}
@Override
public Mono<Void> waitForLastMerges() {
Iterable<Mono<Void>> it = () -> luceneIndicesSet.stream().map(LLLuceneIndex::waitForLastMerges).iterator();
return Mono.whenDelayError(it);
2021-02-03 13:48:30 +01:00
}
@Override
2021-07-18 19:37:24 +02:00
public Mono<Void> refresh(boolean force) {
Iterable<Mono<Void>> it = () -> luceneIndicesSet.stream().map(index -> index.refresh(force)).iterator();
return Mono.whenDelayError(it);
2021-02-03 13:48:30 +01:00
}
2020-12-07 22:15:18 +01:00
@Override
public Mono<LLSnapshot> takeSnapshot() {
return Mono
// Generate next snapshot index
.fromCallable(nextSnapshotNumber::getAndIncrement)
2022-03-05 15:46:40 +01:00
.flatMap(snapshotIndex -> luceneIndicesFlux
.flatMapSequential(llLocalLuceneIndex -> llLocalLuceneIndex.takeSnapshot())
.collectList()
.doOnNext(instancesSnapshotsArray -> registeredSnapshots.put(snapshotIndex, instancesSnapshotsArray))
.thenReturn(new LLSnapshot(snapshotIndex))
);
2020-12-07 22:15:18 +01:00
}
@Override
public Mono<Void> releaseSnapshot(LLSnapshot snapshot) {
return Mono
.fromCallable(() -> registeredSnapshots.remove(snapshot.getSequenceNumber()))
2022-03-05 15:46:40 +01:00
.flatMapIterable(list -> list)
.index()
.flatMapSequential(tuple -> {
int index = (int) (long) tuple.getT1();
LLSnapshot instanceSnapshot = tuple.getT2();
2022-03-05 15:46:40 +01:00
return luceneIndicesSet.get(index).releaseSnapshot(instanceSnapshot);
})
.then();
2020-12-07 22:15:18 +01:00
}
@Override
public boolean isLowMemoryMode() {
2022-03-05 15:46:40 +01:00
return lowMemory;
2020-12-07 22:15:18 +01:00
}
}