diff --git a/pom.xml b/pom.xml index 0f94830..97b4772 100644 --- a/pom.xml +++ b/pom.xml @@ -197,6 +197,11 @@ lucene-analysis-common 9.0.0-SNAPSHOT + + org.apache.lucene + lucene-analyzers-icu + 9.0.0-SNAPSHOT + org.apache.lucene lucene-codecs @@ -245,7 +250,7 @@ it.cavallium data-generator - [0.9.26,) + 0.9.0-SNAPSHOT io.soabase.record-builder diff --git a/src/main/java/it/cavallium/dbengine/client/Indicizer.java b/src/main/java/it/cavallium/dbengine/client/Indicizer.java index 76127a4..b751bc2 100644 --- a/src/main/java/it/cavallium/dbengine/client/Indicizer.java +++ b/src/main/java/it/cavallium/dbengine/client/Indicizer.java @@ -2,7 +2,10 @@ package it.cavallium.dbengine.client; import it.cavallium.dbengine.database.LLDocument; import it.cavallium.dbengine.database.LLTerm; +import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer; +import java.util.Map; import java.util.Set; +import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.jetbrains.annotations.NotNull; import reactor.core.publisher.Flux; import reactor.core.publisher.Mono; @@ -18,6 +21,10 @@ public abstract class Indicizer { public abstract @NotNull T getKey(String key); + public abstract IndicizerAnalyzers getPerFieldAnalyzer(); + + public abstract IndicizerSimilarities getPerFieldSimilarity(); + public Flux>> getMoreLikeThisDocumentFields(T key, U value) { return Flux.empty(); } diff --git a/src/main/java/it/cavallium/dbengine/client/IndicizerAnalyzers.java b/src/main/java/it/cavallium/dbengine/client/IndicizerAnalyzers.java new file mode 100644 index 0000000..4d72f06 --- /dev/null +++ b/src/main/java/it/cavallium/dbengine/client/IndicizerAnalyzers.java @@ -0,0 +1,19 @@ +package it.cavallium.dbengine.client; + +import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer; +import java.util.Map; + +public record IndicizerAnalyzers(TextFieldsAnalyzer defaultAnalyzer, Map fieldAnalyzer) { + + public static IndicizerAnalyzers of() { + return of(TextFieldsAnalyzer.FullText); + } + + public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer) { + return of(defaultAnalyzer, Map.of()); + } + + public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer, Map fieldAnalyzer) { + return new IndicizerAnalyzers(defaultAnalyzer, fieldAnalyzer); + } +} diff --git a/src/main/java/it/cavallium/dbengine/client/IndicizerSimilarities.java b/src/main/java/it/cavallium/dbengine/client/IndicizerSimilarities.java new file mode 100644 index 0000000..290e721 --- /dev/null +++ b/src/main/java/it/cavallium/dbengine/client/IndicizerSimilarities.java @@ -0,0 +1,20 @@ +package it.cavallium.dbengine.client; + +import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer; +import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity; +import java.util.Map; + +public record IndicizerSimilarities(TextFieldsSimilarity defaultSimilarity, Map fieldSimilarity) { + + public static IndicizerSimilarities of() { + return of(TextFieldsSimilarity.BM25Plus); + } + + public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity) { + return of(defaultSimilarity, Map.of()); + } + + public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity, Map fieldSimilarity) { + return new IndicizerSimilarities(defaultSimilarity, fieldSimilarity); + } +} diff --git a/src/main/java/it/cavallium/dbengine/client/LuceneIndexImpl.java b/src/main/java/it/cavallium/dbengine/client/LuceneIndexImpl.java index 7530638..f6a4dea 100644 --- a/src/main/java/it/cavallium/dbengine/client/LuceneIndexImpl.java +++ b/src/main/java/it/cavallium/dbengine/client/LuceneIndexImpl.java @@ -51,7 +51,6 @@ public class LuceneIndexImpl implements LuceneIndex { .flatMap(entry -> indicizer .toDocument(entry.getKey(), entry.getValue()) .map(doc -> Map.entry(indicizer.toIndex(entry.getKey()), doc))) - .collectMap(Entry::getKey, Entry::getValue) ); } diff --git a/src/main/java/it/cavallium/dbengine/database/LLDatabaseConnection.java b/src/main/java/it/cavallium/dbengine/database/LLDatabaseConnection.java index 3a7bc3f..5897dac 100644 --- a/src/main/java/it/cavallium/dbengine/database/LLDatabaseConnection.java +++ b/src/main/java/it/cavallium/dbengine/database/LLDatabaseConnection.java @@ -1,6 +1,8 @@ package it.cavallium.dbengine.database; import io.netty.buffer.ByteBufAllocator; +import it.cavallium.dbengine.client.IndicizerAnalyzers; +import it.cavallium.dbengine.client.IndicizerSimilarities; import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer; import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity; import java.time.Duration; @@ -18,8 +20,8 @@ public interface LLDatabaseConnection { Mono getLuceneIndex(String name, int instancesCount, - TextFieldsAnalyzer textFieldsAnalyzer, - TextFieldsSimilarity textFieldsSimilarity, + IndicizerAnalyzers indicizerAnalyzers, + IndicizerSimilarities indicizerSimilarities, Duration queryRefreshDebounceTime, Duration commitDebounceTime, boolean lowMemory, diff --git a/src/main/java/it/cavallium/dbengine/database/LLLuceneIndex.java b/src/main/java/it/cavallium/dbengine/database/LLLuceneIndex.java index 8c46270..d7657e9 100644 --- a/src/main/java/it/cavallium/dbengine/database/LLLuceneIndex.java +++ b/src/main/java/it/cavallium/dbengine/database/LLLuceneIndex.java @@ -6,6 +6,7 @@ import it.cavallium.dbengine.client.query.current.data.Query; import it.cavallium.dbengine.client.query.current.data.QueryParams; import it.cavallium.dbengine.client.query.current.data.ScoreMode; import it.cavallium.dbengine.lucene.LuceneUtils; +import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; @@ -21,7 +22,7 @@ public interface LLLuceneIndex extends LLSnapshottable { Mono addDocument(LLTerm id, LLDocument doc); - Mono addDocuments(Mono> documents); + Mono addDocuments(Flux> documents); Mono deleteDocument(LLTerm id); diff --git a/src/main/java/it/cavallium/dbengine/database/LLUtils.java b/src/main/java/it/cavallium/dbengine/database/LLUtils.java index ba38f67..98718a2 100644 --- a/src/main/java/it/cavallium/dbengine/database/LLUtils.java +++ b/src/main/java/it/cavallium/dbengine/database/LLUtils.java @@ -8,15 +8,20 @@ import io.netty.buffer.ByteBufUtil; import io.netty.buffer.CompositeByteBuf; import io.netty.buffer.Unpooled; import io.netty.util.IllegalReferenceCountException; +import it.cavallium.dbengine.client.IndicizerAnalyzers; import it.cavallium.dbengine.lucene.RandomSortField; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; import java.util.LinkedList; import java.util.List; +import java.util.Map.Entry; import java.util.Objects; import java.util.function.Function; import java.util.function.ToIntFunction; +import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FloatPoint; @@ -115,16 +120,24 @@ public class LLUtils { return d; } - public static Iterable toDocuments(Iterable document) { - List d = new LinkedList<>(); + public static Collection toDocuments(Collection document) { + List d = new ArrayList<>(document.size()); for (LLDocument doc : document) { d.add(LLUtils.toDocument(doc)); } return d; } + public static Collection toDocumentsFromEntries(Collection> documentsList) { + ArrayList results = new ArrayList<>(documentsList.size()); + for (Entry entry : documentsList) { + results.add(LLUtils.toDocument(entry.getValue())); + } + return results; + } + public static Iterable toTerms(Iterable terms) { - List d = new LinkedList<>(); + List d = new ArrayList<>(); for (LLTerm term : terms) { d.add(LLUtils.toTerm(term)); } diff --git a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalDatabaseConnection.java b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalDatabaseConnection.java index f2bd50f..a35710c 100644 --- a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalDatabaseConnection.java +++ b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalDatabaseConnection.java @@ -1,6 +1,8 @@ package it.cavallium.dbengine.database.disk; import io.netty.buffer.ByteBufAllocator; +import it.cavallium.dbengine.client.IndicizerAnalyzers; +import it.cavallium.dbengine.client.IndicizerSimilarities; import it.cavallium.dbengine.database.Column; import it.cavallium.dbengine.database.LLDatabaseConnection; import it.cavallium.dbengine.database.LLLuceneIndex; @@ -70,8 +72,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection { @Override public Mono getLuceneIndex(String name, int instancesCount, - TextFieldsAnalyzer textFieldsAnalyzer, - TextFieldsSimilarity textFieldsSimilarity, + IndicizerAnalyzers indicizerAnalyzers, + IndicizerSimilarities indicizerSimilarities, Duration queryRefreshDebounceTime, Duration commitDebounceTime, boolean lowMemory, @@ -82,8 +84,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection { return new LLLocalMultiLuceneIndex(basePath.resolve("lucene"), name, instancesCount, - textFieldsAnalyzer, - textFieldsSimilarity, + indicizerAnalyzers, + indicizerSimilarities, queryRefreshDebounceTime, commitDebounceTime, lowMemory, @@ -92,8 +94,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection { } else { return new LLLocalLuceneIndex(basePath.resolve("lucene"), name, - textFieldsAnalyzer, - textFieldsSimilarity, + indicizerAnalyzers, + indicizerSimilarities, queryRefreshDebounceTime, commitDebounceTime, lowMemory, diff --git a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalDictionary.java b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalDictionary.java index c67746b..515dcd1 100644 --- a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalDictionary.java +++ b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalDictionary.java @@ -122,12 +122,15 @@ public class LLLocalDictionary implements LLDictionary { private final Striped itemsLock = Striped.readWriteStampedLock(STRIPES); private final UpdateMode updateMode; private final ByteBufAllocator alloc; + private final String getRangeMultiDebugName; + private final String getRangeKeysMultiDebugName; public LLLocalDictionary( ByteBufAllocator allocator, @NotNull RocksDB db, @NotNull ColumnFamilyHandle columnFamilyHandle, String databaseName, + String columnDisplayName, Scheduler dbScheduler, Function snapshotResolver, UpdateMode updateMode) { @@ -139,6 +142,8 @@ public class LLLocalDictionary implements LLDictionary { this.dbScheduler = dbScheduler; this.snapshotResolver = snapshotResolver; this.updateMode = updateMode; + this.getRangeMultiDebugName = databaseName + "(" + columnDisplayName + ")" + "::getRangeMulti"; + this.getRangeKeysMultiDebugName = databaseName + "(" + columnDisplayName + ")" + "::getRangeKeysMulti"; alloc = allocator; } @@ -1113,12 +1118,19 @@ public class LLLocalDictionary implements LLDictionary { } } + @SuppressWarnings("Convert2MethodRef") private Flux> getRangeMulti(LLSnapshot snapshot, LLRange range) { try { return Flux .using( - () -> new LLLocalEntryReactiveRocksIterator(db, alloc, cfh, range.retain(), resolveSnapshot(snapshot)), - LLLocalReactiveRocksIterator::flux, + () -> new LLLocalEntryReactiveRocksIterator(db, + alloc, + cfh, + range.retain(), + resolveSnapshot(snapshot), + getRangeMultiDebugName + ), + llLocalEntryReactiveRocksIterator -> llLocalEntryReactiveRocksIterator.flux(), LLLocalReactiveRocksIterator::release ) .doOnDiscard(Entry.class, entry -> { @@ -1135,6 +1147,7 @@ public class LLLocalDictionary implements LLDictionary { } } + @SuppressWarnings("Convert2MethodRef") private Flux>> getRangeMultiGrouped(LLSnapshot snapshot, LLRange range, int prefixLength) { try { return Flux @@ -1147,7 +1160,7 @@ public class LLLocalDictionary implements LLDictionary { resolveSnapshot(snapshot), "getRangeMultiGrouped" ), - LLLocalGroupedReactiveRocksIterator::flux, + llLocalGroupedEntryReactiveRocksIterator -> llLocalGroupedEntryReactiveRocksIterator.flux(), LLLocalGroupedReactiveRocksIterator::release ) .subscribeOn(dbScheduler) @@ -1245,12 +1258,19 @@ public class LLLocalDictionary implements LLDictionary { } } + @SuppressWarnings("Convert2MethodRef") private Flux getRangeKeysMulti(LLSnapshot snapshot, LLRange range) { try { return Flux .using( - () -> new LLLocalKeyReactiveRocksIterator(db, alloc, cfh, range.retain(), resolveSnapshot(snapshot)), - LLLocalReactiveRocksIterator::flux, + () -> new LLLocalKeyReactiveRocksIterator(db, + alloc, + cfh, + range.retain(), + resolveSnapshot(snapshot), + getRangeKeysMultiDebugName + ), + llLocalKeyReactiveRocksIterator -> llLocalKeyReactiveRocksIterator.flux(), LLLocalReactiveRocksIterator::release ) .doOnDiscard(ByteBuf.class, ReferenceCounted::release) diff --git a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalEntryReactiveRocksIterator.java b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalEntryReactiveRocksIterator.java index 0394112..1dfa6de 100644 --- a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalEntryReactiveRocksIterator.java +++ b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalEntryReactiveRocksIterator.java @@ -15,8 +15,9 @@ public class LLLocalEntryReactiveRocksIterator extends LLLocalReactiveRocksItera ByteBufAllocator alloc, ColumnFamilyHandle cfh, LLRange range, - ReadOptions readOptions) { - super(db, alloc, cfh, range, readOptions, true); + ReadOptions readOptions, + String debugName) { + super(db, alloc, cfh, range, readOptions, true, debugName); } @Override diff --git a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalKeyReactiveRocksIterator.java b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalKeyReactiveRocksIterator.java index 978df8c..e732d90 100644 --- a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalKeyReactiveRocksIterator.java +++ b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalKeyReactiveRocksIterator.java @@ -13,8 +13,9 @@ public class LLLocalKeyReactiveRocksIterator extends LLLocalReactiveRocksIterato ByteBufAllocator alloc, ColumnFamilyHandle cfh, LLRange range, - ReadOptions readOptions) { - super(db, alloc, cfh, range, readOptions, false); + ReadOptions readOptions, + String debugName) { + super(db, alloc, cfh, range, readOptions, false, debugName); } @Override diff --git a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalKeyValueDatabase.java b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalKeyValueDatabase.java index af68d93..1450878 100644 --- a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalKeyValueDatabase.java +++ b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalKeyValueDatabase.java @@ -414,6 +414,7 @@ public class LLLocalKeyValueDatabase implements LLKeyValueDatabase { db, handles.get(Column.special(Column.toString(columnName))), name, + Column.toString(columnName), dbScheduler, (snapshot) -> snapshotsHandles.get(snapshot.getSequenceNumber()), updateMode diff --git a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalLuceneIndex.java b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalLuceneIndex.java index 03a6531..0518fb3 100644 --- a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalLuceneIndex.java +++ b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalLuceneIndex.java @@ -1,5 +1,7 @@ package it.cavallium.dbengine.database.disk; +import it.cavallium.dbengine.client.IndicizerAnalyzers; +import it.cavallium.dbengine.client.IndicizerSimilarities; import it.cavallium.dbengine.client.query.QueryParser; import it.cavallium.dbengine.client.query.current.data.QueryParams; import it.cavallium.dbengine.database.EnglishItalianStopFilter; @@ -24,7 +26,11 @@ import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult; import java.io.IOException; import java.nio.file.Path; import java.time.Duration; +import java.time.temporal.ChronoUnit; +import java.time.temporal.TemporalUnit; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Objects; @@ -36,6 +42,8 @@ import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; +import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; +import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -110,15 +118,15 @@ public class LLLocalLuceneIndex implements LLLuceneIndex { */ private final ConcurrentHashMap snapshots = new ConcurrentHashMap<>(); private final boolean lowMemory; - private final TextFieldsSimilarity similarity; + private final Similarity similarity; private final ScheduledTaskLifecycle scheduledTasksLifecycle; private final @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter; public LLLocalLuceneIndex(Path luceneBasePath, String name, - TextFieldsAnalyzer analyzer, - TextFieldsSimilarity similarity, + IndicizerAnalyzers indicizerAnalyzers, + IndicizerSimilarities indicizerSimilarities, Duration queryRefreshDebounceTime, Duration commitDebounceTime, boolean lowMemory, boolean inMemory, @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter) throws IOException { @@ -159,9 +167,10 @@ public class LLLocalLuceneIndex implements LLLuceneIndex { this.luceneIndexName = name; this.snapshotter = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy()); this.lowMemory = lowMemory; - this.similarity = similarity; + this.similarity = LuceneUtils.toPerFieldSimilarityWrapper(indicizerSimilarities); this.distributedCollectionStatisticsGetter = distributedCollectionStatisticsGetter; - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.getAnalyzer(analyzer)); + ; + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.toPerFieldAnalyzerWrapper(indicizerAnalyzers)); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexWriterConfig.setIndexDeletionPolicy(snapshotter); indexWriterConfig.setCommitOnClose(true); @@ -186,7 +195,7 @@ public class LLLocalLuceneIndex implements LLLuceneIndex { } private Similarity getSimilarity() { - return LuceneUtils.getSimilarity(similarity); + return similarity; } private void registerScheduledFixedTask(Runnable task, Duration duration) { @@ -269,11 +278,12 @@ public class LLLocalLuceneIndex implements LLLuceneIndex { } @Override - public Mono addDocuments(Mono> documents) { + public Mono addDocuments(Flux> documents) { return documents - .flatMap(documentsMap -> Mono + .collectList() + .flatMap(documentsList -> Mono .fromCallable(() -> { - indexWriter.addDocuments(LLUtils.toDocuments(documentsMap.values())); + indexWriter.addDocuments(LLUtils.toDocumentsFromEntries(documentsList)); return null; }) .subscribeOn(Schedulers.boundedElastic()) diff --git a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalMultiLuceneIndex.java b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalMultiLuceneIndex.java index 4df6381..7a78f54 100644 --- a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalMultiLuceneIndex.java +++ b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalMultiLuceneIndex.java @@ -3,6 +3,8 @@ package it.cavallium.dbengine.database.disk; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader.InvalidCacheLoadException; +import it.cavallium.dbengine.client.IndicizerAnalyzers; +import it.cavallium.dbengine.client.IndicizerSimilarities; import it.cavallium.dbengine.client.query.current.data.QueryParams; import it.cavallium.dbengine.database.LLDocument; import it.cavallium.dbengine.database.LLLuceneIndex; @@ -14,12 +16,15 @@ import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer; import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity; import it.unimi.dsi.fastutil.longs.Long2ObjectMap; import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap; +import it.unimi.dsi.fastutil.objects.Object2ObjectArrayMap; import java.io.IOException; import java.nio.file.Path; import java.time.Duration; import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Objects; import java.util.Optional; import java.util.Set; @@ -55,8 +60,8 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex { public LLLocalMultiLuceneIndex(Path lucene, String name, int instancesCount, - TextFieldsAnalyzer textFieldsAnalyzer, - TextFieldsSimilarity textFieldsSimilarity, + IndicizerAnalyzers indicizerAnalyzers, + IndicizerSimilarities indicizerSimilarities, Duration queryRefreshDebounceTime, Duration commitDebounceTime, boolean lowMemory, boolean inMemory) throws IOException { @@ -76,8 +81,8 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex { } luceneIndices[i] = new LLLocalLuceneIndex(lucene, instanceName, - textFieldsAnalyzer, - textFieldsSimilarity, + indicizerAnalyzers, + indicizerSimilarities, queryRefreshDebounceTime, commitDebounceTime, lowMemory, inMemory, (indexSearcher, field, distributedPre, actionId) -> distributedCustomCollectionStatistics(finalI, @@ -168,21 +173,37 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex { return getLuceneIndex(id).addDocument(id, doc); } + @SuppressWarnings({"unchecked"}) @Override - public Mono addDocuments(Mono> documents) { + public Mono addDocuments(Flux> documents) { return documents - .flatMapMany(map -> { - var sortedMap = new HashMap>(); - map.forEach((key, value) -> sortedMap - .computeIfAbsent(getLuceneIndex(key), _unused -> new HashMap<>()) - .put(key, value) - ); - return Flux.fromIterable(sortedMap.entrySet()); - }) - .flatMap(luceneIndexWithNewDocuments -> { - var luceneIndex = luceneIndexWithNewDocuments.getKey(); - var docs = luceneIndexWithNewDocuments.getValue(); - return luceneIndex.addDocuments(Mono.just(docs)); + .bufferTimeout(512, Duration.ofSeconds(2)) + .flatMap(inputEntries -> { + List>[] sortedEntries = new List[luceneIndices.length]; + Mono[] results = new Mono[luceneIndices.length]; + + // Sort entries + for(var inputEntry : inputEntries) { + int luceneIndexId = getLuceneIndexId(inputEntry.getKey()); + if (sortedEntries[luceneIndexId] == null) { + sortedEntries[luceneIndexId] = new ArrayList<>(); + } + sortedEntries[luceneIndexId].add(inputEntry); + } + + // Add documents + int luceneIndexId = 0; + for (List> docs : sortedEntries) { + if (docs != null && !docs.isEmpty()) { + LLLocalLuceneIndex luceneIndex = luceneIndices[luceneIndexId]; + results[luceneIndexId] = luceneIndex.addDocuments(Flux.fromIterable(docs)); + } else { + results[luceneIndexId] = Mono.empty(); + } + luceneIndexId++; + } + + return Mono.when(results); }) .then(); } diff --git a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalReactiveRocksIterator.java b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalReactiveRocksIterator.java index bacfac0..aeaaf3a 100644 --- a/src/main/java/it/cavallium/dbengine/database/disk/LLLocalReactiveRocksIterator.java +++ b/src/main/java/it/cavallium/dbengine/database/disk/LLLocalReactiveRocksIterator.java @@ -7,6 +7,7 @@ import io.netty.buffer.ByteBufAllocator; import it.cavallium.dbengine.database.LLRange; import it.cavallium.dbengine.database.LLUtils; import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep; +import java.util.concurrent.atomic.AtomicInteger; import org.jetbrains.annotations.NotNull; import org.rocksdb.ColumnFamilyHandle; import org.rocksdb.ReadOptions; @@ -17,6 +18,7 @@ import reactor.core.publisher.Flux; import reactor.util.function.Tuple3; import static io.netty.buffer.Unpooled.*; +import static it.cavallium.dbengine.database.disk.LLLocalDictionary.logger; public abstract class LLLocalReactiveRocksIterator { @@ -26,19 +28,22 @@ public abstract class LLLocalReactiveRocksIterator { private final LLRange range; private final ReadOptions readOptions; private final boolean readValues; + private final String debugName; public LLLocalReactiveRocksIterator(RocksDB db, ByteBufAllocator alloc, ColumnFamilyHandle cfh, LLRange range, ReadOptions readOptions, - boolean readValues) { + boolean readValues, + String debugName) { this.db = db; this.alloc = alloc; this.cfh = cfh; this.range = range; this.readOptions = readOptions; this.readValues = readValues; + this.debugName = debugName; } public Flux flux() { @@ -46,7 +51,7 @@ public abstract class LLLocalReactiveRocksIterator { .>generate(() -> { var readOptions = new ReadOptions(this.readOptions); if (!range.hasMin() || !range.hasMax()) { - readOptions.setReadaheadSize(2 * 1024 * 1024); + readOptions.setReadaheadSize(32 * 1024); // 32KiB readOptions.setFillCache(false); } return getRocksIterator(readOptions, range.retain(), db, cfh); diff --git a/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java b/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java index 35e0d53..8dc5ecf 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java +++ b/src/main/java/it/cavallium/dbengine/lucene/LuceneUtils.java @@ -1,6 +1,10 @@ package it.cavallium.dbengine.lucene; +import com.ibm.icu.text.Collator; +import com.ibm.icu.util.ULocale; import it.cavallium.dbengine.client.CompositeSnapshot; +import it.cavallium.dbengine.client.IndicizerAnalyzers; +import it.cavallium.dbengine.client.IndicizerSimilarities; import it.cavallium.dbengine.client.MultiSort; import it.cavallium.dbengine.client.SearchResult; import it.cavallium.dbengine.client.SearchResultItem; @@ -8,6 +12,7 @@ import it.cavallium.dbengine.client.SearchResultKey; import it.cavallium.dbengine.client.SearchResultKeys; import it.cavallium.dbengine.database.LLKeyScore; import it.cavallium.dbengine.database.LLSearchResultShard; +import it.cavallium.dbengine.database.LLUtils; import it.cavallium.dbengine.database.collections.DatabaseMapDictionary; import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep; import it.cavallium.dbengine.database.collections.Joiner.ValueGetter; @@ -20,6 +25,7 @@ import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult; import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.ResultItemConsumer; import it.cavallium.dbengine.lucene.similarity.NGramSimilarity; import java.io.IOException; +import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Set; @@ -28,13 +34,16 @@ import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.KStemFilter; +import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.similarities.BooleanSimilarity; import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper; import org.apache.lucene.search.similarities.Similarity; import org.jetbrains.annotations.Nullable; import org.novasearch.lucene.search.similarities.BM25Similarity; @@ -56,10 +65,11 @@ public class LuceneUtils { private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 3, 5); private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 3, 5); private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer(); - private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(true, true); - private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(true, false); - private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, true); - private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false); + private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(false,true, true); + private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(false, true, false); + private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, false, true); + private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false, false); + private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer(false, true, true); private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC); private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS); private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L); @@ -78,78 +88,51 @@ public class LuceneUtils { private static final Similarity luceneBooleanSimilarityInstance = new BooleanSimilarity(); private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity(); + @SuppressWarnings("DuplicatedCode") public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) { - switch (analyzer) { - case N4GramPartialWords: - return lucene4GramWordsAnalyzerInstance; - case N4GramPartialString: - return lucene4GramStringAnalyzerInstance; - case N4GramPartialWordsEdge: - return lucene4GramWordsAnalyzerEdgeInstance; - case N4GramPartialStringEdge: - return lucene4GramStringAnalyzerEdgeInstance; - case N3To5GramPartialWords: - return lucene3To5GramWordsAnalyzerInstance; - case N3To5GramPartialString: - return lucene3To5GramStringAnalyzerInstance; - case N3To5GramPartialWordsEdge: - return lucene3To5GramWordsAnalyzerEdgeInstance; - case N3To5GramPartialStringEdge: - return lucene3To5GramStringAnalyzerEdgeInstance; - case Standard: - return luceneStandardAnalyzerInstance; - case FullText: - return luceneWordAnalyzerStopWordsAndStemInstance; - case WordWithStopwordsStripping: - return luceneWordAnalyzerStopWordsInstance; - case WordWithStemming: - return luceneWordAnalyzerStemInstance; - case WordSimple: - return luceneWordAnalyzerSimpleInstance; - default: - throw new UnsupportedOperationException("Unknown analyzer: " + analyzer); - } + return switch (analyzer) { + case N4GramPartialWords -> lucene4GramWordsAnalyzerInstance; + case N4GramPartialString -> lucene4GramStringAnalyzerInstance; + case N4GramPartialWordsEdge -> lucene4GramWordsAnalyzerEdgeInstance; + case N4GramPartialStringEdge -> lucene4GramStringAnalyzerEdgeInstance; + case N3To5GramPartialWords -> lucene3To5GramWordsAnalyzerInstance; + case N3To5GramPartialString -> lucene3To5GramStringAnalyzerInstance; + case N3To5GramPartialWordsEdge -> lucene3To5GramWordsAnalyzerEdgeInstance; + case N3To5GramPartialStringEdge -> lucene3To5GramStringAnalyzerEdgeInstance; + case Standard -> luceneStandardAnalyzerInstance; + case FullText -> luceneWordAnalyzerStopWordsAndStemInstance; + case WordWithStopwordsStripping -> luceneWordAnalyzerStopWordsInstance; + case WordWithStemming -> luceneWordAnalyzerStemInstance; + case WordSimple -> luceneWordAnalyzerSimpleInstance; + case ICUCollationKey -> luceneICUCollationKeyInstance; + //noinspection UnnecessaryDefault + default -> throw new UnsupportedOperationException("Unknown analyzer: " + analyzer); + }; } + @SuppressWarnings("DuplicatedCode") public static Similarity getSimilarity(TextFieldsSimilarity similarity) { - switch (similarity) { - case BM25Classic: - return luceneBM25ClassicSimilarityInstance; - case NGramBM25Classic: - return luceneBM25ClassicNGramSimilarityInstance; - case BM25L: - return luceneBM25LSimilarityInstance; - case NGramBM25L: - return luceneBM25LNGramSimilarityInstance; - case Classic: - return luceneClassicSimilarityInstance; - case NGramClassic: - return luceneClassicNGramSimilarityInstance; - case BM25Plus: - return luceneBM25PlusSimilarityInstance; - case NGramBM25Plus: - return luceneBM25PlusNGramSimilarityInstance; - case BM15Plus: - return luceneBM15PlusSimilarityInstance; - case NGramBM15Plus: - return luceneBM15PlusNGramSimilarityInstance; - case BM11Plus: - return luceneBM11PlusSimilarityInstance; - case NGramBM11Plus: - return luceneBM11PlusNGramSimilarityInstance; - case LTC: - return luceneLTCSimilarityInstance; - case LDP: - return luceneLDPSimilarityInstance; - case LDPNoLength: - return luceneLDPNoLengthSimilarityInstance; - case Robertson: - return luceneRobertsonSimilarityInstance; - case Boolean: - return luceneBooleanSimilarityInstance; - default: - throw new IllegalStateException("Unknown similarity: " + similarity); - } + return switch (similarity) { + case BM25Classic -> luceneBM25ClassicSimilarityInstance; + case NGramBM25Classic -> luceneBM25ClassicNGramSimilarityInstance; + case BM25L -> luceneBM25LSimilarityInstance; + case NGramBM25L -> luceneBM25LNGramSimilarityInstance; + case Classic -> luceneClassicSimilarityInstance; + case NGramClassic -> luceneClassicNGramSimilarityInstance; + case BM25Plus -> luceneBM25PlusSimilarityInstance; + case NGramBM25Plus -> luceneBM25PlusNGramSimilarityInstance; + case BM15Plus -> luceneBM15PlusSimilarityInstance; + case NGramBM15Plus -> luceneBM15PlusNGramSimilarityInstance; + case BM11Plus -> luceneBM11PlusSimilarityInstance; + case NGramBM11Plus -> luceneBM11PlusNGramSimilarityInstance; + case LTC -> luceneLTCSimilarityInstance; + case LDP -> luceneLDPSimilarityInstance; + case LDPNoLength -> luceneLDPNoLengthSimilarityInstance; + case Robertson -> luceneRobertsonSimilarityInstance; + case Boolean -> luceneBooleanSimilarityInstance; + //noinspection UnnecessaryDefault + default -> throw new IllegalStateException("Unknown similarity: " + similarity); + }; } /** @@ -285,4 +268,27 @@ public class LuceneUtils { .at(snapshot, entry.getKey()) .flatMap(sub -> sub.getValue(snapshot, entry.getValue()).doAfterTerminate(sub::release)); } + + public static PerFieldAnalyzerWrapper toPerFieldAnalyzerWrapper(IndicizerAnalyzers indicizerAnalyzers) { + HashMap perFieldAnalyzer = new HashMap<>(); + indicizerAnalyzers + .fieldAnalyzer() + .forEach((key, value) -> perFieldAnalyzer.put(key, LuceneUtils.getAnalyzer(value))); + return new PerFieldAnalyzerWrapper(LuceneUtils.getAnalyzer(indicizerAnalyzers.defaultAnalyzer()), perFieldAnalyzer); + } + + public static PerFieldSimilarityWrapper toPerFieldSimilarityWrapper(IndicizerSimilarities indicizerSimilarities) { + HashMap perFieldSimilarity = new HashMap<>(); + indicizerSimilarities + .fieldSimilarity() + .forEach((key, value) -> perFieldSimilarity.put(key, LuceneUtils.getSimilarity(value))); + var defaultSimilarity = LuceneUtils.getSimilarity(indicizerSimilarities.defaultSimilarity()); + return new PerFieldSimilarityWrapper() { + + @Override + public Similarity get(String name) { + return perFieldSimilarity.getOrDefault(name, defaultSimilarity); + } + }; + } } diff --git a/src/main/java/it/cavallium/dbengine/lucene/analyzer/TextFieldsAnalyzer.java b/src/main/java/it/cavallium/dbengine/lucene/analyzer/TextFieldsAnalyzer.java index e1a3667..b1cc2c3 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/analyzer/TextFieldsAnalyzer.java +++ b/src/main/java/it/cavallium/dbengine/lucene/analyzer/TextFieldsAnalyzer.java @@ -11,6 +11,7 @@ public enum TextFieldsAnalyzer { N3To5GramPartialStringEdge, Standard, WordSimple, + ICUCollationKey, WordWithStopwordsStripping, WordWithStemming, FullText, diff --git a/src/main/java/it/cavallium/dbengine/lucene/analyzer/WordAnalyzer.java b/src/main/java/it/cavallium/dbengine/lucene/analyzer/WordAnalyzer.java index 4165246..4425438 100644 --- a/src/main/java/it/cavallium/dbengine/lucene/analyzer/WordAnalyzer.java +++ b/src/main/java/it/cavallium/dbengine/lucene/analyzer/WordAnalyzer.java @@ -1,31 +1,47 @@ package it.cavallium.dbengine.lucene.analyzer; +import com.ibm.icu.text.Collator; +import com.ibm.icu.util.ULocale; import it.cavallium.dbengine.database.EnglishItalianStopFilter; import it.cavallium.dbengine.lucene.LuceneUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.icu.ICUCollationAttributeFactory; +import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer; +import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; public class WordAnalyzer extends Analyzer { + private final boolean icu; private final boolean removeStopWords; private final boolean stem; - public WordAnalyzer(boolean removeStopWords, boolean stem) { + public WordAnalyzer(boolean icu, boolean removeStopWords, boolean stem) { + this.icu = icu; this.removeStopWords = removeStopWords; this.stem = stem; } @Override protected TokenStreamComponents createComponents(final String fieldName) { - Tokenizer tokenizer = new StandardTokenizer(); + Tokenizer tokenizer; + if (icu) { + tokenizer = new StandardTokenizer(new ICUCollationAttributeFactory(Collator.getInstance(ULocale.ROOT))); + } else { + tokenizer = new StandardTokenizer(); + } TokenStream tokenStream = tokenizer; - //tokenStream = new LengthFilter(tokenStream, 1, 100); + if (stem) { + tokenStream = new LengthFilter(tokenStream, 1, 120); + } + if (!icu) { + tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem); + } if (removeStopWords) { tokenStream = new EnglishItalianStopFilter(tokenStream); } - tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem); return new TokenStreamComponents(tokenizer, tokenStream); }