Add ICU
This commit is contained in:
parent
4e76073259
commit
6eb531e4f1
7
pom.xml
7
pom.xml
|
@ -197,6 +197,11 @@
|
|||
<artifactId>lucene-analysis-common</artifactId>
|
||||
<version>9.0.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers-icu</artifactId>
|
||||
<version>9.0.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-codecs</artifactId>
|
||||
|
@ -245,7 +250,7 @@
|
|||
<dependency>
|
||||
<groupId>it.cavallium</groupId>
|
||||
<artifactId>data-generator</artifactId>
|
||||
<version>[0.9.26,)</version>
|
||||
<version>0.9.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>io.soabase.record-builder</groupId>
|
||||
|
|
|
@ -2,7 +2,10 @@ package it.cavallium.dbengine.client;
|
|||
|
||||
import it.cavallium.dbengine.database.LLDocument;
|
||||
import it.cavallium.dbengine.database.LLTerm;
|
||||
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import reactor.core.publisher.Flux;
|
||||
import reactor.core.publisher.Mono;
|
||||
|
@ -18,6 +21,10 @@ public abstract class Indicizer<T, U> {
|
|||
|
||||
public abstract @NotNull T getKey(String key);
|
||||
|
||||
public abstract IndicizerAnalyzers getPerFieldAnalyzer();
|
||||
|
||||
public abstract IndicizerSimilarities getPerFieldSimilarity();
|
||||
|
||||
public Flux<Tuple2<String, Set<String>>> getMoreLikeThisDocumentFields(T key, U value) {
|
||||
return Flux.empty();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
package it.cavallium.dbengine.client;
|
||||
|
||||
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||
import java.util.Map;
|
||||
|
||||
public record IndicizerAnalyzers(TextFieldsAnalyzer defaultAnalyzer, Map<String, TextFieldsAnalyzer> fieldAnalyzer) {
|
||||
|
||||
public static IndicizerAnalyzers of() {
|
||||
return of(TextFieldsAnalyzer.FullText);
|
||||
}
|
||||
|
||||
public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer) {
|
||||
return of(defaultAnalyzer, Map.of());
|
||||
}
|
||||
|
||||
public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer, Map<String, TextFieldsAnalyzer> fieldAnalyzer) {
|
||||
return new IndicizerAnalyzers(defaultAnalyzer, fieldAnalyzer);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
package it.cavallium.dbengine.client;
|
||||
|
||||
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||
import java.util.Map;
|
||||
|
||||
public record IndicizerSimilarities(TextFieldsSimilarity defaultSimilarity, Map<String, TextFieldsSimilarity> fieldSimilarity) {
|
||||
|
||||
public static IndicizerSimilarities of() {
|
||||
return of(TextFieldsSimilarity.BM25Plus);
|
||||
}
|
||||
|
||||
public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity) {
|
||||
return of(defaultSimilarity, Map.of());
|
||||
}
|
||||
|
||||
public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity, Map<String, TextFieldsSimilarity> fieldSimilarity) {
|
||||
return new IndicizerSimilarities(defaultSimilarity, fieldSimilarity);
|
||||
}
|
||||
}
|
|
@ -51,7 +51,6 @@ public class LuceneIndexImpl<T, U> implements LuceneIndex<T, U> {
|
|||
.flatMap(entry -> indicizer
|
||||
.toDocument(entry.getKey(), entry.getValue())
|
||||
.map(doc -> Map.entry(indicizer.toIndex(entry.getKey()), doc)))
|
||||
.collectMap(Entry::getKey, Entry::getValue)
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
package it.cavallium.dbengine.database;
|
||||
|
||||
import io.netty.buffer.ByteBufAllocator;
|
||||
import it.cavallium.dbengine.client.IndicizerAnalyzers;
|
||||
import it.cavallium.dbengine.client.IndicizerSimilarities;
|
||||
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||
import java.time.Duration;
|
||||
|
@ -18,8 +20,8 @@ public interface LLDatabaseConnection {
|
|||
|
||||
Mono<? extends LLLuceneIndex> getLuceneIndex(String name,
|
||||
int instancesCount,
|
||||
TextFieldsAnalyzer textFieldsAnalyzer,
|
||||
TextFieldsSimilarity textFieldsSimilarity,
|
||||
IndicizerAnalyzers indicizerAnalyzers,
|
||||
IndicizerSimilarities indicizerSimilarities,
|
||||
Duration queryRefreshDebounceTime,
|
||||
Duration commitDebounceTime,
|
||||
boolean lowMemory,
|
||||
|
|
|
@ -6,6 +6,7 @@ import it.cavallium.dbengine.client.query.current.data.Query;
|
|||
import it.cavallium.dbengine.client.query.current.data.QueryParams;
|
||||
import it.cavallium.dbengine.client.query.current.data.ScoreMode;
|
||||
import it.cavallium.dbengine.lucene.LuceneUtils;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
@ -21,7 +22,7 @@ public interface LLLuceneIndex extends LLSnapshottable {
|
|||
|
||||
Mono<Void> addDocument(LLTerm id, LLDocument doc);
|
||||
|
||||
Mono<Void> addDocuments(Mono<Map<LLTerm, LLDocument>> documents);
|
||||
Mono<Void> addDocuments(Flux<Entry<LLTerm, LLDocument>> documents);
|
||||
|
||||
Mono<Void> deleteDocument(LLTerm id);
|
||||
|
||||
|
|
|
@ -8,15 +8,20 @@ import io.netty.buffer.ByteBufUtil;
|
|||
import io.netty.buffer.CompositeByteBuf;
|
||||
import io.netty.buffer.Unpooled;
|
||||
import io.netty.util.IllegalReferenceCountException;
|
||||
import it.cavallium.dbengine.client.IndicizerAnalyzers;
|
||||
import it.cavallium.dbengine.lucene.RandomSortField;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.ToIntFunction;
|
||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FloatPoint;
|
||||
|
@ -115,16 +120,24 @@ public class LLUtils {
|
|||
return d;
|
||||
}
|
||||
|
||||
public static Iterable<Document> toDocuments(Iterable<LLDocument> document) {
|
||||
List<Document> d = new LinkedList<>();
|
||||
public static Collection<Document> toDocuments(Collection<LLDocument> document) {
|
||||
List<Document> d = new ArrayList<>(document.size());
|
||||
for (LLDocument doc : document) {
|
||||
d.add(LLUtils.toDocument(doc));
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
public static Collection<Document> toDocumentsFromEntries(Collection<Entry<LLTerm, LLDocument>> documentsList) {
|
||||
ArrayList<Document> results = new ArrayList<>(documentsList.size());
|
||||
for (Entry<LLTerm, LLDocument> entry : documentsList) {
|
||||
results.add(LLUtils.toDocument(entry.getValue()));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
public static Iterable<Term> toTerms(Iterable<LLTerm> terms) {
|
||||
List<Term> d = new LinkedList<>();
|
||||
List<Term> d = new ArrayList<>();
|
||||
for (LLTerm term : terms) {
|
||||
d.add(LLUtils.toTerm(term));
|
||||
}
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
package it.cavallium.dbengine.database.disk;
|
||||
|
||||
import io.netty.buffer.ByteBufAllocator;
|
||||
import it.cavallium.dbengine.client.IndicizerAnalyzers;
|
||||
import it.cavallium.dbengine.client.IndicizerSimilarities;
|
||||
import it.cavallium.dbengine.database.Column;
|
||||
import it.cavallium.dbengine.database.LLDatabaseConnection;
|
||||
import it.cavallium.dbengine.database.LLLuceneIndex;
|
||||
|
@ -70,8 +72,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
|
|||
@Override
|
||||
public Mono<LLLuceneIndex> getLuceneIndex(String name,
|
||||
int instancesCount,
|
||||
TextFieldsAnalyzer textFieldsAnalyzer,
|
||||
TextFieldsSimilarity textFieldsSimilarity,
|
||||
IndicizerAnalyzers indicizerAnalyzers,
|
||||
IndicizerSimilarities indicizerSimilarities,
|
||||
Duration queryRefreshDebounceTime,
|
||||
Duration commitDebounceTime,
|
||||
boolean lowMemory,
|
||||
|
@ -82,8 +84,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
|
|||
return new LLLocalMultiLuceneIndex(basePath.resolve("lucene"),
|
||||
name,
|
||||
instancesCount,
|
||||
textFieldsAnalyzer,
|
||||
textFieldsSimilarity,
|
||||
indicizerAnalyzers,
|
||||
indicizerSimilarities,
|
||||
queryRefreshDebounceTime,
|
||||
commitDebounceTime,
|
||||
lowMemory,
|
||||
|
@ -92,8 +94,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
|
|||
} else {
|
||||
return new LLLocalLuceneIndex(basePath.resolve("lucene"),
|
||||
name,
|
||||
textFieldsAnalyzer,
|
||||
textFieldsSimilarity,
|
||||
indicizerAnalyzers,
|
||||
indicizerSimilarities,
|
||||
queryRefreshDebounceTime,
|
||||
commitDebounceTime,
|
||||
lowMemory,
|
||||
|
|
|
@ -122,12 +122,15 @@ public class LLLocalDictionary implements LLDictionary {
|
|||
private final Striped<StampedLock> itemsLock = Striped.readWriteStampedLock(STRIPES);
|
||||
private final UpdateMode updateMode;
|
||||
private final ByteBufAllocator alloc;
|
||||
private final String getRangeMultiDebugName;
|
||||
private final String getRangeKeysMultiDebugName;
|
||||
|
||||
public LLLocalDictionary(
|
||||
ByteBufAllocator allocator,
|
||||
@NotNull RocksDB db,
|
||||
@NotNull ColumnFamilyHandle columnFamilyHandle,
|
||||
String databaseName,
|
||||
String columnDisplayName,
|
||||
Scheduler dbScheduler,
|
||||
Function<LLSnapshot, Snapshot> snapshotResolver,
|
||||
UpdateMode updateMode) {
|
||||
|
@ -139,6 +142,8 @@ public class LLLocalDictionary implements LLDictionary {
|
|||
this.dbScheduler = dbScheduler;
|
||||
this.snapshotResolver = snapshotResolver;
|
||||
this.updateMode = updateMode;
|
||||
this.getRangeMultiDebugName = databaseName + "(" + columnDisplayName + ")" + "::getRangeMulti";
|
||||
this.getRangeKeysMultiDebugName = databaseName + "(" + columnDisplayName + ")" + "::getRangeKeysMulti";
|
||||
alloc = allocator;
|
||||
}
|
||||
|
||||
|
@ -1113,12 +1118,19 @@ public class LLLocalDictionary implements LLDictionary {
|
|||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("Convert2MethodRef")
|
||||
private Flux<Entry<ByteBuf, ByteBuf>> getRangeMulti(LLSnapshot snapshot, LLRange range) {
|
||||
try {
|
||||
return Flux
|
||||
.using(
|
||||
() -> new LLLocalEntryReactiveRocksIterator(db, alloc, cfh, range.retain(), resolveSnapshot(snapshot)),
|
||||
LLLocalReactiveRocksIterator::flux,
|
||||
() -> new LLLocalEntryReactiveRocksIterator(db,
|
||||
alloc,
|
||||
cfh,
|
||||
range.retain(),
|
||||
resolveSnapshot(snapshot),
|
||||
getRangeMultiDebugName
|
||||
),
|
||||
llLocalEntryReactiveRocksIterator -> llLocalEntryReactiveRocksIterator.flux(),
|
||||
LLLocalReactiveRocksIterator::release
|
||||
)
|
||||
.doOnDiscard(Entry.class, entry -> {
|
||||
|
@ -1135,6 +1147,7 @@ public class LLLocalDictionary implements LLDictionary {
|
|||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("Convert2MethodRef")
|
||||
private Flux<List<Entry<ByteBuf, ByteBuf>>> getRangeMultiGrouped(LLSnapshot snapshot, LLRange range, int prefixLength) {
|
||||
try {
|
||||
return Flux
|
||||
|
@ -1147,7 +1160,7 @@ public class LLLocalDictionary implements LLDictionary {
|
|||
resolveSnapshot(snapshot),
|
||||
"getRangeMultiGrouped"
|
||||
),
|
||||
LLLocalGroupedReactiveRocksIterator::flux,
|
||||
llLocalGroupedEntryReactiveRocksIterator -> llLocalGroupedEntryReactiveRocksIterator.flux(),
|
||||
LLLocalGroupedReactiveRocksIterator::release
|
||||
)
|
||||
.subscribeOn(dbScheduler)
|
||||
|
@ -1245,12 +1258,19 @@ public class LLLocalDictionary implements LLDictionary {
|
|||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("Convert2MethodRef")
|
||||
private Flux<ByteBuf> getRangeKeysMulti(LLSnapshot snapshot, LLRange range) {
|
||||
try {
|
||||
return Flux
|
||||
.using(
|
||||
() -> new LLLocalKeyReactiveRocksIterator(db, alloc, cfh, range.retain(), resolveSnapshot(snapshot)),
|
||||
LLLocalReactiveRocksIterator::flux,
|
||||
() -> new LLLocalKeyReactiveRocksIterator(db,
|
||||
alloc,
|
||||
cfh,
|
||||
range.retain(),
|
||||
resolveSnapshot(snapshot),
|
||||
getRangeKeysMultiDebugName
|
||||
),
|
||||
llLocalKeyReactiveRocksIterator -> llLocalKeyReactiveRocksIterator.flux(),
|
||||
LLLocalReactiveRocksIterator::release
|
||||
)
|
||||
.doOnDiscard(ByteBuf.class, ReferenceCounted::release)
|
||||
|
|
|
@ -15,8 +15,9 @@ public class LLLocalEntryReactiveRocksIterator extends LLLocalReactiveRocksItera
|
|||
ByteBufAllocator alloc,
|
||||
ColumnFamilyHandle cfh,
|
||||
LLRange range,
|
||||
ReadOptions readOptions) {
|
||||
super(db, alloc, cfh, range, readOptions, true);
|
||||
ReadOptions readOptions,
|
||||
String debugName) {
|
||||
super(db, alloc, cfh, range, readOptions, true, debugName);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -13,8 +13,9 @@ public class LLLocalKeyReactiveRocksIterator extends LLLocalReactiveRocksIterato
|
|||
ByteBufAllocator alloc,
|
||||
ColumnFamilyHandle cfh,
|
||||
LLRange range,
|
||||
ReadOptions readOptions) {
|
||||
super(db, alloc, cfh, range, readOptions, false);
|
||||
ReadOptions readOptions,
|
||||
String debugName) {
|
||||
super(db, alloc, cfh, range, readOptions, false, debugName);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -414,6 +414,7 @@ public class LLLocalKeyValueDatabase implements LLKeyValueDatabase {
|
|||
db,
|
||||
handles.get(Column.special(Column.toString(columnName))),
|
||||
name,
|
||||
Column.toString(columnName),
|
||||
dbScheduler,
|
||||
(snapshot) -> snapshotsHandles.get(snapshot.getSequenceNumber()),
|
||||
updateMode
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package it.cavallium.dbengine.database.disk;
|
||||
|
||||
import it.cavallium.dbengine.client.IndicizerAnalyzers;
|
||||
import it.cavallium.dbengine.client.IndicizerSimilarities;
|
||||
import it.cavallium.dbengine.client.query.QueryParser;
|
||||
import it.cavallium.dbengine.client.query.current.data.QueryParams;
|
||||
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
|
||||
|
@ -24,7 +26,11 @@ import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult;
|
|||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.time.temporal.TemporalUnit;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Objects;
|
||||
|
@ -36,6 +42,8 @@ import java.util.concurrent.Semaphore;
|
|||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexCommit;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
|
@ -110,15 +118,15 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||
*/
|
||||
private final ConcurrentHashMap<Long, LuceneIndexSnapshot> snapshots = new ConcurrentHashMap<>();
|
||||
private final boolean lowMemory;
|
||||
private final TextFieldsSimilarity similarity;
|
||||
private final Similarity similarity;
|
||||
|
||||
private final ScheduledTaskLifecycle scheduledTasksLifecycle;
|
||||
private final @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter;
|
||||
|
||||
public LLLocalLuceneIndex(Path luceneBasePath,
|
||||
String name,
|
||||
TextFieldsAnalyzer analyzer,
|
||||
TextFieldsSimilarity similarity,
|
||||
IndicizerAnalyzers indicizerAnalyzers,
|
||||
IndicizerSimilarities indicizerSimilarities,
|
||||
Duration queryRefreshDebounceTime,
|
||||
Duration commitDebounceTime,
|
||||
boolean lowMemory, boolean inMemory, @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter) throws IOException {
|
||||
|
@ -159,9 +167,10 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||
this.luceneIndexName = name;
|
||||
this.snapshotter = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
|
||||
this.lowMemory = lowMemory;
|
||||
this.similarity = similarity;
|
||||
this.similarity = LuceneUtils.toPerFieldSimilarityWrapper(indicizerSimilarities);
|
||||
this.distributedCollectionStatisticsGetter = distributedCollectionStatisticsGetter;
|
||||
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.getAnalyzer(analyzer));
|
||||
;
|
||||
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.toPerFieldAnalyzerWrapper(indicizerAnalyzers));
|
||||
indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
|
||||
indexWriterConfig.setIndexDeletionPolicy(snapshotter);
|
||||
indexWriterConfig.setCommitOnClose(true);
|
||||
|
@ -186,7 +195,7 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||
}
|
||||
|
||||
private Similarity getSimilarity() {
|
||||
return LuceneUtils.getSimilarity(similarity);
|
||||
return similarity;
|
||||
}
|
||||
|
||||
private void registerScheduledFixedTask(Runnable task, Duration duration) {
|
||||
|
@ -269,11 +278,12 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Mono<Void> addDocuments(Mono<Map<LLTerm, LLDocument>> documents) {
|
||||
public Mono<Void> addDocuments(Flux<Entry<LLTerm, LLDocument>> documents) {
|
||||
return documents
|
||||
.flatMap(documentsMap -> Mono
|
||||
.collectList()
|
||||
.flatMap(documentsList -> Mono
|
||||
.<Void>fromCallable(() -> {
|
||||
indexWriter.addDocuments(LLUtils.toDocuments(documentsMap.values()));
|
||||
indexWriter.addDocuments(LLUtils.toDocumentsFromEntries(documentsList));
|
||||
return null;
|
||||
})
|
||||
.subscribeOn(Schedulers.boundedElastic())
|
||||
|
|
|
@ -3,6 +3,8 @@ package it.cavallium.dbengine.database.disk;
|
|||
import com.google.common.cache.Cache;
|
||||
import com.google.common.cache.CacheBuilder;
|
||||
import com.google.common.cache.CacheLoader.InvalidCacheLoadException;
|
||||
import it.cavallium.dbengine.client.IndicizerAnalyzers;
|
||||
import it.cavallium.dbengine.client.IndicizerSimilarities;
|
||||
import it.cavallium.dbengine.client.query.current.data.QueryParams;
|
||||
import it.cavallium.dbengine.database.LLDocument;
|
||||
import it.cavallium.dbengine.database.LLLuceneIndex;
|
||||
|
@ -14,12 +16,15 @@ import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
|||
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||
import it.unimi.dsi.fastutil.longs.Long2ObjectMap;
|
||||
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2ObjectArrayMap;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
@ -55,8 +60,8 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
|
|||
public LLLocalMultiLuceneIndex(Path lucene,
|
||||
String name,
|
||||
int instancesCount,
|
||||
TextFieldsAnalyzer textFieldsAnalyzer,
|
||||
TextFieldsSimilarity textFieldsSimilarity,
|
||||
IndicizerAnalyzers indicizerAnalyzers,
|
||||
IndicizerSimilarities indicizerSimilarities,
|
||||
Duration queryRefreshDebounceTime,
|
||||
Duration commitDebounceTime,
|
||||
boolean lowMemory, boolean inMemory) throws IOException {
|
||||
|
@ -76,8 +81,8 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
|
|||
}
|
||||
luceneIndices[i] = new LLLocalLuceneIndex(lucene,
|
||||
instanceName,
|
||||
textFieldsAnalyzer,
|
||||
textFieldsSimilarity,
|
||||
indicizerAnalyzers,
|
||||
indicizerSimilarities,
|
||||
queryRefreshDebounceTime,
|
||||
commitDebounceTime,
|
||||
lowMemory, inMemory, (indexSearcher, field, distributedPre, actionId) -> distributedCustomCollectionStatistics(finalI,
|
||||
|
@ -168,21 +173,37 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
|
|||
return getLuceneIndex(id).addDocument(id, doc);
|
||||
}
|
||||
|
||||
@SuppressWarnings({"unchecked"})
|
||||
@Override
|
||||
public Mono<Void> addDocuments(Mono<Map<LLTerm, LLDocument>> documents) {
|
||||
public Mono<Void> addDocuments(Flux<Entry<LLTerm, LLDocument>> documents) {
|
||||
return documents
|
||||
.flatMapMany(map -> {
|
||||
var sortedMap = new HashMap<LLLocalLuceneIndex, Map<LLTerm, LLDocument>>();
|
||||
map.forEach((key, value) -> sortedMap
|
||||
.computeIfAbsent(getLuceneIndex(key), _unused -> new HashMap<>())
|
||||
.put(key, value)
|
||||
);
|
||||
return Flux.fromIterable(sortedMap.entrySet());
|
||||
})
|
||||
.flatMap(luceneIndexWithNewDocuments -> {
|
||||
var luceneIndex = luceneIndexWithNewDocuments.getKey();
|
||||
var docs = luceneIndexWithNewDocuments.getValue();
|
||||
return luceneIndex.addDocuments(Mono.just(docs));
|
||||
.bufferTimeout(512, Duration.ofSeconds(2))
|
||||
.flatMap(inputEntries -> {
|
||||
List<Entry<LLTerm, LLDocument>>[] sortedEntries = new List[luceneIndices.length];
|
||||
Mono<Void>[] results = new Mono[luceneIndices.length];
|
||||
|
||||
// Sort entries
|
||||
for(var inputEntry : inputEntries) {
|
||||
int luceneIndexId = getLuceneIndexId(inputEntry.getKey());
|
||||
if (sortedEntries[luceneIndexId] == null) {
|
||||
sortedEntries[luceneIndexId] = new ArrayList<>();
|
||||
}
|
||||
sortedEntries[luceneIndexId].add(inputEntry);
|
||||
}
|
||||
|
||||
// Add documents
|
||||
int luceneIndexId = 0;
|
||||
for (List<Entry<LLTerm, LLDocument>> docs : sortedEntries) {
|
||||
if (docs != null && !docs.isEmpty()) {
|
||||
LLLocalLuceneIndex luceneIndex = luceneIndices[luceneIndexId];
|
||||
results[luceneIndexId] = luceneIndex.addDocuments(Flux.fromIterable(docs));
|
||||
} else {
|
||||
results[luceneIndexId] = Mono.empty();
|
||||
}
|
||||
luceneIndexId++;
|
||||
}
|
||||
|
||||
return Mono.when(results);
|
||||
})
|
||||
.then();
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ import io.netty.buffer.ByteBufAllocator;
|
|||
import it.cavallium.dbengine.database.LLRange;
|
||||
import it.cavallium.dbengine.database.LLUtils;
|
||||
import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.rocksdb.ColumnFamilyHandle;
|
||||
import org.rocksdb.ReadOptions;
|
||||
|
@ -17,6 +18,7 @@ import reactor.core.publisher.Flux;
|
|||
import reactor.util.function.Tuple3;
|
||||
|
||||
import static io.netty.buffer.Unpooled.*;
|
||||
import static it.cavallium.dbengine.database.disk.LLLocalDictionary.logger;
|
||||
|
||||
public abstract class LLLocalReactiveRocksIterator<T> {
|
||||
|
||||
|
@ -26,19 +28,22 @@ public abstract class LLLocalReactiveRocksIterator<T> {
|
|||
private final LLRange range;
|
||||
private final ReadOptions readOptions;
|
||||
private final boolean readValues;
|
||||
private final String debugName;
|
||||
|
||||
public LLLocalReactiveRocksIterator(RocksDB db,
|
||||
ByteBufAllocator alloc,
|
||||
ColumnFamilyHandle cfh,
|
||||
LLRange range,
|
||||
ReadOptions readOptions,
|
||||
boolean readValues) {
|
||||
boolean readValues,
|
||||
String debugName) {
|
||||
this.db = db;
|
||||
this.alloc = alloc;
|
||||
this.cfh = cfh;
|
||||
this.range = range;
|
||||
this.readOptions = readOptions;
|
||||
this.readValues = readValues;
|
||||
this.debugName = debugName;
|
||||
}
|
||||
|
||||
public Flux<T> flux() {
|
||||
|
@ -46,7 +51,7 @@ public abstract class LLLocalReactiveRocksIterator<T> {
|
|||
.<T, @NotNull Tuple3<RocksIterator, ReleasableSlice, ReleasableSlice>>generate(() -> {
|
||||
var readOptions = new ReadOptions(this.readOptions);
|
||||
if (!range.hasMin() || !range.hasMax()) {
|
||||
readOptions.setReadaheadSize(2 * 1024 * 1024);
|
||||
readOptions.setReadaheadSize(32 * 1024); // 32KiB
|
||||
readOptions.setFillCache(false);
|
||||
}
|
||||
return getRocksIterator(readOptions, range.retain(), db, cfh);
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
package it.cavallium.dbengine.lucene;
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import it.cavallium.dbengine.client.CompositeSnapshot;
|
||||
import it.cavallium.dbengine.client.IndicizerAnalyzers;
|
||||
import it.cavallium.dbengine.client.IndicizerSimilarities;
|
||||
import it.cavallium.dbengine.client.MultiSort;
|
||||
import it.cavallium.dbengine.client.SearchResult;
|
||||
import it.cavallium.dbengine.client.SearchResultItem;
|
||||
|
@ -8,6 +12,7 @@ import it.cavallium.dbengine.client.SearchResultKey;
|
|||
import it.cavallium.dbengine.client.SearchResultKeys;
|
||||
import it.cavallium.dbengine.database.LLKeyScore;
|
||||
import it.cavallium.dbengine.database.LLSearchResultShard;
|
||||
import it.cavallium.dbengine.database.LLUtils;
|
||||
import it.cavallium.dbengine.database.collections.DatabaseMapDictionary;
|
||||
import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep;
|
||||
import it.cavallium.dbengine.database.collections.Joiner.ValueGetter;
|
||||
|
@ -20,6 +25,7 @@ import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult;
|
|||
import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.ResultItemConsumer;
|
||||
import it.cavallium.dbengine.lucene.similarity.NGramSimilarity;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
@ -28,13 +34,16 @@ import org.apache.lucene.analysis.LowerCaseFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
|
||||
import org.apache.lucene.analysis.en.KStemFilter;
|
||||
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.similarities.BooleanSimilarity;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.novasearch.lucene.search.similarities.BM25Similarity;
|
||||
|
@ -56,10 +65,11 @@ public class LuceneUtils {
|
|||
private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 3, 5);
|
||||
private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 3, 5);
|
||||
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer();
|
||||
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(true, true);
|
||||
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(true, false);
|
||||
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, true);
|
||||
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false);
|
||||
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(false,true, true);
|
||||
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(false, true, false);
|
||||
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, false, true);
|
||||
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false, false);
|
||||
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer(false, true, true);
|
||||
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC);
|
||||
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS);
|
||||
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L);
|
||||
|
@ -78,78 +88,51 @@ public class LuceneUtils {
|
|||
private static final Similarity luceneBooleanSimilarityInstance = new BooleanSimilarity();
|
||||
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity();
|
||||
|
||||
@SuppressWarnings("DuplicatedCode")
|
||||
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
|
||||
switch (analyzer) {
|
||||
case N4GramPartialWords:
|
||||
return lucene4GramWordsAnalyzerInstance;
|
||||
case N4GramPartialString:
|
||||
return lucene4GramStringAnalyzerInstance;
|
||||
case N4GramPartialWordsEdge:
|
||||
return lucene4GramWordsAnalyzerEdgeInstance;
|
||||
case N4GramPartialStringEdge:
|
||||
return lucene4GramStringAnalyzerEdgeInstance;
|
||||
case N3To5GramPartialWords:
|
||||
return lucene3To5GramWordsAnalyzerInstance;
|
||||
case N3To5GramPartialString:
|
||||
return lucene3To5GramStringAnalyzerInstance;
|
||||
case N3To5GramPartialWordsEdge:
|
||||
return lucene3To5GramWordsAnalyzerEdgeInstance;
|
||||
case N3To5GramPartialStringEdge:
|
||||
return lucene3To5GramStringAnalyzerEdgeInstance;
|
||||
case Standard:
|
||||
return luceneStandardAnalyzerInstance;
|
||||
case FullText:
|
||||
return luceneWordAnalyzerStopWordsAndStemInstance;
|
||||
case WordWithStopwordsStripping:
|
||||
return luceneWordAnalyzerStopWordsInstance;
|
||||
case WordWithStemming:
|
||||
return luceneWordAnalyzerStemInstance;
|
||||
case WordSimple:
|
||||
return luceneWordAnalyzerSimpleInstance;
|
||||
default:
|
||||
throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
|
||||
}
|
||||
return switch (analyzer) {
|
||||
case N4GramPartialWords -> lucene4GramWordsAnalyzerInstance;
|
||||
case N4GramPartialString -> lucene4GramStringAnalyzerInstance;
|
||||
case N4GramPartialWordsEdge -> lucene4GramWordsAnalyzerEdgeInstance;
|
||||
case N4GramPartialStringEdge -> lucene4GramStringAnalyzerEdgeInstance;
|
||||
case N3To5GramPartialWords -> lucene3To5GramWordsAnalyzerInstance;
|
||||
case N3To5GramPartialString -> lucene3To5GramStringAnalyzerInstance;
|
||||
case N3To5GramPartialWordsEdge -> lucene3To5GramWordsAnalyzerEdgeInstance;
|
||||
case N3To5GramPartialStringEdge -> lucene3To5GramStringAnalyzerEdgeInstance;
|
||||
case Standard -> luceneStandardAnalyzerInstance;
|
||||
case FullText -> luceneWordAnalyzerStopWordsAndStemInstance;
|
||||
case WordWithStopwordsStripping -> luceneWordAnalyzerStopWordsInstance;
|
||||
case WordWithStemming -> luceneWordAnalyzerStemInstance;
|
||||
case WordSimple -> luceneWordAnalyzerSimpleInstance;
|
||||
case ICUCollationKey -> luceneICUCollationKeyInstance;
|
||||
//noinspection UnnecessaryDefault
|
||||
default -> throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
|
||||
};
|
||||
}
|
||||
|
||||
@SuppressWarnings("DuplicatedCode")
|
||||
public static Similarity getSimilarity(TextFieldsSimilarity similarity) {
|
||||
switch (similarity) {
|
||||
case BM25Classic:
|
||||
return luceneBM25ClassicSimilarityInstance;
|
||||
case NGramBM25Classic:
|
||||
return luceneBM25ClassicNGramSimilarityInstance;
|
||||
case BM25L:
|
||||
return luceneBM25LSimilarityInstance;
|
||||
case NGramBM25L:
|
||||
return luceneBM25LNGramSimilarityInstance;
|
||||
case Classic:
|
||||
return luceneClassicSimilarityInstance;
|
||||
case NGramClassic:
|
||||
return luceneClassicNGramSimilarityInstance;
|
||||
case BM25Plus:
|
||||
return luceneBM25PlusSimilarityInstance;
|
||||
case NGramBM25Plus:
|
||||
return luceneBM25PlusNGramSimilarityInstance;
|
||||
case BM15Plus:
|
||||
return luceneBM15PlusSimilarityInstance;
|
||||
case NGramBM15Plus:
|
||||
return luceneBM15PlusNGramSimilarityInstance;
|
||||
case BM11Plus:
|
||||
return luceneBM11PlusSimilarityInstance;
|
||||
case NGramBM11Plus:
|
||||
return luceneBM11PlusNGramSimilarityInstance;
|
||||
case LTC:
|
||||
return luceneLTCSimilarityInstance;
|
||||
case LDP:
|
||||
return luceneLDPSimilarityInstance;
|
||||
case LDPNoLength:
|
||||
return luceneLDPNoLengthSimilarityInstance;
|
||||
case Robertson:
|
||||
return luceneRobertsonSimilarityInstance;
|
||||
case Boolean:
|
||||
return luceneBooleanSimilarityInstance;
|
||||
default:
|
||||
throw new IllegalStateException("Unknown similarity: " + similarity);
|
||||
}
|
||||
return switch (similarity) {
|
||||
case BM25Classic -> luceneBM25ClassicSimilarityInstance;
|
||||
case NGramBM25Classic -> luceneBM25ClassicNGramSimilarityInstance;
|
||||
case BM25L -> luceneBM25LSimilarityInstance;
|
||||
case NGramBM25L -> luceneBM25LNGramSimilarityInstance;
|
||||
case Classic -> luceneClassicSimilarityInstance;
|
||||
case NGramClassic -> luceneClassicNGramSimilarityInstance;
|
||||
case BM25Plus -> luceneBM25PlusSimilarityInstance;
|
||||
case NGramBM25Plus -> luceneBM25PlusNGramSimilarityInstance;
|
||||
case BM15Plus -> luceneBM15PlusSimilarityInstance;
|
||||
case NGramBM15Plus -> luceneBM15PlusNGramSimilarityInstance;
|
||||
case BM11Plus -> luceneBM11PlusSimilarityInstance;
|
||||
case NGramBM11Plus -> luceneBM11PlusNGramSimilarityInstance;
|
||||
case LTC -> luceneLTCSimilarityInstance;
|
||||
case LDP -> luceneLDPSimilarityInstance;
|
||||
case LDPNoLength -> luceneLDPNoLengthSimilarityInstance;
|
||||
case Robertson -> luceneRobertsonSimilarityInstance;
|
||||
case Boolean -> luceneBooleanSimilarityInstance;
|
||||
//noinspection UnnecessaryDefault
|
||||
default -> throw new IllegalStateException("Unknown similarity: " + similarity);
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -285,4 +268,27 @@ public class LuceneUtils {
|
|||
.at(snapshot, entry.getKey())
|
||||
.flatMap(sub -> sub.getValue(snapshot, entry.getValue()).doAfterTerminate(sub::release));
|
||||
}
|
||||
|
||||
public static PerFieldAnalyzerWrapper toPerFieldAnalyzerWrapper(IndicizerAnalyzers indicizerAnalyzers) {
|
||||
HashMap<String, Analyzer> perFieldAnalyzer = new HashMap<>();
|
||||
indicizerAnalyzers
|
||||
.fieldAnalyzer()
|
||||
.forEach((key, value) -> perFieldAnalyzer.put(key, LuceneUtils.getAnalyzer(value)));
|
||||
return new PerFieldAnalyzerWrapper(LuceneUtils.getAnalyzer(indicizerAnalyzers.defaultAnalyzer()), perFieldAnalyzer);
|
||||
}
|
||||
|
||||
public static PerFieldSimilarityWrapper toPerFieldSimilarityWrapper(IndicizerSimilarities indicizerSimilarities) {
|
||||
HashMap<String, Similarity> perFieldSimilarity = new HashMap<>();
|
||||
indicizerSimilarities
|
||||
.fieldSimilarity()
|
||||
.forEach((key, value) -> perFieldSimilarity.put(key, LuceneUtils.getSimilarity(value)));
|
||||
var defaultSimilarity = LuceneUtils.getSimilarity(indicizerSimilarities.defaultSimilarity());
|
||||
return new PerFieldSimilarityWrapper() {
|
||||
|
||||
@Override
|
||||
public Similarity get(String name) {
|
||||
return perFieldSimilarity.getOrDefault(name, defaultSimilarity);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ public enum TextFieldsAnalyzer {
|
|||
N3To5GramPartialStringEdge,
|
||||
Standard,
|
||||
WordSimple,
|
||||
ICUCollationKey,
|
||||
WordWithStopwordsStripping,
|
||||
WordWithStemming,
|
||||
FullText,
|
||||
|
|
|
@ -1,31 +1,47 @@
|
|||
package it.cavallium.dbengine.lucene.analyzer;
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
|
||||
import it.cavallium.dbengine.lucene.LuceneUtils;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.icu.ICUCollationAttributeFactory;
|
||||
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
|
||||
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
public class WordAnalyzer extends Analyzer {
|
||||
|
||||
private final boolean icu;
|
||||
private final boolean removeStopWords;
|
||||
private final boolean stem;
|
||||
|
||||
public WordAnalyzer(boolean removeStopWords, boolean stem) {
|
||||
public WordAnalyzer(boolean icu, boolean removeStopWords, boolean stem) {
|
||||
this.icu = icu;
|
||||
this.removeStopWords = removeStopWords;
|
||||
this.stem = stem;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
Tokenizer tokenizer;
|
||||
if (icu) {
|
||||
tokenizer = new StandardTokenizer(new ICUCollationAttributeFactory(Collator.getInstance(ULocale.ROOT)));
|
||||
} else {
|
||||
tokenizer = new StandardTokenizer();
|
||||
}
|
||||
TokenStream tokenStream = tokenizer;
|
||||
//tokenStream = new LengthFilter(tokenStream, 1, 100);
|
||||
if (stem) {
|
||||
tokenStream = new LengthFilter(tokenStream, 1, 120);
|
||||
}
|
||||
if (!icu) {
|
||||
tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem);
|
||||
}
|
||||
if (removeStopWords) {
|
||||
tokenStream = new EnglishItalianStopFilter(tokenStream);
|
||||
}
|
||||
tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem);
|
||||
|
||||
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user