This commit is contained in:
Andrea Cavalli 2021-05-28 16:04:59 +02:00
parent 4e76073259
commit 6eb531e4f1
19 changed files with 277 additions and 127 deletions

View File

@ -197,6 +197,11 @@
<artifactId>lucene-analysis-common</artifactId>
<version>9.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-icu</artifactId>
<version>9.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-codecs</artifactId>
@ -245,7 +250,7 @@
<dependency>
<groupId>it.cavallium</groupId>
<artifactId>data-generator</artifactId>
<version>[0.9.26,)</version>
<version>0.9.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>io.soabase.record-builder</groupId>

View File

@ -2,7 +2,10 @@ package it.cavallium.dbengine.client;
import it.cavallium.dbengine.database.LLDocument;
import it.cavallium.dbengine.database.LLTerm;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.jetbrains.annotations.NotNull;
import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono;
@ -18,6 +21,10 @@ public abstract class Indicizer<T, U> {
public abstract @NotNull T getKey(String key);
public abstract IndicizerAnalyzers getPerFieldAnalyzer();
public abstract IndicizerSimilarities getPerFieldSimilarity();
public Flux<Tuple2<String, Set<String>>> getMoreLikeThisDocumentFields(T key, U value) {
return Flux.empty();
}

View File

@ -0,0 +1,19 @@
package it.cavallium.dbengine.client;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import java.util.Map;
public record IndicizerAnalyzers(TextFieldsAnalyzer defaultAnalyzer, Map<String, TextFieldsAnalyzer> fieldAnalyzer) {
public static IndicizerAnalyzers of() {
return of(TextFieldsAnalyzer.FullText);
}
public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer) {
return of(defaultAnalyzer, Map.of());
}
public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer, Map<String, TextFieldsAnalyzer> fieldAnalyzer) {
return new IndicizerAnalyzers(defaultAnalyzer, fieldAnalyzer);
}
}

View File

@ -0,0 +1,20 @@
package it.cavallium.dbengine.client;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
import java.util.Map;
public record IndicizerSimilarities(TextFieldsSimilarity defaultSimilarity, Map<String, TextFieldsSimilarity> fieldSimilarity) {
public static IndicizerSimilarities of() {
return of(TextFieldsSimilarity.BM25Plus);
}
public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity) {
return of(defaultSimilarity, Map.of());
}
public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity, Map<String, TextFieldsSimilarity> fieldSimilarity) {
return new IndicizerSimilarities(defaultSimilarity, fieldSimilarity);
}
}

View File

@ -51,7 +51,6 @@ public class LuceneIndexImpl<T, U> implements LuceneIndex<T, U> {
.flatMap(entry -> indicizer
.toDocument(entry.getKey(), entry.getValue())
.map(doc -> Map.entry(indicizer.toIndex(entry.getKey()), doc)))
.collectMap(Entry::getKey, Entry::getValue)
);
}

View File

@ -1,6 +1,8 @@
package it.cavallium.dbengine.database;
import io.netty.buffer.ByteBufAllocator;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
import java.time.Duration;
@ -18,8 +20,8 @@ public interface LLDatabaseConnection {
Mono<? extends LLLuceneIndex> getLuceneIndex(String name,
int instancesCount,
TextFieldsAnalyzer textFieldsAnalyzer,
TextFieldsSimilarity textFieldsSimilarity,
IndicizerAnalyzers indicizerAnalyzers,
IndicizerSimilarities indicizerSimilarities,
Duration queryRefreshDebounceTime,
Duration commitDebounceTime,
boolean lowMemory,

View File

@ -6,6 +6,7 @@ import it.cavallium.dbengine.client.query.current.data.Query;
import it.cavallium.dbengine.client.query.current.data.QueryParams;
import it.cavallium.dbengine.client.query.current.data.ScoreMode;
import it.cavallium.dbengine.lucene.LuceneUtils;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
@ -21,7 +22,7 @@ public interface LLLuceneIndex extends LLSnapshottable {
Mono<Void> addDocument(LLTerm id, LLDocument doc);
Mono<Void> addDocuments(Mono<Map<LLTerm, LLDocument>> documents);
Mono<Void> addDocuments(Flux<Entry<LLTerm, LLDocument>> documents);
Mono<Void> deleteDocument(LLTerm id);

View File

@ -8,15 +8,20 @@ import io.netty.buffer.ByteBufUtil;
import io.netty.buffer.CompositeByteBuf;
import io.netty.buffer.Unpooled;
import io.netty.util.IllegalReferenceCountException;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.lucene.RandomSortField;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.function.Function;
import java.util.function.ToIntFunction;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint;
@ -115,16 +120,24 @@ public class LLUtils {
return d;
}
public static Iterable<Document> toDocuments(Iterable<LLDocument> document) {
List<Document> d = new LinkedList<>();
public static Collection<Document> toDocuments(Collection<LLDocument> document) {
List<Document> d = new ArrayList<>(document.size());
for (LLDocument doc : document) {
d.add(LLUtils.toDocument(doc));
}
return d;
}
public static Collection<Document> toDocumentsFromEntries(Collection<Entry<LLTerm, LLDocument>> documentsList) {
ArrayList<Document> results = new ArrayList<>(documentsList.size());
for (Entry<LLTerm, LLDocument> entry : documentsList) {
results.add(LLUtils.toDocument(entry.getValue()));
}
return results;
}
public static Iterable<Term> toTerms(Iterable<LLTerm> terms) {
List<Term> d = new LinkedList<>();
List<Term> d = new ArrayList<>();
for (LLTerm term : terms) {
d.add(LLUtils.toTerm(term));
}

View File

@ -1,6 +1,8 @@
package it.cavallium.dbengine.database.disk;
import io.netty.buffer.ByteBufAllocator;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.database.Column;
import it.cavallium.dbengine.database.LLDatabaseConnection;
import it.cavallium.dbengine.database.LLLuceneIndex;
@ -70,8 +72,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
@Override
public Mono<LLLuceneIndex> getLuceneIndex(String name,
int instancesCount,
TextFieldsAnalyzer textFieldsAnalyzer,
TextFieldsSimilarity textFieldsSimilarity,
IndicizerAnalyzers indicizerAnalyzers,
IndicizerSimilarities indicizerSimilarities,
Duration queryRefreshDebounceTime,
Duration commitDebounceTime,
boolean lowMemory,
@ -82,8 +84,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
return new LLLocalMultiLuceneIndex(basePath.resolve("lucene"),
name,
instancesCount,
textFieldsAnalyzer,
textFieldsSimilarity,
indicizerAnalyzers,
indicizerSimilarities,
queryRefreshDebounceTime,
commitDebounceTime,
lowMemory,
@ -92,8 +94,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
} else {
return new LLLocalLuceneIndex(basePath.resolve("lucene"),
name,
textFieldsAnalyzer,
textFieldsSimilarity,
indicizerAnalyzers,
indicizerSimilarities,
queryRefreshDebounceTime,
commitDebounceTime,
lowMemory,

View File

@ -122,12 +122,15 @@ public class LLLocalDictionary implements LLDictionary {
private final Striped<StampedLock> itemsLock = Striped.readWriteStampedLock(STRIPES);
private final UpdateMode updateMode;
private final ByteBufAllocator alloc;
private final String getRangeMultiDebugName;
private final String getRangeKeysMultiDebugName;
public LLLocalDictionary(
ByteBufAllocator allocator,
@NotNull RocksDB db,
@NotNull ColumnFamilyHandle columnFamilyHandle,
String databaseName,
String columnDisplayName,
Scheduler dbScheduler,
Function<LLSnapshot, Snapshot> snapshotResolver,
UpdateMode updateMode) {
@ -139,6 +142,8 @@ public class LLLocalDictionary implements LLDictionary {
this.dbScheduler = dbScheduler;
this.snapshotResolver = snapshotResolver;
this.updateMode = updateMode;
this.getRangeMultiDebugName = databaseName + "(" + columnDisplayName + ")" + "::getRangeMulti";
this.getRangeKeysMultiDebugName = databaseName + "(" + columnDisplayName + ")" + "::getRangeKeysMulti";
alloc = allocator;
}
@ -1113,12 +1118,19 @@ public class LLLocalDictionary implements LLDictionary {
}
}
@SuppressWarnings("Convert2MethodRef")
private Flux<Entry<ByteBuf, ByteBuf>> getRangeMulti(LLSnapshot snapshot, LLRange range) {
try {
return Flux
.using(
() -> new LLLocalEntryReactiveRocksIterator(db, alloc, cfh, range.retain(), resolveSnapshot(snapshot)),
LLLocalReactiveRocksIterator::flux,
() -> new LLLocalEntryReactiveRocksIterator(db,
alloc,
cfh,
range.retain(),
resolveSnapshot(snapshot),
getRangeMultiDebugName
),
llLocalEntryReactiveRocksIterator -> llLocalEntryReactiveRocksIterator.flux(),
LLLocalReactiveRocksIterator::release
)
.doOnDiscard(Entry.class, entry -> {
@ -1135,6 +1147,7 @@ public class LLLocalDictionary implements LLDictionary {
}
}
@SuppressWarnings("Convert2MethodRef")
private Flux<List<Entry<ByteBuf, ByteBuf>>> getRangeMultiGrouped(LLSnapshot snapshot, LLRange range, int prefixLength) {
try {
return Flux
@ -1147,7 +1160,7 @@ public class LLLocalDictionary implements LLDictionary {
resolveSnapshot(snapshot),
"getRangeMultiGrouped"
),
LLLocalGroupedReactiveRocksIterator::flux,
llLocalGroupedEntryReactiveRocksIterator -> llLocalGroupedEntryReactiveRocksIterator.flux(),
LLLocalGroupedReactiveRocksIterator::release
)
.subscribeOn(dbScheduler)
@ -1245,12 +1258,19 @@ public class LLLocalDictionary implements LLDictionary {
}
}
@SuppressWarnings("Convert2MethodRef")
private Flux<ByteBuf> getRangeKeysMulti(LLSnapshot snapshot, LLRange range) {
try {
return Flux
.using(
() -> new LLLocalKeyReactiveRocksIterator(db, alloc, cfh, range.retain(), resolveSnapshot(snapshot)),
LLLocalReactiveRocksIterator::flux,
() -> new LLLocalKeyReactiveRocksIterator(db,
alloc,
cfh,
range.retain(),
resolveSnapshot(snapshot),
getRangeKeysMultiDebugName
),
llLocalKeyReactiveRocksIterator -> llLocalKeyReactiveRocksIterator.flux(),
LLLocalReactiveRocksIterator::release
)
.doOnDiscard(ByteBuf.class, ReferenceCounted::release)

View File

@ -15,8 +15,9 @@ public class LLLocalEntryReactiveRocksIterator extends LLLocalReactiveRocksItera
ByteBufAllocator alloc,
ColumnFamilyHandle cfh,
LLRange range,
ReadOptions readOptions) {
super(db, alloc, cfh, range, readOptions, true);
ReadOptions readOptions,
String debugName) {
super(db, alloc, cfh, range, readOptions, true, debugName);
}
@Override

View File

@ -13,8 +13,9 @@ public class LLLocalKeyReactiveRocksIterator extends LLLocalReactiveRocksIterato
ByteBufAllocator alloc,
ColumnFamilyHandle cfh,
LLRange range,
ReadOptions readOptions) {
super(db, alloc, cfh, range, readOptions, false);
ReadOptions readOptions,
String debugName) {
super(db, alloc, cfh, range, readOptions, false, debugName);
}
@Override

View File

@ -414,6 +414,7 @@ public class LLLocalKeyValueDatabase implements LLKeyValueDatabase {
db,
handles.get(Column.special(Column.toString(columnName))),
name,
Column.toString(columnName),
dbScheduler,
(snapshot) -> snapshotsHandles.get(snapshot.getSequenceNumber()),
updateMode

View File

@ -1,5 +1,7 @@
package it.cavallium.dbengine.database.disk;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.client.query.QueryParser;
import it.cavallium.dbengine.client.query.current.data.QueryParams;
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
@ -24,7 +26,11 @@ import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult;
import java.io.IOException;
import java.nio.file.Path;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.time.temporal.TemporalUnit;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
@ -36,6 +42,8 @@ import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@ -110,15 +118,15 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
*/
private final ConcurrentHashMap<Long, LuceneIndexSnapshot> snapshots = new ConcurrentHashMap<>();
private final boolean lowMemory;
private final TextFieldsSimilarity similarity;
private final Similarity similarity;
private final ScheduledTaskLifecycle scheduledTasksLifecycle;
private final @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter;
public LLLocalLuceneIndex(Path luceneBasePath,
String name,
TextFieldsAnalyzer analyzer,
TextFieldsSimilarity similarity,
IndicizerAnalyzers indicizerAnalyzers,
IndicizerSimilarities indicizerSimilarities,
Duration queryRefreshDebounceTime,
Duration commitDebounceTime,
boolean lowMemory, boolean inMemory, @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter) throws IOException {
@ -159,9 +167,10 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
this.luceneIndexName = name;
this.snapshotter = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
this.lowMemory = lowMemory;
this.similarity = similarity;
this.similarity = LuceneUtils.toPerFieldSimilarityWrapper(indicizerSimilarities);
this.distributedCollectionStatisticsGetter = distributedCollectionStatisticsGetter;
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.getAnalyzer(analyzer));
;
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.toPerFieldAnalyzerWrapper(indicizerAnalyzers));
indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
indexWriterConfig.setIndexDeletionPolicy(snapshotter);
indexWriterConfig.setCommitOnClose(true);
@ -186,7 +195,7 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
}
private Similarity getSimilarity() {
return LuceneUtils.getSimilarity(similarity);
return similarity;
}
private void registerScheduledFixedTask(Runnable task, Duration duration) {
@ -269,11 +278,12 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
}
@Override
public Mono<Void> addDocuments(Mono<Map<LLTerm, LLDocument>> documents) {
public Mono<Void> addDocuments(Flux<Entry<LLTerm, LLDocument>> documents) {
return documents
.flatMap(documentsMap -> Mono
.collectList()
.flatMap(documentsList -> Mono
.<Void>fromCallable(() -> {
indexWriter.addDocuments(LLUtils.toDocuments(documentsMap.values()));
indexWriter.addDocuments(LLUtils.toDocumentsFromEntries(documentsList));
return null;
})
.subscribeOn(Schedulers.boundedElastic())

View File

@ -3,6 +3,8 @@ package it.cavallium.dbengine.database.disk;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader.InvalidCacheLoadException;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.client.query.current.data.QueryParams;
import it.cavallium.dbengine.database.LLDocument;
import it.cavallium.dbengine.database.LLLuceneIndex;
@ -14,12 +16,15 @@ import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
import it.unimi.dsi.fastutil.longs.Long2ObjectMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectArrayMap;
import java.io.IOException;
import java.nio.file.Path;
import java.time.Duration;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
@ -55,8 +60,8 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
public LLLocalMultiLuceneIndex(Path lucene,
String name,
int instancesCount,
TextFieldsAnalyzer textFieldsAnalyzer,
TextFieldsSimilarity textFieldsSimilarity,
IndicizerAnalyzers indicizerAnalyzers,
IndicizerSimilarities indicizerSimilarities,
Duration queryRefreshDebounceTime,
Duration commitDebounceTime,
boolean lowMemory, boolean inMemory) throws IOException {
@ -76,8 +81,8 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
}
luceneIndices[i] = new LLLocalLuceneIndex(lucene,
instanceName,
textFieldsAnalyzer,
textFieldsSimilarity,
indicizerAnalyzers,
indicizerSimilarities,
queryRefreshDebounceTime,
commitDebounceTime,
lowMemory, inMemory, (indexSearcher, field, distributedPre, actionId) -> distributedCustomCollectionStatistics(finalI,
@ -168,21 +173,37 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
return getLuceneIndex(id).addDocument(id, doc);
}
@SuppressWarnings({"unchecked"})
@Override
public Mono<Void> addDocuments(Mono<Map<LLTerm, LLDocument>> documents) {
public Mono<Void> addDocuments(Flux<Entry<LLTerm, LLDocument>> documents) {
return documents
.flatMapMany(map -> {
var sortedMap = new HashMap<LLLocalLuceneIndex, Map<LLTerm, LLDocument>>();
map.forEach((key, value) -> sortedMap
.computeIfAbsent(getLuceneIndex(key), _unused -> new HashMap<>())
.put(key, value)
);
return Flux.fromIterable(sortedMap.entrySet());
})
.flatMap(luceneIndexWithNewDocuments -> {
var luceneIndex = luceneIndexWithNewDocuments.getKey();
var docs = luceneIndexWithNewDocuments.getValue();
return luceneIndex.addDocuments(Mono.just(docs));
.bufferTimeout(512, Duration.ofSeconds(2))
.flatMap(inputEntries -> {
List<Entry<LLTerm, LLDocument>>[] sortedEntries = new List[luceneIndices.length];
Mono<Void>[] results = new Mono[luceneIndices.length];
// Sort entries
for(var inputEntry : inputEntries) {
int luceneIndexId = getLuceneIndexId(inputEntry.getKey());
if (sortedEntries[luceneIndexId] == null) {
sortedEntries[luceneIndexId] = new ArrayList<>();
}
sortedEntries[luceneIndexId].add(inputEntry);
}
// Add documents
int luceneIndexId = 0;
for (List<Entry<LLTerm, LLDocument>> docs : sortedEntries) {
if (docs != null && !docs.isEmpty()) {
LLLocalLuceneIndex luceneIndex = luceneIndices[luceneIndexId];
results[luceneIndexId] = luceneIndex.addDocuments(Flux.fromIterable(docs));
} else {
results[luceneIndexId] = Mono.empty();
}
luceneIndexId++;
}
return Mono.when(results);
})
.then();
}

View File

@ -7,6 +7,7 @@ import io.netty.buffer.ByteBufAllocator;
import it.cavallium.dbengine.database.LLRange;
import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep;
import java.util.concurrent.atomic.AtomicInteger;
import org.jetbrains.annotations.NotNull;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.ReadOptions;
@ -17,6 +18,7 @@ import reactor.core.publisher.Flux;
import reactor.util.function.Tuple3;
import static io.netty.buffer.Unpooled.*;
import static it.cavallium.dbengine.database.disk.LLLocalDictionary.logger;
public abstract class LLLocalReactiveRocksIterator<T> {
@ -26,19 +28,22 @@ public abstract class LLLocalReactiveRocksIterator<T> {
private final LLRange range;
private final ReadOptions readOptions;
private final boolean readValues;
private final String debugName;
public LLLocalReactiveRocksIterator(RocksDB db,
ByteBufAllocator alloc,
ColumnFamilyHandle cfh,
LLRange range,
ReadOptions readOptions,
boolean readValues) {
boolean readValues,
String debugName) {
this.db = db;
this.alloc = alloc;
this.cfh = cfh;
this.range = range;
this.readOptions = readOptions;
this.readValues = readValues;
this.debugName = debugName;
}
public Flux<T> flux() {
@ -46,7 +51,7 @@ public abstract class LLLocalReactiveRocksIterator<T> {
.<T, @NotNull Tuple3<RocksIterator, ReleasableSlice, ReleasableSlice>>generate(() -> {
var readOptions = new ReadOptions(this.readOptions);
if (!range.hasMin() || !range.hasMax()) {
readOptions.setReadaheadSize(2 * 1024 * 1024);
readOptions.setReadaheadSize(32 * 1024); // 32KiB
readOptions.setFillCache(false);
}
return getRocksIterator(readOptions, range.retain(), db, cfh);

View File

@ -1,6 +1,10 @@
package it.cavallium.dbengine.lucene;
import com.ibm.icu.text.Collator;
import com.ibm.icu.util.ULocale;
import it.cavallium.dbengine.client.CompositeSnapshot;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.client.MultiSort;
import it.cavallium.dbengine.client.SearchResult;
import it.cavallium.dbengine.client.SearchResultItem;
@ -8,6 +12,7 @@ import it.cavallium.dbengine.client.SearchResultKey;
import it.cavallium.dbengine.client.SearchResultKeys;
import it.cavallium.dbengine.database.LLKeyScore;
import it.cavallium.dbengine.database.LLSearchResultShard;
import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionary;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep;
import it.cavallium.dbengine.database.collections.Joiner.ValueGetter;
@ -20,6 +25,7 @@ import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult;
import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.ResultItemConsumer;
import it.cavallium.dbengine.lucene.similarity.NGramSimilarity;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
@ -28,13 +34,16 @@ import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.similarities.BooleanSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.jetbrains.annotations.Nullable;
import org.novasearch.lucene.search.similarities.BM25Similarity;
@ -56,10 +65,11 @@ public class LuceneUtils {
private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 3, 5);
private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 3, 5);
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer();
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(true, true);
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(true, false);
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, true);
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false);
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(false,true, true);
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(false, true, false);
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, false, true);
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false, false);
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer(false, true, true);
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC);
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS);
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L);
@ -78,78 +88,51 @@ public class LuceneUtils {
private static final Similarity luceneBooleanSimilarityInstance = new BooleanSimilarity();
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity();
@SuppressWarnings("DuplicatedCode")
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
switch (analyzer) {
case N4GramPartialWords:
return lucene4GramWordsAnalyzerInstance;
case N4GramPartialString:
return lucene4GramStringAnalyzerInstance;
case N4GramPartialWordsEdge:
return lucene4GramWordsAnalyzerEdgeInstance;
case N4GramPartialStringEdge:
return lucene4GramStringAnalyzerEdgeInstance;
case N3To5GramPartialWords:
return lucene3To5GramWordsAnalyzerInstance;
case N3To5GramPartialString:
return lucene3To5GramStringAnalyzerInstance;
case N3To5GramPartialWordsEdge:
return lucene3To5GramWordsAnalyzerEdgeInstance;
case N3To5GramPartialStringEdge:
return lucene3To5GramStringAnalyzerEdgeInstance;
case Standard:
return luceneStandardAnalyzerInstance;
case FullText:
return luceneWordAnalyzerStopWordsAndStemInstance;
case WordWithStopwordsStripping:
return luceneWordAnalyzerStopWordsInstance;
case WordWithStemming:
return luceneWordAnalyzerStemInstance;
case WordSimple:
return luceneWordAnalyzerSimpleInstance;
default:
throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
}
return switch (analyzer) {
case N4GramPartialWords -> lucene4GramWordsAnalyzerInstance;
case N4GramPartialString -> lucene4GramStringAnalyzerInstance;
case N4GramPartialWordsEdge -> lucene4GramWordsAnalyzerEdgeInstance;
case N4GramPartialStringEdge -> lucene4GramStringAnalyzerEdgeInstance;
case N3To5GramPartialWords -> lucene3To5GramWordsAnalyzerInstance;
case N3To5GramPartialString -> lucene3To5GramStringAnalyzerInstance;
case N3To5GramPartialWordsEdge -> lucene3To5GramWordsAnalyzerEdgeInstance;
case N3To5GramPartialStringEdge -> lucene3To5GramStringAnalyzerEdgeInstance;
case Standard -> luceneStandardAnalyzerInstance;
case FullText -> luceneWordAnalyzerStopWordsAndStemInstance;
case WordWithStopwordsStripping -> luceneWordAnalyzerStopWordsInstance;
case WordWithStemming -> luceneWordAnalyzerStemInstance;
case WordSimple -> luceneWordAnalyzerSimpleInstance;
case ICUCollationKey -> luceneICUCollationKeyInstance;
//noinspection UnnecessaryDefault
default -> throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
};
}
@SuppressWarnings("DuplicatedCode")
public static Similarity getSimilarity(TextFieldsSimilarity similarity) {
switch (similarity) {
case BM25Classic:
return luceneBM25ClassicSimilarityInstance;
case NGramBM25Classic:
return luceneBM25ClassicNGramSimilarityInstance;
case BM25L:
return luceneBM25LSimilarityInstance;
case NGramBM25L:
return luceneBM25LNGramSimilarityInstance;
case Classic:
return luceneClassicSimilarityInstance;
case NGramClassic:
return luceneClassicNGramSimilarityInstance;
case BM25Plus:
return luceneBM25PlusSimilarityInstance;
case NGramBM25Plus:
return luceneBM25PlusNGramSimilarityInstance;
case BM15Plus:
return luceneBM15PlusSimilarityInstance;
case NGramBM15Plus:
return luceneBM15PlusNGramSimilarityInstance;
case BM11Plus:
return luceneBM11PlusSimilarityInstance;
case NGramBM11Plus:
return luceneBM11PlusNGramSimilarityInstance;
case LTC:
return luceneLTCSimilarityInstance;
case LDP:
return luceneLDPSimilarityInstance;
case LDPNoLength:
return luceneLDPNoLengthSimilarityInstance;
case Robertson:
return luceneRobertsonSimilarityInstance;
case Boolean:
return luceneBooleanSimilarityInstance;
default:
throw new IllegalStateException("Unknown similarity: " + similarity);
}
return switch (similarity) {
case BM25Classic -> luceneBM25ClassicSimilarityInstance;
case NGramBM25Classic -> luceneBM25ClassicNGramSimilarityInstance;
case BM25L -> luceneBM25LSimilarityInstance;
case NGramBM25L -> luceneBM25LNGramSimilarityInstance;
case Classic -> luceneClassicSimilarityInstance;
case NGramClassic -> luceneClassicNGramSimilarityInstance;
case BM25Plus -> luceneBM25PlusSimilarityInstance;
case NGramBM25Plus -> luceneBM25PlusNGramSimilarityInstance;
case BM15Plus -> luceneBM15PlusSimilarityInstance;
case NGramBM15Plus -> luceneBM15PlusNGramSimilarityInstance;
case BM11Plus -> luceneBM11PlusSimilarityInstance;
case NGramBM11Plus -> luceneBM11PlusNGramSimilarityInstance;
case LTC -> luceneLTCSimilarityInstance;
case LDP -> luceneLDPSimilarityInstance;
case LDPNoLength -> luceneLDPNoLengthSimilarityInstance;
case Robertson -> luceneRobertsonSimilarityInstance;
case Boolean -> luceneBooleanSimilarityInstance;
//noinspection UnnecessaryDefault
default -> throw new IllegalStateException("Unknown similarity: " + similarity);
};
}
/**
@ -285,4 +268,27 @@ public class LuceneUtils {
.at(snapshot, entry.getKey())
.flatMap(sub -> sub.getValue(snapshot, entry.getValue()).doAfterTerminate(sub::release));
}
public static PerFieldAnalyzerWrapper toPerFieldAnalyzerWrapper(IndicizerAnalyzers indicizerAnalyzers) {
HashMap<String, Analyzer> perFieldAnalyzer = new HashMap<>();
indicizerAnalyzers
.fieldAnalyzer()
.forEach((key, value) -> perFieldAnalyzer.put(key, LuceneUtils.getAnalyzer(value)));
return new PerFieldAnalyzerWrapper(LuceneUtils.getAnalyzer(indicizerAnalyzers.defaultAnalyzer()), perFieldAnalyzer);
}
public static PerFieldSimilarityWrapper toPerFieldSimilarityWrapper(IndicizerSimilarities indicizerSimilarities) {
HashMap<String, Similarity> perFieldSimilarity = new HashMap<>();
indicizerSimilarities
.fieldSimilarity()
.forEach((key, value) -> perFieldSimilarity.put(key, LuceneUtils.getSimilarity(value)));
var defaultSimilarity = LuceneUtils.getSimilarity(indicizerSimilarities.defaultSimilarity());
return new PerFieldSimilarityWrapper() {
@Override
public Similarity get(String name) {
return perFieldSimilarity.getOrDefault(name, defaultSimilarity);
}
};
}
}

View File

@ -11,6 +11,7 @@ public enum TextFieldsAnalyzer {
N3To5GramPartialStringEdge,
Standard,
WordSimple,
ICUCollationKey,
WordWithStopwordsStripping,
WordWithStemming,
FullText,

View File

@ -1,31 +1,47 @@
package it.cavallium.dbengine.lucene.analyzer;
import com.ibm.icu.text.Collator;
import com.ibm.icu.util.ULocale;
import it.cavallium.dbengine.database.EnglishItalianStopFilter;
import it.cavallium.dbengine.lucene.LuceneUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUCollationAttributeFactory;
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class WordAnalyzer extends Analyzer {
private final boolean icu;
private final boolean removeStopWords;
private final boolean stem;
public WordAnalyzer(boolean removeStopWords, boolean stem) {
public WordAnalyzer(boolean icu, boolean removeStopWords, boolean stem) {
this.icu = icu;
this.removeStopWords = removeStopWords;
this.stem = stem;
}
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
Tokenizer tokenizer = new StandardTokenizer();
Tokenizer tokenizer;
if (icu) {
tokenizer = new StandardTokenizer(new ICUCollationAttributeFactory(Collator.getInstance(ULocale.ROOT)));
} else {
tokenizer = new StandardTokenizer();
}
TokenStream tokenStream = tokenizer;
//tokenStream = new LengthFilter(tokenStream, 1, 100);
if (stem) {
tokenStream = new LengthFilter(tokenStream, 1, 120);
}
if (!icu) {
tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem);
}
if (removeStopWords) {
tokenStream = new EnglishItalianStopFilter(tokenStream);
}
tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem);
return new TokenStreamComponents(tokenizer, tokenStream);
}