This commit is contained in:
Andrea Cavalli 2021-05-28 16:04:59 +02:00
parent 4e76073259
commit 6eb531e4f1
19 changed files with 277 additions and 127 deletions

View File

@ -197,6 +197,11 @@
<artifactId>lucene-analysis-common</artifactId> <artifactId>lucene-analysis-common</artifactId>
<version>9.0.0-SNAPSHOT</version> <version>9.0.0-SNAPSHOT</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-icu</artifactId>
<version>9.0.0-SNAPSHOT</version>
</dependency>
<dependency> <dependency>
<groupId>org.apache.lucene</groupId> <groupId>org.apache.lucene</groupId>
<artifactId>lucene-codecs</artifactId> <artifactId>lucene-codecs</artifactId>
@ -245,7 +250,7 @@
<dependency> <dependency>
<groupId>it.cavallium</groupId> <groupId>it.cavallium</groupId>
<artifactId>data-generator</artifactId> <artifactId>data-generator</artifactId>
<version>[0.9.26,)</version> <version>0.9.0-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>io.soabase.record-builder</groupId> <groupId>io.soabase.record-builder</groupId>

View File

@ -2,7 +2,10 @@ package it.cavallium.dbengine.client;
import it.cavallium.dbengine.database.LLDocument; import it.cavallium.dbengine.database.LLDocument;
import it.cavallium.dbengine.database.LLTerm; import it.cavallium.dbengine.database.LLTerm;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import reactor.core.publisher.Flux; import reactor.core.publisher.Flux;
import reactor.core.publisher.Mono; import reactor.core.publisher.Mono;
@ -18,6 +21,10 @@ public abstract class Indicizer<T, U> {
public abstract @NotNull T getKey(String key); public abstract @NotNull T getKey(String key);
public abstract IndicizerAnalyzers getPerFieldAnalyzer();
public abstract IndicizerSimilarities getPerFieldSimilarity();
public Flux<Tuple2<String, Set<String>>> getMoreLikeThisDocumentFields(T key, U value) { public Flux<Tuple2<String, Set<String>>> getMoreLikeThisDocumentFields(T key, U value) {
return Flux.empty(); return Flux.empty();
} }

View File

@ -0,0 +1,19 @@
package it.cavallium.dbengine.client;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import java.util.Map;
public record IndicizerAnalyzers(TextFieldsAnalyzer defaultAnalyzer, Map<String, TextFieldsAnalyzer> fieldAnalyzer) {
public static IndicizerAnalyzers of() {
return of(TextFieldsAnalyzer.FullText);
}
public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer) {
return of(defaultAnalyzer, Map.of());
}
public static IndicizerAnalyzers of(TextFieldsAnalyzer defaultAnalyzer, Map<String, TextFieldsAnalyzer> fieldAnalyzer) {
return new IndicizerAnalyzers(defaultAnalyzer, fieldAnalyzer);
}
}

View File

@ -0,0 +1,20 @@
package it.cavallium.dbengine.client;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
import java.util.Map;
public record IndicizerSimilarities(TextFieldsSimilarity defaultSimilarity, Map<String, TextFieldsSimilarity> fieldSimilarity) {
public static IndicizerSimilarities of() {
return of(TextFieldsSimilarity.BM25Plus);
}
public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity) {
return of(defaultSimilarity, Map.of());
}
public static IndicizerSimilarities of(TextFieldsSimilarity defaultSimilarity, Map<String, TextFieldsSimilarity> fieldSimilarity) {
return new IndicizerSimilarities(defaultSimilarity, fieldSimilarity);
}
}

View File

@ -51,7 +51,6 @@ public class LuceneIndexImpl<T, U> implements LuceneIndex<T, U> {
.flatMap(entry -> indicizer .flatMap(entry -> indicizer
.toDocument(entry.getKey(), entry.getValue()) .toDocument(entry.getKey(), entry.getValue())
.map(doc -> Map.entry(indicizer.toIndex(entry.getKey()), doc))) .map(doc -> Map.entry(indicizer.toIndex(entry.getKey()), doc)))
.collectMap(Entry::getKey, Entry::getValue)
); );
} }

View File

@ -1,6 +1,8 @@
package it.cavallium.dbengine.database; package it.cavallium.dbengine.database;
import io.netty.buffer.ByteBufAllocator; import io.netty.buffer.ByteBufAllocator;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer; import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity; import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
import java.time.Duration; import java.time.Duration;
@ -18,8 +20,8 @@ public interface LLDatabaseConnection {
Mono<? extends LLLuceneIndex> getLuceneIndex(String name, Mono<? extends LLLuceneIndex> getLuceneIndex(String name,
int instancesCount, int instancesCount,
TextFieldsAnalyzer textFieldsAnalyzer, IndicizerAnalyzers indicizerAnalyzers,
TextFieldsSimilarity textFieldsSimilarity, IndicizerSimilarities indicizerSimilarities,
Duration queryRefreshDebounceTime, Duration queryRefreshDebounceTime,
Duration commitDebounceTime, Duration commitDebounceTime,
boolean lowMemory, boolean lowMemory,

View File

@ -6,6 +6,7 @@ import it.cavallium.dbengine.client.query.current.data.Query;
import it.cavallium.dbengine.client.query.current.data.QueryParams; import it.cavallium.dbengine.client.query.current.data.QueryParams;
import it.cavallium.dbengine.client.query.current.data.ScoreMode; import it.cavallium.dbengine.client.query.current.data.ScoreMode;
import it.cavallium.dbengine.lucene.LuceneUtils; import it.cavallium.dbengine.lucene.LuceneUtils;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
@ -21,7 +22,7 @@ public interface LLLuceneIndex extends LLSnapshottable {
Mono<Void> addDocument(LLTerm id, LLDocument doc); Mono<Void> addDocument(LLTerm id, LLDocument doc);
Mono<Void> addDocuments(Mono<Map<LLTerm, LLDocument>> documents); Mono<Void> addDocuments(Flux<Entry<LLTerm, LLDocument>> documents);
Mono<Void> deleteDocument(LLTerm id); Mono<Void> deleteDocument(LLTerm id);

View File

@ -8,15 +8,20 @@ import io.netty.buffer.ByteBufUtil;
import io.netty.buffer.CompositeByteBuf; import io.netty.buffer.CompositeByteBuf;
import io.netty.buffer.Unpooled; import io.netty.buffer.Unpooled;
import io.netty.util.IllegalReferenceCountException; import io.netty.util.IllegalReferenceCountException;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.lucene.RandomSortField; import it.cavallium.dbengine.lucene.RandomSortField;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map.Entry;
import java.util.Objects; import java.util.Objects;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.ToIntFunction; import java.util.function.ToIntFunction;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint; import org.apache.lucene.document.FloatPoint;
@ -115,16 +120,24 @@ public class LLUtils {
return d; return d;
} }
public static Iterable<Document> toDocuments(Iterable<LLDocument> document) { public static Collection<Document> toDocuments(Collection<LLDocument> document) {
List<Document> d = new LinkedList<>(); List<Document> d = new ArrayList<>(document.size());
for (LLDocument doc : document) { for (LLDocument doc : document) {
d.add(LLUtils.toDocument(doc)); d.add(LLUtils.toDocument(doc));
} }
return d; return d;
} }
public static Collection<Document> toDocumentsFromEntries(Collection<Entry<LLTerm, LLDocument>> documentsList) {
ArrayList<Document> results = new ArrayList<>(documentsList.size());
for (Entry<LLTerm, LLDocument> entry : documentsList) {
results.add(LLUtils.toDocument(entry.getValue()));
}
return results;
}
public static Iterable<Term> toTerms(Iterable<LLTerm> terms) { public static Iterable<Term> toTerms(Iterable<LLTerm> terms) {
List<Term> d = new LinkedList<>(); List<Term> d = new ArrayList<>();
for (LLTerm term : terms) { for (LLTerm term : terms) {
d.add(LLUtils.toTerm(term)); d.add(LLUtils.toTerm(term));
} }

View File

@ -1,6 +1,8 @@
package it.cavallium.dbengine.database.disk; package it.cavallium.dbengine.database.disk;
import io.netty.buffer.ByteBufAllocator; import io.netty.buffer.ByteBufAllocator;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.database.Column; import it.cavallium.dbengine.database.Column;
import it.cavallium.dbengine.database.LLDatabaseConnection; import it.cavallium.dbengine.database.LLDatabaseConnection;
import it.cavallium.dbengine.database.LLLuceneIndex; import it.cavallium.dbengine.database.LLLuceneIndex;
@ -70,8 +72,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
@Override @Override
public Mono<LLLuceneIndex> getLuceneIndex(String name, public Mono<LLLuceneIndex> getLuceneIndex(String name,
int instancesCount, int instancesCount,
TextFieldsAnalyzer textFieldsAnalyzer, IndicizerAnalyzers indicizerAnalyzers,
TextFieldsSimilarity textFieldsSimilarity, IndicizerSimilarities indicizerSimilarities,
Duration queryRefreshDebounceTime, Duration queryRefreshDebounceTime,
Duration commitDebounceTime, Duration commitDebounceTime,
boolean lowMemory, boolean lowMemory,
@ -82,8 +84,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
return new LLLocalMultiLuceneIndex(basePath.resolve("lucene"), return new LLLocalMultiLuceneIndex(basePath.resolve("lucene"),
name, name,
instancesCount, instancesCount,
textFieldsAnalyzer, indicizerAnalyzers,
textFieldsSimilarity, indicizerSimilarities,
queryRefreshDebounceTime, queryRefreshDebounceTime,
commitDebounceTime, commitDebounceTime,
lowMemory, lowMemory,
@ -92,8 +94,8 @@ public class LLLocalDatabaseConnection implements LLDatabaseConnection {
} else { } else {
return new LLLocalLuceneIndex(basePath.resolve("lucene"), return new LLLocalLuceneIndex(basePath.resolve("lucene"),
name, name,
textFieldsAnalyzer, indicizerAnalyzers,
textFieldsSimilarity, indicizerSimilarities,
queryRefreshDebounceTime, queryRefreshDebounceTime,
commitDebounceTime, commitDebounceTime,
lowMemory, lowMemory,

View File

@ -122,12 +122,15 @@ public class LLLocalDictionary implements LLDictionary {
private final Striped<StampedLock> itemsLock = Striped.readWriteStampedLock(STRIPES); private final Striped<StampedLock> itemsLock = Striped.readWriteStampedLock(STRIPES);
private final UpdateMode updateMode; private final UpdateMode updateMode;
private final ByteBufAllocator alloc; private final ByteBufAllocator alloc;
private final String getRangeMultiDebugName;
private final String getRangeKeysMultiDebugName;
public LLLocalDictionary( public LLLocalDictionary(
ByteBufAllocator allocator, ByteBufAllocator allocator,
@NotNull RocksDB db, @NotNull RocksDB db,
@NotNull ColumnFamilyHandle columnFamilyHandle, @NotNull ColumnFamilyHandle columnFamilyHandle,
String databaseName, String databaseName,
String columnDisplayName,
Scheduler dbScheduler, Scheduler dbScheduler,
Function<LLSnapshot, Snapshot> snapshotResolver, Function<LLSnapshot, Snapshot> snapshotResolver,
UpdateMode updateMode) { UpdateMode updateMode) {
@ -139,6 +142,8 @@ public class LLLocalDictionary implements LLDictionary {
this.dbScheduler = dbScheduler; this.dbScheduler = dbScheduler;
this.snapshotResolver = snapshotResolver; this.snapshotResolver = snapshotResolver;
this.updateMode = updateMode; this.updateMode = updateMode;
this.getRangeMultiDebugName = databaseName + "(" + columnDisplayName + ")" + "::getRangeMulti";
this.getRangeKeysMultiDebugName = databaseName + "(" + columnDisplayName + ")" + "::getRangeKeysMulti";
alloc = allocator; alloc = allocator;
} }
@ -1113,12 +1118,19 @@ public class LLLocalDictionary implements LLDictionary {
} }
} }
@SuppressWarnings("Convert2MethodRef")
private Flux<Entry<ByteBuf, ByteBuf>> getRangeMulti(LLSnapshot snapshot, LLRange range) { private Flux<Entry<ByteBuf, ByteBuf>> getRangeMulti(LLSnapshot snapshot, LLRange range) {
try { try {
return Flux return Flux
.using( .using(
() -> new LLLocalEntryReactiveRocksIterator(db, alloc, cfh, range.retain(), resolveSnapshot(snapshot)), () -> new LLLocalEntryReactiveRocksIterator(db,
LLLocalReactiveRocksIterator::flux, alloc,
cfh,
range.retain(),
resolveSnapshot(snapshot),
getRangeMultiDebugName
),
llLocalEntryReactiveRocksIterator -> llLocalEntryReactiveRocksIterator.flux(),
LLLocalReactiveRocksIterator::release LLLocalReactiveRocksIterator::release
) )
.doOnDiscard(Entry.class, entry -> { .doOnDiscard(Entry.class, entry -> {
@ -1135,6 +1147,7 @@ public class LLLocalDictionary implements LLDictionary {
} }
} }
@SuppressWarnings("Convert2MethodRef")
private Flux<List<Entry<ByteBuf, ByteBuf>>> getRangeMultiGrouped(LLSnapshot snapshot, LLRange range, int prefixLength) { private Flux<List<Entry<ByteBuf, ByteBuf>>> getRangeMultiGrouped(LLSnapshot snapshot, LLRange range, int prefixLength) {
try { try {
return Flux return Flux
@ -1147,7 +1160,7 @@ public class LLLocalDictionary implements LLDictionary {
resolveSnapshot(snapshot), resolveSnapshot(snapshot),
"getRangeMultiGrouped" "getRangeMultiGrouped"
), ),
LLLocalGroupedReactiveRocksIterator::flux, llLocalGroupedEntryReactiveRocksIterator -> llLocalGroupedEntryReactiveRocksIterator.flux(),
LLLocalGroupedReactiveRocksIterator::release LLLocalGroupedReactiveRocksIterator::release
) )
.subscribeOn(dbScheduler) .subscribeOn(dbScheduler)
@ -1245,12 +1258,19 @@ public class LLLocalDictionary implements LLDictionary {
} }
} }
@SuppressWarnings("Convert2MethodRef")
private Flux<ByteBuf> getRangeKeysMulti(LLSnapshot snapshot, LLRange range) { private Flux<ByteBuf> getRangeKeysMulti(LLSnapshot snapshot, LLRange range) {
try { try {
return Flux return Flux
.using( .using(
() -> new LLLocalKeyReactiveRocksIterator(db, alloc, cfh, range.retain(), resolveSnapshot(snapshot)), () -> new LLLocalKeyReactiveRocksIterator(db,
LLLocalReactiveRocksIterator::flux, alloc,
cfh,
range.retain(),
resolveSnapshot(snapshot),
getRangeKeysMultiDebugName
),
llLocalKeyReactiveRocksIterator -> llLocalKeyReactiveRocksIterator.flux(),
LLLocalReactiveRocksIterator::release LLLocalReactiveRocksIterator::release
) )
.doOnDiscard(ByteBuf.class, ReferenceCounted::release) .doOnDiscard(ByteBuf.class, ReferenceCounted::release)

View File

@ -15,8 +15,9 @@ public class LLLocalEntryReactiveRocksIterator extends LLLocalReactiveRocksItera
ByteBufAllocator alloc, ByteBufAllocator alloc,
ColumnFamilyHandle cfh, ColumnFamilyHandle cfh,
LLRange range, LLRange range,
ReadOptions readOptions) { ReadOptions readOptions,
super(db, alloc, cfh, range, readOptions, true); String debugName) {
super(db, alloc, cfh, range, readOptions, true, debugName);
} }
@Override @Override

View File

@ -13,8 +13,9 @@ public class LLLocalKeyReactiveRocksIterator extends LLLocalReactiveRocksIterato
ByteBufAllocator alloc, ByteBufAllocator alloc,
ColumnFamilyHandle cfh, ColumnFamilyHandle cfh,
LLRange range, LLRange range,
ReadOptions readOptions) { ReadOptions readOptions,
super(db, alloc, cfh, range, readOptions, false); String debugName) {
super(db, alloc, cfh, range, readOptions, false, debugName);
} }
@Override @Override

View File

@ -414,6 +414,7 @@ public class LLLocalKeyValueDatabase implements LLKeyValueDatabase {
db, db,
handles.get(Column.special(Column.toString(columnName))), handles.get(Column.special(Column.toString(columnName))),
name, name,
Column.toString(columnName),
dbScheduler, dbScheduler,
(snapshot) -> snapshotsHandles.get(snapshot.getSequenceNumber()), (snapshot) -> snapshotsHandles.get(snapshot.getSequenceNumber()),
updateMode updateMode

View File

@ -1,5 +1,7 @@
package it.cavallium.dbengine.database.disk; package it.cavallium.dbengine.database.disk;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.client.query.QueryParser; import it.cavallium.dbengine.client.query.QueryParser;
import it.cavallium.dbengine.client.query.current.data.QueryParams; import it.cavallium.dbengine.client.query.current.data.QueryParams;
import it.cavallium.dbengine.database.EnglishItalianStopFilter; import it.cavallium.dbengine.database.EnglishItalianStopFilter;
@ -24,7 +26,11 @@ import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.Duration; import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.time.temporal.TemporalUnit;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Objects; import java.util.Objects;
@ -36,6 +42,8 @@ import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
@ -110,15 +118,15 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
*/ */
private final ConcurrentHashMap<Long, LuceneIndexSnapshot> snapshots = new ConcurrentHashMap<>(); private final ConcurrentHashMap<Long, LuceneIndexSnapshot> snapshots = new ConcurrentHashMap<>();
private final boolean lowMemory; private final boolean lowMemory;
private final TextFieldsSimilarity similarity; private final Similarity similarity;
private final ScheduledTaskLifecycle scheduledTasksLifecycle; private final ScheduledTaskLifecycle scheduledTasksLifecycle;
private final @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter; private final @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter;
public LLLocalLuceneIndex(Path luceneBasePath, public LLLocalLuceneIndex(Path luceneBasePath,
String name, String name,
TextFieldsAnalyzer analyzer, IndicizerAnalyzers indicizerAnalyzers,
TextFieldsSimilarity similarity, IndicizerSimilarities indicizerSimilarities,
Duration queryRefreshDebounceTime, Duration queryRefreshDebounceTime,
Duration commitDebounceTime, Duration commitDebounceTime,
boolean lowMemory, boolean inMemory, @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter) throws IOException { boolean lowMemory, boolean inMemory, @Nullable LLSearchCollectionStatisticsGetter distributedCollectionStatisticsGetter) throws IOException {
@ -159,9 +167,10 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
this.luceneIndexName = name; this.luceneIndexName = name;
this.snapshotter = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy()); this.snapshotter = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
this.lowMemory = lowMemory; this.lowMemory = lowMemory;
this.similarity = similarity; this.similarity = LuceneUtils.toPerFieldSimilarityWrapper(indicizerSimilarities);
this.distributedCollectionStatisticsGetter = distributedCollectionStatisticsGetter; this.distributedCollectionStatisticsGetter = distributedCollectionStatisticsGetter;
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.getAnalyzer(analyzer)); ;
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneUtils.toPerFieldAnalyzerWrapper(indicizerAnalyzers));
indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
indexWriterConfig.setIndexDeletionPolicy(snapshotter); indexWriterConfig.setIndexDeletionPolicy(snapshotter);
indexWriterConfig.setCommitOnClose(true); indexWriterConfig.setCommitOnClose(true);
@ -186,7 +195,7 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
} }
private Similarity getSimilarity() { private Similarity getSimilarity() {
return LuceneUtils.getSimilarity(similarity); return similarity;
} }
private void registerScheduledFixedTask(Runnable task, Duration duration) { private void registerScheduledFixedTask(Runnable task, Duration duration) {
@ -269,11 +278,12 @@ public class LLLocalLuceneIndex implements LLLuceneIndex {
} }
@Override @Override
public Mono<Void> addDocuments(Mono<Map<LLTerm, LLDocument>> documents) { public Mono<Void> addDocuments(Flux<Entry<LLTerm, LLDocument>> documents) {
return documents return documents
.flatMap(documentsMap -> Mono .collectList()
.flatMap(documentsList -> Mono
.<Void>fromCallable(() -> { .<Void>fromCallable(() -> {
indexWriter.addDocuments(LLUtils.toDocuments(documentsMap.values())); indexWriter.addDocuments(LLUtils.toDocumentsFromEntries(documentsList));
return null; return null;
}) })
.subscribeOn(Schedulers.boundedElastic()) .subscribeOn(Schedulers.boundedElastic())

View File

@ -3,6 +3,8 @@ package it.cavallium.dbengine.database.disk;
import com.google.common.cache.Cache; import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader.InvalidCacheLoadException; import com.google.common.cache.CacheLoader.InvalidCacheLoadException;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.client.query.current.data.QueryParams; import it.cavallium.dbengine.client.query.current.data.QueryParams;
import it.cavallium.dbengine.database.LLDocument; import it.cavallium.dbengine.database.LLDocument;
import it.cavallium.dbengine.database.LLLuceneIndex; import it.cavallium.dbengine.database.LLLuceneIndex;
@ -14,12 +16,15 @@ import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity; import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
import it.unimi.dsi.fastutil.longs.Long2ObjectMap; import it.unimi.dsi.fastutil.longs.Long2ObjectMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectArrayMap;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.Duration; import java.time.Duration;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
@ -55,8 +60,8 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
public LLLocalMultiLuceneIndex(Path lucene, public LLLocalMultiLuceneIndex(Path lucene,
String name, String name,
int instancesCount, int instancesCount,
TextFieldsAnalyzer textFieldsAnalyzer, IndicizerAnalyzers indicizerAnalyzers,
TextFieldsSimilarity textFieldsSimilarity, IndicizerSimilarities indicizerSimilarities,
Duration queryRefreshDebounceTime, Duration queryRefreshDebounceTime,
Duration commitDebounceTime, Duration commitDebounceTime,
boolean lowMemory, boolean inMemory) throws IOException { boolean lowMemory, boolean inMemory) throws IOException {
@ -76,8 +81,8 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
} }
luceneIndices[i] = new LLLocalLuceneIndex(lucene, luceneIndices[i] = new LLLocalLuceneIndex(lucene,
instanceName, instanceName,
textFieldsAnalyzer, indicizerAnalyzers,
textFieldsSimilarity, indicizerSimilarities,
queryRefreshDebounceTime, queryRefreshDebounceTime,
commitDebounceTime, commitDebounceTime,
lowMemory, inMemory, (indexSearcher, field, distributedPre, actionId) -> distributedCustomCollectionStatistics(finalI, lowMemory, inMemory, (indexSearcher, field, distributedPre, actionId) -> distributedCustomCollectionStatistics(finalI,
@ -168,21 +173,37 @@ public class LLLocalMultiLuceneIndex implements LLLuceneIndex {
return getLuceneIndex(id).addDocument(id, doc); return getLuceneIndex(id).addDocument(id, doc);
} }
@SuppressWarnings({"unchecked"})
@Override @Override
public Mono<Void> addDocuments(Mono<Map<LLTerm, LLDocument>> documents) { public Mono<Void> addDocuments(Flux<Entry<LLTerm, LLDocument>> documents) {
return documents return documents
.flatMapMany(map -> { .bufferTimeout(512, Duration.ofSeconds(2))
var sortedMap = new HashMap<LLLocalLuceneIndex, Map<LLTerm, LLDocument>>(); .flatMap(inputEntries -> {
map.forEach((key, value) -> sortedMap List<Entry<LLTerm, LLDocument>>[] sortedEntries = new List[luceneIndices.length];
.computeIfAbsent(getLuceneIndex(key), _unused -> new HashMap<>()) Mono<Void>[] results = new Mono[luceneIndices.length];
.put(key, value)
); // Sort entries
return Flux.fromIterable(sortedMap.entrySet()); for(var inputEntry : inputEntries) {
}) int luceneIndexId = getLuceneIndexId(inputEntry.getKey());
.flatMap(luceneIndexWithNewDocuments -> { if (sortedEntries[luceneIndexId] == null) {
var luceneIndex = luceneIndexWithNewDocuments.getKey(); sortedEntries[luceneIndexId] = new ArrayList<>();
var docs = luceneIndexWithNewDocuments.getValue(); }
return luceneIndex.addDocuments(Mono.just(docs)); sortedEntries[luceneIndexId].add(inputEntry);
}
// Add documents
int luceneIndexId = 0;
for (List<Entry<LLTerm, LLDocument>> docs : sortedEntries) {
if (docs != null && !docs.isEmpty()) {
LLLocalLuceneIndex luceneIndex = luceneIndices[luceneIndexId];
results[luceneIndexId] = luceneIndex.addDocuments(Flux.fromIterable(docs));
} else {
results[luceneIndexId] = Mono.empty();
}
luceneIndexId++;
}
return Mono.when(results);
}) })
.then(); .then();
} }

View File

@ -7,6 +7,7 @@ import io.netty.buffer.ByteBufAllocator;
import it.cavallium.dbengine.database.LLRange; import it.cavallium.dbengine.database.LLRange;
import it.cavallium.dbengine.database.LLUtils; import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep; import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep;
import java.util.concurrent.atomic.AtomicInteger;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.rocksdb.ColumnFamilyHandle; import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.ReadOptions; import org.rocksdb.ReadOptions;
@ -17,6 +18,7 @@ import reactor.core.publisher.Flux;
import reactor.util.function.Tuple3; import reactor.util.function.Tuple3;
import static io.netty.buffer.Unpooled.*; import static io.netty.buffer.Unpooled.*;
import static it.cavallium.dbengine.database.disk.LLLocalDictionary.logger;
public abstract class LLLocalReactiveRocksIterator<T> { public abstract class LLLocalReactiveRocksIterator<T> {
@ -26,19 +28,22 @@ public abstract class LLLocalReactiveRocksIterator<T> {
private final LLRange range; private final LLRange range;
private final ReadOptions readOptions; private final ReadOptions readOptions;
private final boolean readValues; private final boolean readValues;
private final String debugName;
public LLLocalReactiveRocksIterator(RocksDB db, public LLLocalReactiveRocksIterator(RocksDB db,
ByteBufAllocator alloc, ByteBufAllocator alloc,
ColumnFamilyHandle cfh, ColumnFamilyHandle cfh,
LLRange range, LLRange range,
ReadOptions readOptions, ReadOptions readOptions,
boolean readValues) { boolean readValues,
String debugName) {
this.db = db; this.db = db;
this.alloc = alloc; this.alloc = alloc;
this.cfh = cfh; this.cfh = cfh;
this.range = range; this.range = range;
this.readOptions = readOptions; this.readOptions = readOptions;
this.readValues = readValues; this.readValues = readValues;
this.debugName = debugName;
} }
public Flux<T> flux() { public Flux<T> flux() {
@ -46,7 +51,7 @@ public abstract class LLLocalReactiveRocksIterator<T> {
.<T, @NotNull Tuple3<RocksIterator, ReleasableSlice, ReleasableSlice>>generate(() -> { .<T, @NotNull Tuple3<RocksIterator, ReleasableSlice, ReleasableSlice>>generate(() -> {
var readOptions = new ReadOptions(this.readOptions); var readOptions = new ReadOptions(this.readOptions);
if (!range.hasMin() || !range.hasMax()) { if (!range.hasMin() || !range.hasMax()) {
readOptions.setReadaheadSize(2 * 1024 * 1024); readOptions.setReadaheadSize(32 * 1024); // 32KiB
readOptions.setFillCache(false); readOptions.setFillCache(false);
} }
return getRocksIterator(readOptions, range.retain(), db, cfh); return getRocksIterator(readOptions, range.retain(), db, cfh);

View File

@ -1,6 +1,10 @@
package it.cavallium.dbengine.lucene; package it.cavallium.dbengine.lucene;
import com.ibm.icu.text.Collator;
import com.ibm.icu.util.ULocale;
import it.cavallium.dbengine.client.CompositeSnapshot; import it.cavallium.dbengine.client.CompositeSnapshot;
import it.cavallium.dbengine.client.IndicizerAnalyzers;
import it.cavallium.dbengine.client.IndicizerSimilarities;
import it.cavallium.dbengine.client.MultiSort; import it.cavallium.dbengine.client.MultiSort;
import it.cavallium.dbengine.client.SearchResult; import it.cavallium.dbengine.client.SearchResult;
import it.cavallium.dbengine.client.SearchResultItem; import it.cavallium.dbengine.client.SearchResultItem;
@ -8,6 +12,7 @@ import it.cavallium.dbengine.client.SearchResultKey;
import it.cavallium.dbengine.client.SearchResultKeys; import it.cavallium.dbengine.client.SearchResultKeys;
import it.cavallium.dbengine.database.LLKeyScore; import it.cavallium.dbengine.database.LLKeyScore;
import it.cavallium.dbengine.database.LLSearchResultShard; import it.cavallium.dbengine.database.LLSearchResultShard;
import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionary; import it.cavallium.dbengine.database.collections.DatabaseMapDictionary;
import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep; import it.cavallium.dbengine.database.collections.DatabaseMapDictionaryDeep;
import it.cavallium.dbengine.database.collections.Joiner.ValueGetter; import it.cavallium.dbengine.database.collections.Joiner.ValueGetter;
@ -20,6 +25,7 @@ import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.HandleResult;
import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.ResultItemConsumer; import it.cavallium.dbengine.lucene.searcher.LuceneStreamSearcher.ResultItemConsumer;
import it.cavallium.dbengine.lucene.similarity.NGramSimilarity; import it.cavallium.dbengine.lucene.similarity.NGramSimilarity;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
@ -28,13 +34,16 @@ import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.similarities.BooleanSimilarity; import org.apache.lucene.search.similarities.BooleanSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
import org.novasearch.lucene.search.similarities.BM25Similarity; import org.novasearch.lucene.search.similarities.BM25Similarity;
@ -56,10 +65,11 @@ public class LuceneUtils {
private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 3, 5); private static final Analyzer lucene3To5GramWordsAnalyzerInstance = new NCharGramAnalyzer(true, 3, 5);
private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 3, 5); private static final Analyzer lucene3To5GramStringAnalyzerInstance = new NCharGramAnalyzer(false, 3, 5);
private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer(); private static final Analyzer luceneStandardAnalyzerInstance = new StandardAnalyzer();
private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(true, true); private static final Analyzer luceneWordAnalyzerStopWordsAndStemInstance = new WordAnalyzer(false,true, true);
private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(true, false); private static final Analyzer luceneWordAnalyzerStopWordsInstance = new WordAnalyzer(false, true, false);
private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, true); private static final Analyzer luceneWordAnalyzerStemInstance = new WordAnalyzer(false, false, true);
private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false); private static final Analyzer luceneWordAnalyzerSimpleInstance = new WordAnalyzer(false, false, false);
private static final Analyzer luceneICUCollationKeyInstance = new WordAnalyzer(false, true, true);
private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC); private static final Similarity luceneBM25ClassicSimilarityInstance = new BM25Similarity(BM25Model.CLASSIC);
private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS); private static final Similarity luceneBM25PlusSimilarityInstance = new BM25Similarity(BM25Model.PLUS);
private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L); private static final Similarity luceneBM25LSimilarityInstance = new BM25Similarity(BM25Model.L);
@ -78,78 +88,51 @@ public class LuceneUtils {
private static final Similarity luceneBooleanSimilarityInstance = new BooleanSimilarity(); private static final Similarity luceneBooleanSimilarityInstance = new BooleanSimilarity();
private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity(); private static final Similarity luceneRobertsonSimilarityInstance = new RobertsonSimilarity();
@SuppressWarnings("DuplicatedCode")
public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) { public static Analyzer getAnalyzer(TextFieldsAnalyzer analyzer) {
switch (analyzer) { return switch (analyzer) {
case N4GramPartialWords: case N4GramPartialWords -> lucene4GramWordsAnalyzerInstance;
return lucene4GramWordsAnalyzerInstance; case N4GramPartialString -> lucene4GramStringAnalyzerInstance;
case N4GramPartialString: case N4GramPartialWordsEdge -> lucene4GramWordsAnalyzerEdgeInstance;
return lucene4GramStringAnalyzerInstance; case N4GramPartialStringEdge -> lucene4GramStringAnalyzerEdgeInstance;
case N4GramPartialWordsEdge: case N3To5GramPartialWords -> lucene3To5GramWordsAnalyzerInstance;
return lucene4GramWordsAnalyzerEdgeInstance; case N3To5GramPartialString -> lucene3To5GramStringAnalyzerInstance;
case N4GramPartialStringEdge: case N3To5GramPartialWordsEdge -> lucene3To5GramWordsAnalyzerEdgeInstance;
return lucene4GramStringAnalyzerEdgeInstance; case N3To5GramPartialStringEdge -> lucene3To5GramStringAnalyzerEdgeInstance;
case N3To5GramPartialWords: case Standard -> luceneStandardAnalyzerInstance;
return lucene3To5GramWordsAnalyzerInstance; case FullText -> luceneWordAnalyzerStopWordsAndStemInstance;
case N3To5GramPartialString: case WordWithStopwordsStripping -> luceneWordAnalyzerStopWordsInstance;
return lucene3To5GramStringAnalyzerInstance; case WordWithStemming -> luceneWordAnalyzerStemInstance;
case N3To5GramPartialWordsEdge: case WordSimple -> luceneWordAnalyzerSimpleInstance;
return lucene3To5GramWordsAnalyzerEdgeInstance; case ICUCollationKey -> luceneICUCollationKeyInstance;
case N3To5GramPartialStringEdge: //noinspection UnnecessaryDefault
return lucene3To5GramStringAnalyzerEdgeInstance; default -> throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
case Standard: };
return luceneStandardAnalyzerInstance;
case FullText:
return luceneWordAnalyzerStopWordsAndStemInstance;
case WordWithStopwordsStripping:
return luceneWordAnalyzerStopWordsInstance;
case WordWithStemming:
return luceneWordAnalyzerStemInstance;
case WordSimple:
return luceneWordAnalyzerSimpleInstance;
default:
throw new UnsupportedOperationException("Unknown analyzer: " + analyzer);
}
} }
@SuppressWarnings("DuplicatedCode")
public static Similarity getSimilarity(TextFieldsSimilarity similarity) { public static Similarity getSimilarity(TextFieldsSimilarity similarity) {
switch (similarity) { return switch (similarity) {
case BM25Classic: case BM25Classic -> luceneBM25ClassicSimilarityInstance;
return luceneBM25ClassicSimilarityInstance; case NGramBM25Classic -> luceneBM25ClassicNGramSimilarityInstance;
case NGramBM25Classic: case BM25L -> luceneBM25LSimilarityInstance;
return luceneBM25ClassicNGramSimilarityInstance; case NGramBM25L -> luceneBM25LNGramSimilarityInstance;
case BM25L: case Classic -> luceneClassicSimilarityInstance;
return luceneBM25LSimilarityInstance; case NGramClassic -> luceneClassicNGramSimilarityInstance;
case NGramBM25L: case BM25Plus -> luceneBM25PlusSimilarityInstance;
return luceneBM25LNGramSimilarityInstance; case NGramBM25Plus -> luceneBM25PlusNGramSimilarityInstance;
case Classic: case BM15Plus -> luceneBM15PlusSimilarityInstance;
return luceneClassicSimilarityInstance; case NGramBM15Plus -> luceneBM15PlusNGramSimilarityInstance;
case NGramClassic: case BM11Plus -> luceneBM11PlusSimilarityInstance;
return luceneClassicNGramSimilarityInstance; case NGramBM11Plus -> luceneBM11PlusNGramSimilarityInstance;
case BM25Plus: case LTC -> luceneLTCSimilarityInstance;
return luceneBM25PlusSimilarityInstance; case LDP -> luceneLDPSimilarityInstance;
case NGramBM25Plus: case LDPNoLength -> luceneLDPNoLengthSimilarityInstance;
return luceneBM25PlusNGramSimilarityInstance; case Robertson -> luceneRobertsonSimilarityInstance;
case BM15Plus: case Boolean -> luceneBooleanSimilarityInstance;
return luceneBM15PlusSimilarityInstance; //noinspection UnnecessaryDefault
case NGramBM15Plus: default -> throw new IllegalStateException("Unknown similarity: " + similarity);
return luceneBM15PlusNGramSimilarityInstance; };
case BM11Plus:
return luceneBM11PlusSimilarityInstance;
case NGramBM11Plus:
return luceneBM11PlusNGramSimilarityInstance;
case LTC:
return luceneLTCSimilarityInstance;
case LDP:
return luceneLDPSimilarityInstance;
case LDPNoLength:
return luceneLDPNoLengthSimilarityInstance;
case Robertson:
return luceneRobertsonSimilarityInstance;
case Boolean:
return luceneBooleanSimilarityInstance;
default:
throw new IllegalStateException("Unknown similarity: " + similarity);
}
} }
/** /**
@ -285,4 +268,27 @@ public class LuceneUtils {
.at(snapshot, entry.getKey()) .at(snapshot, entry.getKey())
.flatMap(sub -> sub.getValue(snapshot, entry.getValue()).doAfterTerminate(sub::release)); .flatMap(sub -> sub.getValue(snapshot, entry.getValue()).doAfterTerminate(sub::release));
} }
public static PerFieldAnalyzerWrapper toPerFieldAnalyzerWrapper(IndicizerAnalyzers indicizerAnalyzers) {
HashMap<String, Analyzer> perFieldAnalyzer = new HashMap<>();
indicizerAnalyzers
.fieldAnalyzer()
.forEach((key, value) -> perFieldAnalyzer.put(key, LuceneUtils.getAnalyzer(value)));
return new PerFieldAnalyzerWrapper(LuceneUtils.getAnalyzer(indicizerAnalyzers.defaultAnalyzer()), perFieldAnalyzer);
}
public static PerFieldSimilarityWrapper toPerFieldSimilarityWrapper(IndicizerSimilarities indicizerSimilarities) {
HashMap<String, Similarity> perFieldSimilarity = new HashMap<>();
indicizerSimilarities
.fieldSimilarity()
.forEach((key, value) -> perFieldSimilarity.put(key, LuceneUtils.getSimilarity(value)));
var defaultSimilarity = LuceneUtils.getSimilarity(indicizerSimilarities.defaultSimilarity());
return new PerFieldSimilarityWrapper() {
@Override
public Similarity get(String name) {
return perFieldSimilarity.getOrDefault(name, defaultSimilarity);
}
};
}
} }

View File

@ -11,6 +11,7 @@ public enum TextFieldsAnalyzer {
N3To5GramPartialStringEdge, N3To5GramPartialStringEdge,
Standard, Standard,
WordSimple, WordSimple,
ICUCollationKey,
WordWithStopwordsStripping, WordWithStopwordsStripping,
WordWithStemming, WordWithStemming,
FullText, FullText,

View File

@ -1,31 +1,47 @@
package it.cavallium.dbengine.lucene.analyzer; package it.cavallium.dbengine.lucene.analyzer;
import com.ibm.icu.text.Collator;
import com.ibm.icu.util.ULocale;
import it.cavallium.dbengine.database.EnglishItalianStopFilter; import it.cavallium.dbengine.database.EnglishItalianStopFilter;
import it.cavallium.dbengine.lucene.LuceneUtils; import it.cavallium.dbengine.lucene.LuceneUtils;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUCollationAttributeFactory;
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
public class WordAnalyzer extends Analyzer { public class WordAnalyzer extends Analyzer {
private final boolean icu;
private final boolean removeStopWords; private final boolean removeStopWords;
private final boolean stem; private final boolean stem;
public WordAnalyzer(boolean removeStopWords, boolean stem) { public WordAnalyzer(boolean icu, boolean removeStopWords, boolean stem) {
this.icu = icu;
this.removeStopWords = removeStopWords; this.removeStopWords = removeStopWords;
this.stem = stem; this.stem = stem;
} }
@Override @Override
protected TokenStreamComponents createComponents(final String fieldName) { protected TokenStreamComponents createComponents(final String fieldName) {
Tokenizer tokenizer = new StandardTokenizer(); Tokenizer tokenizer;
if (icu) {
tokenizer = new StandardTokenizer(new ICUCollationAttributeFactory(Collator.getInstance(ULocale.ROOT)));
} else {
tokenizer = new StandardTokenizer();
}
TokenStream tokenStream = tokenizer; TokenStream tokenStream = tokenizer;
//tokenStream = new LengthFilter(tokenStream, 1, 100); if (stem) {
tokenStream = new LengthFilter(tokenStream, 1, 120);
}
if (!icu) {
tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem);
}
if (removeStopWords) { if (removeStopWords) {
tokenStream = new EnglishItalianStopFilter(tokenStream); tokenStream = new EnglishItalianStopFilter(tokenStream);
} }
tokenStream = LuceneUtils.newCommonFilter(tokenStream, stem);
return new TokenStreamComponents(tokenizer, tokenStream); return new TokenStreamComponents(tokenizer, tokenStream);
} }