CavalliumDBEngine/src/main/java/it/cavallium/dbengine/database/disk/LLLocalDictionary.java

1203 lines
41 KiB
Java

package it.cavallium.dbengine.database.disk;
import static it.cavallium.dbengine.database.LLUtils.MARKER_ROCKSDB;
import static it.cavallium.dbengine.database.LLUtils.isBoundedRange;
import static it.cavallium.dbengine.database.LLUtils.mapList;
import static it.cavallium.dbengine.database.LLUtils.toStringSafe;
import static it.cavallium.dbengine.database.disk.UpdateAtomicResultMode.DELTA;
import static it.cavallium.dbengine.utils.StreamUtils.ROCKSDB_POOL;
import static it.cavallium.dbengine.utils.StreamUtils.collectOn;
import static it.cavallium.dbengine.utils.StreamUtils.executing;
import static it.cavallium.dbengine.utils.StreamUtils.fastSummingLong;
import static it.cavallium.dbengine.utils.StreamUtils.resourceStream;
import static java.util.Objects.requireNonNull;
import static it.cavallium.dbengine.utils.StreamUtils.batches;
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.Timer;
import it.cavallium.buffer.Buf;
import it.cavallium.dbengine.client.DbProgress;
import it.cavallium.dbengine.client.DbProgress.DbSSTProgress;
import it.cavallium.dbengine.client.SSTProgress.SSTOk;
import it.cavallium.dbengine.client.SSTProgress.SSTProgressReport;
import it.cavallium.dbengine.client.SSTProgress.SSTStart;
import it.cavallium.dbengine.client.SSTVerificationProgress;
import it.cavallium.dbengine.client.SSTVerificationProgress.SSTBlockBad;
import it.cavallium.dbengine.database.ColumnUtils;
import it.cavallium.dbengine.database.LLDelta;
import it.cavallium.dbengine.database.LLDictionary;
import it.cavallium.dbengine.database.LLDictionaryResultType;
import it.cavallium.dbengine.database.LLEntry;
import it.cavallium.dbengine.database.LLRange;
import it.cavallium.dbengine.database.LLSnapshot;
import it.cavallium.dbengine.database.LLUtils;
import it.cavallium.dbengine.database.OptionalBuf;
import it.cavallium.dbengine.database.RocksDBLongProperty;
import it.cavallium.dbengine.database.SerializedKey;
import it.cavallium.dbengine.database.UpdateMode;
import it.cavallium.dbengine.database.UpdateReturnMode;
import it.cavallium.dbengine.database.disk.RocksDBFile.IterationMetadata;
import it.cavallium.dbengine.database.disk.rocksdb.LLReadOptions;
import it.cavallium.dbengine.database.disk.rocksdb.LLWriteOptions;
import it.cavallium.dbengine.database.serialization.KVSerializationFunction;
import it.cavallium.dbengine.database.serialization.SerializationFunction;
import it.cavallium.dbengine.rpc.current.data.Column;
import it.cavallium.dbengine.utils.DBException;
import java.io.IOException;
import java.math.BigInteger;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Function;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.util.Supplier;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.CompactRangeOptions;
import org.rocksdb.FlushOptions;
import org.rocksdb.RocksDBException;
import org.rocksdb.Snapshot;
import org.rocksdb.WriteBatch;
public class LLLocalDictionary implements LLDictionary {
protected static final Logger logger = LogManager.getLogger(LLLocalDictionary.class);
private static final boolean USE_CURRENT_FASTSIZE_FOR_OLD_SNAPSHOTS = false;
static final int RESERVED_WRITE_BATCH_SIZE = 2 * 1024 * 1024; // 2MiB
static final long MAX_WRITE_BATCH_SIZE = 1024L * 1024L * 1024L; // 1GiB
static final int CAPPED_WRITE_BATCH_CAP = 50000; // 50K operations
static final int MULTI_GET_WINDOW = 16;
static final boolean PREFER_AUTO_SEEK_BOUND = false;
/**
* It used to be false,
* now it's true to avoid crashes during iterations on completely corrupted files
*/
static final boolean VERIFY_CHECKSUMS_WHEN_NOT_NEEDED = !LLUtils.FORCE_DISABLE_CHECKSUM_VERIFICATION;
/**
* Default: true. Use false to debug problems with windowing.
*/
static final boolean USE_WINDOW_IN_SET_RANGE = true;
/**
* Default: true. Use false to debug problems with write batches.
*/
static final boolean USE_WRITE_BATCHES_IN_PUT_MULTI = true;
/**
* Default: true. Use false to debug problems with write batches.
*/
static final boolean USE_WRITE_BATCHES_IN_SET_RANGE = true;
/**
* Default: true. Use false to debug problems with capped write batches.
*/
static final boolean USE_CAPPED_WRITE_BATCH_IN_SET_RANGE = true;
/**
* Default: true. Use false to debug problems with write batches deletes.
*/
static final boolean USE_WRITE_BATCH_IN_SET_RANGE_DELETE = false;
static final boolean PARALLEL_EXACT_SIZE = true;
private static final boolean USE_NUM_ENTRIES_PRECISE_COUNTER = true;
private static final byte[] FIRST_KEY = new byte[]{};
private final RocksDBColumn db;
private final ColumnFamilyHandle cfh;
private final String databaseName;
private final String columnName;
private final Function<LLSnapshot, Snapshot> snapshotResolver;
private final UpdateMode updateMode;
private final Counter startedUpdates;
private final Counter endedUpdates;
private final Timer updateTime;
private final Counter startedGet;
private final Counter endedGet;
private final Timer getTime;
private final Counter startedContains;
private final Counter endedContains;
private final Timer containsTime;
private final Counter startedPut;
private final Counter endedPut;
private final Timer putTime;
private final Counter startedRemove;
private final Counter endedRemove;
private final Timer removeTime;
public LLLocalDictionary(
@NotNull RocksDBColumn db,
String databaseName,
String columnName,
Function<LLSnapshot, Snapshot> snapshotResolver,
UpdateMode updateMode) {
requireNonNull(db);
this.db = db;
this.cfh = db.getColumnFamilyHandle();
this.databaseName = databaseName;
this.columnName = columnName;
this.snapshotResolver = snapshotResolver;
this.updateMode = updateMode;
var meterRegistry = db.getMeterRegistry();
this.startedGet = meterRegistry.counter("db.read.map.get.started.counter", "db.name", databaseName, "db.column", columnName);
this.endedGet = meterRegistry.counter("db.read.map.get.ended.counter", "db.name", databaseName, "db.column", columnName);
this.getTime = Timer
.builder("db.read.map.get.timer")
.publishPercentiles(0.2, 0.5, 0.95)
.publishPercentileHistogram()
.tags("db.name", databaseName, "db.column", columnName)
.register(meterRegistry);
this.startedContains = meterRegistry.counter("db.read.map.contains.started.counter", "db.name", databaseName, "db.column", columnName);
this.endedContains = meterRegistry.counter("db.read.map.contains.ended.counter", "db.name", databaseName, "db.column", columnName);
this.containsTime = Timer
.builder("db.read.map.contains.timer")
.publishPercentiles(0.2, 0.5, 0.95)
.publishPercentileHistogram()
.tags("db.name", databaseName, "db.column", columnName)
.register(meterRegistry);
this.startedUpdates = meterRegistry.counter("db.write.map.update.started.counter", "db.name", databaseName, "db.column", columnName);
this.endedUpdates = meterRegistry.counter("db.write.map.update.ended.counter", "db.name", databaseName, "db.column", columnName);
this.updateTime = Timer
.builder("db.write.map.update.timer")
.publishPercentiles(0.2, 0.5, 0.95)
.publishPercentileHistogram()
.tags("db.name", databaseName, "db.column", columnName)
.register(meterRegistry);
this.startedPut = meterRegistry.counter("db.write.map.put.started.counter", "db.name", databaseName, "db.column", columnName);
this.endedPut = meterRegistry.counter("db.write.map.put.ended.counter", "db.name", databaseName, "db.column", columnName);
this.putTime = Timer
.builder("db.write.map.put.timer")
.publishPercentiles(0.2, 0.5, 0.95)
.publishPercentileHistogram()
.tags("db.name", databaseName, "db.column", columnName)
.register(meterRegistry);
this.startedRemove = meterRegistry.counter("db.write.map.remove.started.counter", "db.name", databaseName, "db.column", columnName);
this.endedRemove = meterRegistry.counter("db.write.map.remove.ended.counter", "db.name", databaseName, "db.column", columnName);
this.removeTime = Timer
.builder("db.write.map.remove.timer")
.publishPercentiles(0.2, 0.5, 0.95)
.publishPercentileHistogram()
.tags("db.name", databaseName, "db.column", columnName)
.register(meterRegistry);
}
@Override
public String getDatabaseName() {
return databaseName;
}
public String getColumnName() {
return columnName;
}
@NotNull
private LLReadOptions generateReadOptionsOrNew(LLSnapshot snapshot) {
return generateReadOptions(snapshot != null ? snapshotResolver.apply(snapshot) : null);
}
private LLReadOptions generateReadOptions(Snapshot snapshot) {
if (snapshot != null) {
return new LLReadOptions().setSnapshot(snapshot);
} else {
return new LLReadOptions();
}
}
@Override
public Buf get(@Nullable LLSnapshot snapshot, Buf key) {
return this.getSync(snapshot, key);
}
private Buf getSync(LLSnapshot snapshot, Buf key) {
logger.trace(MARKER_ROCKSDB, "Reading {}", () -> toStringSafe(key));
try {
Buf result;
startedGet.increment();
try (var readOptions = generateReadOptionsOrNew(snapshot)) {
var initTime = System.nanoTime();
result = db.get(readOptions, key);
getTime.record(Duration.ofNanos(System.nanoTime() - initTime));
} finally {
endedGet.increment();
}
logger.trace(MARKER_ROCKSDB, "Read {}: {}", () -> toStringSafe(key), () -> toStringSafe(result));
return result;
} catch (RocksDBException ex) {
throw new DBException("Failed to read " + toStringSafe(key) + ": " + ex.getMessage());
}
}
@Override
public boolean isRangeEmpty(@Nullable LLSnapshot snapshot, LLRange range, boolean fillCache) {
assert !LLUtils.isInNonBlockingThread() : "Called isRangeEmpty in a nonblocking thread";
startedContains.increment();
try {
Boolean isRangeEmpty = containsTime.recordCallable(() -> {
if (range.isSingle()) {
return !containsKey(snapshot, range.getSingleUnsafe());
} else {
// Temporary resources to release after finished
try (var readOpts = LLUtils.generateCustomReadOptions(generateReadOptionsOrNew(snapshot),
true,
isBoundedRange(range),
true
)) {
readOpts.setVerifyChecksums(VERIFY_CHECKSUMS_WHEN_NOT_NEEDED);
readOpts.setFillCache(fillCache);
try (var rocksIterator = db.newIterator(readOpts, range.getMin(), range.getMax())) {
if (!LLLocalDictionary.PREFER_AUTO_SEEK_BOUND && range.hasMin()) {
var seekArray = LLUtils.asArray(range.getMin());
rocksIterator.seek(seekArray);
} else {
rocksIterator.seekToFirst();
}
return !rocksIterator.isValid();
}
}
}
});
assert isRangeEmpty != null;
return isRangeEmpty;
} catch (Exception ex) {
throw new DBException("Failed to read range " + LLUtils.toStringSafe(range), ex);
} finally {
endedContains.increment();
}
}
private boolean containsKey(@Nullable LLSnapshot snapshot, Buf key) throws RocksDBException {
startedContains.increment();
try {
var result = containsTime.recordCallable(() -> {
try (var readOptions = generateReadOptionsOrNew(snapshot)) {
return db.exists(readOptions, key);
}
});
assert result != null;
return result;
} catch (RocksDBException | RuntimeException e) {
throw e;
} catch (Exception ex) {
throw new RuntimeException(ex);
} finally {
endedContains.increment();
}
}
@Override
public Buf put(Buf key, Buf value, LLDictionaryResultType resultType) {
// Obtain the previous value from the database
var previousData = this.getPreviousData(key, resultType);
putInternal(key, value);
return previousData;
}
private void putInternal(Buf key, Buf value) {
if (logger.isTraceEnabled(MARKER_ROCKSDB)) {
var varargs = new Supplier<?>[]{() -> toStringSafe(key), () -> toStringSafe(value)};
logger.trace(MARKER_ROCKSDB, "Writing {}: {}", varargs);
}
startedPut.increment();
try (var writeOptions = new LLWriteOptions()) {
putTime.recordCallable(() -> {
db.put(writeOptions, key, value);
return null;
});
} catch (RocksDBException ex) {
throw new DBException("Failed to write: " + ex.getMessage());
} catch (RuntimeException ex) {
throw ex;
} catch (Exception ex) {
throw new RuntimeException("Failed to write", ex);
} finally {
endedPut.increment();
}
}
@Override
public UpdateMode getUpdateMode() {
return updateMode;
}
@SuppressWarnings("DuplicatedCode")
@Override
public Buf update(Buf key,
SerializationFunction<@Nullable Buf, @Nullable Buf> updater,
UpdateReturnMode updateReturnMode) {
assert !LLUtils.isInNonBlockingThread() : "Called update in a nonblocking thread";
if (updateMode == UpdateMode.DISALLOW) {
throw new UnsupportedOperationException("update() is disallowed");
}
UpdateAtomicResultMode returnMode = switch (updateReturnMode) {
case NOTHING -> UpdateAtomicResultMode.NOTHING;
case GET_NEW_VALUE -> UpdateAtomicResultMode.CURRENT;
case GET_OLD_VALUE -> UpdateAtomicResultMode.PREVIOUS;
};
UpdateAtomicResult result = null;
try {
startedUpdates.increment();
try (var readOptions = generateReadOptionsOrNew(null);
var writeOptions = new LLWriteOptions()) {
result = updateTime.recordCallable(() -> db.updateAtomic(readOptions, writeOptions, key, updater, returnMode));
} finally {
endedUpdates.increment();
}
assert result != null;
return switch (updateReturnMode) {
case NOTHING -> {
yield null;
}
case GET_NEW_VALUE -> ((UpdateAtomicResultCurrent) result).current();
case GET_OLD_VALUE -> ((UpdateAtomicResultPrevious) result).previous();
};
} catch (Exception ex) {
throw new DBException("Failed to update key-value pair", ex);
}
}
@SuppressWarnings("DuplicatedCode")
@Override
public LLDelta updateAndGetDelta(Buf key, SerializationFunction<@Nullable Buf, @Nullable Buf> updater) {
assert !LLUtils.isInNonBlockingThread() : "Called update in a nonblocking thread";
if (updateMode == UpdateMode.DISALLOW) {
throw new UnsupportedOperationException("update() is disallowed");
}
if (updateMode == UpdateMode.ALLOW && !db.supportsTransactions()) {
throw new UnsupportedOperationException("update() is disallowed because the database doesn't support"
+ "safe atomic operations");
}
UpdateAtomicResultDelta result;
try {
startedUpdates.increment();
try (var readOptions = generateReadOptionsOrNew(null);
var writeOptions = new LLWriteOptions()) {
result = updateTime.recordCallable(() -> (UpdateAtomicResultDelta) db.updateAtomic(readOptions,
writeOptions,
key,
updater,
DELTA
));
} finally {
endedUpdates.increment();
}
assert result != null;
return result.delta();
} catch (Exception ex) {
throw new DBException("Failed to update key-value pair and/or return the delta", ex);
}
}
@Override
public Buf remove(Buf key, LLDictionaryResultType resultType) {
// Obtain the previous value from the database
Buf previousData = this.getPreviousData(key, resultType);
// Delete the value from the database
try {
logger.trace(MARKER_ROCKSDB, "Deleting {}", () -> toStringSafe(key));
startedRemove.increment();
try (var writeOptions = new LLWriteOptions()) {
removeTime.recordCallable(() -> {
db.delete(writeOptions, key);
return null;
});
} finally {
endedRemove.increment();
}
return previousData;
} catch (Exception ex) {
throw new DBException("Failed to delete", ex);
}
}
private Buf getPreviousData(Buf key, LLDictionaryResultType resultType) {
try {
return switch (resultType) {
case PREVIOUS_VALUE_EXISTENCE -> {
var contained = containsKey(null, key);
yield LLUtils.booleanToResponseByteBuffer(contained);
}
case PREVIOUS_VALUE -> {
assert !LLUtils.isInNonBlockingThread() : "Called getPreviousData in a nonblocking thread";
Buf result;
try (var readOptions = generateReadOptionsOrNew(null)) {
result = db.get(readOptions, key);
}
logger.trace(MARKER_ROCKSDB, "Read {}: {}", () -> toStringSafe(key), () -> toStringSafe(result));
yield result;
}
case VOID -> null;
};
} catch (RocksDBException ex) {
throw new DBException("Failed to read previous data");
}
}
@Override
public Stream<OptionalBuf> getMulti(@Nullable LLSnapshot snapshot, Stream<Buf> keys) {
return keys.map(key -> OptionalBuf.ofNullable(getSync(snapshot, key)));
}
@Override
public void putMulti(Stream<LLEntry> entries) {
collectOn(ROCKSDB_POOL,
batches(entries, Math.min(MULTI_GET_WINDOW, CAPPED_WRITE_BATCH_CAP)),
executing(entriesWindow -> {
try (var writeOptions = new LLWriteOptions()) {
assert !LLUtils.isInNonBlockingThread() : "Called putMulti in a nonblocking thread";
if (USE_WRITE_BATCHES_IN_PUT_MULTI) {
try (var batch = new CappedWriteBatch(db,
CAPPED_WRITE_BATCH_CAP,
RESERVED_WRITE_BATCH_SIZE,
MAX_WRITE_BATCH_SIZE,
writeOptions
)) {
for (LLEntry entry : entriesWindow) {
batch.put(cfh, entry.getKey(), entry.getValue());
}
batch.flush();
}
} else {
for (LLEntry entry : entriesWindow) {
db.put(writeOptions, entry.getKey(), entry.getValue());
}
}
} catch (RocksDBException ex) {
throw new CompletionException(new DBException("Failed to write: " + ex.getMessage()));
}
})
);
}
@Override
public <K> Stream<Boolean> updateMulti(Stream<SerializedKey<K>> keys,
KVSerializationFunction<K, @Nullable Buf, @Nullable Buf> updateFunction) {
record MappedInput<K>(K key, Buf serializedKey, OptionalBuf mapped) {}
return batches(keys, Math.min(MULTI_GET_WINDOW, CAPPED_WRITE_BATCH_CAP))
.flatMap(entriesWindow -> {
try (var writeOptions = new LLWriteOptions()) {
if (LLUtils.isInNonBlockingThread()) {
throw new UnsupportedOperationException("Called updateMulti in a nonblocking thread");
}
List<Buf> keyBufsWindow = new ArrayList<>(entriesWindow.size());
for (var objects : entriesWindow) {
keyBufsWindow.add(objects.serialized());
}
ArrayList<MappedInput<K>> mappedInputs;
{
try (var readOptions = generateReadOptionsOrNew(null)) {
var inputs = db.multiGetAsList(readOptions, mapList(keyBufsWindow, Buf::asArray));
mappedInputs = new ArrayList<>(inputs.size());
for (int i = 0; i < inputs.size(); i++) {
var val = inputs.get(i);
if (val != null) {
inputs.set(i, null);
mappedInputs.add(new MappedInput<>(entriesWindow.get(i).key(),
keyBufsWindow.get(i),
OptionalBuf.of(Buf.wrap(val))
));
} else {
mappedInputs.add(new MappedInput<>(entriesWindow.get(i).key(),
keyBufsWindow.get(i),
OptionalBuf.empty()
));
}
}
}
}
var updatedValuesToWrite = new ArrayList<Buf>(mappedInputs.size());
var valueChangedResult = new ArrayList<Boolean>(mappedInputs.size());
for (var mappedInput : mappedInputs) {
var updatedValue = updateFunction.apply(mappedInput.key(), mappedInput.serializedKey());
var t3 = mappedInput.mapped().orElse(null);
valueChangedResult.add(!LLUtils.equals(t3, updatedValue));
updatedValuesToWrite.add(updatedValue);
}
if (USE_WRITE_BATCHES_IN_PUT_MULTI) {
try (var batch = new CappedWriteBatch(db,
CAPPED_WRITE_BATCH_CAP,
RESERVED_WRITE_BATCH_SIZE,
MAX_WRITE_BATCH_SIZE,
writeOptions
)) {
int i = 0;
for (var entry : entriesWindow) {
var valueToWrite = updatedValuesToWrite.get(i);
if (valueToWrite == null) {
batch.delete(cfh, entry.serialized());
} else {
batch.put(cfh, entry.serialized(), valueToWrite);
}
i++;
}
batch.flush();
}
} else {
int i = 0;
for (var entry : entriesWindow) {
db.put(writeOptions, entry.serialized(), updatedValuesToWrite.get(i));
i++;
}
}
return valueChangedResult.stream();
} catch (RocksDBException e) {
throw new CompletionException(new DBException("Failed to update multiple key-value pairs", e));
}
});
}
@Override
public Stream<LLEntry> getRange(@Nullable LLSnapshot snapshot,
LLRange range,
boolean reverse,
boolean smallRange) {
if (range.isSingle()) {
var rangeSingle = range.getSingle();
return getRangeSingle(snapshot, rangeSingle);
} else {
return getRangeMulti(snapshot, range, reverse, smallRange);
}
}
@Override
public Stream<List<LLEntry>> getRangeGrouped(@Nullable LLSnapshot snapshot,
LLRange range,
int prefixLength,
boolean smallRange) {
if (range.isSingle()) {
var rangeSingle = range.getSingle();
return getRangeSingle(snapshot, rangeSingle).map(Collections::singletonList);
} else {
return getRangeMultiGrouped(snapshot, range, prefixLength, smallRange);
}
}
private Stream<LLEntry> getRangeSingle(LLSnapshot snapshot, Buf key) {
var val = this.get(snapshot, key);
if (val == null) return Stream.of();
return Stream.of(LLEntry.of(key, val));
}
private Stream<LLEntry> getRangeMulti(LLSnapshot snapshot,
LLRange range,
boolean reverse,
boolean smallRange) {
return new LLLocalEntryReactiveRocksIterator(db,
range,
() -> generateReadOptionsOrNew(snapshot),
reverse,
smallRange
).stream();
}
private Stream<List<LLEntry>> getRangeMultiGrouped(LLSnapshot snapshot, LLRange range,
int prefixLength, boolean smallRange) {
return new LLLocalGroupedEntryReactiveRocksIterator(db,
prefixLength,
range,
() -> generateReadOptionsOrNew(snapshot),
smallRange
).stream();
}
@Override
public Stream<Buf> getRangeKeys(@Nullable LLSnapshot snapshot,
LLRange range,
boolean reverse,
boolean smallRange) {
if (range.isSingle()) {
return this.getRangeKeysSingle(snapshot, range.getSingle());
} else {
return this.getRangeKeysMulti(snapshot, range, reverse, smallRange);
}
}
@Override
public Stream<List<Buf>> getRangeKeysGrouped(@Nullable LLSnapshot snapshot,
LLRange range,
int prefixLength,
boolean smallRange) {
return new LLLocalGroupedKeyReactiveRocksIterator(db,
prefixLength,
range,
() -> generateReadOptionsOrNew(snapshot),
smallRange
).stream();
}
@Override
public Stream<DbProgress<SSTVerificationProgress>> verifyChecksum(LLRange rangeFull) {
Set<String> brokenFiles = new ConcurrentHashMap<String, Boolean>().keySet(true);
LongAdder totalScanned = new LongAdder();
long totalEstimate;
{
long totalEstimateTmp = 0;
try {
totalEstimateTmp = db.getNumEntries();
} catch (Throwable ex) {
logger.warn("Failed to get total entries count", ex);
}
totalEstimate = totalEstimateTmp;
}
Column column = ColumnUtils.special(columnName);
try {
var liveFiles = db.getAllLiveFiles().toList();
var liveFilesCount = liveFiles.size();
return liveFiles.stream()
.sorted(Comparator.reverseOrder())
.flatMap(fr -> {
AtomicReference<IterationMetadata> metadataRef = new AtomicReference<>();
AtomicLong initialEstimate = new AtomicLong();
return fr
.verify(SSTRange.parse(rangeFull))
.map(status -> switch (status) {
case SSTStart fileStart -> {
metadataRef.set(fileStart.metadata());
long numEntriesEstimate = Objects.requireNonNullElse(fileStart.metadata().countEstimate(), totalEstimate / liveFilesCount);
totalScanned.add(numEntriesEstimate);
initialEstimate.set(numEntriesEstimate);
yield status;
}
case SSTOk fileEnd -> {
long initialNumEntriesEstimate = initialEstimate.get();
long numEntriesScanned = fileEnd.scannedCount();
totalScanned.add(numEntriesScanned - initialNumEntriesEstimate);
yield status;
}
case SSTProgressReport ignored -> status;
case SSTBlockBad ignored -> status;
})
.<DbProgress<SSTVerificationProgress>>map(sstProgress -> new DbProgress.DbSSTProgress<>(databaseName, column, metadataRef.get().filePath(), totalScanned.longValue(), totalEstimate, sstProgress));
})
.filter(err -> !(err instanceof DbProgress.DbSSTProgress<SSTVerificationProgress> sstProgress
&& sstProgress.sstProgress() instanceof SSTBlockBad blockBad
&& blockBad.rawKey() == null && !brokenFiles.add(sstProgress.fileString())));
} catch (RocksDBException e) {
return Stream.of(new DbProgress.DbSSTProgress<>(databaseName, column, null, 0, 0, new SSTBlockBad(null, e)));
}
}
@Override
public Stream<Buf> getRangeKeyPrefixes(@Nullable LLSnapshot snapshot, LLRange range,
int prefixLength, boolean smallRange) {
return new LLLocalKeyPrefixReactiveRocksIterator(db,
prefixLength,
range,
() -> generateReadOptionsOrNew(snapshot),
true,
smallRange
).stream();
}
private Stream<Buf> getRangeKeysSingle(LLSnapshot snapshot, Buf key) {
try {
if (containsKey(snapshot, key)) {
return Stream.of(key);
} else {
return Stream.empty();
}
} catch (RocksDBException e) {
throw new DBException("Failed to get range keys", e);
}
}
private Stream<Buf> getRangeKeysMulti(LLSnapshot snapshot,
LLRange range,
boolean reverse,
boolean smallRange) {
return new LLLocalKeyReactiveRocksIterator(db,
range,
() -> generateReadOptionsOrNew(snapshot),
reverse,
smallRange
).stream();
}
@Override
public void setRange(LLRange range, Stream<LLEntry> entries, boolean smallRange) {
if (USE_WINDOW_IN_SET_RANGE) {
try (var writeOptions = new LLWriteOptions()) {
assert !LLUtils.isInNonBlockingThread() : "Called setRange in a nonblocking thread";
if (!USE_WRITE_BATCH_IN_SET_RANGE_DELETE || !USE_WRITE_BATCHES_IN_SET_RANGE) {
try (var opts = LLUtils.generateCustomReadOptions(null, true, isBoundedRange(range), smallRange)) {
try (var it = db.newIterator(opts, range.getMin(), range.getMax())) {
if (!PREFER_AUTO_SEEK_BOUND && range.hasMin()) {
it.seekTo(range.getMin());
} else {
it.seekToFirst();
}
while (it.isValid()) {
db.delete(writeOptions, it.key());
it.next();
}
}
}
} else if (USE_CAPPED_WRITE_BATCH_IN_SET_RANGE) {
try (var batch = new CappedWriteBatch(db,
CAPPED_WRITE_BATCH_CAP,
RESERVED_WRITE_BATCH_SIZE,
MAX_WRITE_BATCH_SIZE,
writeOptions
)) {
if (range.isSingle()) {
batch.delete(cfh, range.getSingle());
} else {
deleteSmallRangeWriteBatch(batch, range.copy());
}
batch.flush();
}
} else {
try (var batch = new WriteBatch(RESERVED_WRITE_BATCH_SIZE)) {
if (range.isSingle()) {
batch.delete(cfh, LLUtils.asArray(range.getSingleUnsafe()));
} else {
deleteSmallRangeWriteBatch(batch, range.copy());
}
db.write(writeOptions, batch);
batch.clear();
}
}
} catch (RocksDBException ex) {
throw new DBException("Failed to set a range: " + ex.getMessage());
}
collectOn(ROCKSDB_POOL, batches(entries, MULTI_GET_WINDOW), executing(entriesList -> {
try (var writeOptions = new LLWriteOptions()) {
if (!USE_WRITE_BATCHES_IN_SET_RANGE) {
for (LLEntry entry : entriesList) {
db.put(writeOptions, entry.getKey(), entry.getValue());
}
} else if (USE_CAPPED_WRITE_BATCH_IN_SET_RANGE) {
try (var batch = new CappedWriteBatch(db,
CAPPED_WRITE_BATCH_CAP,
RESERVED_WRITE_BATCH_SIZE,
MAX_WRITE_BATCH_SIZE,
writeOptions
)) {
for (LLEntry entry : entriesList) {
batch.put(cfh, entry.getKey(), entry.getValue());
}
batch.flush();
}
} else {
try (var batch = new WriteBatch(RESERVED_WRITE_BATCH_SIZE)) {
for (LLEntry entry : entriesList) {
batch.put(cfh, LLUtils.asArray(entry.getKey()), LLUtils.asArray(entry.getValue()));
}
db.write(writeOptions, batch);
batch.clear();
}
}
} catch (RocksDBException ex) {
throw new CompletionException(new DBException("Failed to write range", ex));
}
}));
} else {
if (USE_WRITE_BATCHES_IN_SET_RANGE) {
throw new UnsupportedOperationException("Can't use write batches in setRange without window. Please fix the parameters");
}
collectOn(ROCKSDB_POOL, this.getRange(null, range, false, smallRange), executing(oldValue -> {
try (var writeOptions = new LLWriteOptions()) {
db.delete(writeOptions, oldValue.getKey());
} catch (RocksDBException ex) {
throw new CompletionException(new DBException("Failed to write range", ex));
}
}));
collectOn(ROCKSDB_POOL, entries, executing(entry -> {
if (entry.getKey() != null && entry.getValue() != null) {
this.putInternal(entry.getKey(), entry.getValue());
}
}));
}
}
private void deleteSmallRangeWriteBatch(WriteBatch writeBatch, LLRange range)
throws RocksDBException {
try (var readOpts = LLUtils.generateCustomReadOptions(null, false, isBoundedRange(range), true)) {
try (var rocksIterator = db.newIterator(readOpts, range.getMin(), range.getMax())) {
if (!LLLocalDictionary.PREFER_AUTO_SEEK_BOUND && range.hasMin()) {
rocksIterator.seekTo(range.getMin());
} else {
rocksIterator.seekToFirst();
}
while (rocksIterator.isValid()) {
writeBatch.delete(cfh, rocksIterator.key());
rocksIterator.next();
}
}
}
}
public void clear() {
assert !LLUtils.isInNonBlockingThread() : "Called clear in a nonblocking thread";
boolean shouldCompactLater = false;
try (var writeOptions = new LLWriteOptions();
var readOpts = LLUtils.generateCustomReadOptions(null, false, false, false)) {
if (VERIFY_CHECKSUMS_WHEN_NOT_NEEDED) {
readOpts.setVerifyChecksums(true);
}
// readOpts.setIgnoreRangeDeletions(true);
if (LLUtils.MANUAL_READAHEAD) {
readOpts.setReadaheadSize(32 * 1024); // 32KiB
}
try (CappedWriteBatch writeBatch = new CappedWriteBatch(db,
CAPPED_WRITE_BATCH_CAP,
RESERVED_WRITE_BATCH_SIZE,
MAX_WRITE_BATCH_SIZE,
writeOptions
)) {
byte[] firstDeletedKey = null;
byte[] lastDeletedKey = null;
try (var rocksIterator = db.newIterator(readOpts, null, null)) {
// If the database supports transactions, delete each key one by one
if (db.supportsTransactions()) {
rocksIterator.seekToFirst();
while (rocksIterator.isValid()) {
writeBatch.delete(cfh, rocksIterator.key());
rocksIterator.next();
}
} else {
rocksIterator.seekToLast();
if (rocksIterator.isValid()) {
firstDeletedKey = FIRST_KEY;
lastDeletedKey = rocksIterator.key().clone();
writeBatch.deleteRange(cfh, FIRST_KEY, lastDeletedKey);
writeBatch.delete(cfh, lastDeletedKey);
shouldCompactLater = true;
}
}
}
writeBatch.flush();
if (shouldCompactLater) {
// Compact range
db.suggestCompactRange();
if (lastDeletedKey != null) {
try (var cro = new CompactRangeOptions()
.setAllowWriteStall(false)
.setExclusiveManualCompaction(false)
.setChangeLevel(false)) {
db.compactRange(firstDeletedKey, lastDeletedKey, cro);
}
}
}
try (var fo = new FlushOptions().setWaitForFlush(true).setAllowWriteStall(true)) {
db.flush(fo);
}
db.flushWal(true);
}
} catch (RocksDBException ex) {
throw new DBException("Failed to clear", ex);
}
}
@Override
public long sizeRange(@Nullable LLSnapshot snapshot, LLRange range, boolean fast) {
try {
assert !LLUtils.isInNonBlockingThread() : "Called sizeRange in a nonblocking thread";
if (range.isAll()) {
return fast ? fastSizeAll(snapshot) : exactSizeAll(snapshot);
} else {
try (var readOpts = LLUtils.generateCustomReadOptions(generateReadOptionsOrNew(snapshot),
false,
isBoundedRange(range),
false
)) {
readOpts.setFillCache(false);
readOpts.setVerifyChecksums(VERIFY_CHECKSUMS_WHEN_NOT_NEEDED);
if (fast) {
readOpts.setIgnoreRangeDeletions(true);
}
try (var rocksIterator = db.newIterator(readOpts, range.getMin(), range.getMax())) {
if (!LLLocalDictionary.PREFER_AUTO_SEEK_BOUND && range.hasMin()) {
rocksIterator.seekTo(range.getMin());
} else {
rocksIterator.seekToFirst();
}
long i = 0;
while (rocksIterator.isValid()) {
rocksIterator.next();
i++;
}
return i;
}
}
}
} catch (RocksDBException ex) {
throw new DBException("Failed to get size of range", ex);
}
}
@Override
public LLEntry getOne(@Nullable LLSnapshot snapshot, LLRange range) {
try {
assert !LLUtils.isInNonBlockingThread() : "Called getOne in a nonblocking thread";
try (var readOpts = LLUtils.generateCustomReadOptions(generateReadOptionsOrNew(snapshot), true, true, true)) {
try (var rocksIterator = db.newIterator(readOpts, range.getMin(), range.getMax())) {
if (!LLLocalDictionary.PREFER_AUTO_SEEK_BOUND && range.hasMin()) {
rocksIterator.seekTo(range.getMin());
} else {
rocksIterator.seekToFirst();
}
if (rocksIterator.isValid()) {
var keyView = rocksIterator.keyBuf();
var valueView = rocksIterator.valueBuf();
return LLEntry.copyOf(keyView, valueView);
} else {
return null;
}
}
}
} catch (RocksDBException ex) {
throw new DBException("Failed to get one entry", ex);
}
}
@Override
public Buf getOneKey(@Nullable LLSnapshot snapshot, LLRange range) {
try {
assert !LLUtils.isInNonBlockingThread() : "Called getOneKey in a nonblocking thread";
try (var readOpts = LLUtils.generateCustomReadOptions(generateReadOptionsOrNew(snapshot), true, true, true)) {
try (var rocksIterator = db.newIterator(readOpts, range.getMin(), range.getMax())) {
if (!LLLocalDictionary.PREFER_AUTO_SEEK_BOUND && range.hasMin()) {
rocksIterator.seekTo(range.getMin());
} else {
rocksIterator.seekToFirst();
}
if (rocksIterator.isValid()) {
return rocksIterator.keyBuf();
} else {
return null;
}
}
}
} catch (RocksDBException ex) {
throw new DBException("Failed to get one key", ex);
}
}
private long fastSizeAll(@Nullable LLSnapshot snapshot) throws RocksDBException {
try (var rocksdbSnapshot = generateReadOptionsOrNew(snapshot)) {
if (USE_CURRENT_FASTSIZE_FOR_OLD_SNAPSHOTS || rocksdbSnapshot.snapshot() == null) {
try {
if (USE_NUM_ENTRIES_PRECISE_COUNTER) {
return getRocksDBNumEntries();
}
return db.getLongProperty(RocksDBLongProperty.ESTIMATE_NUM_KEYS.getName());
} catch (RocksDBException e) {
logger.error(MARKER_ROCKSDB, "Failed to get RocksDB estimated keys count property", e);
return 0;
}
} else if (USE_NUM_ENTRIES_PRECISE_COUNTER && snapshot == null) {
return getRocksDBNumEntries();
} else if (PARALLEL_EXACT_SIZE) {
return exactSizeAll(snapshot);
} else {
rocksdbSnapshot.setFillCache(false);
rocksdbSnapshot.setVerifyChecksums(VERIFY_CHECKSUMS_WHEN_NOT_NEEDED);
rocksdbSnapshot.setIgnoreRangeDeletions(true);
long count = 0;
try (var rocksIterator = db.newIterator(rocksdbSnapshot, null, null)) {
rocksIterator.seekToFirst();
// If it's a fast size of a snapshot, count only up to 100'000 elements
while (rocksIterator.isValid() && count < 100_000) {
count++;
rocksIterator.next();
}
return count;
}
}
}
}
private long getRocksDBNumEntries() {
try {
return db.getNumEntries();
} catch (RocksDBException ex) {
throw new IllegalStateException("Failed to read exact size", ex);
}
}
private static BigInteger getRangePointAt(LLRange range,
BigInteger minBi,
BigInteger maxBi,
BigInteger rangeBi,
int pointsCount,
BigInteger pointsCountBi,
int i) {
if (i == 0) {
return range.hasMin() ? minBi : null;
} else if (i + 1 == pointsCount) {
return range.hasMax() ? maxBi : null;
} else {
return minBi.add(rangeBi.multiply(BigInteger.valueOf(i)).divide(pointsCountBi));
}
}
private static Buf getMinBufForParallelization(LLRange range) {
Buf min = range.getMin();
if (min != null) {
return min;
} else {
return Buf.wrap();
}
}
private static Buf getMaxBufForParallelization(LLRange range) {
Buf max = range.getMax();
if (max != null) {
return max;
} else {
max = range.getMin().copy();
max.add(Byte.MAX_VALUE);
return max.freeze();
}
}
public static Stream<LLRange> parallelizeRange(LLRange range) {
return parallelizeRange(range, Math.max(Runtime.getRuntime().availableProcessors() - 1, 1));
}
public static Stream<LLRange> parallelizeRange(LLRange range, int threads) {
if (threads < 1) {
throw new IllegalArgumentException();
}
if (range.isAll()) {
return IntStream
.range(-1, LLUtils.LEXICONOGRAPHIC_ITERATION_SEEKS.length)
.mapToObj(idx -> LLRange.of(
Buf.wrap(idx == -1 ? new byte[0] : LLUtils.LEXICONOGRAPHIC_ITERATION_SEEKS[idx]),
idx + 1 >= LLUtils.LEXICONOGRAPHIC_ITERATION_SEEKS.length ? null : Buf.wrap(LLUtils.LEXICONOGRAPHIC_ITERATION_SEEKS[idx + 1])));
} else if (range.isSingle()) {
return Stream.of(range);
} else {
Buf minBuf = getMinBufForParallelization(range);
Buf maxBuf = getMaxBufForParallelization(range);
byte[] minUnboundedArray = minBuf.asUnboundedArray();
byte[] maxUnboundedArray = maxBuf.asUnboundedArray();
var minBi = minBuf.isEmpty() ? BigInteger.ZERO : new BigInteger(minUnboundedArray, 0, minBuf.size());
var maxBi = new BigInteger(maxUnboundedArray, 0, maxBuf.size());
BigInteger rangeBi = maxBi.subtract(minBi);
int pointsCount = rangeBi.min(BigInteger.valueOf(threads).add(BigInteger.ONE)).intValueExact();
int segmentsCount = pointsCount - 1;
if (threads > 2 && segmentsCount <= 2) {
return Stream.of(range);
}
BigInteger pointsCountBi = BigInteger.valueOf(pointsCount);
byte[][] points = new byte[pointsCount][];
for (int i = 0; i < pointsCount; i++) {
BigInteger rangePoint = getRangePointAt(range, minBi, maxBi, rangeBi, pointsCount, pointsCountBi, i);
points[i] = rangePoint != null ? rangePoint.toByteArray() : null;
}
LLRange[] ranges = new LLRange[segmentsCount];
for (int i = 0; i < segmentsCount; i++) {
byte[] min = points[i];
byte[] max = points[i + 1];
if (min == null) {
ranges[i] = LLRange.to(Buf.wrap(max));
} else if (max == null) {
ranges[i] = LLRange.from(Buf.wrap(min));
} else {
ranges[i] = LLRange.of(Buf.wrap(min), Buf.wrap(max));
}
}
return Stream.of(ranges);
}
}
private long exactSizeAll(@Nullable LLSnapshot snapshot) {
if (LLUtils.isInNonBlockingThread()) {
throw new UnsupportedOperationException("Called exactSizeAll in a nonblocking thread");
}
try (var readOpts = LLUtils.generateCustomReadOptions(generateReadOptionsOrNew(snapshot), false, false, false)) {
if (LLUtils.MANUAL_READAHEAD) {
readOpts.setReadaheadSize(128 * 1024); // 128KiB
}
readOpts.setVerifyChecksums(VERIFY_CHECKSUMS_WHEN_NOT_NEEDED);
if (PARALLEL_EXACT_SIZE) {
return collectOn(ROCKSDB_POOL, parallelizeRange(LLRange.all()).map(range -> {
long partialCount = 0;
try (var rangeReadOpts = readOpts.copy()) {
try {
try (var rocksIterator = db.newRocksIterator(rangeReadOpts, range, false)) {
rocksIterator.seekToFirst();
while (rocksIterator.isValid()) {
partialCount++;
rocksIterator.next();
}
return partialCount;
}
} catch (RocksDBException ex) {
throw new CompletionException(new IOException("Failed to get size", ex));
}
}
}), fastSummingLong());
} else {
long count = 0;
try (var rocksIterator = db.newIterator(readOpts, null, null)) {
rocksIterator.seekToFirst();
while (rocksIterator.isValid()) {
count++;
rocksIterator.next();
}
return count;
} catch (RocksDBException ex) {
throw new IllegalStateException("Failed to read exact size", ex);
}
}
}
}
@Override
public LLEntry removeOne(LLRange range) {
assert !LLUtils.isInNonBlockingThread() : "Called removeOne in a nonblocking thread";
try (var readOpts = new LLReadOptions();
var writeOpts = new LLWriteOptions()) {
try (var rocksIterator = db.newIterator(readOpts, range.getMin(), range.getMax())) {
if (!LLLocalDictionary.PREFER_AUTO_SEEK_BOUND && range.hasMin()) {
rocksIterator.seekTo(range.getMin());
} else {
rocksIterator.seekToFirst();
}
if (!rocksIterator.isValid()) {
return null;
}
Buf key = rocksIterator.keyBuf().copy();
Buf value = rocksIterator.valueBuf().copy();
db.delete(writeOpts, key);
return LLEntry.of(key, value);
} catch (RocksDBException e) {
throw new DBException("Failed to remove key", e);
}
}
}
}