db536ee045
Summary: In `FileMetaData`, we keep track of the lowest-numbered blob file referenced by the SST file in question for the purposes of BlobDB's garbage collection in the `oldest_blob_file_number` field, which is updated in `UpdateBoundaries`. However, with the current code, `BlobIndex` decoding errors (or invalid blob file numbers) are swallowed in this method. The patch changes this by propagating these errors and failing the corresponding flush/compaction. (Note that since blob references are generated by the BlobDB code and also parsed by `CompactionIterator`, in reality this can only happen in the case of memory corruption.) This change necessitated updating some unit tests that involved fake/corrupt `BlobIndex` objects. Some of these just used a dummy string like `"blob_index"` as a placeholder; these were replaced with real `BlobIndex`es. Some were relying on the earlier behavior to simulate corruption; these were replaced with `SyncPoint`-based test code that corrupts a valid blob reference at read time. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9851 Test Plan: `make check` Reviewed By: riversand963 Differential Revision: D35683671 Pulled By: ltamasi fbshipit-source-id: f7387af9945c48e4d5c4cd864f1ba425c7ad51f6
1071 lines
34 KiB
C++
1071 lines
34 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include <array>
|
|
#include <sstream>
|
|
|
|
#include "db/blob/blob_index.h"
|
|
#include "db/blob/blob_log_format.h"
|
|
#include "db/db_test_util.h"
|
|
#include "port/stack_trace.h"
|
|
#include "test_util/sync_point.h"
|
|
#include "utilities/fault_injection_env.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class DBBlobBasicTest : public DBTestBase {
|
|
protected:
|
|
DBBlobBasicTest()
|
|
: DBTestBase("db_blob_basic_test", /* env_do_fsync */ false) {}
|
|
};
|
|
|
|
TEST_F(DBBlobBasicTest, GetBlob) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr char key[] = "key";
|
|
constexpr char blob_value[] = "blob_value";
|
|
|
|
ASSERT_OK(Put(key, blob_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_EQ(Get(key), blob_value);
|
|
|
|
// Try again with no I/O allowed. The table and the necessary blocks should
|
|
// already be in their respective caches; however, the blob itself can only be
|
|
// read from the blob file, so the read should return Incomplete.
|
|
ReadOptions read_options;
|
|
read_options.read_tier = kBlockCacheTier;
|
|
|
|
PinnableSlice result;
|
|
ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
|
|
.IsIncomplete());
|
|
}
|
|
|
|
TEST_F(DBBlobBasicTest, MultiGetBlobs) {
|
|
constexpr size_t min_blob_size = 6;
|
|
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = min_blob_size;
|
|
|
|
Reopen(options);
|
|
|
|
// Put then retrieve three key-values. The first value is below the size limit
|
|
// and is thus stored inline; the other two are stored separately as blobs.
|
|
constexpr size_t num_keys = 3;
|
|
|
|
constexpr char first_key[] = "first_key";
|
|
constexpr char first_value[] = "short";
|
|
static_assert(sizeof(first_value) - 1 < min_blob_size,
|
|
"first_value too long to be inlined");
|
|
|
|
ASSERT_OK(Put(first_key, first_value));
|
|
|
|
constexpr char second_key[] = "second_key";
|
|
constexpr char second_value[] = "long_value";
|
|
static_assert(sizeof(second_value) - 1 >= min_blob_size,
|
|
"second_value too short to be stored as blob");
|
|
|
|
ASSERT_OK(Put(second_key, second_value));
|
|
|
|
constexpr char third_key[] = "third_key";
|
|
constexpr char third_value[] = "other_long_value";
|
|
static_assert(sizeof(third_value) - 1 >= min_blob_size,
|
|
"third_value too short to be stored as blob");
|
|
|
|
ASSERT_OK(Put(third_key, third_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ReadOptions read_options;
|
|
|
|
std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
|
|
|
|
{
|
|
std::array<PinnableSlice, num_keys> values;
|
|
std::array<Status, num_keys> statuses;
|
|
|
|
db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
|
|
&values[0], &statuses[0]);
|
|
|
|
ASSERT_OK(statuses[0]);
|
|
ASSERT_EQ(values[0], first_value);
|
|
|
|
ASSERT_OK(statuses[1]);
|
|
ASSERT_EQ(values[1], second_value);
|
|
|
|
ASSERT_OK(statuses[2]);
|
|
ASSERT_EQ(values[2], third_value);
|
|
}
|
|
|
|
// Try again with no I/O allowed. The table and the necessary blocks should
|
|
// already be in their respective caches. The first (inlined) value should be
|
|
// successfully read; however, the two blob values could only be read from the
|
|
// blob file, so for those the read should return Incomplete.
|
|
read_options.read_tier = kBlockCacheTier;
|
|
|
|
{
|
|
std::array<PinnableSlice, num_keys> values;
|
|
std::array<Status, num_keys> statuses;
|
|
|
|
db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
|
|
&values[0], &statuses[0]);
|
|
|
|
ASSERT_OK(statuses[0]);
|
|
ASSERT_EQ(values[0], first_value);
|
|
|
|
ASSERT_TRUE(statuses[1].IsIncomplete());
|
|
|
|
ASSERT_TRUE(statuses[2].IsIncomplete());
|
|
}
|
|
}
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
|
|
Options options = GetDefaultOptions();
|
|
|
|
// First, create an external SST file ["b"].
|
|
const std::string file_path = dbname_ + "/test.sst";
|
|
{
|
|
SstFileWriter sst_file_writer(EnvOptions(), GetDefaultOptions());
|
|
Status s = sst_file_writer.Open(file_path);
|
|
ASSERT_OK(s);
|
|
ASSERT_OK(sst_file_writer.Put("b", "b_value"));
|
|
ASSERT_OK(sst_file_writer.Finish());
|
|
}
|
|
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 1000;
|
|
options.use_direct_reads = true;
|
|
options.allow_ingest_behind = true;
|
|
|
|
// Open DB with fixed-prefix sst-partitioner so that compaction will cut
|
|
// new table file when encountering a new key whose 1-byte prefix changes.
|
|
constexpr size_t key_len = 1;
|
|
options.sst_partitioner_factory =
|
|
NewSstPartitionerFixedPrefixFactory(key_len);
|
|
|
|
Status s = TryReopen(options);
|
|
if (s.IsInvalidArgument()) {
|
|
ROCKSDB_GTEST_SKIP("This test requires direct IO support");
|
|
return;
|
|
}
|
|
ASSERT_OK(s);
|
|
|
|
constexpr size_t num_keys = 3;
|
|
constexpr size_t blob_size = 3000;
|
|
|
|
constexpr char first_key[] = "a";
|
|
const std::string first_blob(blob_size, 'a');
|
|
ASSERT_OK(Put(first_key, first_blob));
|
|
|
|
constexpr char second_key[] = "b";
|
|
const std::string second_blob(2 * blob_size, 'b');
|
|
ASSERT_OK(Put(second_key, second_blob));
|
|
|
|
constexpr char third_key[] = "d";
|
|
const std::string third_blob(blob_size, 'd');
|
|
ASSERT_OK(Put(third_key, third_blob));
|
|
|
|
// first_blob, second_blob and third_blob in the same blob file.
|
|
// SST Blob file
|
|
// L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'|
|
|
// | | | ^ ^ ^
|
|
// | | | | | |
|
|
// | | +---------|-------|--------+
|
|
// | +-----------------|-------+
|
|
// +-------------------------+
|
|
ASSERT_OK(Flush());
|
|
|
|
constexpr char fourth_key[] = "c";
|
|
const std::string fourth_blob(blob_size, 'c');
|
|
ASSERT_OK(Put(fourth_key, fourth_blob));
|
|
// fourth_blob in another blob file.
|
|
// SST Blob file SST Blob file
|
|
// L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'| ["c"] |'cccc'|
|
|
// | | | ^ ^ ^ | ^
|
|
// | | | | | | | |
|
|
// | | +---------|-------|--------+ +-------+
|
|
// | +-----------------|-------+
|
|
// +-------------------------+
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
|
|
/*end=*/nullptr));
|
|
|
|
// Due to the above sst partitioner, we get 4 L1 files. The blob files are
|
|
// unchanged.
|
|
// |'aaaa', 'bbbb', 'dddd'| |'cccc'|
|
|
// ^ ^ ^ ^
|
|
// | | | |
|
|
// L0 | | | |
|
|
// L1 ["a"] ["b"] ["c"] | | ["d"] |
|
|
// | | | | | |
|
|
// | | +---------|-------|---------------+
|
|
// | +-----------------|-------+
|
|
// +-------------------------+
|
|
ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
|
|
|
|
{
|
|
// Ingest the external SST file into bottommost level.
|
|
std::vector<std::string> ext_files{file_path};
|
|
IngestExternalFileOptions opts;
|
|
opts.ingest_behind = true;
|
|
ASSERT_OK(
|
|
db_->IngestExternalFile(db_->DefaultColumnFamily(), ext_files, opts));
|
|
}
|
|
|
|
// Now the database becomes as follows.
|
|
// |'aaaa', 'bbbb', 'dddd'| |'cccc'|
|
|
// ^ ^ ^ ^
|
|
// | | | |
|
|
// L0 | | | |
|
|
// L1 ["a"] ["b"] ["c"] | | ["d"] |
|
|
// | | | | | |
|
|
// | | +---------|-------|---------------+
|
|
// | +-----------------|-------+
|
|
// +-------------------------+
|
|
//
|
|
// L6 ["b"]
|
|
|
|
{
|
|
// Compact ["b"] to bottommost level.
|
|
Slice begin = Slice(second_key);
|
|
Slice end = Slice(second_key);
|
|
CompactRangeOptions cro;
|
|
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
|
ASSERT_OK(db_->CompactRange(cro, &begin, &end));
|
|
}
|
|
|
|
// |'aaaa', 'bbbb', 'dddd'| |'cccc'|
|
|
// ^ ^ ^ ^
|
|
// | | | |
|
|
// L0 | | | |
|
|
// L1 ["a"] ["c"] | | ["d"] |
|
|
// | | | | |
|
|
// | +---------|-------|---------------+
|
|
// | +-----------------|-------+
|
|
// +-------|-----------------+
|
|
// |
|
|
// L6 ["b"]
|
|
ASSERT_EQ(3, NumTableFilesAtLevel(/*level=*/1));
|
|
ASSERT_EQ(1, NumTableFilesAtLevel(/*level=*/6));
|
|
|
|
bool called = false;
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
"RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* arg) {
|
|
auto* aligned_reqs = static_cast<std::vector<FSReadRequest>*>(arg);
|
|
assert(aligned_reqs);
|
|
ASSERT_EQ(1, aligned_reqs->size());
|
|
called = true;
|
|
});
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
std::array<Slice, num_keys> keys{{first_key, third_key, second_key}};
|
|
|
|
{
|
|
std::array<PinnableSlice, num_keys> values;
|
|
std::array<Status, num_keys> statuses;
|
|
|
|
// The MultiGet(), when constructing the KeyContexts, will process the keys
|
|
// in such order: a, d, b. The reason is that ["a"] and ["d"] are in L1,
|
|
// while ["b"] resides in L6.
|
|
// Consequently, the original FSReadRequest list prepared by
|
|
// Version::MultiGetblob() will be for "a", "d" and "b". It is unsorted as
|
|
// follows:
|
|
//
|
|
// ["a", offset=30, len=3033],
|
|
// ["d", offset=9096, len=3033],
|
|
// ["b", offset=3063, len=6033]
|
|
//
|
|
// If we do not sort them before calling MultiRead() in DirectIO, then the
|
|
// underlying IO merging logic will yield two requests.
|
|
//
|
|
// [offset=0, len=4096] (for "a")
|
|
// [offset=0, len=12288] (result of merging the request for "d" and "b")
|
|
//
|
|
// We need to sort them in Version::MultiGetBlob() so that the underlying
|
|
// IO merging logic in DirectIO mode works as expected. The correct
|
|
// behavior will be one aligned request:
|
|
//
|
|
// [offset=0, len=12288]
|
|
|
|
db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
|
|
&values[0], &statuses[0]);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
ASSERT_TRUE(called);
|
|
|
|
ASSERT_OK(statuses[0]);
|
|
ASSERT_EQ(values[0], first_blob);
|
|
|
|
ASSERT_OK(statuses[1]);
|
|
ASSERT_EQ(values[1], third_blob);
|
|
|
|
ASSERT_OK(statuses[2]);
|
|
ASSERT_EQ(values[2], second_blob);
|
|
}
|
|
}
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr size_t kNumBlobFiles = 3;
|
|
constexpr size_t kNumBlobsPerFile = 3;
|
|
constexpr size_t kNumKeys = kNumBlobsPerFile * kNumBlobFiles;
|
|
|
|
std::vector<std::string> key_strs;
|
|
std::vector<std::string> value_strs;
|
|
for (size_t i = 0; i < kNumBlobFiles; ++i) {
|
|
for (size_t j = 0; j < kNumBlobsPerFile; ++j) {
|
|
std::string key = "key" + std::to_string(i) + "_" + std::to_string(j);
|
|
std::string value =
|
|
"value_as_blob" + std::to_string(i) + "_" + std::to_string(j);
|
|
ASSERT_OK(Put(key, value));
|
|
key_strs.push_back(key);
|
|
value_strs.push_back(value);
|
|
}
|
|
ASSERT_OK(Flush());
|
|
}
|
|
assert(key_strs.size() == kNumKeys);
|
|
std::array<Slice, kNumKeys> keys;
|
|
for (size_t i = 0; i < keys.size(); ++i) {
|
|
keys[i] = key_strs[i];
|
|
}
|
|
std::array<PinnableSlice, kNumKeys> values;
|
|
std::array<Status, kNumKeys> statuses;
|
|
db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), kNumKeys, &keys[0],
|
|
&values[0], &statuses[0]);
|
|
|
|
for (size_t i = 0; i < kNumKeys; ++i) {
|
|
ASSERT_OK(statuses[i]);
|
|
ASSERT_EQ(value_strs[i], values[i]);
|
|
}
|
|
}
|
|
|
|
TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr char key[] = "key";
|
|
constexpr char blob[] = "blob";
|
|
|
|
ASSERT_OK(Put(key, blob));
|
|
ASSERT_OK(Flush());
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
"Version::Get::TamperWithBlobIndex", [](void* arg) {
|
|
Slice* const blob_index = static_cast<Slice*>(arg);
|
|
assert(blob_index);
|
|
assert(!blob_index->empty());
|
|
blob_index->remove_prefix(1);
|
|
});
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
PinnableSlice result;
|
|
ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
|
|
.IsCorruption());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
}
|
|
|
|
TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
options.create_if_missing = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
constexpr size_t kNumOfKeys = 3;
|
|
std::array<std::string, kNumOfKeys> key_strs;
|
|
std::array<std::string, kNumOfKeys> value_strs;
|
|
std::array<Slice, kNumOfKeys + 1> keys;
|
|
for (size_t i = 0; i < kNumOfKeys; ++i) {
|
|
key_strs[i] = "foo" + std::to_string(i);
|
|
value_strs[i] = "blob_value" + std::to_string(i);
|
|
ASSERT_OK(Put(key_strs[i], value_strs[i]));
|
|
keys[i] = key_strs[i];
|
|
}
|
|
|
|
constexpr char key[] = "key";
|
|
constexpr char blob[] = "blob";
|
|
ASSERT_OK(Put(key, blob));
|
|
keys[kNumOfKeys] = key;
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
"Version::MultiGet::TamperWithBlobIndex", [&key](void* arg) {
|
|
KeyContext* const key_context = static_cast<KeyContext*>(arg);
|
|
assert(key_context);
|
|
assert(key_context->key);
|
|
|
|
if (*(key_context->key) == key) {
|
|
Slice* const blob_index = key_context->value;
|
|
assert(blob_index);
|
|
assert(!blob_index->empty());
|
|
blob_index->remove_prefix(1);
|
|
}
|
|
});
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
std::array<PinnableSlice, kNumOfKeys + 1> values;
|
|
std::array<Status, kNumOfKeys + 1> statuses;
|
|
db_->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), kNumOfKeys + 1,
|
|
keys.data(), values.data(), statuses.data(),
|
|
/*sorted_input=*/false);
|
|
for (size_t i = 0; i < kNumOfKeys + 1; ++i) {
|
|
if (i != kNumOfKeys) {
|
|
ASSERT_OK(statuses[i]);
|
|
ASSERT_EQ("blob_value" + std::to_string(i), values[i]);
|
|
} else {
|
|
ASSERT_TRUE(statuses[i].IsCorruption());
|
|
}
|
|
}
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
}
|
|
|
|
TEST_F(DBBlobBasicTest, MultiGetBlob_ExceedSoftLimit) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr size_t kNumOfKeys = 3;
|
|
std::array<std::string, kNumOfKeys> key_bufs;
|
|
std::array<std::string, kNumOfKeys> value_bufs;
|
|
std::array<Slice, kNumOfKeys> keys;
|
|
for (size_t i = 0; i < kNumOfKeys; ++i) {
|
|
key_bufs[i] = "foo" + std::to_string(i);
|
|
value_bufs[i] = "blob_value" + std::to_string(i);
|
|
ASSERT_OK(Put(key_bufs[i], value_bufs[i]));
|
|
keys[i] = key_bufs[i];
|
|
}
|
|
ASSERT_OK(Flush());
|
|
|
|
std::array<PinnableSlice, kNumOfKeys> values;
|
|
std::array<Status, kNumOfKeys> statuses;
|
|
ReadOptions read_opts;
|
|
read_opts.value_size_soft_limit = 1;
|
|
db_->MultiGet(read_opts, dbfull()->DefaultColumnFamily(), kNumOfKeys,
|
|
keys.data(), values.data(), statuses.data(),
|
|
/*sorted_input=*/true);
|
|
for (const auto& s : statuses) {
|
|
ASSERT_TRUE(s.IsAborted());
|
|
}
|
|
}
|
|
|
|
TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) {
|
|
constexpr uint64_t min_blob_size = 10;
|
|
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = min_blob_size;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr char key[] = "key";
|
|
constexpr char blob[] = "short";
|
|
static_assert(sizeof(short) - 1 < min_blob_size,
|
|
"Blob too long to be inlined");
|
|
|
|
// Fake an inlined TTL blob index.
|
|
std::string blob_index;
|
|
|
|
constexpr uint64_t expiration = 1234567890;
|
|
|
|
BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
|
|
|
|
WriteBatch batch;
|
|
ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
PinnableSlice result;
|
|
ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
|
|
.IsCorruption());
|
|
}
|
|
|
|
TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr char key[] = "key";
|
|
|
|
// Fake a blob index referencing a non-existent blob file.
|
|
std::string blob_index;
|
|
|
|
constexpr uint64_t blob_file_number = 1000;
|
|
constexpr uint64_t offset = 1234;
|
|
constexpr uint64_t size = 5678;
|
|
|
|
BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
|
|
kNoCompression);
|
|
|
|
WriteBatch batch;
|
|
ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
PinnableSlice result;
|
|
ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
|
|
.IsCorruption());
|
|
}
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
TEST_F(DBBlobBasicTest, GenerateIOTracing) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
std::string trace_file = dbname_ + "/io_trace_file";
|
|
|
|
Reopen(options);
|
|
{
|
|
// Create IO trace file
|
|
std::unique_ptr<TraceWriter> trace_writer;
|
|
ASSERT_OK(
|
|
NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer));
|
|
ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer)));
|
|
|
|
constexpr char key[] = "key";
|
|
constexpr char blob_value[] = "blob_value";
|
|
|
|
ASSERT_OK(Put(key, blob_value));
|
|
ASSERT_OK(Flush());
|
|
ASSERT_EQ(Get(key), blob_value);
|
|
|
|
ASSERT_OK(db_->EndIOTrace());
|
|
ASSERT_OK(env_->FileExists(trace_file));
|
|
}
|
|
{
|
|
// Parse trace file to check file operations related to blob files are
|
|
// recorded.
|
|
std::unique_ptr<TraceReader> trace_reader;
|
|
ASSERT_OK(
|
|
NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader));
|
|
IOTraceReader reader(std::move(trace_reader));
|
|
|
|
IOTraceHeader header;
|
|
ASSERT_OK(reader.ReadHeader(&header));
|
|
ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
|
|
ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
|
|
|
|
// Read records.
|
|
int blob_files_op_count = 0;
|
|
Status status;
|
|
while (true) {
|
|
IOTraceRecord record;
|
|
status = reader.ReadIOOp(&record);
|
|
if (!status.ok()) {
|
|
break;
|
|
}
|
|
if (record.file_name.find("blob") != std::string::npos) {
|
|
blob_files_op_count++;
|
|
}
|
|
}
|
|
// Assuming blob files will have Append, Close and then Read operations.
|
|
ASSERT_GT(blob_files_op_count, 2);
|
|
}
|
|
}
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
options.create_if_missing = true;
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(dbfull()->DisableFileDeletions());
|
|
constexpr int kNumTableFiles = 2;
|
|
for (int i = 0; i < kNumTableFiles; ++i) {
|
|
for (char ch = 'a'; ch != 'c'; ++ch) {
|
|
std::string key(1, ch);
|
|
ASSERT_OK(Put(key, "value" + std::to_string(i)));
|
|
}
|
|
ASSERT_OK(Flush());
|
|
}
|
|
|
|
Close();
|
|
|
|
std::vector<std::string> files;
|
|
ASSERT_OK(env_->GetChildren(dbname_, &files));
|
|
std::string blob_file_path;
|
|
uint64_t max_blob_file_num = kInvalidBlobFileNumber;
|
|
for (const auto& fname : files) {
|
|
uint64_t file_num = 0;
|
|
FileType type;
|
|
if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) &&
|
|
type == kBlobFile) {
|
|
if (file_num > max_blob_file_num) {
|
|
max_blob_file_num = file_num;
|
|
blob_file_path = dbname_ + "/" + fname;
|
|
}
|
|
}
|
|
}
|
|
ASSERT_OK(env_->DeleteFile(blob_file_path));
|
|
|
|
options.best_efforts_recovery = true;
|
|
Reopen(options);
|
|
std::string value;
|
|
ASSERT_OK(db_->Get(ReadOptions(), "a", &value));
|
|
ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value);
|
|
}
|
|
|
|
TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) {
|
|
Options options = GetDefaultOptions();
|
|
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(Put("Key1", "v1"));
|
|
ASSERT_OK(Flush());
|
|
ASSERT_OK(Merge("Key1", "v2"));
|
|
ASSERT_OK(Flush());
|
|
ASSERT_OK(Merge("Key1", "v3"));
|
|
ASSERT_OK(Flush());
|
|
|
|
std::string value;
|
|
ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value));
|
|
ASSERT_EQ(Get("Key1"), "v1,v2,v3");
|
|
}
|
|
|
|
TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
|
|
constexpr size_t num_keys = 3;
|
|
|
|
Options options = GetDefaultOptions();
|
|
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(Put("Key0", "v0_0"));
|
|
ASSERT_OK(Put("Key1", "v1_0"));
|
|
ASSERT_OK(Put("Key2", "v2_0"));
|
|
ASSERT_OK(Flush());
|
|
ASSERT_OK(Merge("Key0", "v0_1"));
|
|
ASSERT_OK(Merge("Key1", "v1_1"));
|
|
ASSERT_OK(Flush());
|
|
ASSERT_OK(Merge("Key0", "v0_2"));
|
|
ASSERT_OK(Flush());
|
|
|
|
std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
|
|
std::array<PinnableSlice, num_keys> values;
|
|
std::array<Status, num_keys> statuses;
|
|
|
|
db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
|
|
&values[0], &statuses[0]);
|
|
|
|
ASSERT_OK(statuses[0]);
|
|
ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
|
|
|
|
ASSERT_OK(statuses[1]);
|
|
ASSERT_EQ(values[1], "v1_0,v1_1");
|
|
|
|
ASSERT_OK(statuses[2]);
|
|
ASSERT_EQ(values[2], "v2_0");
|
|
}
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
TEST_F(DBBlobBasicTest, Properties) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr char key1[] = "key1";
|
|
constexpr size_t key1_size = sizeof(key1) - 1;
|
|
|
|
constexpr char key2[] = "key2";
|
|
constexpr size_t key2_size = sizeof(key2) - 1;
|
|
|
|
constexpr char key3[] = "key3";
|
|
constexpr size_t key3_size = sizeof(key3) - 1;
|
|
|
|
constexpr char blob[] = "00000000000000";
|
|
constexpr size_t blob_size = sizeof(blob) - 1;
|
|
|
|
constexpr char longer_blob[] = "00000000000000000000";
|
|
constexpr size_t longer_blob_size = sizeof(longer_blob) - 1;
|
|
|
|
ASSERT_OK(Put(key1, blob));
|
|
ASSERT_OK(Put(key2, longer_blob));
|
|
ASSERT_OK(Flush());
|
|
|
|
constexpr size_t first_blob_file_expected_size =
|
|
BlobLogHeader::kSize +
|
|
BlobLogRecord::CalculateAdjustmentForRecordHeader(key1_size) + blob_size +
|
|
BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
|
|
longer_blob_size + BlobLogFooter::kSize;
|
|
|
|
ASSERT_OK(Put(key3, blob));
|
|
ASSERT_OK(Flush());
|
|
|
|
constexpr size_t second_blob_file_expected_size =
|
|
BlobLogHeader::kSize +
|
|
BlobLogRecord::CalculateAdjustmentForRecordHeader(key3_size) + blob_size +
|
|
BlobLogFooter::kSize;
|
|
|
|
constexpr size_t total_expected_size =
|
|
first_blob_file_expected_size + second_blob_file_expected_size;
|
|
|
|
// Number of blob files
|
|
uint64_t num_blob_files = 0;
|
|
ASSERT_TRUE(
|
|
db_->GetIntProperty(DB::Properties::kNumBlobFiles, &num_blob_files));
|
|
ASSERT_EQ(num_blob_files, 2);
|
|
|
|
// Total size of live blob files
|
|
uint64_t live_blob_file_size = 0;
|
|
ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileSize,
|
|
&live_blob_file_size));
|
|
ASSERT_EQ(live_blob_file_size, total_expected_size);
|
|
|
|
// Total amount of garbage in live blob files
|
|
{
|
|
uint64_t live_blob_file_garbage_size = 0;
|
|
ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
|
|
&live_blob_file_garbage_size));
|
|
ASSERT_EQ(live_blob_file_garbage_size, 0);
|
|
}
|
|
|
|
// Total size of all blob files across all versions
|
|
// Note: this should be the same as above since we only have one
|
|
// version at this point.
|
|
uint64_t total_blob_file_size = 0;
|
|
ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
|
|
&total_blob_file_size));
|
|
ASSERT_EQ(total_blob_file_size, total_expected_size);
|
|
|
|
// Delete key2 to create some garbage
|
|
ASSERT_OK(Delete(key2));
|
|
ASSERT_OK(Flush());
|
|
|
|
constexpr Slice* begin = nullptr;
|
|
constexpr Slice* end = nullptr;
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
|
|
|
|
constexpr size_t expected_garbage_size =
|
|
BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
|
|
longer_blob_size;
|
|
|
|
constexpr double expected_space_amp =
|
|
static_cast<double>(total_expected_size) /
|
|
(total_expected_size - expected_garbage_size);
|
|
|
|
// Blob file stats
|
|
std::string blob_stats;
|
|
ASSERT_TRUE(db_->GetProperty(DB::Properties::kBlobStats, &blob_stats));
|
|
|
|
std::ostringstream oss;
|
|
oss << "Number of blob files: 2\nTotal size of blob files: "
|
|
<< total_expected_size
|
|
<< "\nTotal size of garbage in blob files: " << expected_garbage_size
|
|
<< "\nBlob file space amplification: " << expected_space_amp << '\n';
|
|
|
|
ASSERT_EQ(blob_stats, oss.str());
|
|
|
|
// Total amount of garbage in live blob files
|
|
{
|
|
uint64_t live_blob_file_garbage_size = 0;
|
|
ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
|
|
&live_blob_file_garbage_size));
|
|
ASSERT_EQ(live_blob_file_garbage_size, expected_garbage_size);
|
|
}
|
|
}
|
|
|
|
TEST_F(DBBlobBasicTest, PropertiesMultiVersion) {
|
|
Options options = GetDefaultOptions();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr char key1[] = "key1";
|
|
constexpr char key2[] = "key2";
|
|
constexpr char key3[] = "key3";
|
|
|
|
constexpr size_t key_size = sizeof(key1) - 1;
|
|
static_assert(sizeof(key2) - 1 == key_size, "unexpected size: key2");
|
|
static_assert(sizeof(key3) - 1 == key_size, "unexpected size: key3");
|
|
|
|
constexpr char blob[] = "0000000000";
|
|
constexpr size_t blob_size = sizeof(blob) - 1;
|
|
|
|
ASSERT_OK(Put(key1, blob));
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Put(key2, blob));
|
|
ASSERT_OK(Flush());
|
|
|
|
// Create an iterator to keep the current version alive
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
|
|
ASSERT_OK(iter->status());
|
|
|
|
// Note: the Delete and subsequent compaction results in the first blob file
|
|
// not making it to the final version. (It is still part of the previous
|
|
// version kept alive by the iterator though.) On the other hand, the Put
|
|
// results in a third blob file.
|
|
ASSERT_OK(Delete(key1));
|
|
ASSERT_OK(Put(key3, blob));
|
|
ASSERT_OK(Flush());
|
|
|
|
constexpr Slice* begin = nullptr;
|
|
constexpr Slice* end = nullptr;
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
|
|
|
|
// Total size of all blob files across all versions: between the two versions,
|
|
// we should have three blob files of the same size with one blob each.
|
|
// The version kept alive by the iterator contains the first and the second
|
|
// blob file, while the final version contains the second and the third blob
|
|
// file. (The second blob file is thus shared by the two versions but should
|
|
// be counted only once.)
|
|
uint64_t total_blob_file_size = 0;
|
|
ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
|
|
&total_blob_file_size));
|
|
ASSERT_EQ(total_blob_file_size,
|
|
3 * (BlobLogHeader::kSize +
|
|
BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
|
|
blob_size + BlobLogFooter::kSize));
|
|
}
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
|
|
public testing::WithParamInterface<std::string> {
|
|
protected:
|
|
DBBlobBasicIOErrorTest() : sync_point_(GetParam()) {
|
|
fault_injection_env_.reset(new FaultInjectionTestEnv(env_));
|
|
}
|
|
~DBBlobBasicIOErrorTest() { Close(); }
|
|
|
|
std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
|
|
std::string sync_point_;
|
|
};
|
|
|
|
class DBBlobBasicIOErrorMultiGetTest : public DBBlobBasicIOErrorTest {
|
|
public:
|
|
DBBlobBasicIOErrorMultiGetTest() : DBBlobBasicIOErrorTest() {}
|
|
};
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest,
|
|
::testing::ValuesIn(std::vector<std::string>{
|
|
"BlobFileReader::OpenFile:NewRandomAccessFile",
|
|
"BlobFileReader::GetBlob:ReadFromFile"}));
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorMultiGetTest,
|
|
::testing::ValuesIn(std::vector<std::string>{
|
|
"BlobFileReader::OpenFile:NewRandomAccessFile",
|
|
"BlobFileReader::MultiGetBlob:ReadFromFile"}));
|
|
|
|
TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) {
|
|
Options options;
|
|
options.env = fault_injection_env_.get();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr char key[] = "key";
|
|
constexpr char blob_value[] = "blob_value";
|
|
|
|
ASSERT_OK(Put(key, blob_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
|
|
fault_injection_env_->SetFilesystemActive(false,
|
|
Status::IOError(sync_point_));
|
|
});
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
PinnableSlice result;
|
|
ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
|
|
.IsIOError());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
}
|
|
|
|
TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) {
|
|
Options options = GetDefaultOptions();
|
|
options.env = fault_injection_env_.get();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr size_t num_keys = 2;
|
|
|
|
constexpr char first_key[] = "first_key";
|
|
constexpr char first_value[] = "first_value";
|
|
|
|
ASSERT_OK(Put(first_key, first_value));
|
|
|
|
constexpr char second_key[] = "second_key";
|
|
constexpr char second_value[] = "second_value";
|
|
|
|
ASSERT_OK(Put(second_key, second_value));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
std::array<Slice, num_keys> keys{{first_key, second_key}};
|
|
std::array<PinnableSlice, num_keys> values;
|
|
std::array<Status, num_keys> statuses;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
|
|
fault_injection_env_->SetFilesystemActive(false,
|
|
Status::IOError(sync_point_));
|
|
});
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
|
|
&values[0], &statuses[0]);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
ASSERT_TRUE(statuses[0].IsIOError());
|
|
ASSERT_TRUE(statuses[1].IsIOError());
|
|
}
|
|
|
|
TEST_P(DBBlobBasicIOErrorMultiGetTest, MultipleBlobFiles) {
|
|
Options options = GetDefaultOptions();
|
|
options.env = fault_injection_env_.get();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
|
|
Reopen(options);
|
|
|
|
constexpr size_t num_keys = 2;
|
|
|
|
constexpr char key1[] = "key1";
|
|
constexpr char value1[] = "blob1";
|
|
|
|
ASSERT_OK(Put(key1, value1));
|
|
ASSERT_OK(Flush());
|
|
|
|
constexpr char key2[] = "key2";
|
|
constexpr char value2[] = "blob2";
|
|
|
|
ASSERT_OK(Put(key2, value2));
|
|
ASSERT_OK(Flush());
|
|
|
|
std::array<Slice, num_keys> keys{{key1, key2}};
|
|
std::array<PinnableSlice, num_keys> values;
|
|
std::array<Status, num_keys> statuses;
|
|
|
|
bool first_blob_file = true;
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
sync_point_, [&first_blob_file, this](void* /* arg */) {
|
|
if (first_blob_file) {
|
|
first_blob_file = false;
|
|
return;
|
|
}
|
|
fault_injection_env_->SetFilesystemActive(false,
|
|
Status::IOError(sync_point_));
|
|
});
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
|
|
keys.data(), values.data(), statuses.data());
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
ASSERT_OK(statuses[0]);
|
|
ASSERT_EQ(value1, values[0]);
|
|
ASSERT_TRUE(statuses[1].IsIOError());
|
|
}
|
|
|
|
namespace {
|
|
|
|
class ReadBlobCompactionFilter : public CompactionFilter {
|
|
public:
|
|
ReadBlobCompactionFilter() = default;
|
|
const char* Name() const override {
|
|
return "rocksdb.compaction.filter.read.blob";
|
|
}
|
|
CompactionFilter::Decision FilterV2(
|
|
int /*level*/, const Slice& /*key*/, ValueType value_type,
|
|
const Slice& existing_value, std::string* new_value,
|
|
std::string* /*skip_until*/) const override {
|
|
if (value_type != CompactionFilter::ValueType::kValue) {
|
|
return CompactionFilter::Decision::kKeep;
|
|
}
|
|
assert(new_value);
|
|
new_value->assign(existing_value.data(), existing_value.size());
|
|
return CompactionFilter::Decision::kChangeValue;
|
|
}
|
|
};
|
|
|
|
} // anonymous namespace
|
|
|
|
TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
|
|
Options options = GetDefaultOptions();
|
|
options.env = fault_injection_env_.get();
|
|
options.enable_blob_files = true;
|
|
options.min_blob_size = 0;
|
|
options.create_if_missing = true;
|
|
std::unique_ptr<CompactionFilter> compaction_filter_guard(
|
|
new ReadBlobCompactionFilter);
|
|
options.compaction_filter = compaction_filter_guard.get();
|
|
|
|
DestroyAndReopen(options);
|
|
constexpr char key[] = "foo";
|
|
constexpr char blob_value[] = "foo_blob_value";
|
|
ASSERT_OK(Put(key, blob_value));
|
|
ASSERT_OK(Flush());
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
|
|
fault_injection_env_->SetFilesystemActive(false,
|
|
Status::IOError(sync_point_));
|
|
});
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
|
|
/*end=*/nullptr)
|
|
.IsIOError());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
int main(int argc, char** argv) {
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
RegisterCustomObjects(argc, argv);
|
|
return RUN_ALL_TESTS();
|
|
}
|