Make mempurge a background process (equivalent to in-memory compaction). (#8505)

Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505

Reviewed By: pdillinger

Differential Revision: D29619283

Pulled By: bjlemaire

fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
This commit is contained in:
Baptiste Lemaire 2021-07-09 17:16:00 -07:00 committed by Facebook GitHub Bot
parent bb485e986a
commit 837705ad80
11 changed files with 497 additions and 356 deletions

View File

@ -443,7 +443,7 @@ bool SuperVersion::Unref() {
return previous_refs == 1; return previous_refs == 1;
} }
void SuperVersion::Cleanup(const bool noImmMemoryContribution) { void SuperVersion::Cleanup() {
assert(refs.load(std::memory_order_relaxed) == 0); assert(refs.load(std::memory_order_relaxed) == 0);
// Since this SuperVersion object is being deleted, // Since this SuperVersion object is being deleted,
// decrement reference to the immutable MemtableList // decrement reference to the immutable MemtableList
@ -451,18 +451,9 @@ void SuperVersion::Cleanup(const bool noImmMemoryContribution) {
imm->Unref(&to_delete); imm->Unref(&to_delete);
MemTable* m = mem->Unref(); MemTable* m = mem->Unref();
if (m != nullptr) { if (m != nullptr) {
// Typically, if the m memtable was not made auto* memory_usage = current->cfd()->imm()->current_memory_usage();
// immutable, and therefore was not added to the assert(*memory_usage >= m->ApproximateMemoryUsage());
// imm list, it does not contribute to the imm *memory_usage -= m->ApproximateMemoryUsage();
// memory footprint (and actually is not part of
// the 'imm' MemtableList at all).
// At the moment, noImmMemoryContribution is only
// used by the experimental 'MemPurge' prototype.
if (!noImmMemoryContribution) {
auto* memory_usage = current->cfd()->imm()->current_memory_usage();
assert(*memory_usage >= m->ApproximateMemoryUsage());
*memory_usage -= m->ApproximateMemoryUsage();
}
to_delete.push_back(m); to_delete.push_back(m);
} }
current->Unref(); current->Unref();
@ -1272,7 +1263,7 @@ void ColumnFamilyData::InstallSuperVersion(
void ColumnFamilyData::InstallSuperVersion( void ColumnFamilyData::InstallSuperVersion(
SuperVersionContext* sv_context, InstrumentedMutex* db_mutex, SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
const MutableCFOptions& mutable_cf_options, bool noImmMemoryContribution) { const MutableCFOptions& mutable_cf_options) {
SuperVersion* new_superversion = sv_context->new_superversion.release(); SuperVersion* new_superversion = sv_context->new_superversion.release();
new_superversion->db_mutex = db_mutex; new_superversion->db_mutex = db_mutex;
new_superversion->mutable_cf_options = mutable_cf_options; new_superversion->mutable_cf_options = mutable_cf_options;
@ -1302,7 +1293,7 @@ void ColumnFamilyData::InstallSuperVersion(
new_superversion->write_stall_condition, GetName(), ioptions()); new_superversion->write_stall_condition, GetName(), ioptions());
} }
if (old_superversion->Unref()) { if (old_superversion->Unref()) {
old_superversion->Cleanup(noImmMemoryContribution); old_superversion->Cleanup();
sv_context->superversions_to_free.push_back(old_superversion); sv_context->superversions_to_free.push_back(old_superversion);
} }
} }

View File

@ -222,10 +222,7 @@ struct SuperVersion {
// Cleanup unrefs mem, imm and current. Also, it stores all memtables // Cleanup unrefs mem, imm and current. Also, it stores all memtables
// that needs to be deleted in to_delete vector. Unrefing those // that needs to be deleted in to_delete vector. Unrefing those
// objects needs to be done in the mutex // objects needs to be done in the mutex
// The 'noImmMemoryContribution' is set to true if the memtable being void Cleanup();
// dereferenced in this SuperVersion was not added to the Immutable
// memtable list.
void Cleanup(bool noImmMemoryContribution = false);
void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
MemTableListVersion* new_imm, Version* new_current); MemTableListVersion* new_imm, Version* new_current);
@ -457,8 +454,7 @@ class ColumnFamilyData {
// IMPORTANT: Only call this from DBImpl::InstallSuperVersion() // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
void InstallSuperVersion(SuperVersionContext* sv_context, void InstallSuperVersion(SuperVersionContext* sv_context,
InstrumentedMutex* db_mutex, InstrumentedMutex* db_mutex,
const MutableCFOptions& mutable_cf_options, const MutableCFOptions& mutable_cf_options);
bool noImmMemoryContribution = false);
void InstallSuperVersion(SuperVersionContext* sv_context, void InstallSuperVersion(SuperVersionContext* sv_context,
InstrumentedMutex* db_mutex); InstrumentedMutex* db_mutex);
@ -524,6 +520,7 @@ class ColumnFamilyData {
} }
ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
private: private:
friend class ColumnFamilySet; friend class ColumnFamilySet;

View File

@ -692,16 +692,17 @@ TEST_F(DBFlushTest, MemPurgeBasic) {
options.allow_concurrent_memtable_write = true; options.allow_concurrent_memtable_write = true;
// Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
options.write_buffer_size = 64 << 20; options.write_buffer_size = 1 << 20;
// Activate the MemPurge prototype. // Activate the MemPurge prototype.
options.experimental_allow_mempurge = true; options.experimental_allow_mempurge = true;
ASSERT_OK(TryReopen(options)); ASSERT_OK(TryReopen(options));
uint32_t mempurge_count = 0; uint32_t mempurge_count = 0;
uint32_t flush_count = 0; uint32_t sst_count = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::MemPurge", [&](void* /*arg*/) { mempurge_count++; }); "DBImpl::FlushJob:MemPurgeSuccessful",
[&](void* /*arg*/) { mempurge_count++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::FlushJob:Flush", [&](void* /*arg*/) { flush_count++; }); "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
std::string KEY1 = "IamKey1"; std::string KEY1 = "IamKey1";
@ -709,62 +710,120 @@ TEST_F(DBFlushTest, MemPurgeBasic) {
std::string KEY3 = "IamKey3"; std::string KEY3 = "IamKey3";
std::string KEY4 = "IamKey4"; std::string KEY4 = "IamKey4";
std::string KEY5 = "IamKey5"; std::string KEY5 = "IamKey5";
std::string VALUE1 = "IamValue1"; std::string KEY6 = "IamKey6";
std::string VALUE2 = "IamValue2"; std::string KEY7 = "IamKey7";
std::string KEY8 = "IamKey8";
std::string KEY9 = "IamKey9";
std::string RNDKEY1, RNDKEY2, RNDKEY3;
const std::string NOT_FOUND = "NOT_FOUND"; const std::string NOT_FOUND = "NOT_FOUND";
// Check simple operations (put-delete).
ASSERT_OK(Put(KEY1, VALUE1));
ASSERT_OK(Put(KEY2, VALUE2));
ASSERT_OK(Delete(KEY1));
ASSERT_OK(Put(KEY2, VALUE1));
ASSERT_OK(Put(KEY1, VALUE2));
ASSERT_OK(Flush());
ASSERT_EQ(Get(KEY1), VALUE2);
ASSERT_EQ(Get(KEY2), VALUE1);
ASSERT_OK(Delete(KEY1));
ASSERT_EQ(Get(KEY1), NOT_FOUND);
ASSERT_OK(Flush());
ASSERT_EQ(Get(KEY1), NOT_FOUND);
// Heavy overwrite workload, // Heavy overwrite workload,
// more than would fit in maximum allowed memtables. // more than would fit in maximum allowed memtables.
Random rnd(719); Random rnd(719);
const size_t NUM_REPEAT = 100000; const size_t NUM_REPEAT = 100;
const size_t RAND_VALUES_LENGTH = 512; const size_t RAND_KEYS_LENGTH = 57;
std::string p_v1, p_v2, p_v3, p_v4, p_v5; const size_t RAND_VALUES_LENGTH = 10240;
// Insertion of of K-V pairs, multiple times. std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9, p_rv1,
// Also insert DeleteRange p_rv2, p_rv3;
// Insert a very first set of keys that will be
// mempurged at least once.
p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
ASSERT_OK(Put(KEY1, p_v1));
ASSERT_OK(Put(KEY2, p_v2));
ASSERT_OK(Put(KEY3, p_v3));
ASSERT_OK(Put(KEY4, p_v4));
ASSERT_EQ(Get(KEY1), p_v1);
ASSERT_EQ(Get(KEY2), p_v2);
ASSERT_EQ(Get(KEY3), p_v3);
ASSERT_EQ(Get(KEY4), p_v4);
// Insertion of of K-V pairs, multiple times (overwrites).
for (size_t i = 0; i < NUM_REPEAT; i++) { for (size_t i = 0; i < NUM_REPEAT; i++) {
// Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v5 = rnd.RandomString(RAND_VALUES_LENGTH); p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
ASSERT_OK(Put(KEY1, p_v1));
ASSERT_OK(Put(KEY2, p_v2));
ASSERT_OK(Put(KEY3, p_v3));
ASSERT_OK(Put(KEY4, p_v4));
ASSERT_OK(Put(KEY5, p_v5)); ASSERT_OK(Put(KEY5, p_v5));
ASSERT_OK(Put(KEY6, p_v6));
ASSERT_OK(Put(KEY7, p_v7));
ASSERT_OK(Put(KEY8, p_v8));
ASSERT_OK(Put(KEY9, p_v9));
ASSERT_EQ(Get(KEY1), p_v1); ASSERT_EQ(Get(KEY1), p_v1);
ASSERT_EQ(Get(KEY2), p_v2); ASSERT_EQ(Get(KEY2), p_v2);
ASSERT_EQ(Get(KEY3), p_v3); ASSERT_EQ(Get(KEY3), p_v3);
ASSERT_EQ(Get(KEY4), p_v4); ASSERT_EQ(Get(KEY4), p_v4);
ASSERT_EQ(Get(KEY5), p_v5); ASSERT_EQ(Get(KEY5), p_v5);
ASSERT_EQ(Get(KEY6), p_v6);
ASSERT_EQ(Get(KEY7), p_v7);
ASSERT_EQ(Get(KEY8), p_v8);
ASSERT_EQ(Get(KEY9), p_v9);
} }
// Check that there was at least one mempurge // Check that there was at least one mempurge
const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
// Check that there was no flush to storage. // Check that there was no SST files created during flush.
const uint32_t EXPECTED_FLUSH_COUNT = 0; const uint32_t EXPECTED_SST_COUNT = 0;
EXPECT_GE(mempurge_count, EXPECTED_MIN_MEMPURGE_COUNT); EXPECT_GE(mempurge_count, EXPECTED_MIN_MEMPURGE_COUNT);
EXPECT_EQ(flush_count, EXPECTED_FLUSH_COUNT); EXPECT_EQ(sst_count, EXPECTED_SST_COUNT);
const uint32_t mempurge_count_record = mempurge_count;
// Insertion of of K-V pairs, no overwrites.
for (size_t i = 0; i < NUM_REPEAT; i++) {
// Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
RNDKEY1 = rnd.RandomString(RAND_KEYS_LENGTH);
RNDKEY2 = rnd.RandomString(RAND_KEYS_LENGTH);
RNDKEY3 = rnd.RandomString(RAND_KEYS_LENGTH);
p_rv1 = rnd.RandomString(RAND_VALUES_LENGTH);
p_rv2 = rnd.RandomString(RAND_VALUES_LENGTH);
p_rv3 = rnd.RandomString(RAND_VALUES_LENGTH);
ASSERT_OK(Put(RNDKEY1, p_rv1));
ASSERT_OK(Put(RNDKEY2, p_rv2));
ASSERT_OK(Put(RNDKEY3, p_rv3));
ASSERT_EQ(Get(KEY1), p_v1);
ASSERT_EQ(Get(KEY2), p_v2);
ASSERT_EQ(Get(KEY3), p_v3);
ASSERT_EQ(Get(KEY4), p_v4);
ASSERT_EQ(Get(KEY5), p_v5);
ASSERT_EQ(Get(KEY6), p_v6);
ASSERT_EQ(Get(KEY7), p_v7);
ASSERT_EQ(Get(KEY8), p_v8);
ASSERT_EQ(Get(KEY9), p_v9);
ASSERT_EQ(Get(RNDKEY1), p_rv1);
ASSERT_EQ(Get(RNDKEY2), p_rv2);
ASSERT_EQ(Get(RNDKEY3), p_rv3);
}
// Assert that at least one flush to storage has been performed
ASSERT_GT(sst_count, EXPECTED_SST_COUNT);
// (which will consequently increase the number of mempurges recorded too).
ASSERT_EQ(mempurge_count, mempurge_count_record);
// Assert that there is no data corruption, even with
// a flush to storage.
ASSERT_EQ(Get(KEY1), p_v1);
ASSERT_EQ(Get(KEY2), p_v2);
ASSERT_EQ(Get(KEY3), p_v3);
ASSERT_EQ(Get(KEY4), p_v4);
ASSERT_EQ(Get(KEY5), p_v5);
ASSERT_EQ(Get(KEY6), p_v6);
ASSERT_EQ(Get(KEY7), p_v7);
ASSERT_EQ(Get(KEY8), p_v8);
ASSERT_EQ(Get(KEY9), p_v9);
ASSERT_EQ(Get(RNDKEY1), p_rv1);
ASSERT_EQ(Get(RNDKEY2), p_rv2);
ASSERT_EQ(Get(RNDKEY3), p_rv3);
Close(); Close();
} }
@ -780,17 +839,18 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
options.allow_concurrent_memtable_write = true; options.allow_concurrent_memtable_write = true;
// Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
options.write_buffer_size = 64 << 20; options.write_buffer_size = 1 << 20;
// Activate the MemPurge prototype. // Activate the MemPurge prototype.
options.experimental_allow_mempurge = true; options.experimental_allow_mempurge = true;
ASSERT_OK(TryReopen(options)); ASSERT_OK(TryReopen(options));
uint32_t mempurge_count = 0; uint32_t mempurge_count = 0;
uint32_t flush_count = 0; uint32_t sst_count = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::MemPurge", [&](void* /*arg*/) { mempurge_count++; }); "DBImpl::FlushJob:MemPurgeSuccessful",
[&](void* /*arg*/) { mempurge_count++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::FlushJob:Flush", [&](void* /*arg*/) { flush_count++; }); "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
std::string KEY1 = "ThisIsKey1"; std::string KEY1 = "ThisIsKey1";
@ -801,9 +861,9 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
const std::string NOT_FOUND = "NOT_FOUND"; const std::string NOT_FOUND = "NOT_FOUND";
Random rnd(117); Random rnd(117);
const size_t NUM_REPEAT = 200; const size_t NUM_REPEAT = 100;
const size_t RAND_VALUES_LENGTH = 512; const size_t RAND_VALUES_LENGTH = 10240;
bool atLeastOneFlush = false;
std::string key, value, p_v1, p_v2, p_v3, p_v3b, p_v4, p_v5; std::string key, value, p_v1, p_v2, p_v3, p_v3b, p_v4, p_v5;
int count = 0; int count = 0;
const int EXPECTED_COUNT_FORLOOP = 3; const int EXPECTED_COUNT_FORLOOP = 3;
@ -813,6 +873,7 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
ropt.pin_data = true; ropt.pin_data = true;
ropt.total_order_seek = true; ropt.total_order_seek = true;
Iterator* iter = nullptr; Iterator* iter = nullptr;
// Insertion of of K-V pairs, multiple times. // Insertion of of K-V pairs, multiple times.
// Also insert DeleteRange // Also insert DeleteRange
for (size_t i = 0; i < NUM_REPEAT; i++) { for (size_t i = 0; i < NUM_REPEAT; i++) {
@ -836,12 +897,6 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
KEY3)); KEY3));
ASSERT_OK(Delete(KEY1)); ASSERT_OK(Delete(KEY1));
// Flush (MemPurge) with a probability of 50%.
if (rnd.OneIn(2)) {
ASSERT_OK(Flush());
atLeastOneFlush = true;
}
ASSERT_EQ(Get(KEY1), NOT_FOUND); ASSERT_EQ(Get(KEY1), NOT_FOUND);
ASSERT_EQ(Get(KEY2), NOT_FOUND); ASSERT_EQ(Get(KEY2), NOT_FOUND);
ASSERT_EQ(Get(KEY3), p_v3b); ASSERT_EQ(Get(KEY3), p_v3b);
@ -875,19 +930,11 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
// Check that there was at least one mempurge // Check that there was at least one mempurge
const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
// Check that there was no flush to storage. // Check that there was no SST files created during flush.
const uint32_t EXPECTED_FLUSH_COUNT = 0; const uint32_t EXPECTED_SST_COUNT = 0;
if (atLeastOneFlush) { EXPECT_GE(mempurge_count, EXPECTED_MIN_MEMPURGE_COUNT);
EXPECT_GE(mempurge_count, EXPECTED_MIN_MEMPURGE_COUNT); EXPECT_EQ(sst_count, EXPECTED_SST_COUNT);
} else {
// Note that there isn't enough values added to
// automatically trigger a flush/MemPurge in the background.
// Therefore we can make the assumption that if we never
// called "Flush()", no mempurge happened.
EXPECT_EQ(mempurge_count, EXPECTED_FLUSH_COUNT);
}
EXPECT_EQ(flush_count, EXPECTED_FLUSH_COUNT);
// Additional test for the iterator+memPurge. // Additional test for the iterator+memPurge.
ASSERT_OK(Put(KEY2, p_v2)); ASSERT_OK(Put(KEY2, p_v2));
@ -911,6 +958,7 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
ASSERT_EQ(value, NOT_FOUND); ASSERT_EQ(value, NOT_FOUND);
count++; count++;
} }
// Expected count here is 4: KEY2, KEY3, KEY4, KEY5. // Expected count here is 4: KEY2, KEY3, KEY4, KEY5.
ASSERT_EQ(count, EXPECTED_COUNT_END); ASSERT_EQ(count, EXPECTED_COUNT_END);
if (iter) delete iter; if (iter) delete iter;
@ -974,6 +1022,10 @@ TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) {
std::string KEY3 = "ThisIsKey3"; std::string KEY3 = "ThisIsKey3";
std::string KEY4 = "ThisIsKey4"; std::string KEY4 = "ThisIsKey4";
std::string KEY5 = "ThisIsKey5"; std::string KEY5 = "ThisIsKey5";
std::string KEY6 = "ThisIsKey6";
std::string KEY7 = "ThisIsKey7";
std::string KEY8 = "ThisIsKey8";
std::string KEY9 = "ThisIsKey9";
const std::string NOT_FOUND = "NOT_FOUND"; const std::string NOT_FOUND = "NOT_FOUND";
options.statistics = CreateDBStatistics(); options.statistics = CreateDBStatistics();
@ -990,43 +1042,68 @@ TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) {
std::make_shared<ConditionalUpdateFilterFactory>(KEY4); std::make_shared<ConditionalUpdateFilterFactory>(KEY4);
// Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
options.write_buffer_size = 64 << 20; options.write_buffer_size = 1 << 20;
// Activate the MemPurge prototype. // Activate the MemPurge prototype.
options.experimental_allow_mempurge = true; options.experimental_allow_mempurge = true;
ASSERT_OK(TryReopen(options)); ASSERT_OK(TryReopen(options));
uint32_t mempurge_count = 0;
uint32_t sst_count = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::FlushJob:MemPurgeSuccessful",
[&](void* /*arg*/) { mempurge_count++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
Random rnd(53); Random rnd(53);
const size_t NUM_REPEAT = 25; const size_t NUM_REPEAT = 1000;
const size_t RAND_VALUES_LENGTH = 128; const size_t RAND_VALUES_LENGTH = 10240;
std::string p_v1, p_v2, p_v3, p_v4, p_v5; std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9;
p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
ASSERT_OK(Put(KEY1, p_v1));
ASSERT_OK(Put(KEY2, p_v2));
ASSERT_OK(Put(KEY3, p_v3));
ASSERT_OK(Put(KEY4, p_v4));
ASSERT_OK(Put(KEY5, p_v5));
ASSERT_OK(Delete(KEY1));
// Insertion of of K-V pairs, multiple times. // Insertion of of K-V pairs, multiple times.
// Also insert DeleteRange
for (size_t i = 0; i < NUM_REPEAT; i++) { for (size_t i = 0; i < NUM_REPEAT; i++) {
// Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. // Create value strings of arbitrary
p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); // length RAND_VALUES_LENGTH bytes.
p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v4 = rnd.RandomString(RAND_VALUES_LENGTH); p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
p_v5 = rnd.RandomString(RAND_VALUES_LENGTH); p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
ASSERT_OK(Put(KEY1, p_v1)); ASSERT_OK(Put(KEY6, p_v6));
ASSERT_OK(Put(KEY2, p_v2)); ASSERT_OK(Put(KEY7, p_v7));
ASSERT_OK(Put(KEY3, p_v3)); ASSERT_OK(Put(KEY8, p_v8));
ASSERT_OK(Put(KEY4, p_v4)); ASSERT_OK(Put(KEY9, p_v9));
ASSERT_OK(Put(KEY5, p_v5));
ASSERT_OK(Delete(KEY1)); ASSERT_OK(Delete(KEY7));
ASSERT_OK(Flush());
// Verify that the ConditionalUpdateCompactionFilter
// updated the values of KEY2 and KEY3, and not KEY4 and KEY5.
ASSERT_EQ(Get(KEY1), NOT_FOUND);
ASSERT_EQ(Get(KEY2), NEW_VALUE);
ASSERT_EQ(Get(KEY3), NEW_VALUE);
ASSERT_EQ(Get(KEY4), p_v4);
ASSERT_EQ(Get(KEY5), p_v5);
} }
// Check that there was at least one mempurge
const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
// Check that there was no SST files created during flush.
const uint32_t EXPECTED_SST_COUNT = 0;
EXPECT_GE(mempurge_count, EXPECTED_MIN_MEMPURGE_COUNT);
EXPECT_EQ(sst_count, EXPECTED_SST_COUNT);
// Verify that the ConditionalUpdateCompactionFilter
// updated the values of KEY2 and KEY3, and not KEY4 and KEY5.
ASSERT_EQ(Get(KEY1), NOT_FOUND);
ASSERT_EQ(Get(KEY2), NEW_VALUE);
ASSERT_EQ(Get(KEY3), NEW_VALUE);
ASSERT_EQ(Get(KEY4), p_v4);
ASSERT_EQ(Get(KEY5), p_v5);
} }
TEST_P(DBFlushDirectIOTest, DirectIO) { TEST_P(DBFlushDirectIOTest, DirectIO) {

View File

@ -548,12 +548,45 @@ Status DBImpl::CloseHelper() {
flush_scheduler_.Clear(); flush_scheduler_.Clear();
trim_history_scheduler_.Clear(); trim_history_scheduler_.Clear();
// For now, simply trigger a manual flush at close time
// on all the column families.
// TODO(bjlemaire): Check if this is needed. Also, in the
// future we can contemplate doing a more fine-grained
// flushing by first checking if there is a need for
// flushing (but need to implement something
// else than imm()->IsFlushPending() because the output
// memtables added to imm() dont trigger flushes).
if (immutable_db_options_.experimental_allow_mempurge) {
Status flush_ret;
mutex_.Unlock();
for (ColumnFamilyData* cf : *versions_->GetColumnFamilySet()) {
if (immutable_db_options_.atomic_flush) {
flush_ret = AtomicFlushMemTables({cf}, FlushOptions(),
FlushReason::kManualFlush);
if (!flush_ret.ok()) {
ROCKS_LOG_INFO(
immutable_db_options_.info_log,
"Atomic flush memtables failed upon closing (mempurge).");
}
} else {
flush_ret =
FlushMemTable(cf, FlushOptions(), FlushReason::kManualFlush);
if (!flush_ret.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Flush memtables failed upon closing (mempurge).");
}
}
}
mutex_.Lock();
}
while (!flush_queue_.empty()) { while (!flush_queue_.empty()) {
const FlushRequest& flush_req = PopFirstFromFlushQueue(); const FlushRequest& flush_req = PopFirstFromFlushQueue();
for (const auto& iter : flush_req) { for (const auto& iter : flush_req) {
iter.first->UnrefAndTryDelete(); iter.first->UnrefAndTryDelete();
} }
} }
while (!compaction_queue_.empty()) { while (!compaction_queue_.empty()) {
auto cfd = PopFirstFromCompactionQueue(); auto cfd = PopFirstFromCompactionQueue();
cfd->UnrefAndTryDelete(); cfd->UnrefAndTryDelete();

View File

@ -1612,23 +1612,6 @@ class DBImpl : public DB {
Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
// Memtable Garbage Collection algorithm: a MemPurge takes the memtable
// and filters (or "purge") the outdated bytes out of it. The output
// (the filtered bytes, or "useful payload") is then transfered into
// the new memtable "new_mem". This process is typically intended for
// workloads with heavy overwrites to save on IO cost resulting from
// expensive flush operations.
// "MemPurge" is an experimental feature still at a very early stage
// of development. At the moment it is only compatible with the Get, Put,
// Delete operations as well as Iterators and CompactionFilters.
// For this early version, "MemPurge" is called by setting the
// options.experimental_allow_mempurge flag as "true". When this is
// the case, ALL flush operations will be replaced by MemPurge operations.
// (for prototype stress-testing purposes). Therefore, we strongly
// recommend all users not to set this flag as true given that the MemPurge
// process has not matured yet.
Status MemPurge(ColumnFamilyData* cfd, MemTable* new_mem);
void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds); void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
// Force current memtable contents to be flushed. // Force current memtable contents to be flushed.
@ -1854,7 +1837,7 @@ class DBImpl : public DB {
// state needs flush or compaction. // state needs flush or compaction.
void InstallSuperVersionAndScheduleWork( void InstallSuperVersionAndScheduleWork(
ColumnFamilyData* cfd, SuperVersionContext* sv_context, ColumnFamilyData* cfd, SuperVersionContext* sv_context,
const MutableCFOptions& mutable_cf_options, bool fromMemPurge = false); const MutableCFOptions& mutable_cf_options);
bool GetIntPropertyInternal(ColumnFamilyData* cfd, bool GetIntPropertyInternal(ColumnFamilyData* cfd,
const DBPropertyInfo& property_info, const DBPropertyInfo& property_info,

View File

@ -3436,7 +3436,7 @@ void DBImpl::BuildCompactionJobInfo(
void DBImpl::InstallSuperVersionAndScheduleWork( void DBImpl::InstallSuperVersionAndScheduleWork(
ColumnFamilyData* cfd, SuperVersionContext* sv_context, ColumnFamilyData* cfd, SuperVersionContext* sv_context,
const MutableCFOptions& mutable_cf_options, bool fromMemPurge) { const MutableCFOptions& mutable_cf_options) {
mutex_.AssertHeld(); mutex_.AssertHeld();
// Update max_total_in_memory_state_ // Update max_total_in_memory_state_
@ -3451,8 +3451,7 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
if (UNLIKELY(sv_context->new_superversion == nullptr)) { if (UNLIKELY(sv_context->new_superversion == nullptr)) {
sv_context->NewSuperVersion(); sv_context->NewSuperVersion();
} }
cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options, cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);
fromMemPurge);
// There may be a small data race here. The snapshot tricking bottommost // There may be a small data race here. The snapshot tricking bottommost
// compaction may already be released here. But assuming there will always be // compaction may already be released here. But assuming there will always be

View File

@ -1737,184 +1737,6 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
Status DBImpl::MemPurge(ColumnFamilyData* cfd, MemTable* new_mem) {
Status s;
assert(new_mem != nullptr);
JobContext job_context(next_job_id_.fetch_add(1), true);
std::vector<SequenceNumber> snapshot_seqs;
SequenceNumber earliest_write_conflict_snapshot;
SnapshotChecker* snapshot_checker;
GetSnapshotContext(&job_context, &snapshot_seqs,
&earliest_write_conflict_snapshot, &snapshot_checker);
// Grab current memtable
MemTable* m = cfd->mem();
SequenceNumber first_seqno = m->GetFirstSequenceNumber();
SequenceNumber earliest_seqno = m->GetEarliestSequenceNumber();
// Create two iterators, one for the memtable data (contains
// info from puts + deletes), and one for the memtable
// Range Tombstones (from DeleteRanges).
ReadOptions ro;
ro.total_order_seek = true;
Arena arena;
std::vector<InternalIterator*> memtables(1, m->NewIterator(ro, &arena));
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters;
auto* range_del_iter = m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
if (range_del_iter != nullptr) {
range_del_iters.emplace_back(range_del_iter);
}
ScopedArenaIterator iter(
NewMergingIterator(&(cfd->internal_comparator()), memtables.data(),
static_cast<int>(memtables.size()), &arena));
auto* ioptions = cfd->ioptions();
// Place iterator at the First (meaning most recent) key node.
iter->SeekToFirst();
std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
new CompactionRangeDelAggregator(&(cfd->internal_comparator()),
snapshot_seqs));
for (auto& rd_iter : range_del_iters) {
range_del_agg->AddTombstones(std::move(rd_iter));
}
// If there is valid data in the memtable,
// or at least range tombstones, copy over the info
// to the new memtable.
if (iter->Valid() || !range_del_agg->IsEmpty()) {
std::unique_ptr<CompactionFilter> compaction_filter;
if (ioptions->compaction_filter_factory != nullptr &&
ioptions->compaction_filter_factory->ShouldFilterTableFileCreation(
TableFileCreationReason::kFlush)) {
CompactionFilter::Context ctx;
ctx.is_full_compaction = false;
ctx.is_manual_compaction = false;
ctx.column_family_id = cfd->GetID();
ctx.reason = TableFileCreationReason::kFlush;
compaction_filter =
ioptions->compaction_filter_factory->CreateCompactionFilter(ctx);
if (compaction_filter != nullptr &&
!compaction_filter->IgnoreSnapshots()) {
s = Status::NotSupported(
"CompactionFilter::IgnoreSnapshots() = false is not supported "
"anymore.");
return s;
}
}
Env* env = immutable_db_options_.env;
assert(env);
MergeHelper merge(
env, (cfd->internal_comparator()).user_comparator(),
(ioptions->merge_operator).get(), compaction_filter.get(),
ioptions->logger, true /* internal key corruption is not ok */,
snapshot_seqs.empty() ? 0 : snapshot_seqs.back(), snapshot_checker);
CompactionIterator c_iter(
iter.get(), (cfd->internal_comparator()).user_comparator(), &merge,
kMaxSequenceNumber, &snapshot_seqs, earliest_write_conflict_snapshot,
snapshot_checker, env, ShouldReportDetailedTime(env, ioptions->stats),
true /* internal key corruption is not ok */, range_del_agg.get(),
nullptr, ioptions->allow_data_in_errors,
/*compaction=*/nullptr, compaction_filter.get(),
/*shutting_down=*/nullptr,
/*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr,
/*manual_compaction_canceled=*/nullptr, immutable_db_options_.info_log,
&(cfd->GetFullHistoryTsLow()));
c_iter.SeekToFirst();
mutex_.AssertHeld();
// Set earliest sequence number in the new memtable
// to be equal to the earliest sequence number of the
// memtable being flushed (See later if there is a need
// to update this number!).
new_mem->SetEarliestSequenceNumber(earliest_seqno);
// Likewise for first seq number.
new_mem->SetFirstSequenceNumber(first_seqno);
SequenceNumber new_first_seqno = kMaxSequenceNumber;
// Key transfer
for (; c_iter.Valid(); c_iter.Next()) {
const ParsedInternalKey ikey = c_iter.ikey();
const Slice value = c_iter.value();
new_first_seqno =
ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno;
// Should we update "OldestKeyTime" ????
s = new_mem->Add(
ikey.sequence, ikey.type, ikey.user_key, value,
nullptr, // KV protection info set as nullptr since it
// should only be useful for the first add to
// the original memtable.
false, // : allow concurrent_memtable_writes_
// Not seen as necessary for now.
nullptr, // get_post_process_info(m) must be nullptr
// when concurrent_memtable_writes is switched off.
nullptr); // hint, only used when concurrent_memtable_writes_
// is switched on.
if (!s.ok()) {
break;
}
}
// Check status and propagate
// potential error status from c_iter
if (!s.ok()) {
c_iter.status().PermitUncheckedError();
} else if (!c_iter.status().ok()) {
s = c_iter.status();
}
// Range tombstone transfer.
if (s.ok()) {
auto range_del_it = range_del_agg->NewIterator();
for (range_del_it->SeekToFirst(); range_del_it->Valid();
range_del_it->Next()) {
auto tombstone = range_del_it->Tombstone();
new_first_seqno =
tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno;
s = new_mem->Add(
tombstone.seq_, // Sequence number
kTypeRangeDeletion, // KV type
tombstone.start_key_, // Key is start key.
tombstone.end_key_, // Value is end key.
nullptr, // KV protection info set as nullptr since it
// should only be useful for the first add to
// the original memtable.
false, // : allow concurrent_memtable_writes_
// Not seen as necessary for now.
nullptr, // get_post_process_info(m) must be nullptr
// when concurrent_memtable_writes is switched off.
nullptr); // hint, only used when concurrent_memtable_writes_
// is switched on.
if (!s.ok()) {
break;
}
}
}
// Rectify the first sequence number, which (unlike the earliest seq
// number) needs to be present in the new memtable.
new_mem->SetFirstSequenceNumber(new_first_seqno);
}
// Note: if the mempurge was ineffective, meaning that there was no
// garbage to remove, and this new_mem needs to be flushed again,
// the new_mem->Add would have updated the flush status when it
// called "UpdateFlushState()" internally at the last Add() call.
// Therefore if the new mem needs to be flushed again, we mark
// the return status as "aborted", which will trigger the regular
// flush operation.
if (s.ok() && new_mem->ShouldScheduleFlush()) {
s = Status::Aborted(Slice("No garbage collected."));
}
return s;
}
// REQUIRES: mutex_ is held // REQUIRES: mutex_ is held
// REQUIRES: this thread is currently at the front of the writer queue // REQUIRES: this thread is currently at the front of the writer queue
// REQUIRES: this thread is currently at the front of the 2nd writer queue if // REQUIRES: this thread is currently at the front of the 2nd writer queue if
@ -2114,48 +1936,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
cfd->mem()->SetNextLogNumber(logfile_number_); cfd->mem()->SetNextLogNumber(logfile_number_);
assert(new_mem != nullptr); assert(new_mem != nullptr);
// By default, it is assumed that the 'old' memtable cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
// will be added to the Imm memtable list and will therefore
// contribute to the Imm memory footprint.
bool noImmMemoryContribution = false;
// If MemPurge activated, purge and delete current memtable.
if (immutable_db_options_.experimental_allow_mempurge &&
((cfd->GetFlushReason() == FlushReason::kOthers) ||
(cfd->GetFlushReason() == FlushReason::kManualFlush))) {
Status mempurge_s = MemPurge(cfd, new_mem);
if (mempurge_s.ok()) {
// If mempurge worked successfully,
// create sync point and decrement current memtable reference.
TEST_SYNC_POINT("DBImpl::MemPurge");
cfd->mem()->Unref();
// If the MemPurge is successful, the 'old' (purged) memtable
// is not added to the Imm memtable list and therefore
// does not contribute to the Imm memory cost anymore.
noImmMemoryContribution = true;
} else {
// If mempurge failed, go back to regular mem->imm->flush workflow.
assert(new_mem != nullptr);
delete new_mem;
SuperVersion* new_superversion =
context->superversion_context.new_superversion.release();
if (new_superversion != nullptr) {
delete new_superversion;
}
SequenceNumber seq = versions_->LastSequence();
new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
assert(new_mem != nullptr);
context->superversion_context.NewSuperVersion();
cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
}
} else {
// Else make the memtable immutable and proceed as usual.
cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
}
new_mem->Ref(); new_mem->Ref();
cfd->SetMemtable(new_mem); cfd->SetMemtable(new_mem);
InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
mutable_cf_options, mutable_cf_options);
noImmMemoryContribution);
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
mutex_.Unlock(); mutex_.Unlock();

View File

@ -227,9 +227,25 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
} }
Status mempurge_s = Status::NotFound("No MemPurge.");
// This will release and re-acquire the mutex. if (db_options_.experimental_allow_mempurge &&
Status s = WriteLevel0Table(); (cfd_->GetFlushReason() == FlushReason::kWriteBufferFull) &&
(!mems_.empty())) {
mempurge_s = MemPurge();
if (!mempurge_s.ok()) {
ROCKS_LOG_INFO(db_options_.info_log,
"Mempurge process unsuccessful: %s\n",
mempurge_s.ToString().c_str());
}
}
Status s;
if (mempurge_s.ok()) {
base_->Unref();
s = Status::OK();
} else {
// This will release and re-acquire the mutex.
s = WriteLevel0Table();
}
if (s.ok() && cfd_->IsDropped()) { if (s.ok() && cfd_->IsDropped()) {
s = Status::ColumnFamilyDropped("Column family dropped during compaction"); s = Status::ColumnFamilyDropped("Column family dropped during compaction");
@ -306,6 +322,237 @@ void FlushJob::Cancel() {
base_->Unref(); base_->Unref();
} }
Status FlushJob::MemPurge() {
Status s;
db_mutex_->AssertHeld();
db_mutex_->Unlock();
assert(!mems_.empty());
MemTable* new_mem = nullptr;
// Create two iterators, one for the memtable data (contains
// info from puts + deletes), and one for the memtable
// Range Tombstones (from DeleteRanges).
ReadOptions ro;
ro.total_order_seek = true;
Arena arena;
std::vector<InternalIterator*> memtables;
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters;
for (MemTable* m : mems_) {
memtables.push_back(m->NewIterator(ro, &arena));
auto* range_del_iter = m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
if (range_del_iter != nullptr) {
range_del_iters.emplace_back(range_del_iter);
}
}
assert(!memtables.empty());
SequenceNumber first_seqno = mems_[0]->GetFirstSequenceNumber();
SequenceNumber earliest_seqno = mems_[0]->GetEarliestSequenceNumber();
ScopedArenaIterator iter(
NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(),
static_cast<int>(memtables.size()), &arena));
auto* ioptions = cfd_->ioptions();
// Place iterator at the First (meaning most recent) key node.
iter->SeekToFirst();
std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
new CompactionRangeDelAggregator(&(cfd_->internal_comparator()),
existing_snapshots_));
for (auto& rd_iter : range_del_iters) {
range_del_agg->AddTombstones(std::move(rd_iter));
}
// If there is valid data in the memtable,
// or at least range tombstones, copy over the info
// to the new memtable.
if (iter->Valid() || !range_del_agg->IsEmpty()) {
// Arbitrary heuristic: maxSize is 60% cpacity.
size_t maxSize = ((mutable_cf_options_.write_buffer_size + 6U) / 10U);
std::unique_ptr<CompactionFilter> compaction_filter;
if (ioptions->compaction_filter_factory != nullptr &&
ioptions->compaction_filter_factory->ShouldFilterTableFileCreation(
TableFileCreationReason::kFlush)) {
CompactionFilter::Context ctx;
ctx.is_full_compaction = false;
ctx.is_manual_compaction = false;
ctx.column_family_id = cfd_->GetID();
ctx.reason = TableFileCreationReason::kFlush;
compaction_filter =
ioptions->compaction_filter_factory->CreateCompactionFilter(ctx);
if (compaction_filter != nullptr &&
!compaction_filter->IgnoreSnapshots()) {
s = Status::NotSupported(
"CompactionFilter::IgnoreSnapshots() = false is not supported "
"anymore.");
return s;
}
}
// mems are ordered by increasing ID, so mems_[0]->GetID
// returns the smallest memtable ID.
new_mem =
new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()),
mutable_cf_options_, cfd_->write_buffer_mgr(),
mems_[0]->GetEarliestSequenceNumber(), cfd_->GetID());
assert(new_mem != nullptr);
Env* env = db_options_.env;
assert(env);
MergeHelper merge(
env, (cfd_->internal_comparator()).user_comparator(),
(ioptions->merge_operator).get(), compaction_filter.get(),
ioptions->logger, true /* internal key corruption is not ok */,
existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
snapshot_checker_);
CompactionIterator c_iter(
iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge,
kMaxSequenceNumber, &existing_snapshots_,
earliest_write_conflict_snapshot_, snapshot_checker_, env,
ShouldReportDetailedTime(env, ioptions->stats),
true /* internal key corruption is not ok */, range_del_agg.get(),
nullptr, ioptions->allow_data_in_errors,
/*compaction=*/nullptr, compaction_filter.get(),
/*shutting_down=*/nullptr,
/*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr,
/*manual_compaction_canceled=*/nullptr, ioptions->info_log,
&(cfd_->GetFullHistoryTsLow()));
// Set earliest sequence number in the new memtable
// to be equal to the earliest sequence number of the
// memtable being flushed (See later if there is a need
// to update this number!).
new_mem->SetEarliestSequenceNumber(earliest_seqno);
// Likewise for first seq number.
new_mem->SetFirstSequenceNumber(first_seqno);
SequenceNumber new_first_seqno = kMaxSequenceNumber;
c_iter.SeekToFirst();
// Key transfer
for (; c_iter.Valid(); c_iter.Next()) {
const ParsedInternalKey ikey = c_iter.ikey();
const Slice value = c_iter.value();
new_first_seqno =
ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno;
// Should we update "OldestKeyTime" ???? -> timestamp appear
// to still be an "experimental" feature.
s = new_mem->Add(
ikey.sequence, ikey.type, ikey.user_key, value,
nullptr, // KV protection info set as nullptr since it
// should only be useful for the first add to
// the original memtable.
false, // : allow concurrent_memtable_writes_
// Not seen as necessary for now.
nullptr, // get_post_process_info(m) must be nullptr
// when concurrent_memtable_writes is switched off.
nullptr); // hint, only used when concurrent_memtable_writes_
// is switched on.
if (!s.ok()) {
break;
}
// If new_mem has size greater than maxSize,
// then rollback to regular flush operation,
// and destroy new_mem.
if (new_mem->ApproximateMemoryUsage() > maxSize) {
s = Status::Aborted("Mempurge filled more than one memtable.");
break;
}
}
// Check status and propagate
// potential error status from c_iter
if (!s.ok()) {
c_iter.status().PermitUncheckedError();
} else if (!c_iter.status().ok()) {
s = c_iter.status();
}
// Range tombstone transfer.
if (s.ok()) {
auto range_del_it = range_del_agg->NewIterator();
for (range_del_it->SeekToFirst(); range_del_it->Valid();
range_del_it->Next()) {
auto tombstone = range_del_it->Tombstone();
new_first_seqno =
tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno;
s = new_mem->Add(
tombstone.seq_, // Sequence number
kTypeRangeDeletion, // KV type
tombstone.start_key_, // Key is start key.
tombstone.end_key_, // Value is end key.
nullptr, // KV protection info set as nullptr since it
// should only be useful for the first add to
// the original memtable.
false, // : allow concurrent_memtable_writes_
// Not seen as necessary for now.
nullptr, // get_post_process_info(m) must be nullptr
// when concurrent_memtable_writes is switched off.
nullptr); // hint, only used when concurrent_memtable_writes_
// is switched on.
if (!s.ok()) {
break;
}
// If new_mem has size greater than maxSize,
// then rollback to regular flush operation,
// and destroy new_mem.
if (new_mem->ApproximateMemoryUsage() > maxSize) {
s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
break;
}
}
}
// If everything happened smoothly and new_mem contains valid data,
// decide if it is flushed to storage or kept in the imm()
// memtable list (memory).
if (s.ok() && (new_first_seqno != kMaxSequenceNumber)) {
// Rectify the first sequence number, which (unlike the earliest seq
// number) needs to be present in the new memtable.
new_mem->SetFirstSequenceNumber(new_first_seqno);
// The new_mem is added to the list of immutable memtables
// only if it filled at less than 60% capacity (arbitrary heuristic).
if (new_mem->ApproximateMemoryUsage() < maxSize) {
db_mutex_->Lock();
cfd_->imm()
->Add(new_mem, &job_context_->memtables_to_free, false /* trigger_flush. Adding this memtable will not trigger any flush */);
new_mem->Ref();
db_mutex_->Unlock();
} else {
s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
if (new_mem) {
job_context_->memtables_to_free.push_back(new_mem);
}
}
} else {
// In this case, the newly allocated new_mem is empty.
assert(new_mem != nullptr);
job_context_->memtables_to_free.push_back(new_mem);
}
}
// Reacquire the mutex for WriteLevel0 function.
db_mutex_->Lock();
// If mempurge successful, don't write input tables to level0,
// but write any full output table to level0.
if (s.ok()) {
TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeSuccessful");
} else {
TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful");
}
return s;
}
Status FlushJob::WriteLevel0Table() { Status FlushJob::WriteLevel0Table() {
AutoThreadOperationStageUpdater stage_updater( AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_FLUSH_WRITE_L0); ThreadStatus::STAGE_FLUSH_WRITE_L0);
@ -362,7 +609,7 @@ Status FlushJob::WriteLevel0Table() {
{ {
ScopedArenaIterator iter( ScopedArenaIterator iter(
NewMergingIterator(&cfd_->internal_comparator(), &memtables[0], NewMergingIterator(&cfd_->internal_comparator(), memtables.data(),
static_cast<int>(memtables.size()), &arena)); static_cast<int>(memtables.size()), &arena));
ROCKS_LOG_INFO(db_options_.info_log, ROCKS_LOG_INFO(db_options_.info_log,
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started", "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
@ -470,6 +717,7 @@ Status FlushJob::WriteLevel0Table() {
const bool has_output = meta_.fd.GetFileSize() > 0; const bool has_output = meta_.fd.GetFileSize() > 0;
if (s.ok() && has_output) { if (s.ok() && has_output) {
TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated");
// if we have more than 1 background thread, then we cannot // if we have more than 1 background thread, then we cannot
// insert files directly into higher levels because some other // insert files directly into higher levels because some other
// threads could be concurrently producing compacted files for // threads could be concurrently producing compacted files for

View File

@ -101,6 +101,28 @@ class FlushJob {
void ReportFlushInputSize(const autovector<MemTable*>& mems); void ReportFlushInputSize(const autovector<MemTable*>& mems);
void RecordFlushIOStats(); void RecordFlushIOStats();
Status WriteLevel0Table(); Status WriteLevel0Table();
// Memtable Garbage Collection algorithm: a MemPurge takes the list
// of immutable memtables and filters out (or "purge") the outdated bytes
// out of it. The output (the filtered bytes, or "useful payload") is
// then transfered into a new memtable. If this memtable is filled, then
// the mempurge is aborted and rerouted to a regular flush process. Else,
// depending on the heuristics, placed onto the immutable memtable list.
// The addition to the imm list will not trigger a flush operation. The
// flush of the imm list will instead be triggered once the mutable memtable
// is added to the imm list.
// This process is typically intended for workloads with heavy overwrites
// when we want to avoid SSD writes (and reads) as much as possible.
// "MemPurge" is an experimental feature still at a very early stage
// of development. At the moment it is only compatible with the Get, Put,
// Delete operations as well as Iterators and CompactionFilters.
// For this early version, "MemPurge" is called by setting the
// options.experimental_allow_mempurge flag as "true". When this is
// the case, ALL automatic flush operations (kWRiteBufferManagerFull) will
// first go through the MemPurge process. herefore, we strongly
// recommend all users not to set this flag as true given that the MemPurge
// process has not matured yet.
Status MemPurge();
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const; std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
#endif // !ROCKSDB_LITE #endif // !ROCKSDB_LITE

View File

@ -516,7 +516,8 @@ Status MemTableList::TryInstallMemtableFlushResults(
} }
// New memtables are inserted at the front of the list. // New memtables are inserted at the front of the list.
void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) { void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete,
bool trigger_flush) {
assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_); assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_);
InstallNewVersion(); InstallNewVersion();
// this method is used to move mutable memtable into an immutable list. // this method is used to move mutable memtable into an immutable list.
@ -527,7 +528,8 @@ void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
current_->Add(m, to_delete); current_->Add(m, to_delete);
m->MarkImmutable(); m->MarkImmutable();
num_flush_not_started_++; num_flush_not_started_++;
if (num_flush_not_started_ == 1) {
if (num_flush_not_started_ > 0 && trigger_flush) {
imm_flush_needed.store(true, std::memory_order_release); imm_flush_needed.store(true, std::memory_order_release);
} }
UpdateCachedValuesFromMemTableListVersion(); UpdateCachedValuesFromMemTableListVersion();

View File

@ -272,7 +272,11 @@ class MemTableList {
// New memtables are inserted at the front of the list. // New memtables are inserted at the front of the list.
// Takes ownership of the referenced held on *m by the caller of Add(). // Takes ownership of the referenced held on *m by the caller of Add().
void Add(MemTable* m, autovector<MemTable*>* to_delete); // By default, adding memtables will flag that the memtable list needs to be
// flushed, but in certain situations, like after a mempurge, we may want to
// avoid flushing the memtable list upon addition of a memtable.
void Add(MemTable* m, autovector<MemTable*>* to_delete,
bool trigger_flush = true);
// Returns an estimate of the number of bytes of data in use. // Returns an estimate of the number of bytes of data in use.
size_t ApproximateMemoryUsage(); size_t ApproximateMemoryUsage();