Add a DB property to indicate number of background errors encountered
Summary: Add a property to calculate number of background errors encountered to help users build their monitoring Test Plan: Add a unit test. make all check Reviewers: haobo, igor, dhruba Reviewed By: igor CC: ljin, nkg-, yhchiang, leveldb Differential Revision: https://reviews.facebook.net/D16959
This commit is contained in:
parent
1ec72b37b1
commit
71e6a34271
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
#include "db/db_impl.h"
|
#include "db/db_impl.h"
|
||||||
|
|
||||||
|
#define __STDC_FORMAT_MACROS
|
||||||
|
#include <inttypes.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
@ -1806,8 +1808,10 @@ Status DBImpl::WaitForFlushMemTable() {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status DBImpl::TEST_FlushMemTable() {
|
Status DBImpl::TEST_FlushMemTable(bool wait) {
|
||||||
return FlushMemTable(FlushOptions());
|
FlushOptions fo;
|
||||||
|
fo.wait = wait;
|
||||||
|
return FlushMemTable(fo);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status DBImpl::TEST_WaitForFlushMemTable() {
|
Status DBImpl::TEST_WaitForFlushMemTable() {
|
||||||
@ -1904,10 +1908,13 @@ void DBImpl::BackgroundCallFlush() {
|
|||||||
// case this is an environmental problem and we do not want to
|
// case this is an environmental problem and we do not want to
|
||||||
// chew up resources for failed compactions for the duration of
|
// chew up resources for failed compactions for the duration of
|
||||||
// the problem.
|
// the problem.
|
||||||
|
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
|
||||||
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
||||||
Log(options_.info_log, "Waiting after background flush error: %s",
|
|
||||||
s.ToString().c_str());
|
|
||||||
mutex_.Unlock();
|
mutex_.Unlock();
|
||||||
|
Log(options_.info_log,
|
||||||
|
"Waiting after background flush error: %s"
|
||||||
|
"Accumulated background error counts: %" PRIu64,
|
||||||
|
s.ToString().c_str(), error_cnt);
|
||||||
log_buffer.FlushBufferToLog();
|
log_buffer.FlushBufferToLog();
|
||||||
LogFlush(options_.info_log);
|
LogFlush(options_.info_log);
|
||||||
env_->SleepForMicroseconds(1000000);
|
env_->SleepForMicroseconds(1000000);
|
||||||
@ -1978,11 +1985,14 @@ void DBImpl::BackgroundCallCompaction() {
|
|||||||
// case this is an environmental problem and we do not want to
|
// case this is an environmental problem and we do not want to
|
||||||
// chew up resources for failed compactions for the duration of
|
// chew up resources for failed compactions for the duration of
|
||||||
// the problem.
|
// the problem.
|
||||||
|
uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount();
|
||||||
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
||||||
mutex_.Unlock();
|
mutex_.Unlock();
|
||||||
log_buffer.FlushBufferToLog();
|
log_buffer.FlushBufferToLog();
|
||||||
Log(options_.info_log, "Waiting after background compaction error: %s",
|
Log(options_.info_log,
|
||||||
s.ToString().c_str());
|
"Waiting after background compaction error: %s, "
|
||||||
|
"Accumulated background error counts: %" PRIu64,
|
||||||
|
s.ToString().c_str(), error_cnt);
|
||||||
LogFlush(options_.info_log);
|
LogFlush(options_.info_log);
|
||||||
env_->SleepForMicroseconds(1000000);
|
env_->SleepForMicroseconds(1000000);
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
|
@ -109,7 +109,7 @@ class DBImpl : public DB {
|
|||||||
const Slice* end);
|
const Slice* end);
|
||||||
|
|
||||||
// Force current memtable contents to be flushed.
|
// Force current memtable contents to be flushed.
|
||||||
Status TEST_FlushMemTable();
|
Status TEST_FlushMemTable(bool wait = true);
|
||||||
|
|
||||||
// Wait for memtable compaction
|
// Wait for memtable compaction
|
||||||
Status TEST_WaitForFlushMemTable();
|
Status TEST_WaitForFlushMemTable();
|
||||||
|
@ -4188,6 +4188,11 @@ TEST(DBTest, NoSpace) {
|
|||||||
dbfull()->TEST_CompactRange(level, nullptr, nullptr);
|
dbfull()->TEST_CompactRange(level, nullptr, nullptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string property_value;
|
||||||
|
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
|
||||||
|
ASSERT_EQ("5", property_value);
|
||||||
|
|
||||||
env_->no_space_.Release_Store(nullptr);
|
env_->no_space_.Release_Store(nullptr);
|
||||||
ASSERT_LT(CountFiles(), num_files + 3);
|
ASSERT_LT(CountFiles(), num_files + 3);
|
||||||
|
|
||||||
@ -4196,6 +4201,43 @@ TEST(DBTest, NoSpace) {
|
|||||||
} while (ChangeCompactOptions());
|
} while (ChangeCompactOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check background error counter bumped on flush failures.
|
||||||
|
TEST(DBTest, NoSpaceFlush) {
|
||||||
|
do {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.env = env_;
|
||||||
|
options.max_background_flushes = 1;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
|
ASSERT_OK(Put("foo", "v1"));
|
||||||
|
env_->no_space_.Release_Store(env_); // Force out-of-space errors
|
||||||
|
|
||||||
|
std::string property_value;
|
||||||
|
// Background error count is 0 now.
|
||||||
|
ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
|
||||||
|
ASSERT_EQ("0", property_value);
|
||||||
|
|
||||||
|
dbfull()->TEST_FlushMemTable(false);
|
||||||
|
|
||||||
|
// Wait 300 milliseconds or background-errors turned 1 from 0.
|
||||||
|
int time_to_sleep_limit = 300000;
|
||||||
|
while (time_to_sleep_limit > 0) {
|
||||||
|
int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit;
|
||||||
|
time_to_sleep_limit -= to_sleep;
|
||||||
|
env_->SleepForMicroseconds(to_sleep);
|
||||||
|
|
||||||
|
ASSERT_TRUE(
|
||||||
|
db_->GetProperty("rocksdb.background-errors", &property_value));
|
||||||
|
if (property_value == "1") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ASSERT_EQ("1", property_value);
|
||||||
|
|
||||||
|
env_->no_space_.Release_Store(nullptr);
|
||||||
|
} while (ChangeCompactOptions());
|
||||||
|
}
|
||||||
|
|
||||||
TEST(DBTest, NonWritableFileSystem) {
|
TEST(DBTest, NonWritableFileSystem) {
|
||||||
do {
|
do {
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
|
@ -30,9 +30,11 @@ DBPropertyType GetPropertyType(const Slice& property) {
|
|||||||
} else if (in == "num-immutable-mem-table") {
|
} else if (in == "num-immutable-mem-table") {
|
||||||
return kNumImmutableMemTable;
|
return kNumImmutableMemTable;
|
||||||
} else if (in == "mem-table-flush-pending") {
|
} else if (in == "mem-table-flush-pending") {
|
||||||
return MemtableFlushPending;
|
return kMemtableFlushPending;
|
||||||
} else if (in == "compaction-pending") {
|
} else if (in == "compaction-pending") {
|
||||||
return CompactionPending;
|
return kCompactionPending;
|
||||||
|
} else if (in == "background-errors") {
|
||||||
|
return kBackgroundErrors;
|
||||||
}
|
}
|
||||||
return kUnknown;
|
return kUnknown;
|
||||||
}
|
}
|
||||||
@ -330,15 +332,21 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
|
|||||||
case kNumImmutableMemTable:
|
case kNumImmutableMemTable:
|
||||||
*value = std::to_string(imm.size());
|
*value = std::to_string(imm.size());
|
||||||
return true;
|
return true;
|
||||||
case MemtableFlushPending:
|
case kMemtableFlushPending:
|
||||||
// Return number of mem tables that are ready to flush (made immutable)
|
// Return number of mem tables that are ready to flush (made immutable)
|
||||||
*value = std::to_string(imm.IsFlushPending() ? 1 : 0);
|
*value = std::to_string(imm.IsFlushPending() ? 1 : 0);
|
||||||
return true;
|
return true;
|
||||||
case CompactionPending:
|
case kCompactionPending:
|
||||||
// 1 if the system already determines at least one compacdtion is needed.
|
// 1 if the system already determines at least one compacdtion is needed.
|
||||||
// 0 otherwise,
|
// 0 otherwise,
|
||||||
*value = std::to_string(current->NeedsCompaction() ? 1 : 0);
|
*value = std::to_string(current->NeedsCompaction() ? 1 : 0);
|
||||||
return true;
|
return true;
|
||||||
|
/////////////
|
||||||
|
case kBackgroundErrors:
|
||||||
|
// Accumulated number of errors in background flushes or compactions.
|
||||||
|
*value = std::to_string(GetBackgroundErrorCount());
|
||||||
|
return true;
|
||||||
|
/////////
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -26,9 +26,11 @@ enum DBPropertyType {
|
|||||||
kStats, // Return general statitistics of DB
|
kStats, // Return general statitistics of DB
|
||||||
kSsTables, // Return a human readable string of current SST files
|
kSsTables, // Return a human readable string of current SST files
|
||||||
kNumImmutableMemTable, // Return number of immutable mem tables
|
kNumImmutableMemTable, // Return number of immutable mem tables
|
||||||
MemtableFlushPending, // Return 1 if mem table flushing is pending, otherwise
|
kMemtableFlushPending, // Return 1 if mem table flushing is pending,
|
||||||
|
// otherwise
|
||||||
// 0.
|
// 0.
|
||||||
CompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
|
kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
|
||||||
|
kBackgroundErrors, // Return accumulated background errors encountered.
|
||||||
kUnknown,
|
kUnknown,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -49,6 +51,7 @@ class InternalStats {
|
|||||||
stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
|
stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
|
||||||
stall_leveln_slowdown_(num_levels, 0),
|
stall_leveln_slowdown_(num_levels, 0),
|
||||||
stall_leveln_slowdown_count_(num_levels, 0),
|
stall_leveln_slowdown_count_(num_levels, 0),
|
||||||
|
bg_error_count_(0),
|
||||||
number_levels_(num_levels),
|
number_levels_(num_levels),
|
||||||
statistics_(statistics),
|
statistics_(statistics),
|
||||||
env_(env),
|
env_(env),
|
||||||
@ -116,6 +119,10 @@ class InternalStats {
|
|||||||
stall_leveln_slowdown_count_[level] += micros;
|
stall_leveln_slowdown_count_[level] += micros;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
|
||||||
|
|
||||||
|
uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
|
||||||
|
|
||||||
bool GetProperty(DBPropertyType property_type, const Slice& property,
|
bool GetProperty(DBPropertyType property_type, const Slice& property,
|
||||||
std::string* value, VersionSet* version_set,
|
std::string* value, VersionSet* version_set,
|
||||||
const MemTableList& imm);
|
const MemTableList& imm);
|
||||||
@ -158,6 +165,13 @@ class InternalStats {
|
|||||||
std::vector<uint64_t> stall_leveln_slowdown_;
|
std::vector<uint64_t> stall_leveln_slowdown_;
|
||||||
std::vector<uint64_t> stall_leveln_slowdown_count_;
|
std::vector<uint64_t> stall_leveln_slowdown_count_;
|
||||||
|
|
||||||
|
// Total number of background errors encountered. Every time a flush task
|
||||||
|
// or compaction task fails, this counter is incremented. The failure can
|
||||||
|
// be caused by any possible reason, including file system errors, out of
|
||||||
|
// resources, or input file corruption. Failing when retrying the same flush
|
||||||
|
// or compaction will cause the counter to increase too.
|
||||||
|
uint64_t bg_error_count_;
|
||||||
|
|
||||||
int number_levels_;
|
int number_levels_;
|
||||||
Statistics* statistics_;
|
Statistics* statistics_;
|
||||||
Env* env_;
|
Env* env_;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user