Blob db create a snapshot before every read

Summary:
If GC kicks in between

* A Get() reads index entry from base db.
* The Get() read from a blob file

The GC can delete the corresponding blob file, making the key not found. Fortunately we have existing logic to avoid deleting a blob file if it is referenced by a snapshot. So the fix is to explicitly create a snapshot before reading index entry from base db.
Closes https://github.com/facebook/rocksdb/pull/2754

Differential Revision: D5655956

Pulled By: yiwu-arbug

fbshipit-source-id: e4ccbc51331362542e7343175bbcbdea5830f544
This commit is contained in:
yiwu-arbug 2017-08-20 18:12:38 -07:00 committed by Facebook Github Bot
parent 4624ae52c9
commit 5b68b114f1
3 changed files with 197 additions and 38 deletions

View File

@ -305,8 +305,8 @@ void BlobDBImpl::StartBackgroundTasks() {
kDeleteCheckPeriodMillisecs,
std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1));
tqueue_.add(
kDeleteObsoletedFilesPeriodMillisecs,
std::bind(&BlobDBImpl::DeleteObsFiles, this, std::placeholders::_1));
kDeleteObsoleteFilesPeriodMillisecs,
std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1));
tqueue_.add(kSanityCheckPeriodMillisecs,
std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
tqueue_.add(kWriteAmplificationStatsPeriodMillisecs,
@ -1117,15 +1117,25 @@ Status BlobDBImpl::AppendSN(const std::shared_ptr<BlobFile>& bfile,
}
std::vector<Status> BlobDBImpl::MultiGet(
const ReadOptions& options,
const ReadOptions& read_options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) {
// Get a snapshot to avoid blob file get deleted between we
// fetch and index entry and reading from the file.
ReadOptions ro(read_options);
bool snapshot_created = SetSnapshotIfNeeded(&ro);
std::vector<std::string> values_lsm;
values_lsm.resize(keys.size());
auto statuses = db_->MultiGet(options, column_family, keys, &values_lsm);
auto statuses = db_->MultiGet(ro, column_family, keys, &values_lsm);
TEST_SYNC_POINT("BlobDBImpl::MultiGet:AfterIndexEntryGet:1");
TEST_SYNC_POINT("BlobDBImpl::MultiGet:AfterIndexEntryGet:2");
values->resize(keys.size());
assert(statuses.size() == keys.size());
for (size_t i = 0; i < keys.size(); ++i) {
if (!statuses[i].ok()) continue;
if (!statuses[i].ok()) {
continue;
}
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
auto cfd = cfh->cfd();
@ -1133,9 +1143,21 @@ std::vector<Status> BlobDBImpl::MultiGet(
Status s = CommonGet(cfd, keys[i], values_lsm[i], &((*values)[i]));
statuses[i] = s;
}
if (snapshot_created) {
db_->ReleaseSnapshot(ro.snapshot);
}
return statuses;
}
bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) {
assert(read_options != nullptr);
if (read_options->snapshot != nullptr) {
return false;
}
read_options->snapshot = db_->GetSnapshot();
return true;
}
Status BlobDBImpl::CommonGet(const ColumnFamilyData* cfd, const Slice& key,
const std::string& index_entry, std::string* value,
SequenceNumber* sequence) {
@ -1172,11 +1194,6 @@ Status BlobDBImpl::CommonGet(const ColumnFamilyData* cfd, const Slice& key,
bfile = hitr->second;
}
if (bfile->Obsolete()) {
return Status::NotFound(
"Blob Not Found as blob file was garbage collected");
}
// 0 - size
if (!handle.size() && value != nullptr) {
value->clear();
@ -1274,25 +1291,30 @@ Status BlobDBImpl::CommonGet(const ColumnFamilyData* cfd, const Slice& key,
return s;
}
Status BlobDBImpl::Get(const ReadOptions& options,
Status BlobDBImpl::Get(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
// Get a snapshot to avoid blob file get deleted between we
// fetch and index entry and reading from the file.
// TODO(yiwu): For Get() retry if file not found would be a simpler strategy.
ReadOptions ro(read_options);
bool snapshot_created = SetSnapshotIfNeeded(&ro);
Status s;
std::string index_entry;
s = db_->Get(options, column_family, key, &index_entry);
if (!s.ok()) {
if (debug_level_ >= 3)
ROCKS_LOG_WARN(db_options_.info_log,
"Get Failed on LSM KEY: %s status: '%s'",
key.ToString().c_str(), s.ToString().c_str());
return s;
s = db_->Get(ro, column_family, key, &index_entry);
TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2");
if (s.ok()) {
s = CommonGet(cfd, key, index_entry, value->GetSelf());
value->PinSelf();
}
if (snapshot_created) {
db_->ReleaseSnapshot(ro.snapshot);
}
s = CommonGet(cfd, key, index_entry, value->GetSelf());
value->PinSelf();
return s;
}
@ -1302,6 +1324,8 @@ Slice BlobDBIterator::value() const {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh_);
auto cfd = cfh->cfd();
TEST_SYNC_POINT("BlobDBIterator::value:BeforeGetBlob:1");
TEST_SYNC_POINT("BlobDBIterator::value:BeforeGetBlob:2");
Status s = db_impl_->CommonGet(cfd, iter_->key(), index_entry.ToString(false),
&vpart_);
return Slice(vpart_);
@ -1977,7 +2001,7 @@ bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, uint64_t now,
return false;
}
std::pair<bool, int64_t> BlobDBImpl::DeleteObsFiles(bool aborted) {
std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
if (aborted) return std::make_pair(false, -1);
{
@ -2002,6 +2026,7 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsFiles(bool aborted) {
}
}
blob_files_.erase(bfile->BlobFileNumber());
Status s = env_->DeleteFile(bfile->PathName());
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
@ -2026,7 +2051,9 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsFiles(bool aborted) {
// put files back into obsolete if for some reason, delete failed
if (!tobsolete.empty()) {
WriteLock wl(&mutex_);
for (auto bfile : tobsolete) obsolete_files_.push_front(bfile);
for (auto bfile : tobsolete) {
obsolete_files_.push_front(bfile);
}
}
return std::make_pair(!aborted, -1);
@ -2212,8 +2239,6 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
WriteLock wl(&mutex_);
for (auto bfile : obsoletes) {
bool last_file = (bfile == obsoletes.back());
// remove from global list so writers
blob_files_.erase(bfile->BlobFileNumber());
if (!evict_cb) {
bfile->SetCanBeDeleted();
@ -2231,10 +2256,14 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
return std::make_pair(true, -1);
}
Iterator* BlobDBImpl::NewIterator(const ReadOptions& opts,
Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) {
return new BlobDBIterator(db_->NewIterator(opts, column_family),
column_family, this);
// Get a snapshot to avoid blob file get deleted between we
// fetch and index entry and reading from the file.
ReadOptions ro(read_options);
bool snapshot_created = SetSnapshotIfNeeded(&ro);
return new BlobDBIterator(db_->NewIterator(ro, column_family), column_family,
this, snapshot_created, ro.snapshot);
}
Status DestroyBlobDB(const std::string& dbname, const Options& options,
@ -2283,6 +2312,7 @@ Status BlobDBImpl::TEST_GetSequenceNumber(const Slice& key,
}
std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
ReadLock l(&mutex_);
std::vector<std::shared_ptr<BlobFile>> blob_files;
for (auto& p : blob_files_) {
blob_files.emplace_back(p.second);
@ -2300,6 +2330,10 @@ std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetObsoleteFiles()
return obsolete_files;
}
void BlobDBImpl::TEST_DeleteObsoleteFiles() {
DeleteObsoleteFiles(false /*abort*/);
}
void BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile) {
CloseSeqWrite(bfile, false /*abort*/);
}
@ -2310,6 +2344,16 @@ Status BlobDBImpl::TEST_GCFileAndUpdateLSM(std::shared_ptr<BlobFile>& bfile,
}
void BlobDBImpl::TEST_RunGC() { RunGC(false /*abort*/); }
void BlobDBImpl::TEST_ObsoleteFile(std::shared_ptr<BlobFile>& bfile) {
uint64_t number = bfile->BlobFileNumber();
assert(blob_files_.count(number) > 0);
bfile->SetCanBeDeleted();
{
WriteLock l(&mutex_);
obsolete_files_.push_back(bfile);
}
}
#endif // !NDEBUG
} // namespace blob_db

View File

@ -200,7 +200,7 @@ class BlobDBImpl : public BlobDB {
static constexpr uint32_t kReclaimOpenFilesPeriodMillisecs = 1 * 1000;
// how often to schedule delete obs files periods
static constexpr uint32_t kDeleteObsoletedFilesPeriodMillisecs = 10 * 1000;
static constexpr uint32_t kDeleteObsoleteFilesPeriodMillisecs = 10 * 1000;
// how often to schedule check seq files period
static constexpr uint32_t kCheckSeqFilesPeriodMillisecs = 10 * 1000;
@ -219,16 +219,16 @@ class BlobDBImpl : public BlobDB {
const Slice& key) override;
using rocksdb::StackableDB::Get;
Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* value) override;
using rocksdb::StackableDB::NewIterator;
virtual Iterator* NewIterator(const ReadOptions& opts,
virtual Iterator* NewIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) override;
using rocksdb::StackableDB::MultiGet;
virtual std::vector<Status> MultiGet(
const ReadOptions& options,
const ReadOptions& read_options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys,
std::vector<std::string>* values) override;
@ -269,11 +269,19 @@ class BlobDBImpl : public BlobDB {
GCStats* gc_stats);
void TEST_RunGC();
void TEST_ObsoleteFile(std::shared_ptr<BlobFile>& bfile);
void TEST_DeleteObsoleteFiles();
#endif // !NDEBUG
private:
Status OpenPhase1();
// Create a snapshot if there isn't one in read options.
// Return true if a snapshot is created.
bool SetSnapshotIfNeeded(ReadOptions* read_options);
Status CommonGet(const ColumnFamilyData* cfd, const Slice& key,
const std::string& index_entry, std::string* value,
SequenceNumber* sequence = nullptr);
@ -332,7 +340,7 @@ class BlobDBImpl : public BlobDB {
// delete files which have been garbage collected and marked
// obsolete. Check whether any snapshots exist which refer to
// the same
std::pair<bool, int64_t> DeleteObsFiles(bool aborted);
std::pair<bool, int64_t> DeleteObsoleteFiles(bool aborted);
// Major task to garbage collect expired and deleted blobs
std::pair<bool, int64_t> RunGC(bool aborted);
@ -593,7 +601,7 @@ class BlobFile {
// This Read-Write mutex is per file specific and protects
// all the datastructures
port::RWMutex mutex_;
mutable port::RWMutex mutex_;
// time when the random access reader was last created.
std::atomic<std::time_t> last_access_;
@ -700,12 +708,23 @@ class BlobFile {
class BlobDBIterator : public Iterator {
public:
explicit BlobDBIterator(Iterator* iter, ColumnFamilyHandle* column_family,
BlobDBImpl* impl)
: iter_(iter), cfh_(column_family), db_impl_(impl) {
assert(iter_);
BlobDBImpl* impl, bool own_snapshot,
const Snapshot* snapshot)
: iter_(iter),
cfh_(column_family),
db_impl_(impl),
own_snapshot_(own_snapshot),
snapshot_(snapshot) {
assert(iter != nullptr);
assert(snapshot != nullptr);
}
~BlobDBIterator() { delete iter_; }
~BlobDBIterator() {
if (own_snapshot_) {
db_impl_->ReleaseSnapshot(snapshot_);
}
delete iter_;
}
bool Valid() const override { return iter_->Valid(); }
@ -727,10 +746,14 @@ class BlobDBIterator : public Iterator {
Status status() const override { return iter_->status(); }
// Iterator::Refresh() not supported.
private:
Iterator* iter_;
ColumnFamilyHandle* cfh_;
BlobDBImpl* db_impl_;
bool own_snapshot_;
const Snapshot* snapshot_;
mutable std::string vpart_;
};

View File

@ -723,6 +723,98 @@ TEST_F(BlobDBTest, GCOldestSimpleBlobFileWhenOutOfSpace) {
obsolete_files[0]->BlobFileNumber());
}
TEST_F(BlobDBTest, ReadWhileGC) {
// run the same test for Get(), MultiGet() and Iterator each.
for (int i = 0; i < 3; i++) {
BlobDBOptions bdb_options;
bdb_options.disable_background_tasks = true;
Open(bdb_options);
blob_db_->Put(WriteOptions(), "foo", "bar");
BlobDBImpl *blob_db_impl =
static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
auto blob_files = blob_db_impl->TEST_GetBlobFiles();
ASSERT_EQ(1, blob_files.size());
std::shared_ptr<BlobFile> bfile = blob_files[0];
uint64_t bfile_number = bfile->BlobFileNumber();
blob_db_impl->TEST_CloseBlobFile(bfile);
switch (i) {
case 0:
SyncPoint::GetInstance()->LoadDependency(
{{"BlobDBImpl::Get:AfterIndexEntryGet:1",
"BlobDBTest::ReadWhileGC:1"},
{"BlobDBTest::ReadWhileGC:2",
"BlobDBImpl::Get:AfterIndexEntryGet:2"}});
break;
case 1:
SyncPoint::GetInstance()->LoadDependency(
{{"BlobDBImpl::MultiGet:AfterIndexEntryGet:1",
"BlobDBTest::ReadWhileGC:1"},
{"BlobDBTest::ReadWhileGC:2",
"BlobDBImpl::MultiGet:AfterIndexEntryGet:2"}});
break;
case 2:
SyncPoint::GetInstance()->LoadDependency(
{{"BlobDBIterator::value:BeforeGetBlob:1",
"BlobDBTest::ReadWhileGC:1"},
{"BlobDBTest::ReadWhileGC:2",
"BlobDBIterator::value:BeforeGetBlob:2"}});
break;
}
SyncPoint::GetInstance()->EnableProcessing();
auto reader = port::Thread([this, i]() {
std::string value;
std::vector<std::string> values;
std::vector<Status> statuses;
switch (i) {
case 0:
ASSERT_OK(blob_db_->Get(ReadOptions(), "foo", &value));
ASSERT_EQ("bar", value);
break;
case 1:
statuses = blob_db_->MultiGet(ReadOptions(), {"foo"}, &values);
ASSERT_EQ(1, statuses.size());
ASSERT_EQ(1, values.size());
ASSERT_EQ("bar", values[0]);
break;
case 2:
// VerifyDB use iterator to scan the DB.
VerifyDB({{"foo", "bar"}});
break;
}
});
TEST_SYNC_POINT("BlobDBTest::ReadWhileGC:1");
GCStats gc_stats;
ASSERT_OK(blob_db_impl->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
ASSERT_EQ(1, gc_stats.blob_count);
ASSERT_EQ(1, gc_stats.num_relocate);
ASSERT_EQ(1, gc_stats.relocate_succeeded);
blob_db_impl->TEST_ObsoleteFile(blob_files[0]);
blob_db_impl->TEST_DeleteObsoleteFiles();
// The file shouln't be deleted
blob_files = blob_db_impl->TEST_GetBlobFiles();
ASSERT_EQ(2, blob_files.size());
ASSERT_EQ(bfile_number, blob_files[0]->BlobFileNumber());
auto obsolete_files = blob_db_impl->TEST_GetObsoleteFiles();
ASSERT_EQ(1, obsolete_files.size());
ASSERT_EQ(bfile_number, obsolete_files[0]->BlobFileNumber());
TEST_SYNC_POINT("BlobDBTest::ReadWhileGC:2");
reader.join();
SyncPoint::GetInstance()->DisableProcessing();
// The file is deleted this time
blob_db_impl->TEST_DeleteObsoleteFiles();
blob_files = blob_db_impl->TEST_GetBlobFiles();
ASSERT_EQ(1, blob_files.size());
ASSERT_NE(bfile_number, blob_files[0]->BlobFileNumber());
ASSERT_EQ(0, blob_db_impl->TEST_GetObsoleteFiles().size());
VerifyDB({{"foo", "bar"}});
Destroy();
}
}
} // namespace blob_db
} // namespace rocksdb