Stable cache keys on ingested SST files (#8669)

Summary:
Extends https://github.com/facebook/rocksdb/issues/8659 to work for ingested external SST files, even
the same file ingested into different DBs sharing a block cache.

Note: These new cache keys are currently only enabled when FileSystem
does not provide GetUniqueId. For now, they are typically larger,
so slightly less efficient.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8669

Test Plan: Extended unit test

Reviewed By: zhichao-cao

Differential Revision: D30398532

Pulled By: pdillinger

fbshipit-source-id: 1f13e2af4b8bfff5741953a69466e9589fbc23c7
This commit is contained in:
Peter Dillinger 2021-08-18 11:32:00 -07:00 committed by Facebook GitHub Bot
parent 2b367fa8cc
commit b6269b078a
7 changed files with 73 additions and 29 deletions

View File

@ -18,6 +18,7 @@
#include "rocksdb/statistics.h" #include "rocksdb/statistics.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "util/compression.h" #include "util/compression.h"
#include "util/defer.h"
#include "util/random.h" #include "util/random.h"
#include "utilities/fault_injection_fs.h" #include "utilities/fault_injection_fs.h"
@ -1310,7 +1311,7 @@ class StableCacheKeyTestFS : public FaultInjectionTestFS {
SetFailGetUniqueId(true); SetFailGetUniqueId(true);
} }
virtual ~StableCacheKeyTestFS() {} virtual ~StableCacheKeyTestFS() override {}
IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&, IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
IODebugContext*) override { IODebugContext*) override {
@ -1342,16 +1343,17 @@ TEST_F(DBBlockCacheTest, StableCacheKeys) {
table_options.no_block_cache = true; table_options.no_block_cache = true;
table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false); table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false);
verify_stats = [&options] { verify_stats = [&options] {
// One for ordinary SST file and one for external SST file
ASSERT_EQ( ASSERT_EQ(
1, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD)); 2, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD));
}; };
} else { } else {
table_options.cache_index_and_filter_blocks = true; table_options.cache_index_and_filter_blocks = true;
table_options.block_cache = NewLRUCache(1 << 25, 0, false); table_options.block_cache = NewLRUCache(1 << 25, 0, false);
verify_stats = [&options] { verify_stats = [&options] {
ASSERT_EQ(1, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); ASSERT_EQ(2, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
ASSERT_EQ(1, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); ASSERT_EQ(2, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
ASSERT_EQ(1, ASSERT_EQ(2,
options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
}; };
} }
@ -1360,18 +1362,41 @@ TEST_F(DBBlockCacheTest, StableCacheKeys) {
options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options); DestroyAndReopen(options);
// Ordinary SST file
ASSERT_OK(Put("key1", "abc")); ASSERT_OK(Put("key1", "abc"));
std::string something_compressible(500U, 'x'); std::string something_compressible(500U, 'x');
ASSERT_OK(Put("key2", something_compressible)); ASSERT_OK(Put("key1a", something_compressible));
ASSERT_OK(Flush()); ASSERT_OK(Flush());
#ifndef ROCKSDB_LITE
// External SST file
std::string external = dbname_ + "/external.sst";
{
SstFileWriter sst_file_writer(EnvOptions(), options);
ASSERT_OK(sst_file_writer.Open(external));
ASSERT_OK(sst_file_writer.Put("key2", "abc"));
ASSERT_OK(sst_file_writer.Put("key2a", something_compressible));
ExternalSstFileInfo external_info;
ASSERT_OK(sst_file_writer.Finish(&external_info));
IngestExternalFileOptions ingest_opts;
ASSERT_OK(db_->IngestExternalFile({external}, ingest_opts));
}
#else
// Another ordinary SST file
ASSERT_OK(Put("key2", "abc"));
ASSERT_OK(Put("key2a", something_compressible));
ASSERT_OK(Flush());
#endif
ASSERT_EQ(Get("key1"), std::string("abc")); ASSERT_EQ(Get("key1"), std::string("abc"));
ASSERT_EQ(Get("key2"), std::string("abc"));
verify_stats(); verify_stats();
// Make sure we can cache hit after re-open // Make sure we can cache hit after re-open
Reopen(options); Reopen(options);
ASSERT_EQ(Get("key1"), std::string("abc")); ASSERT_EQ(Get("key1"), std::string("abc"));
ASSERT_EQ(Get("key2"), std::string("abc"));
verify_stats(); verify_stats();
// Make sure we can cache hit even on a full copy of the DB. Using // Make sure we can cache hit even on a full copy of the DB. Using
@ -1386,14 +1411,26 @@ TEST_F(DBBlockCacheTest, StableCacheKeys) {
Close(); Close();
Destroy(options); Destroy(options);
dbname_ = db_copy_name; SaveAndRestore<std::string> save_dbname(&dbname_, db_copy_name);
Reopen(options); Reopen(options);
ASSERT_EQ(Get("key1"), std::string("abc")); ASSERT_EQ(Get("key1"), std::string("abc"));
ASSERT_EQ(Get("key2"), std::string("abc"));
verify_stats();
// And ensure that re-ingesting the same external file into a different DB
// uses same cache keys
DestroyAndReopen(options);
IngestExternalFileOptions ingest_opts;
ASSERT_OK(db_->IngestExternalFile({external}, ingest_opts));
ASSERT_EQ(Get("key2"), std::string("abc"));
verify_stats(); verify_stats();
#endif // !ROCKSDB_LITE #endif // !ROCKSDB_LITE
Close(); Close();
Destroy(options);
} }
} }

View File

@ -3945,10 +3945,10 @@ Status DBImpl::GetDbSessionId(std::string& session_id) const {
return Status::OK(); return Status::OK();
} }
void DBImpl::SetDbSessionId() { std::string DBImpl::GenerateDbSessionId(Env* env) {
// GenerateUniqueId() generates an identifier that has a negligible // GenerateUniqueId() generates an identifier that has a negligible
// probability of being duplicated, ~128 bits of entropy // probability of being duplicated, ~128 bits of entropy
std::string uuid = env_->GenerateUniqueId(); std::string uuid = env->GenerateUniqueId();
// Hash and reformat that down to a more compact format, 20 characters // Hash and reformat that down to a more compact format, 20 characters
// in base-36 ([0-9A-Z]), which is ~103 bits of entropy, which is enough // in base-36 ([0-9A-Z]), which is ~103 bits of entropy, which is enough
@ -3959,15 +3959,21 @@ void DBImpl::SetDbSessionId() {
// * Visually distinct from DB id format // * Visually distinct from DB id format
uint64_t a = NPHash64(uuid.data(), uuid.size(), 1234U); uint64_t a = NPHash64(uuid.data(), uuid.size(), 1234U);
uint64_t b = NPHash64(uuid.data(), uuid.size(), 5678U); uint64_t b = NPHash64(uuid.data(), uuid.size(), 5678U);
db_session_id_.resize(20); std::string db_session_id;
db_session_id.resize(20);
static const char* const base36 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; static const char* const base36 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
size_t i = 0; size_t i = 0;
for (; i < 10U; ++i, a /= 36U) { for (; i < 10U; ++i, a /= 36U) {
db_session_id_[i] = base36[a % 36]; db_session_id[i] = base36[a % 36];
} }
for (; i < 20U; ++i, b /= 36U) { for (; i < 20U; ++i, b /= 36U) {
db_session_id_[i] = base36[b % 36]; db_session_id[i] = base36[b % 36];
} }
return db_session_id;
}
void DBImpl::SetDbSessionId() {
db_session_id_ = GenerateDbSessionId(env_);
TEST_SYNC_POINT_CALLBACK("DBImpl::SetDbSessionId", &db_session_id_); TEST_SYNC_POINT_CALLBACK("DBImpl::SetDbSessionId", &db_session_id_);
} }

View File

@ -1114,6 +1114,8 @@ class DBImpl : public DB {
State state_; State state_;
}; };
static std::string GenerateDbSessionId(Env* env);
protected: protected:
const std::string dbname_; const std::string dbname_;
std::string db_id_; std::string db_id_;

View File

@ -93,6 +93,7 @@ class Repairer {
const ColumnFamilyOptions& default_cf_opts, const ColumnFamilyOptions& default_cf_opts,
const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs) const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs)
: dbname_(dbname), : dbname_(dbname),
db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)),
env_(db_options.env), env_(db_options.env),
file_options_(), file_options_(),
db_options_(SanitizeOptions(dbname_, db_options)), db_options_(SanitizeOptions(dbname_, db_options)),
@ -109,21 +110,16 @@ class Repairer {
// TableCache can be small since we expect each table to be opened // TableCache can be small since we expect each table to be opened
// once. // once.
NewLRUCache(10, db_options_.table_cache_numshardbits)), NewLRUCache(10, db_options_.table_cache_numshardbits)),
table_cache_( table_cache_(new TableCache(default_iopts_, &file_options_,
// TODO: db_session_id for TableCache should be initialized after raw_table_cache_.get(),
// db_session_id_ is set. /*block_cache_tracer=*/nullptr,
new TableCache(default_iopts_, &file_options_, /*io_tracer=*/nullptr, db_session_id_)),
raw_table_cache_.get(),
/*block_cache_tracer=*/nullptr,
/*io_tracer=*/nullptr, /*db_session_id*/ "")),
wb_(db_options_.db_write_buffer_size), wb_(db_options_.db_write_buffer_size),
wc_(db_options_.delayed_write_rate), wc_(db_options_.delayed_write_rate),
// TODO: db_session_id for VersionSet should be initialized after
// db_session_id_ is set and use it for initialization.
vset_(dbname_, &immutable_db_options_, file_options_, vset_(dbname_, &immutable_db_options_, file_options_,
raw_table_cache_.get(), &wb_, &wc_, raw_table_cache_.get(), &wb_, &wc_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_session_id*/ ""), db_session_id_),
next_file_number_(1), next_file_number_(1),
db_lock_(nullptr), db_lock_(nullptr),
closed_(false) { closed_(false) {
@ -198,10 +194,6 @@ class Repairer {
} }
// Just create a DBImpl temporarily so we can reuse NewDB() // Just create a DBImpl temporarily so we can reuse NewDB()
db_impl = new DBImpl(db_options_, dbname_); db_impl = new DBImpl(db_options_, dbname_);
// Also use this temp DBImpl to get a session id
status = db_impl->GetDbSessionId(db_session_id_);
}
if (status.ok()) {
status = db_impl->NewDB(/*new_filenames=*/nullptr); status = db_impl->NewDB(/*new_filenames=*/nullptr);
} }
delete db_impl; delete db_impl;

View File

@ -649,8 +649,6 @@ Status BlockBasedTable::Open(
// under these portable/stable keys. // under these portable/stable keys.
// Note: For now, each external SST file gets its own unique session id, // Note: For now, each external SST file gets its own unique session id,
// so we can use a fixed file number under than session id. // so we can use a fixed file number under than session id.
// ... except FIXME (peterd): sst_file_writer currently uses wrong
// format for db_session_ids so this approach doesn't work yet.
db_session_id = rep->table_properties->db_session_id; db_session_id = rep->table_properties->db_session_id;
file_num = 1; file_num = 1;
} }

View File

@ -7,6 +7,7 @@
#include <vector> #include <vector>
#include "db/db_impl/db_impl.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "file/writable_file_writer.h" #include "file/writable_file_writer.h"
#include "rocksdb/file_system.h" #include "rocksdb/file_system.h"
@ -245,7 +246,7 @@ Status SstFileWriter::Open(const std::string& file_path) {
// Here we mimic the way db_session_id behaves by resetting the db_session_id // Here we mimic the way db_session_id behaves by resetting the db_session_id
// every time SstFileWriter is used, and in this case db_id is set to be "SST // every time SstFileWriter is used, and in this case db_id is set to be "SST
// Writer". // Writer".
std::string db_session_id = r->ioptions.env->GenerateUniqueId(); std::string db_session_id = DBImpl::GenerateDbSessionId(r->ioptions.env);
if (!db_session_id.empty() && db_session_id.back() == '\n') { if (!db_session_id.empty() && db_session_id.back() == '\n') {
db_session_id.pop_back(); db_session_id.pop_back();
} }

View File

@ -57,6 +57,14 @@ class SaveAndRestore {
public: public:
// obj is non-null pointer to value to be saved and later restored. // obj is non-null pointer to value to be saved and later restored.
explicit SaveAndRestore(T* obj) : obj_(obj), saved_(*obj) {} explicit SaveAndRestore(T* obj) : obj_(obj), saved_(*obj) {}
// new_value is stored in *obj
SaveAndRestore(T* obj, const T& new_value)
: obj_(obj), saved_(std::move(*obj)) {
*obj = new_value;
}
SaveAndRestore(T* obj, T&& new_value) : obj_(obj), saved_(std::move(*obj)) {
*obj = std::move(new_value);
}
~SaveAndRestore() { *obj_ = std::move(saved_); } ~SaveAndRestore() { *obj_ = std::move(saved_); }
// No copies // No copies