[Performance Branch] If options.max_open_files set to be -1, cache table readers in FileMetadata for Get() and NewIterator()

Summary:
In some use cases, table readers for all live files should always be cached. In that case, there will be an opportunity to avoid the table cache look-up while Get() and NewIterator().

We define options.max_open_files = -1 to be the mode that table readers for live files will always be kept. In that mode, table readers are cached in FileMetaData (with a reference count hold in table cache). So that when executing table_cache.Get() and table_cache.newInterator(), LRU cache checking can be by-passed, to reduce latency.

Test Plan: add a test case in db_test

Reviewers: haobo, kailiu

Reviewed By: haobo

CC: dhruba, igor, leveldb

Differential Revision: https://reviews.facebook.net/D15039
This commit is contained in:
Siying Dong 2014-01-06 20:29:17 -08:00
parent 5b5ab0c1a8
commit aa0ef6602d
10 changed files with 124 additions and 58 deletions

View File

@ -204,8 +204,7 @@ Status BuildTable(const std::string& dbname,
// Verify that the table is usable // Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(), Iterator* it = table_cache->NewIterator(ReadOptions(),
soptions, soptions,
meta->number, *meta);
meta->file_size);
s = it->status(); s = it->status();
delete it; delete it;
} }

View File

@ -126,7 +126,10 @@ Options SanitizeOptions(const std::string& dbname,
Options result = src; Options result = src;
result.comparator = icmp; result.comparator = icmp;
result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr; result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
ClipToRange(&result.max_open_files, 20, 1000000); // result.max_open_files means an "infinite" open files.
if (result.max_open_files != -1) {
ClipToRange(&result.max_open_files, 20, 1000000);
}
ClipToRange(&result.write_buffer_size, ((size_t)64)<<10, ClipToRange(&result.write_buffer_size, ((size_t)64)<<10,
((size_t)64)<<30); ((size_t)64)<<30);
ClipToRange(&result.block_size, 1<<10, 4<<20); ClipToRange(&result.block_size, 1<<10, 4<<20);
@ -278,7 +281,10 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
} }
// Reserve ten files or so for other uses and give the rest to TableCache. // Reserve ten files or so for other uses and give the rest to TableCache.
const int table_cache_size = options_.max_open_files - 10; // Give a large number for setting of "infinite" open files.
const int table_cache_size =
(options_.max_open_files == -1) ?
4194304 : options_.max_open_files - 10;
table_cache_.reset(new TableCache(dbname_, &options_, table_cache_.reset(new TableCache(dbname_, &options_,
storage_options_, table_cache_size)); storage_options_, table_cache_size));
versions_.reset(new VersionSet(dbname_, &options_, storage_options_, versions_.reset(new VersionSet(dbname_, &options_, storage_options_,
@ -335,6 +341,9 @@ DBImpl::~DBImpl() {
for (MemTable* m: to_delete) { for (MemTable* m: to_delete) {
delete m; delete m;
} }
// versions need to be destroyed before table_cache since it can holds
// references to table_cache.
versions_.reset();
LogFlush(options_.info_log); LogFlush(options_.info_log);
} }
@ -2095,10 +2104,10 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
if (s.ok() && current_entries > 0) { if (s.ok() && current_entries > 0) {
// Verify that the table is usable // Verify that the table is usable
FileMetaData meta(output_number, current_bytes);
Iterator* iter = table_cache_->NewIterator(ReadOptions(), Iterator* iter = table_cache_->NewIterator(ReadOptions(),
storage_options_, storage_options_,
output_number, meta);
current_bytes);
s = iter->status(); s = iter->status();
delete iter; delete iter;
if (s.ok()) { if (s.ok()) {
@ -3701,7 +3710,7 @@ Status DBImpl::DeleteFile(std::string name) {
} }
int level; int level;
FileMetaData metadata; FileMetaData* metadata;
int maxlevel = NumberLevels(); int maxlevel = NumberLevels();
VersionEdit edit(maxlevel); VersionEdit edit(maxlevel);
DeletionState deletion_state(true); DeletionState deletion_state(true);
@ -3716,7 +3725,7 @@ Status DBImpl::DeleteFile(std::string name) {
assert((level > 0) && (level < maxlevel)); assert((level > 0) && (level < maxlevel));
// If the file is being compacted no need to delete. // If the file is being compacted no need to delete.
if (metadata.being_compacted) { if (metadata->being_compacted) {
Log(options_.info_log, Log(options_.info_log,
"DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); "DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
return Status::OK(); return Status::OK();

View File

@ -265,6 +265,7 @@ class DBTest {
kHashSkipList, kHashSkipList,
kUniversalCompaction, kUniversalCompaction,
kCompressedBlockCache, kCompressedBlockCache,
kInfiniteMaxOpenFiles,
kEnd kEnd
}; };
int option_config_; int option_config_;
@ -415,6 +416,9 @@ class DBTest {
case kCompressedBlockCache: case kCompressedBlockCache:
options.block_cache_compressed = NewLRUCache(8*1024*1024); options.block_cache_compressed = NewLRUCache(8*1024*1024);
break; break;
case kInfiniteMaxOpenFiles:
options.max_open_files = -1;
break;
default: default:
break; break;
} }

View File

@ -265,8 +265,9 @@ class Repairer {
int counter = 0; int counter = 0;
Status status = env_->GetFileSize(fname, &t->meta.file_size); Status status = env_->GetFileSize(fname, &t->meta.file_size);
if (status.ok()) { if (status.ok()) {
FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
Iterator* iter = table_cache_->NewIterator( Iterator* iter = table_cache_->NewIterator(
ReadOptions(), storage_options_, t->meta.number, t->meta.file_size); ReadOptions(), storage_options_, dummy_meta);
bool empty = true; bool empty = true;
ParsedInternalKey parsed; ParsedInternalKey parsed;
t->min_sequence = 0; t->min_sequence = 0;

View File

@ -10,6 +10,7 @@
#include "db/table_cache.h" #include "db/table_cache.h"
#include "db/filename.h" #include "db/filename.h"
#include "db/version_edit.h"
#include "rocksdb/statistics.h" #include "rocksdb/statistics.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
@ -50,6 +51,14 @@ TableCache::TableCache(const std::string& dbname,
TableCache::~TableCache() { TableCache::~TableCache() {
} }
TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
return reinterpret_cast<TableReader*>(cache_->Value(handle));
}
void TableCache::ReleaseHandle(Cache::Handle* handle) {
cache_->Release(handle);
}
Status TableCache::FindTable(const EnvOptions& toptions, Status TableCache::FindTable(const EnvOptions& toptions,
uint64_t file_number, uint64_t file_size, uint64_t file_number, uint64_t file_size,
Cache::Handle** handle, bool* table_io, Cache::Handle** handle, bool* table_io,
@ -94,25 +103,27 @@ Status TableCache::FindTable(const EnvOptions& toptions,
Iterator* TableCache::NewIterator(const ReadOptions& options, Iterator* TableCache::NewIterator(const ReadOptions& options,
const EnvOptions& toptions, const EnvOptions& toptions,
uint64_t file_number, const FileMetaData& file_meta,
uint64_t file_size,
TableReader** table_reader_ptr, TableReader** table_reader_ptr,
bool for_compaction) { bool for_compaction) {
if (table_reader_ptr != nullptr) { if (table_reader_ptr != nullptr) {
*table_reader_ptr = nullptr; *table_reader_ptr = nullptr;
} }
Cache::Handle* handle = file_meta.table_reader_handle;
Cache::Handle* handle = nullptr; Status s;
Status s = FindTable(toptions, file_number, file_size, &handle, if (!handle) {
nullptr, options.read_tier == kBlockCacheTier); s = FindTable(toptions, file_meta.number, file_meta.file_size, &handle,
nullptr, options.read_tier == kBlockCacheTier);
}
if (!s.ok()) { if (!s.ok()) {
return NewErrorIterator(s); return NewErrorIterator(s);
} }
TableReader* table_reader = TableReader* table_reader = GetTableReaderFromHandle(handle);
reinterpret_cast<TableReader*>(cache_->Value(handle));
Iterator* result = table_reader->NewIterator(options); Iterator* result = table_reader->NewIterator(options);
result->RegisterCleanup(&UnrefEntry, cache_.get(), handle); if (!file_meta.table_reader_handle) {
result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
}
if (table_reader_ptr != nullptr) { if (table_reader_ptr != nullptr) {
*table_reader_ptr = table_reader; *table_reader_ptr = table_reader;
} }
@ -125,22 +136,24 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
} }
Status TableCache::Get(const ReadOptions& options, Status TableCache::Get(const ReadOptions& options,
uint64_t file_number, const FileMetaData& file_meta,
uint64_t file_size,
const Slice& k, const Slice& k,
void* arg, void* arg,
bool (*saver)(void*, const Slice&, const Slice&, bool), bool (*saver)(void*, const Slice&, const Slice&, bool),
bool* table_io, bool* table_io,
void (*mark_key_may_exist)(void*)) { void (*mark_key_may_exist)(void*)) {
Cache::Handle* handle = nullptr; Cache::Handle* handle = file_meta.table_reader_handle;
Status s = FindTable(storage_options_, file_number, file_size, Status s;
&handle, table_io, if (!handle) {
options.read_tier == kBlockCacheTier); s = FindTable(storage_options_, file_meta.number, file_meta.file_size,
&handle, table_io, options.read_tier == kBlockCacheTier);
}
if (s.ok()) { if (s.ok()) {
TableReader* t = TableReader* t = GetTableReaderFromHandle(handle);
reinterpret_cast<TableReader*>(cache_->Value(handle));
s = t->Get(options, k, arg, saver, mark_key_may_exist); s = t->Get(options, k, arg, saver, mark_key_may_exist);
cache_->Release(handle); if (!file_meta.table_reader_handle) {
ReleaseHandle(handle);
}
} else if (options.read_tier && s.IsIncomplete()) { } else if (options.read_tier && s.IsIncomplete()) {
// Couldnt find Table in cache but treat as kFound if no_io set // Couldnt find Table in cache but treat as kFound if no_io set
(*mark_key_may_exist)(arg); (*mark_key_may_exist)(arg);
@ -159,10 +172,9 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options,
file_size, &handle, table_io); file_size, &handle, table_io);
bool may_match = true; bool may_match = true;
if (s.ok()) { if (s.ok()) {
TableReader* t = TableReader* t = GetTableReaderFromHandle(handle);
reinterpret_cast<TableReader*>(cache_->Value(handle));
may_match = t->PrefixMayMatch(internal_prefix); may_match = t->PrefixMayMatch(internal_prefix);
cache_->Release(handle); ReleaseHandle(handle);
} }
return may_match; return may_match;
} }

View File

@ -21,6 +21,7 @@
namespace rocksdb { namespace rocksdb {
class Env; class Env;
struct FileMetaData;
class TableCache { class TableCache {
public: public:
@ -37,8 +38,7 @@ class TableCache {
// returned iterator is live. // returned iterator is live.
Iterator* NewIterator(const ReadOptions& options, Iterator* NewIterator(const ReadOptions& options,
const EnvOptions& toptions, const EnvOptions& toptions,
uint64_t file_number, const FileMetaData& file_meta,
uint64_t file_size,
TableReader** table_reader_ptr = nullptr, TableReader** table_reader_ptr = nullptr,
bool for_compaction = false); bool for_compaction = false);
@ -46,8 +46,7 @@ class TableCache {
// call (*handle_result)(arg, found_key, found_value) repeatedly until // call (*handle_result)(arg, found_key, found_value) repeatedly until
// it returns false. // it returns false.
Status Get(const ReadOptions& options, Status Get(const ReadOptions& options,
uint64_t file_number, const FileMetaData& file_meta,
uint64_t file_size,
const Slice& k, const Slice& k,
void* arg, void* arg,
bool (*handle_result)(void*, const Slice&, const Slice&, bool), bool (*handle_result)(void*, const Slice&, const Slice&, bool),
@ -63,16 +62,23 @@ class TableCache {
// Evict any entry for the specified file number // Evict any entry for the specified file number
void Evict(uint64_t file_number); void Evict(uint64_t file_number);
// Find table reader
Status FindTable(const EnvOptions& toptions, uint64_t file_number,
uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
const bool no_io = false);
// Get TableReader from a cache handle.
TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
// Release the handle from a cache
void ReleaseHandle(Cache::Handle* handle);
private: private:
Env* const env_; Env* const env_;
const std::string dbname_; const std::string dbname_;
const Options* options_; const Options* options_;
const EnvOptions& storage_options_; const EnvOptions& storage_options_;
std::shared_ptr<Cache> cache_; std::shared_ptr<Cache> cache_;
Status FindTable(const EnvOptions& toptions, uint64_t file_number,
uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
const bool no_io = false);
}; };
} // namespace rocksdb } // namespace rocksdb

View File

@ -11,6 +11,7 @@
#include <set> #include <set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "rocksdb/cache.h"
#include "db/dbformat.h" #include "db/dbformat.h"
namespace rocksdb { namespace rocksdb {
@ -28,8 +29,14 @@ struct FileMetaData {
SequenceNumber smallest_seqno;// The smallest seqno in this file SequenceNumber smallest_seqno;// The smallest seqno in this file
SequenceNumber largest_seqno; // The largest seqno in this file SequenceNumber largest_seqno; // The largest seqno in this file
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0), // Needs to be disposed when refs becomes 0.
being_compacted(false) { } Cache::Handle* table_reader_handle;
FileMetaData(uint64_t number, uint64_t file_size) :
refs(0), allowed_seeks(1 << 30), number(number), file_size(file_size),
being_compacted(false), table_reader_handle(nullptr) {
}
FileMetaData() : FileMetaData(0, 0) { }
}; };
class VersionEdit { class VersionEdit {

View File

@ -51,6 +51,10 @@ Version::~Version() {
assert(f->refs > 0); assert(f->refs > 0);
f->refs--; f->refs--;
if (f->refs <= 0) { if (f->refs <= 0) {
if (f->table_reader_handle) {
vset_->table_cache_->ReleaseHandle(f->table_reader_handle);
f->table_reader_handle = nullptr;
}
vset_->obsolete_files_.push_back(f); vset_->obsolete_files_.push_back(f);
} }
} }
@ -202,10 +206,11 @@ static Iterator* GetFileIterator(void* arg,
options_copy = options; options_copy = options;
options_copy.prefix = nullptr; options_copy.prefix = nullptr;
} }
FileMetaData meta(DecodeFixed64(file_value.data()),
DecodeFixed64(file_value.data() + 8));
return cache->NewIterator(options.prefix ? options_copy : options, return cache->NewIterator(options.prefix ? options_copy : options,
soptions, soptions,
DecodeFixed64(file_value.data()), meta,
DecodeFixed64(file_value.data() + 8),
nullptr /* don't need reference to table*/, nullptr /* don't need reference to table*/,
for_compaction); for_compaction);
} }
@ -257,9 +262,8 @@ void Version::AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iters) { std::vector<Iterator*>* iters) {
// Merge all level zero files together since they may overlap // Merge all level zero files together since they may overlap
for (const FileMetaData* file : files_[0]) { for (const FileMetaData* file : files_[0]) {
iters->push_back( iters->push_back(vset_->table_cache_->NewIterator(options, soptions,
vset_->table_cache_->NewIterator( *file));
options, soptions, file->number, file->file_size));
} }
// For levels > 0, we can use a concatenating iterator that sequentially // For levels > 0, we can use a concatenating iterator that sequentially
@ -513,9 +517,8 @@ void Version::Get(const ReadOptions& options,
prev_file = f; prev_file = f;
#endif #endif
bool tableIO = false; bool tableIO = false;
*status = vset_->table_cache_->Get(options, f->number, f->file_size, *status = vset_->table_cache_->Get(options, *f, ikey, &saver, SaveValue,
ikey, &saver, SaveValue, &tableIO, &tableIO, MarkKeyMayExist);
MarkKeyMayExist);
// TODO: examine the behavior for corrupted key // TODO: examine the behavior for corrupted key
if (!status->ok()) { if (!status->ok()) {
return; return;
@ -954,6 +957,11 @@ class VersionSet::Builder {
FileMetaData* f = to_unref[i]; FileMetaData* f = to_unref[i];
f->refs--; f->refs--;
if (f->refs <= 0) { if (f->refs <= 0) {
if (f->table_reader_handle) {
vset_->table_cache_->ReleaseHandle(
f->table_reader_handle);
f->table_reader_handle = nullptr;
}
delete f; delete f;
} }
} }
@ -1113,6 +1121,20 @@ class VersionSet::Builder {
CheckConsistency(v); CheckConsistency(v);
} }
void LoadTableHandlers() {
for (int level = 0; level < vset_->NumberLevels(); level++) {
for (auto& file_meta : *(levels_[level].added_files)) {
assert (!file_meta->table_reader_handle);
bool table_io;
vset_->table_cache_->FindTable(vset_->storage_options_,
file_meta->number,
file_meta->file_size,
&file_meta->table_reader_handle,
&table_io, false);
}
}
}
void MaybeAddFile(Version* v, int level, FileMetaData* f) { void MaybeAddFile(Version* v, int level, FileMetaData* f) {
if (levels_[level].deleted_files.count(f->number) > 0) { if (levels_[level].deleted_files.count(f->number) > 0) {
// File is deleted: do nothing // File is deleted: do nothing
@ -1258,7 +1280,7 @@ Status VersionSet::LogAndApply(
edit->SetNextFile(next_file_number_); edit->SetNextFile(next_file_number_);
} }
// Unlock during expensive MANIFEST log write. New writes cannot get here // Unlock during expensive operations. New writes cannot get here
// because &w is ensuring that all new writes get queued. // because &w is ensuring that all new writes get queued.
{ {
// calculate the amount of data being compacted at every level // calculate the amount of data being compacted at every level
@ -1267,6 +1289,12 @@ Status VersionSet::LogAndApply(
mu->Unlock(); mu->Unlock();
if (options_->max_open_files == -1) {
// unlimited table cache. Pre-load table handle now.
// Need to do it out of the mutex.
builder.LoadTableHandlers();
}
// This is fine because everything inside of this block is serialized -- // This is fine because everything inside of this block is serialized --
// only one thread can be here at the same time // only one thread can be here at the same time
if (!new_manifest_filename.empty()) { if (!new_manifest_filename.empty()) {
@ -1966,8 +1994,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
// approximate offset of "ikey" within the table. // approximate offset of "ikey" within the table.
TableReader* table_reader_ptr; TableReader* table_reader_ptr;
Iterator* iter = table_cache_->NewIterator( Iterator* iter = table_cache_->NewIterator(
ReadOptions(), storage_options_, files[i]->number, ReadOptions(), storage_options_, *(files[i]), &table_reader_ptr);
files[i]->file_size, &table_reader_ptr);
if (table_reader_ptr != nullptr) { if (table_reader_ptr != nullptr) {
result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
} }
@ -2092,8 +2119,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
for (size_t i = 0; i < files.size(); i++) { for (size_t i = 0; i < files.size(); i++) {
list[num++] = table_cache_->NewIterator( list[num++] = table_cache_->NewIterator(
options, storage_options_compactions_, options, storage_options_compactions_,
files[i]->number, files[i]->file_size, nullptr, *(files[i]), nullptr, true /* for compaction */);
true /* for compaction */);
} }
} else { } else {
// Create concatenating iterator for the files from this level // Create concatenating iterator for the files from this level
@ -2876,12 +2902,12 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
Status VersionSet::GetMetadataForFile( Status VersionSet::GetMetadataForFile(
uint64_t number, uint64_t number,
int *filelevel, int *filelevel,
FileMetaData *meta) { FileMetaData **meta) {
for (int level = 0; level < NumberLevels(); level++) { for (int level = 0; level < NumberLevels(); level++) {
const std::vector<FileMetaData*>& files = current_->files_[level]; const std::vector<FileMetaData*>& files = current_->files_[level];
for (size_t i = 0; i < files.size(); i++) { for (size_t i = 0; i < files.size(); i++) {
if (files[i]->number == number) { if (files[i]->number == number) {
*meta = *files[i]; *meta = files[i];
*filelevel = level; *filelevel = level;
return Status::OK(); return Status::OK();
} }

View File

@ -431,7 +431,7 @@ class VersionSet {
double MaxBytesForLevel(int level); double MaxBytesForLevel(int level);
Status GetMetadataForFile( Status GetMetadataForFile(
uint64_t number, int *filelevel, FileMetaData *metadata); uint64_t number, int *filelevel, FileMetaData **metadata);
void GetLiveFilesMetaData( void GetLiveFilesMetaData(
std::vector<LiveFileMetaData> *metadata); std::vector<LiveFileMetaData> *metadata);

View File

@ -182,8 +182,10 @@ struct Options {
int min_write_buffer_number_to_merge; int min_write_buffer_number_to_merge;
// Number of open files that can be used by the DB. You may need to // Number of open files that can be used by the DB. You may need to
// increase this if your database has a large working set (budget // increase this if your database has a large working set. Value -1 means
// one open file per 2MB of working set). // files opened are always kept open. You can estimate number of files based
// on target_file_size_base and target_file_size_multiplier for level-based
// compaction. For universal-style compaction, you can usually set it to -1.
// //
// Default: 1000 // Default: 1000
int max_open_files; int max_open_files;