[RocksDB] [Performance] Allow different posix advice to be applied to the same table file

Summary:
Current posix advice implementation ties up the access pattern hint with the creation of a file.
It is not possible to apply different advice for different access (random get vs compaction read),
without keeping two open files for the same table. This patch extended the RandomeAccessFile interface
to accept new access hint at anytime. Particularly, we are able to set different access hint on the same
table file based on when/how the file is used.
Two options are added to set the access hint, after the file is first opened and after the file is being
compacted.

Test Plan: make check; db_stress; db_bench

Reviewers: dhruba

Reviewed By: dhruba

CC: MarkCallaghan, leveldb

Differential Revision: https://reviews.facebook.net/D10905
This commit is contained in:
Haobo Xu 2013-05-17 15:53:01 -07:00
parent 2df65c118c
commit ab8d2f6ab2
12 changed files with 146 additions and 22 deletions

View File

@ -293,6 +293,14 @@ static bool FLAGS_use_mmap_writes;
// Allow readaheads to occur for compactions // Allow readaheads to occur for compactions
static bool FLAGS_use_readahead_compactions; static bool FLAGS_use_readahead_compactions;
// Advise random access on table file open
static bool FLAGS_advise_random_on_open =
leveldb::Options().advise_random_on_open;
// Access pattern advice when a file is compacted
static auto FLAGS_compaction_fadvice =
leveldb::Options().access_hint_on_compaction_start;
namespace leveldb { namespace leveldb {
// Helper for quickly generating random data. // Helper for quickly generating random data.
@ -900,6 +908,7 @@ unique_ptr<char []> GenerateKeyFromInt(int v, const char* suffix = "")
} }
if (method != nullptr) { if (method != nullptr) {
fprintf(stdout, "DB path: [%s]\n", FLAGS_db);
RunBenchmark(num_threads, name, method); RunBenchmark(num_threads, name, method);
} }
} }
@ -1138,6 +1147,8 @@ unique_ptr<char []> GenerateKeyFromInt(int v, const char* suffix = "")
options.allow_mmap_reads = FLAGS_use_mmap_reads; options.allow_mmap_reads = FLAGS_use_mmap_reads;
options.allow_mmap_writes = FLAGS_use_mmap_writes; options.allow_mmap_writes = FLAGS_use_mmap_writes;
options.allow_readahead_compactions = FLAGS_use_readahead_compactions; options.allow_readahead_compactions = FLAGS_use_readahead_compactions;
options.advise_random_on_open = FLAGS_advise_random_on_open;
options.access_hint_on_compaction_start = FLAGS_compaction_fadvice;
Status s; Status s;
if(FLAGS_read_only) { if(FLAGS_read_only) {
s = DB::OpenForReadOnly(options, FLAGS_db, &db_); s = DB::OpenForReadOnly(options, FLAGS_db, &db_);
@ -1731,8 +1742,9 @@ int main(int argc, char** argv) {
int n; int n;
long l; long l;
char junk; char junk;
char hdfsname[2048]; char buf[2048];
char str[512]; char str[512];
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
@ -1848,8 +1860,8 @@ int main(int argc, char** argv) {
} else if (sscanf(argv[i], "--get_approx=%d%c", &n, &junk) == 1 && } else if (sscanf(argv[i], "--get_approx=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) { (n == 0 || n == 1)) {
FLAGS_get_approx = n; FLAGS_get_approx = n;
} else if (sscanf(argv[i], "--hdfs=%s", hdfsname) == 1) { } else if (sscanf(argv[i], "--hdfs=%s", buf) == 1) {
FLAGS_env = new leveldb::HdfsEnv(hdfsname); FLAGS_env = new leveldb::HdfsEnv(buf);
} else if (sscanf(argv[i], "--num_levels=%d%c", } else if (sscanf(argv[i], "--num_levels=%d%c",
&n, &junk) == 1) { &n, &junk) == 1) {
FLAGS_num_levels = n; FLAGS_num_levels = n;
@ -1931,6 +1943,21 @@ int main(int argc, char** argv) {
FLAGS_source_compaction_factor = n; FLAGS_source_compaction_factor = n;
} else if (sscanf(argv[i], "--wal_ttl=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--wal_ttl=%d%c", &n, &junk) == 1) {
FLAGS_WAL_ttl_seconds = static_cast<uint64_t>(n); FLAGS_WAL_ttl_seconds = static_cast<uint64_t>(n);
} else if (sscanf(argv[i], "--advise_random_on_open=%d%c", &n, &junk) == 1
&& (n == 0 || n ==1 )) {
FLAGS_advise_random_on_open = n;
} else if (sscanf(argv[i], "--compaction_fadvice=%s", buf) == 1) {
if (!strcasecmp(buf, "NONE"))
FLAGS_compaction_fadvice = leveldb::Options::NONE;
else if (!strcasecmp(buf, "NORMAL"))
FLAGS_compaction_fadvice = leveldb::Options::NORMAL;
else if (!strcasecmp(buf, "SEQUENTIAL"))
FLAGS_compaction_fadvice = leveldb::Options::SEQUENTIAL;
else if (!strcasecmp(buf, "WILLNEED"))
FLAGS_compaction_fadvice = leveldb::Options::WILLNEED;
else {
fprintf(stdout, "Unknown compaction fadvice:%s\n", buf);
}
} else { } else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]); fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1); exit(1);

View File

@ -54,6 +54,9 @@ Status TableCache::FindTable(const EnvOptions& toptions,
s = env_->NewRandomAccessFile(fname, &file, toptions); s = env_->NewRandomAccessFile(fname, &file, toptions);
RecordTick(options_->statistics, NO_FILE_OPENS); RecordTick(options_->statistics, NO_FILE_OPENS);
if (s.ok()) { if (s.ok()) {
if (options_->advise_random_on_open) {
file->Hint(RandomAccessFile::RANDOM);
}
s = Table::Open(*options_, toptions, std::move(file), file_size, &table); s = Table::Open(*options_, toptions, std::move(file), file_size, &table);
} }
@ -74,7 +77,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
const EnvOptions& toptions, const EnvOptions& toptions,
uint64_t file_number, uint64_t file_number,
uint64_t file_size, uint64_t file_size,
Table** tableptr) { Table** tableptr,
bool for_compaction) {
if (tableptr != nullptr) { if (tableptr != nullptr) {
*tableptr = nullptr; *tableptr = nullptr;
} }
@ -92,6 +96,11 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
if (tableptr != nullptr) { if (tableptr != nullptr) {
*tableptr = table; *tableptr = table;
} }
if (for_compaction) {
table->SetAccessHintForCompaction();
}
return result; return result;
} }

View File

@ -37,7 +37,8 @@ class TableCache {
const EnvOptions& toptions, const EnvOptions& toptions,
uint64_t file_number, uint64_t file_number,
uint64_t file_size, uint64_t file_size,
Table** tableptr = nullptr); Table** tableptr = nullptr,
bool for_compaction = false);
// If a seek to internal key "k" in specified file finds an entry, // If a seek to internal key "k" in specified file finds an entry,
// call (*handle_result)(arg, found_key, found_value) repeatedly until // call (*handle_result)(arg, found_key, found_value) repeatedly until

View File

@ -180,7 +180,8 @@ class Version::LevelFileNumIterator : public Iterator {
static Iterator* GetFileIterator(void* arg, static Iterator* GetFileIterator(void* arg,
const ReadOptions& options, const ReadOptions& options,
const EnvOptions& soptions, const EnvOptions& soptions,
const Slice& file_value) { const Slice& file_value,
bool for_compaction) {
TableCache* cache = reinterpret_cast<TableCache*>(arg); TableCache* cache = reinterpret_cast<TableCache*>(arg);
if (file_value.size() != 16) { if (file_value.size() != 16) {
return NewErrorIterator( return NewErrorIterator(
@ -189,7 +190,9 @@ static Iterator* GetFileIterator(void* arg,
return cache->NewIterator(options, return cache->NewIterator(options,
soptions, soptions,
DecodeFixed64(file_value.data()), DecodeFixed64(file_value.data()),
DecodeFixed64(file_value.data() + 8)); DecodeFixed64(file_value.data() + 8),
nullptr /* don't need reference to table*/,
for_compaction);
} }
} }
@ -1834,13 +1837,15 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
for (size_t i = 0; i < files.size(); i++) { for (size_t i = 0; i < files.size(); i++) {
list[num++] = table_cache_->NewIterator( list[num++] = table_cache_->NewIterator(
options, storage_options_compactions_, options, storage_options_compactions_,
files[i]->number, files[i]->file_size); files[i]->number, files[i]->file_size, nullptr,
true /* for compaction */);
} }
} else { } else {
// Create concatenating iterator for the files from this level // Create concatenating iterator for the files from this level
list[num++] = NewTwoLevelIterator( list[num++] = NewTwoLevelIterator(
new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]), new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
&GetFileIterator, table_cache_, options, storage_options_); &GetFileIterator, table_cache_, options, storage_options_,
true /* for compaction */);
} }
} }
} }

View File

@ -249,6 +249,12 @@ class RandomAccessFile {
return 0; // Default implementation to prevent issues with backwards return 0; // Default implementation to prevent issues with backwards
// compatibility. // compatibility.
}; };
enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
virtual void Hint(AccessPattern pattern) {}
}; };
// A file abstraction for sequential writing. The implementation // A file abstraction for sequential writing. The implementation

View File

@ -441,6 +441,16 @@ struct Options {
// new record will be written to the next block. // new record will be written to the next block.
// Default is 10. // Default is 10.
int block_size_deviation; int block_size_deviation;
// If set true, will hint the underlying file system that the file
// access pattern is random, when a sst file is opened.
// Default: true
bool advise_random_on_open;
// Specify the file access pattern once a compaction is started.
// It will be applied to all input files of a compaction.
// Default: NORMAL
enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
}; };
// Options that control read operations // Options that control read operations

View File

@ -141,6 +141,24 @@ Status Table::Open(const Options& options,
return s; return s;
} }
void Table::SetAccessHintForCompaction() {
switch (rep_->options.access_hint_on_compaction_start) {
case Options::NONE:
break;
case Options::NORMAL:
rep_->file->Hint(RandomAccessFile::NORMAL);
break;
case Options::SEQUENTIAL:
rep_->file->Hint(RandomAccessFile::SEQUENTIAL);
break;
case Options::WILLNEED:
rep_->file->Hint(RandomAccessFile::WILLNEED);
break;
default:
assert(false);
}
}
void Table::ReadMeta(const Footer& footer) { void Table::ReadMeta(const Footer& footer) {
if (rep_->options.filter_policy == nullptr) { if (rep_->options.filter_policy == nullptr) {
return; // Do not need any metadata return; // Do not need any metadata
@ -273,7 +291,8 @@ Iterator* Table::BlockReader(void* arg,
Iterator* Table::BlockReader(void* arg, Iterator* Table::BlockReader(void* arg,
const ReadOptions& options, const ReadOptions& options,
const EnvOptions& soptions, const EnvOptions& soptions,
const Slice& index_value) { const Slice& index_value,
bool for_compaction) {
return BlockReader(arg, options, index_value, nullptr); return BlockReader(arg, options, index_value, nullptr);
} }
@ -285,7 +304,8 @@ Iterator* Table::NewIterator(const ReadOptions& options) const {
Status Table::InternalGet(const ReadOptions& options, const Slice& k, Status Table::InternalGet(const ReadOptions& options, const Slice& k,
void* arg, void* arg,
bool (*saver)(void*, const Slice&, const Slice&, bool)) { bool (*saver)(void*, const Slice&, const Slice&,
bool)) {
Status s; Status s;
Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
bool done = false; bool done = false;

View File

@ -64,13 +64,16 @@ class Table {
// REQUIRES: key is in this table. // REQUIRES: key is in this table.
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
void SetAccessHintForCompaction();
private: private:
struct Rep; struct Rep;
Rep* rep_; Rep* rep_;
explicit Table(Rep* rep) { rep_ = rep; } explicit Table(Rep* rep) { rep_ = rep; }
static Iterator* BlockReader(void*, const ReadOptions&, static Iterator* BlockReader(void*, const ReadOptions&,
const EnvOptions& soptions, const Slice&); const EnvOptions& soptions, const Slice&,
bool for_compaction);
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&, static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
bool* didIO); bool* didIO);

View File

@ -14,7 +14,8 @@ namespace leveldb {
namespace { namespace {
typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, typedef Iterator* (*BlockFunction)(void*, const ReadOptions&,
const EnvOptions& soptions, const Slice&); const EnvOptions& soptions, const Slice&,
bool for_compaction);
class TwoLevelIterator: public Iterator { class TwoLevelIterator: public Iterator {
public: public:
@ -23,7 +24,8 @@ class TwoLevelIterator: public Iterator {
BlockFunction block_function, BlockFunction block_function,
void* arg, void* arg,
const ReadOptions& options, const ReadOptions& options,
const EnvOptions& soptions); const EnvOptions& soptions,
bool for_compaction);
virtual ~TwoLevelIterator(); virtual ~TwoLevelIterator();
@ -74,6 +76,7 @@ class TwoLevelIterator: public Iterator {
// If data_iter_ is non-nullptr, then "data_block_handle_" holds the // If data_iter_ is non-nullptr, then "data_block_handle_" holds the
// "index_value" passed to block_function_ to create the data_iter_. // "index_value" passed to block_function_ to create the data_iter_.
std::string data_block_handle_; std::string data_block_handle_;
bool for_compaction_;
}; };
TwoLevelIterator::TwoLevelIterator( TwoLevelIterator::TwoLevelIterator(
@ -81,13 +84,15 @@ TwoLevelIterator::TwoLevelIterator(
BlockFunction block_function, BlockFunction block_function,
void* arg, void* arg,
const ReadOptions& options, const ReadOptions& options,
const EnvOptions& soptions) const EnvOptions& soptions,
bool for_compaction)
: block_function_(block_function), : block_function_(block_function),
arg_(arg), arg_(arg),
options_(options), options_(options),
soptions_(soptions), soptions_(soptions),
index_iter_(index_iter), index_iter_(index_iter),
data_iter_(nullptr) { data_iter_(nullptr),
for_compaction_(for_compaction) {
} }
TwoLevelIterator::~TwoLevelIterator() { TwoLevelIterator::~TwoLevelIterator() {
@ -168,7 +173,8 @@ void TwoLevelIterator::InitDataBlock() {
// data_iter_ is already constructed with this iterator, so // data_iter_ is already constructed with this iterator, so
// no need to change anything // no need to change anything
} else { } else {
Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle); Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle,
for_compaction_);
data_block_handle_.assign(handle.data(), handle.size()); data_block_handle_.assign(handle.data(), handle.size());
SetDataIterator(iter); SetDataIterator(iter);
} }
@ -182,9 +188,10 @@ Iterator* NewTwoLevelIterator(
BlockFunction block_function, BlockFunction block_function,
void* arg, void* arg,
const ReadOptions& options, const ReadOptions& options,
const EnvOptions& soptions) { const EnvOptions& soptions,
bool for_compaction) {
return new TwoLevelIterator(index_iter, block_function, arg, return new TwoLevelIterator(index_iter, block_function, arg,
options, soptions); options, soptions, for_compaction);
} }
} // namespace leveldb } // namespace leveldb

View File

@ -27,10 +27,12 @@ extern Iterator* NewTwoLevelIterator(
void* arg, void* arg,
const ReadOptions& options, const ReadOptions& options,
const EnvOptions& soptions, const EnvOptions& soptions,
const Slice& index_value), const Slice& index_value,
bool for_compaction),
void* arg, void* arg,
const ReadOptions& options, const ReadOptions& options,
const EnvOptions& soptions); const EnvOptions& soptions,
bool for_compaction = false);
} // namespace leveldb } // namespace leveldb

View File

@ -207,6 +207,30 @@ class PosixRandomAccessFile: public RandomAccessFile {
return static_cast<size_t>(rid-id); return static_cast<size_t>(rid-id);
} }
#endif #endif
virtual void Hint(AccessPattern pattern) {
switch(pattern) {
case NORMAL:
posix_fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
break;
case RANDOM:
posix_fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
break;
case SEQUENTIAL:
posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
break;
case WILLNEED:
posix_fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
break;
case DONTNEED:
posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
break;
default:
assert(false);
break;
}
}
}; };
// mmap() based random-access // mmap() based random-access

View File

@ -71,9 +71,15 @@ Options::Options()
is_fd_close_on_exec(true), is_fd_close_on_exec(true),
skip_log_error_on_recovery(false), skip_log_error_on_recovery(false),
stats_dump_period_sec(3600), stats_dump_period_sec(3600),
block_size_deviation (10) { block_size_deviation (10),
advise_random_on_open(true),
access_hint_on_compaction_start(NORMAL) {
} }
static const char* const access_hints[] = {
"NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
};
void void
Options::Dump(Logger* log) const Options::Dump(Logger* log) const
{ {
@ -198,6 +204,10 @@ Options::Dump(Logger* log) const
stats_dump_period_sec); stats_dump_period_sec);
Log(log," Options.block_size_deviation: %d", Log(log," Options.block_size_deviation: %d",
block_size_deviation); block_size_deviation);
Log(log," Options.advise_random_on_open: %d",
advise_random_on_open);
Log(log," Options.access_hint_on_compaction_start: %s",
access_hints[access_hint_on_compaction_start]);
} // Options::Dump } // Options::Dump
// //