Improve CPU Efficiency of ApproximateSize (part 2) (#5609)
Summary: In some cases, we don't have to get really accurate number. Something like 10% off is fine, we can create a new option for that use case. In this case, we can calculate size for full files first, and avoid estimation inside SST files if full files got us a huge number. For example, if we already covered 100GB of data, we should be able to skip partial dives into 10 SST files of 30MB. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5609 Differential Revision: D16433481 Pulled By: elipoz fbshipit-source-id: 5830b31e1c656d0fd3a00d7fd2678ddc8f6e601b
This commit is contained in:
parent
b538e756c2
commit
4834dab578
@ -22,6 +22,7 @@
|
||||
* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
|
||||
* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
|
||||
* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
|
||||
* Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin.
|
||||
|
||||
### Performance Improvements
|
||||
* Reduce iterator key comparision for upper/lower bound check.
|
||||
|
@ -520,7 +520,8 @@ void CompactionJob::GenSubcompactionBoundaries() {
|
||||
// to the index block and may incur I/O cost in the process. Unlock db
|
||||
// mutex to reduce contention
|
||||
db_mutex_->Unlock();
|
||||
uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1,
|
||||
uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
|
||||
b, start_lvl, out_lvl + 1,
|
||||
TableReaderCaller::kCompaction);
|
||||
db_mutex_->Lock();
|
||||
ranges.emplace_back(a, b, size);
|
||||
|
@ -2808,8 +2808,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
|
||||
sizes[i] = 0;
|
||||
if (options.include_files) {
|
||||
sizes[i] += versions_->ApproximateSize(
|
||||
v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1,
|
||||
TableReaderCaller::kUserApproximateSize);
|
||||
options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
|
||||
/*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
|
||||
}
|
||||
if (options.include_memtabtles) {
|
||||
sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
|
||||
|
@ -1257,6 +1257,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
|
||||
options.compression = kNoCompression;
|
||||
options.create_if_missing = true;
|
||||
DestroyAndReopen(options);
|
||||
auto default_cf = db_->DefaultColumnFamily();
|
||||
|
||||
const int N = 128;
|
||||
Random rnd(301);
|
||||
@ -1268,9 +1269,10 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
|
||||
std::string start = Key(50);
|
||||
std::string end = Key(60);
|
||||
Range r(start, end);
|
||||
uint8_t include_both = DB::SizeApproximationFlags::INCLUDE_FILES |
|
||||
DB::SizeApproximationFlags::INCLUDE_MEMTABLES;
|
||||
db_->GetApproximateSizes(&r, 1, &size, include_both);
|
||||
SizeApproximationOptions size_approx_options;
|
||||
size_approx_options.include_memtabtles = true;
|
||||
size_approx_options.include_files = true;
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
ASSERT_GT(size, 6000);
|
||||
ASSERT_LT(size, 204800);
|
||||
// Zero if not including mem table
|
||||
@ -1280,7 +1282,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
|
||||
start = Key(500);
|
||||
end = Key(600);
|
||||
r = Range(start, end);
|
||||
db_->GetApproximateSizes(&r, 1, &size, include_both);
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
ASSERT_EQ(size, 0);
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
@ -1290,19 +1292,20 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
|
||||
start = Key(500);
|
||||
end = Key(600);
|
||||
r = Range(start, end);
|
||||
db_->GetApproximateSizes(&r, 1, &size, include_both);
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
ASSERT_EQ(size, 0);
|
||||
|
||||
start = Key(100);
|
||||
end = Key(1020);
|
||||
r = Range(start, end);
|
||||
db_->GetApproximateSizes(&r, 1, &size, include_both);
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
ASSERT_GT(size, 6000);
|
||||
|
||||
options.max_write_buffer_number = 8;
|
||||
options.min_write_buffer_number_to_merge = 5;
|
||||
options.write_buffer_size = 1024 * N; // Not very large
|
||||
DestroyAndReopen(options);
|
||||
default_cf = db_->DefaultColumnFamily();
|
||||
|
||||
int keys[N * 3];
|
||||
for (int i = 0; i < N; i++) {
|
||||
@ -1319,26 +1322,27 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
|
||||
start = Key(100);
|
||||
end = Key(300);
|
||||
r = Range(start, end);
|
||||
db_->GetApproximateSizes(&r, 1, &size, include_both);
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
ASSERT_EQ(size, 0);
|
||||
|
||||
start = Key(1050);
|
||||
end = Key(1080);
|
||||
r = Range(start, end);
|
||||
db_->GetApproximateSizes(&r, 1, &size, include_both);
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
ASSERT_GT(size, 6000);
|
||||
|
||||
start = Key(2100);
|
||||
end = Key(2300);
|
||||
r = Range(start, end);
|
||||
db_->GetApproximateSizes(&r, 1, &size, include_both);
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
ASSERT_EQ(size, 0);
|
||||
|
||||
start = Key(1050);
|
||||
end = Key(1080);
|
||||
r = Range(start, end);
|
||||
uint64_t size_with_mt, size_without_mt;
|
||||
db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both);
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
|
||||
&size_with_mt);
|
||||
ASSERT_GT(size_with_mt, 6000);
|
||||
db_->GetApproximateSizes(&r, 1, &size_without_mt);
|
||||
ASSERT_EQ(size_without_mt, 0);
|
||||
@ -1352,10 +1356,80 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
|
||||
start = Key(1050);
|
||||
end = Key(1080);
|
||||
r = Range(start, end);
|
||||
db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both);
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
|
||||
&size_with_mt);
|
||||
db_->GetApproximateSizes(&r, 1, &size_without_mt);
|
||||
ASSERT_GT(size_with_mt, size_without_mt);
|
||||
ASSERT_GT(size_without_mt, 6000);
|
||||
|
||||
// Check that include_memtabtles flag works as expected
|
||||
size_approx_options.include_memtabtles = false;
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
ASSERT_EQ(size, size_without_mt);
|
||||
|
||||
// Check that files_size_error_margin works as expected, when the heuristic
|
||||
// conditions are not met
|
||||
start = Key(1);
|
||||
end = Key(1000 + N - 2);
|
||||
r = Range(start, end);
|
||||
size_approx_options.files_size_error_margin = -1.0; // disabled
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
uint64_t size2;
|
||||
size_approx_options.files_size_error_margin = 0.5; // enabled, but not used
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
|
||||
ASSERT_EQ(size, size2);
|
||||
}
|
||||
|
||||
TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
|
||||
Options options = CurrentOptions();
|
||||
options.write_buffer_size = 1024 * 1024;
|
||||
options.compression = kNoCompression;
|
||||
options.create_if_missing = true;
|
||||
options.target_file_size_base = 1024 * 1024;
|
||||
DestroyAndReopen(options);
|
||||
const auto default_cf = db_->DefaultColumnFamily();
|
||||
|
||||
const int N = 64000;
|
||||
Random rnd(301);
|
||||
for (int i = 0; i < N; i++) {
|
||||
ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
|
||||
}
|
||||
// Flush everything to files
|
||||
Flush();
|
||||
// Compact the entire key space into the next level
|
||||
db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr);
|
||||
|
||||
// Write more keys
|
||||
for (int i = N; i < (N + N / 4); i++) {
|
||||
ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
|
||||
}
|
||||
// Flush everything to files again
|
||||
Flush();
|
||||
|
||||
// Wait for compaction to finish
|
||||
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
||||
|
||||
const std::string start = Key(0);
|
||||
const std::string end = Key(2 * N);
|
||||
const Range r(start, end);
|
||||
|
||||
SizeApproximationOptions size_approx_options;
|
||||
size_approx_options.include_memtabtles = false;
|
||||
size_approx_options.include_files = true;
|
||||
size_approx_options.files_size_error_margin = -1.0; // disabled
|
||||
|
||||
// Get the precise size without any approximation heuristic
|
||||
uint64_t size;
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||
ASSERT_NE(size, 0);
|
||||
|
||||
// Get the size with an approximation heuristic
|
||||
uint64_t size2;
|
||||
const double error_margin = 0.2;
|
||||
size_approx_options.files_size_error_margin = error_margin;
|
||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
|
||||
ASSERT_LT(size2, size * (1 + error_margin));
|
||||
ASSERT_GT(size2, size * (1 - error_margin));
|
||||
}
|
||||
|
||||
TEST_F(DBTest, GetApproximateMemTableStats) {
|
||||
|
@ -4872,84 +4872,134 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
|
||||
// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
|
||||
// we avoid doing binary search for the keys b and c twice and instead somehow
|
||||
// maintain state of where they first appear in the files.
|
||||
uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
|
||||
uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
|
||||
Version* v, const Slice& start,
|
||||
const Slice& end, int start_level,
|
||||
int end_level, TableReaderCaller caller) {
|
||||
// pre-condition
|
||||
assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
|
||||
const auto& icmp = v->cfd_->internal_comparator();
|
||||
|
||||
uint64_t size = 0;
|
||||
// pre-condition
|
||||
assert(icmp.Compare(start, end) <= 0);
|
||||
|
||||
uint64_t total_full_size = 0;
|
||||
const auto* vstorage = v->storage_info();
|
||||
end_level = end_level == -1
|
||||
? vstorage->num_non_empty_levels()
|
||||
: std::min(end_level, vstorage->num_non_empty_levels());
|
||||
const int num_non_empty_levels = vstorage->num_non_empty_levels();
|
||||
end_level = (end_level == -1) ? num_non_empty_levels
|
||||
: std::min(end_level, num_non_empty_levels);
|
||||
|
||||
assert(start_level <= end_level);
|
||||
|
||||
for (int level = start_level; level < end_level; level++) {
|
||||
// Outline of the optimization that uses options.files_size_error_margin.
|
||||
// When approximating the files total size that is used to store a keys range,
|
||||
// we first sum up the sizes of the files that fully fall into the range.
|
||||
// Then we sum up the sizes of all the files that may intersect with the range
|
||||
// (this includes all files in L0 as well). Then, if total_intersecting_size
|
||||
// is smaller than total_full_size * options.files_size_error_margin - we can
|
||||
// infer that the intersecting files have a sufficiently negligible
|
||||
// contribution to the total size, and we can approximate the storage required
|
||||
// for the keys in range as just half of the intersecting_files_size.
|
||||
// E.g., if the value of files_size_error_margin is 0.1, then the error of the
|
||||
// approximation is limited to only ~10% of the total size of files that fully
|
||||
// fall into the keys range. In such case, this helps to avoid a costly
|
||||
// process of binary searching the intersecting files that is required only
|
||||
// for a more precise calculation of the total size.
|
||||
|
||||
autovector<FdWithKeyRange*, 32> first_files;
|
||||
autovector<FdWithKeyRange*, 16> last_files;
|
||||
|
||||
// scan all the levels
|
||||
for (int level = start_level; level < end_level; ++level) {
|
||||
const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
|
||||
if (!files_brief.num_files) {
|
||||
if (files_brief.num_files == 0) {
|
||||
// empty level, skip exploration
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!level) {
|
||||
// level 0 data is sorted order, handle the use case explicitly
|
||||
size += ApproximateSizeLevel0(v, files_brief, start, end, caller);
|
||||
if (level == 0) {
|
||||
// level 0 files are not in sorted order, we need to iterate through
|
||||
// the list to compute the total bytes that require scanning,
|
||||
// so handle the case explicitly (similarly to first_files case)
|
||||
for (size_t i = 0; i < files_brief.num_files; i++) {
|
||||
first_files.push_back(&files_brief.files[i]);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(level > 0);
|
||||
assert(files_brief.num_files > 0);
|
||||
|
||||
// identify the file position for starting key
|
||||
const uint64_t idx_start = FindFileInRange(
|
||||
v->cfd_->internal_comparator(), files_brief, start,
|
||||
/*start=*/0, static_cast<uint32_t>(files_brief.num_files - 1));
|
||||
assert(idx_start < files_brief.num_files);
|
||||
// identify the file position for start key
|
||||
const int idx_start =
|
||||
FindFileInRange(icmp, files_brief, start, 0,
|
||||
static_cast<uint32_t>(files_brief.num_files - 1));
|
||||
assert(static_cast<size_t>(idx_start) < files_brief.num_files);
|
||||
|
||||
// scan all files from the starting position until the ending position
|
||||
// inferred from the sorted order
|
||||
for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
|
||||
uint64_t val;
|
||||
val = ApproximateSize(v, files_brief.files[i], end, caller);
|
||||
if (!val) {
|
||||
// the files after this will not have the range
|
||||
break;
|
||||
}
|
||||
// identify the file position for end key
|
||||
int idx_end = idx_start;
|
||||
if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
|
||||
idx_end =
|
||||
FindFileInRange(icmp, files_brief, end, idx_start,
|
||||
static_cast<uint32_t>(files_brief.num_files - 1));
|
||||
}
|
||||
assert(idx_end >= idx_start &&
|
||||
static_cast<size_t>(idx_end) < files_brief.num_files);
|
||||
|
||||
size += val;
|
||||
// scan all files from the starting index to the ending index
|
||||
// (inferred from the sorted order)
|
||||
|
||||
if (i == idx_start) {
|
||||
// subtract the bytes needed to be scanned to get to the starting
|
||||
// key
|
||||
val = ApproximateSize(v, files_brief.files[i], start, caller);
|
||||
assert(size >= val);
|
||||
size -= val;
|
||||
}
|
||||
// first scan all the intermediate full files (excluding first and last)
|
||||
for (int i = idx_start + 1; i < idx_end; ++i) {
|
||||
uint64_t file_size = files_brief.files[i].fd.GetFileSize();
|
||||
// The entire file falls into the range, so we can just take its size.
|
||||
assert(file_size ==
|
||||
ApproximateSize(v, files_brief.files[i], end, caller));
|
||||
total_full_size += file_size;
|
||||
}
|
||||
|
||||
// save the first and the last files (which may be the same file), so we
|
||||
// can scan them later.
|
||||
first_files.push_back(&files_brief.files[idx_start]);
|
||||
if (idx_start != idx_end) {
|
||||
// we need to estimate size for both files, only if they are different
|
||||
last_files.push_back(&files_brief.files[idx_end]);
|
||||
}
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
|
||||
const LevelFilesBrief& files_brief,
|
||||
const Slice& key_start,
|
||||
const Slice& key_end,
|
||||
TableReaderCaller caller) {
|
||||
// level 0 files are not in sorted order, we need to iterate through
|
||||
// the list to compute the total bytes that require scanning
|
||||
uint64_t size = 0;
|
||||
for (size_t i = 0; i < files_brief.num_files; i++) {
|
||||
const uint64_t start =
|
||||
ApproximateSize(v, files_brief.files[i], key_start, caller);
|
||||
const uint64_t end =
|
||||
ApproximateSize(v, files_brief.files[i], key_end, caller);
|
||||
assert(end >= start);
|
||||
size += end - start;
|
||||
// The sum of all file sizes that intersect the [start, end] keys range.
|
||||
uint64_t total_intersecting_size = 0;
|
||||
for (const auto* file_ptr : first_files) {
|
||||
total_intersecting_size += file_ptr->fd.GetFileSize();
|
||||
}
|
||||
return size;
|
||||
for (const auto* file_ptr : last_files) {
|
||||
total_intersecting_size += file_ptr->fd.GetFileSize();
|
||||
}
|
||||
|
||||
// Now scan all the first & last files at each level, and estimate their size.
|
||||
// If the total_intersecting_size is less than X% of the total_full_size - we
|
||||
// want to approximate the result in order to avoid the costly binary search
|
||||
// inside ApproximateSize. We use half of file size as an approximation below.
|
||||
|
||||
const double margin = options.files_size_error_margin;
|
||||
if (margin > 0 && total_intersecting_size <
|
||||
static_cast<uint64_t>(total_full_size * margin)) {
|
||||
total_full_size += total_intersecting_size / 2;
|
||||
} else {
|
||||
// Estimate for all the first files, at each level
|
||||
for (const auto file_ptr : first_files) {
|
||||
total_full_size += ApproximateSize(v, *file_ptr, end, caller);
|
||||
// subtract the bytes needed to be scanned to get to the starting key
|
||||
uint64_t val = ApproximateSize(v, *file_ptr, start, caller);
|
||||
assert(total_full_size >= val);
|
||||
total_full_size -= val;
|
||||
}
|
||||
|
||||
// Estimate for all the last files, at each level
|
||||
for (const auto file_ptr : last_files) {
|
||||
total_full_size += ApproximateSize(v, *file_ptr, end, caller);
|
||||
}
|
||||
}
|
||||
|
||||
return total_full_size;
|
||||
}
|
||||
|
||||
uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
|
||||
@ -4957,12 +5007,13 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
|
||||
TableReaderCaller caller) {
|
||||
// pre-condition
|
||||
assert(v);
|
||||
const auto& icmp = v->cfd_->internal_comparator();
|
||||
|
||||
uint64_t result = 0;
|
||||
if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) {
|
||||
if (icmp.Compare(f.largest_key, key) <= 0) {
|
||||
// Entire file is before "key", so just add the file size
|
||||
result = f.fd.GetFileSize();
|
||||
} else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) {
|
||||
} else if (icmp.Compare(f.smallest_key, key) > 0) {
|
||||
// Entire file is after "key", so ignore
|
||||
result = 0;
|
||||
} else {
|
||||
@ -4971,7 +5022,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
|
||||
TableCache* table_cache = v->cfd_->table_cache();
|
||||
if (table_cache != nullptr) {
|
||||
result = table_cache->ApproximateOffsetOf(
|
||||
key, f.file_metadata->fd, caller, v->cfd()->internal_comparator(),
|
||||
key, f.file_metadata->fd, caller, icmp,
|
||||
v->GetMutableCFOptions().prefix_extractor.get());
|
||||
}
|
||||
}
|
||||
|
@ -983,7 +983,8 @@ class VersionSet {
|
||||
// Return the approximate size of data to be scanned for range [start, end)
|
||||
// in levels [start_level, end_level). If end_level == -1 it will search
|
||||
// through all non-empty levels
|
||||
uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
|
||||
uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
|
||||
const Slice& start, const Slice& end,
|
||||
int start_level, int end_level,
|
||||
TableReaderCaller caller);
|
||||
|
||||
@ -1033,11 +1034,6 @@ class VersionSet {
|
||||
}
|
||||
};
|
||||
|
||||
// ApproximateSize helper
|
||||
uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
|
||||
const Slice& start, const Slice& end,
|
||||
TableReaderCaller caller);
|
||||
|
||||
uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
|
||||
const Slice& key, TableReaderCaller caller);
|
||||
|
||||
|
@ -1514,6 +1514,16 @@ struct SizeApproximationOptions {
|
||||
// Defines whether the returned size should include data serialized to disk.
|
||||
// If set to false, include_memtabtles must be true.
|
||||
bool include_files = true;
|
||||
// When approximating the files total size that is used to store a keys range
|
||||
// using DB::GetApproximateSizes, allow approximation with an error margin of
|
||||
// up to total_files_size * files_size_error_margin. This allows to take some
|
||||
// shortcuts in files size approximation, resulting in better performance,
|
||||
// while guaranteeing the resulting error is within a reasonable margin.
|
||||
// E.g., if the value is 0.1, then the error margin of the returned files size
|
||||
// approximation will be within 10%.
|
||||
// If the value is non-positive - a more precise yet more CPU intensive
|
||||
// estimation is performed.
|
||||
double files_size_error_margin = -1.0;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
Loading…
Reference in New Issue
Block a user