Optimizing ApproximateSize to create index iterator just once (#5693)

Summary:
VersionSet::ApproximateSize doesn't need to create two separate index iterators and do binary search for each in BlockBasedTable. So BlockBasedTable::ApproximateSize was added that creates the iterator once and uses it to calculate the data size between start and end keys.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5693

Differential Revision: D16774056

Pulled By: elipoz

fbshipit-source-id: 53ce262e1a057788243bf30cd9b8aa6581df1a18
This commit is contained in:
Eli Pozniansky 2019-08-16 14:16:49 -07:00 committed by Facebook Github Bot
parent c762efc4a9
commit c2404d9928
11 changed files with 170 additions and 29 deletions

View File

@ -553,4 +553,31 @@ uint64_t TableCache::ApproximateOffsetOf(
return result;
}
uint64_t TableCache::ApproximateSize(
const Slice& start, const Slice& end, const FileDescriptor& fd,
TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
const SliceTransform* prefix_extractor) {
uint64_t result = 0;
TableReader* table_reader = fd.table_reader;
Cache::Handle* table_handle = nullptr;
if (table_reader == nullptr) {
const bool for_compaction = (caller == TableReaderCaller::kCompaction);
Status s = FindTable(env_options_, internal_comparator, fd, &table_handle,
prefix_extractor, false /* no_io */,
!for_compaction /* record_read_stats */);
if (s.ok()) {
table_reader = GetTableReaderFromHandle(table_handle);
}
}
if (table_reader != nullptr) {
result = table_reader->ApproximateSize(start, end, caller);
}
if (table_handle != nullptr) {
ReleaseHandle(table_handle);
}
return result;
}
} // namespace rocksdb

View File

@ -167,6 +167,13 @@ class TableCache {
const InternalKeyComparator& internal_comparator,
const SliceTransform* prefix_extractor = nullptr);
// Returns approximated data size between start and end keys in a file
// represented by fd (the start key must not be greater than the end key).
uint64_t ApproximateSize(const Slice& start, const Slice& end,
const FileDescriptor& fd, TableReaderCaller caller,
const InternalKeyComparator& internal_comparator,
const SliceTransform* prefix_extractor = nullptr);
// Release the handle from a cache
void ReleaseHandle(Cache::Handle* handle);

View File

@ -5011,7 +5011,7 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
uint64_t file_size = files_brief.files[i].fd.GetFileSize();
// The entire file falls into the range, so we can just take its size.
assert(file_size ==
ApproximateSize(v, files_brief.files[i], end, caller));
ApproximateSize(v, files_brief.files[i], start, end, caller));
total_full_size += file_size;
}
@ -5045,25 +5045,23 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
} else {
// Estimate for all the first files, at each level
for (const auto file_ptr : first_files) {
total_full_size += ApproximateSize(v, *file_ptr, end, caller);
// subtract the bytes needed to be scanned to get to the starting key
uint64_t val = ApproximateSize(v, *file_ptr, start, caller);
assert(total_full_size >= val);
total_full_size -= val;
total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
}
// Estimate for all the last files, at each level
for (const auto file_ptr : last_files) {
total_full_size += ApproximateSize(v, *file_ptr, end, caller);
// We could use ApproximateSize here, but calling ApproximateOffsetOf
// directly is just more efficient.
total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
}
}
return total_full_size;
}
uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
const Slice& key,
TableReaderCaller caller) {
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
const Slice& key,
TableReaderCaller caller) {
// pre-condition
assert(v);
const auto& icmp = v->cfd_->internal_comparator();
@ -5088,6 +5086,43 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
return result;
}
uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
const Slice& start, const Slice& end,
TableReaderCaller caller) {
// pre-condition
assert(v);
const auto& icmp = v->cfd_->internal_comparator();
assert(icmp.Compare(start, end) <= 0);
if (icmp.Compare(f.largest_key, start) <= 0 ||
icmp.Compare(f.smallest_key, end) > 0) {
// Entire file is before or after the start/end keys range
return 0;
}
if (icmp.Compare(f.smallest_key, start) >= 0) {
// Start of the range is before the file start - approximate by end offset
return ApproximateOffsetOf(v, f, end, caller);
}
if (icmp.Compare(f.largest_key, end) < 0) {
// End of the range is after the file end - approximate by subtracting
// start offset from the file size
uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
assert(f.fd.GetFileSize() >= start_offset);
return f.fd.GetFileSize() - start_offset;
}
// The interval falls entirely in the range for this file.
TableCache* table_cache = v->cfd_->table_cache();
if (table_cache == nullptr) {
return 0;
}
return table_cache->ApproximateSize(
start, end, f.file_metadata->fd, caller, icmp,
v->GetMutableCFOptions().prefix_extractor.get());
}
void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
// pre-calculate space requirement
int64_t total_files = 0;

View File

@ -1043,8 +1043,15 @@ class VersionSet {
}
};
// Returns approximated offset of a key in a file for a given version.
uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
const Slice& key, TableReaderCaller caller);
// Returns approximated data size between start and end keys in a file
// for a given version.
uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
const Slice& key, TableReaderCaller caller);
const Slice& start, const Slice& end,
TableReaderCaller caller);
// Save current contents to *log
Status WriteSnapshot(log::Writer* log);

View File

@ -3921,25 +3921,16 @@ Status BlockBasedTable::CreateIndexReader(
}
}
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
TableReaderCaller caller) {
BlockCacheLookupContext context(caller);
IndexBlockIter iiter_on_stack;
auto index_iter =
NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
/*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
/*lookup_context=*/&context);
index_iter->Seek(key);
uint64_t result;
if (index_iter->Valid()) {
BlockHandle handle = index_iter->value().handle;
uint64_t BlockBasedTable::ApproximateOffsetOf(
const InternalIteratorBase<IndexValue>& index_iter) const {
uint64_t result = 0;
if (index_iter.Valid()) {
BlockHandle handle = index_iter.value().handle;
result = handle.offset();
} else {
// key is past the last key in the file. If table_properties is not
// The iterator is past the last key in the file. If table_properties is not
// available, approximate the offset by returning the offset of the
// metaindex block (which is right near the end of the file).
result = 0;
if (rep_->table_properties) {
result = rep_->table_properties->data_size;
}
@ -3949,11 +3940,48 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
}
}
return result;
}
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
TableReaderCaller caller) {
BlockCacheLookupContext context(caller);
IndexBlockIter iiter_on_stack;
auto index_iter =
NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false,
/*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
/*lookup_context=*/&context);
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
if (index_iter != &iiter_on_stack) {
delete index_iter;
iiter_unique_ptr.reset(index_iter);
}
return result;
index_iter->Seek(key);
return ApproximateOffsetOf(*index_iter);
}
uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
TableReaderCaller caller) {
assert(rep_->internal_comparator.Compare(start, end) <= 0);
BlockCacheLookupContext context(caller);
IndexBlockIter iiter_on_stack;
auto index_iter =
NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false,
/*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
/*lookup_context=*/&context);
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
if (index_iter != &iiter_on_stack) {
iiter_unique_ptr.reset(index_iter);
}
index_iter->Seek(start);
uint64_t start_offset = ApproximateOffsetOf(*index_iter);
index_iter->Seek(end);
uint64_t end_offset = ApproximateOffsetOf(*index_iter);
assert(end_offset >= start_offset);
return end_offset - start_offset;
}
bool BlockBasedTable::TEST_FilterBlockInCache() const {

View File

@ -151,13 +151,20 @@ class BlockBasedTable : public TableReader {
// Given a key, return an approximate byte offset in the file where
// the data for that key begins (or would begin if the key were
// present in the file). The returned value is in terms of file
// present in the file). The returned value is in terms of file
// bytes, and so includes effects like compression of the underlying data.
// E.g., the approximate offset of the last key in the table will
// be close to the file length.
uint64_t ApproximateOffsetOf(const Slice& key,
TableReaderCaller caller) override;
// Given start and end keys, return the approximate data size in the file
// between the keys. The returned value is in terms of file bytes, and so
// includes effects like compression of the underlying data.
// The start key must not be greater than the end key.
uint64_t ApproximateSize(const Slice& start, const Slice& end,
TableReaderCaller caller) override;
bool TEST_BlockInCache(const BlockHandle& handle) const;
// Returns true if the block for the specified key is in cache.
@ -438,6 +445,10 @@ class BlockBasedTable : public TableReader {
static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer,
size_t* size);
// Given an iterator return its offset in file.
uint64_t ApproximateOffsetOf(
const InternalIteratorBase<IndexValue>& index_iter) const;
// Helper functions for DumpTable()
Status DumpIndexBlock(WritableFile* out_file);
Status DumpDataBlocks(WritableFile* out_file);

View File

@ -62,6 +62,12 @@ class CuckooTableReader: public TableReader {
TableReaderCaller /*caller*/) override {
return 0;
}
uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
TableReaderCaller /*caller*/) override {
return 0;
}
void SetupForCompaction() override {}
// End of methods not implemented.

View File

@ -55,6 +55,11 @@ class MockTableReader : public TableReader {
return 0;
}
uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
TableReaderCaller /*caller*/) override {
return 0;
}
size_t ApproximateMemoryUsage() const override { return 0; }
void SetupForCompaction() override {}

View File

@ -620,6 +620,12 @@ uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
return 0;
}
uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
const Slice& /*end*/,
TableReaderCaller /*caller*/) {
return 0;
}
PlainTableIterator::PlainTableIterator(PlainTableReader* table,
bool use_prefix_seek)
: table_(table),

View File

@ -94,6 +94,9 @@ class PlainTableReader: public TableReader {
uint64_t ApproximateOffsetOf(const Slice& key,
TableReaderCaller caller) override;
uint64_t ApproximateSize(const Slice& start, const Slice& end,
TableReaderCaller caller) override;
uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
void SetupForCompaction() override;

View File

@ -65,6 +65,12 @@ class TableReader {
virtual uint64_t ApproximateOffsetOf(const Slice& key,
TableReaderCaller caller) = 0;
// Given start and end keys, return the approximate data size in the file
// between the keys. The returned value is in terms of file bytes, and so
// includes effects like compression of the underlying data.
virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
TableReaderCaller caller) = 0;
// Set up the table for Compaction. Might change some parameters with
// posix_fadvise
virtual void SetupForCompaction() = 0;