Optimizing ApproximateSize to create index iterator just once (#5693)
Summary: VersionSet::ApproximateSize doesn't need to create two separate index iterators and do binary search for each in BlockBasedTable. So BlockBasedTable::ApproximateSize was added that creates the iterator once and uses it to calculate the data size between start and end keys. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5693 Differential Revision: D16774056 Pulled By: elipoz fbshipit-source-id: 53ce262e1a057788243bf30cd9b8aa6581df1a18
This commit is contained in:
parent
c762efc4a9
commit
c2404d9928
@ -553,4 +553,31 @@ uint64_t TableCache::ApproximateOffsetOf(
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
uint64_t TableCache::ApproximateSize(
|
||||
const Slice& start, const Slice& end, const FileDescriptor& fd,
|
||||
TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
|
||||
const SliceTransform* prefix_extractor) {
|
||||
uint64_t result = 0;
|
||||
TableReader* table_reader = fd.table_reader;
|
||||
Cache::Handle* table_handle = nullptr;
|
||||
if (table_reader == nullptr) {
|
||||
const bool for_compaction = (caller == TableReaderCaller::kCompaction);
|
||||
Status s = FindTable(env_options_, internal_comparator, fd, &table_handle,
|
||||
prefix_extractor, false /* no_io */,
|
||||
!for_compaction /* record_read_stats */);
|
||||
if (s.ok()) {
|
||||
table_reader = GetTableReaderFromHandle(table_handle);
|
||||
}
|
||||
}
|
||||
|
||||
if (table_reader != nullptr) {
|
||||
result = table_reader->ApproximateSize(start, end, caller);
|
||||
}
|
||||
if (table_handle != nullptr) {
|
||||
ReleaseHandle(table_handle);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
} // namespace rocksdb
|
||||
|
@ -167,6 +167,13 @@ class TableCache {
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const SliceTransform* prefix_extractor = nullptr);
|
||||
|
||||
// Returns approximated data size between start and end keys in a file
|
||||
// represented by fd (the start key must not be greater than the end key).
|
||||
uint64_t ApproximateSize(const Slice& start, const Slice& end,
|
||||
const FileDescriptor& fd, TableReaderCaller caller,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const SliceTransform* prefix_extractor = nullptr);
|
||||
|
||||
// Release the handle from a cache
|
||||
void ReleaseHandle(Cache::Handle* handle);
|
||||
|
||||
|
@ -5011,7 +5011,7 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
|
||||
uint64_t file_size = files_brief.files[i].fd.GetFileSize();
|
||||
// The entire file falls into the range, so we can just take its size.
|
||||
assert(file_size ==
|
||||
ApproximateSize(v, files_brief.files[i], end, caller));
|
||||
ApproximateSize(v, files_brief.files[i], start, end, caller));
|
||||
total_full_size += file_size;
|
||||
}
|
||||
|
||||
@ -5045,25 +5045,23 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
|
||||
} else {
|
||||
// Estimate for all the first files, at each level
|
||||
for (const auto file_ptr : first_files) {
|
||||
total_full_size += ApproximateSize(v, *file_ptr, end, caller);
|
||||
// subtract the bytes needed to be scanned to get to the starting key
|
||||
uint64_t val = ApproximateSize(v, *file_ptr, start, caller);
|
||||
assert(total_full_size >= val);
|
||||
total_full_size -= val;
|
||||
total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
|
||||
}
|
||||
|
||||
// Estimate for all the last files, at each level
|
||||
for (const auto file_ptr : last_files) {
|
||||
total_full_size += ApproximateSize(v, *file_ptr, end, caller);
|
||||
// We could use ApproximateSize here, but calling ApproximateOffsetOf
|
||||
// directly is just more efficient.
|
||||
total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
|
||||
}
|
||||
}
|
||||
|
||||
return total_full_size;
|
||||
}
|
||||
|
||||
uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
|
||||
const Slice& key,
|
||||
TableReaderCaller caller) {
|
||||
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
|
||||
const Slice& key,
|
||||
TableReaderCaller caller) {
|
||||
// pre-condition
|
||||
assert(v);
|
||||
const auto& icmp = v->cfd_->internal_comparator();
|
||||
@ -5088,6 +5086,43 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
|
||||
return result;
|
||||
}
|
||||
|
||||
uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
|
||||
const Slice& start, const Slice& end,
|
||||
TableReaderCaller caller) {
|
||||
// pre-condition
|
||||
assert(v);
|
||||
const auto& icmp = v->cfd_->internal_comparator();
|
||||
assert(icmp.Compare(start, end) <= 0);
|
||||
|
||||
if (icmp.Compare(f.largest_key, start) <= 0 ||
|
||||
icmp.Compare(f.smallest_key, end) > 0) {
|
||||
// Entire file is before or after the start/end keys range
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (icmp.Compare(f.smallest_key, start) >= 0) {
|
||||
// Start of the range is before the file start - approximate by end offset
|
||||
return ApproximateOffsetOf(v, f, end, caller);
|
||||
}
|
||||
|
||||
if (icmp.Compare(f.largest_key, end) < 0) {
|
||||
// End of the range is after the file end - approximate by subtracting
|
||||
// start offset from the file size
|
||||
uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
|
||||
assert(f.fd.GetFileSize() >= start_offset);
|
||||
return f.fd.GetFileSize() - start_offset;
|
||||
}
|
||||
|
||||
// The interval falls entirely in the range for this file.
|
||||
TableCache* table_cache = v->cfd_->table_cache();
|
||||
if (table_cache == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
return table_cache->ApproximateSize(
|
||||
start, end, f.file_metadata->fd, caller, icmp,
|
||||
v->GetMutableCFOptions().prefix_extractor.get());
|
||||
}
|
||||
|
||||
void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
|
||||
// pre-calculate space requirement
|
||||
int64_t total_files = 0;
|
||||
|
@ -1043,8 +1043,15 @@ class VersionSet {
|
||||
}
|
||||
};
|
||||
|
||||
// Returns approximated offset of a key in a file for a given version.
|
||||
uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
|
||||
const Slice& key, TableReaderCaller caller);
|
||||
|
||||
// Returns approximated data size between start and end keys in a file
|
||||
// for a given version.
|
||||
uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
|
||||
const Slice& key, TableReaderCaller caller);
|
||||
const Slice& start, const Slice& end,
|
||||
TableReaderCaller caller);
|
||||
|
||||
// Save current contents to *log
|
||||
Status WriteSnapshot(log::Writer* log);
|
||||
|
@ -3921,25 +3921,16 @@ Status BlockBasedTable::CreateIndexReader(
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
|
||||
TableReaderCaller caller) {
|
||||
BlockCacheLookupContext context(caller);
|
||||
IndexBlockIter iiter_on_stack;
|
||||
auto index_iter =
|
||||
NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
|
||||
/*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
|
||||
/*lookup_context=*/&context);
|
||||
|
||||
index_iter->Seek(key);
|
||||
uint64_t result;
|
||||
if (index_iter->Valid()) {
|
||||
BlockHandle handle = index_iter->value().handle;
|
||||
uint64_t BlockBasedTable::ApproximateOffsetOf(
|
||||
const InternalIteratorBase<IndexValue>& index_iter) const {
|
||||
uint64_t result = 0;
|
||||
if (index_iter.Valid()) {
|
||||
BlockHandle handle = index_iter.value().handle;
|
||||
result = handle.offset();
|
||||
} else {
|
||||
// key is past the last key in the file. If table_properties is not
|
||||
// The iterator is past the last key in the file. If table_properties is not
|
||||
// available, approximate the offset by returning the offset of the
|
||||
// metaindex block (which is right near the end of the file).
|
||||
result = 0;
|
||||
if (rep_->table_properties) {
|
||||
result = rep_->table_properties->data_size;
|
||||
}
|
||||
@ -3949,11 +3940,48 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
|
||||
TableReaderCaller caller) {
|
||||
BlockCacheLookupContext context(caller);
|
||||
IndexBlockIter iiter_on_stack;
|
||||
auto index_iter =
|
||||
NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false,
|
||||
/*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
|
||||
/*lookup_context=*/&context);
|
||||
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
|
||||
if (index_iter != &iiter_on_stack) {
|
||||
delete index_iter;
|
||||
iiter_unique_ptr.reset(index_iter);
|
||||
}
|
||||
|
||||
return result;
|
||||
index_iter->Seek(key);
|
||||
return ApproximateOffsetOf(*index_iter);
|
||||
}
|
||||
|
||||
uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
|
||||
TableReaderCaller caller) {
|
||||
assert(rep_->internal_comparator.Compare(start, end) <= 0);
|
||||
|
||||
BlockCacheLookupContext context(caller);
|
||||
IndexBlockIter iiter_on_stack;
|
||||
auto index_iter =
|
||||
NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false,
|
||||
/*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
|
||||
/*lookup_context=*/&context);
|
||||
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
|
||||
if (index_iter != &iiter_on_stack) {
|
||||
iiter_unique_ptr.reset(index_iter);
|
||||
}
|
||||
|
||||
index_iter->Seek(start);
|
||||
uint64_t start_offset = ApproximateOffsetOf(*index_iter);
|
||||
index_iter->Seek(end);
|
||||
uint64_t end_offset = ApproximateOffsetOf(*index_iter);
|
||||
|
||||
assert(end_offset >= start_offset);
|
||||
return end_offset - start_offset;
|
||||
}
|
||||
|
||||
bool BlockBasedTable::TEST_FilterBlockInCache() const {
|
||||
|
@ -151,13 +151,20 @@ class BlockBasedTable : public TableReader {
|
||||
|
||||
// Given a key, return an approximate byte offset in the file where
|
||||
// the data for that key begins (or would begin if the key were
|
||||
// present in the file). The returned value is in terms of file
|
||||
// present in the file). The returned value is in terms of file
|
||||
// bytes, and so includes effects like compression of the underlying data.
|
||||
// E.g., the approximate offset of the last key in the table will
|
||||
// be close to the file length.
|
||||
uint64_t ApproximateOffsetOf(const Slice& key,
|
||||
TableReaderCaller caller) override;
|
||||
|
||||
// Given start and end keys, return the approximate data size in the file
|
||||
// between the keys. The returned value is in terms of file bytes, and so
|
||||
// includes effects like compression of the underlying data.
|
||||
// The start key must not be greater than the end key.
|
||||
uint64_t ApproximateSize(const Slice& start, const Slice& end,
|
||||
TableReaderCaller caller) override;
|
||||
|
||||
bool TEST_BlockInCache(const BlockHandle& handle) const;
|
||||
|
||||
// Returns true if the block for the specified key is in cache.
|
||||
@ -438,6 +445,10 @@ class BlockBasedTable : public TableReader {
|
||||
static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer,
|
||||
size_t* size);
|
||||
|
||||
// Given an iterator return its offset in file.
|
||||
uint64_t ApproximateOffsetOf(
|
||||
const InternalIteratorBase<IndexValue>& index_iter) const;
|
||||
|
||||
// Helper functions for DumpTable()
|
||||
Status DumpIndexBlock(WritableFile* out_file);
|
||||
Status DumpDataBlocks(WritableFile* out_file);
|
||||
|
@ -62,6 +62,12 @@ class CuckooTableReader: public TableReader {
|
||||
TableReaderCaller /*caller*/) override {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
|
||||
TableReaderCaller /*caller*/) override {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void SetupForCompaction() override {}
|
||||
// End of methods not implemented.
|
||||
|
||||
|
@ -55,6 +55,11 @@ class MockTableReader : public TableReader {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
|
||||
TableReaderCaller /*caller*/) override {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t ApproximateMemoryUsage() const override { return 0; }
|
||||
|
||||
void SetupForCompaction() override {}
|
||||
|
@ -620,6 +620,12 @@ uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
|
||||
const Slice& /*end*/,
|
||||
TableReaderCaller /*caller*/) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
PlainTableIterator::PlainTableIterator(PlainTableReader* table,
|
||||
bool use_prefix_seek)
|
||||
: table_(table),
|
||||
|
@ -94,6 +94,9 @@ class PlainTableReader: public TableReader {
|
||||
uint64_t ApproximateOffsetOf(const Slice& key,
|
||||
TableReaderCaller caller) override;
|
||||
|
||||
uint64_t ApproximateSize(const Slice& start, const Slice& end,
|
||||
TableReaderCaller caller) override;
|
||||
|
||||
uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
|
||||
void SetupForCompaction() override;
|
||||
|
||||
|
@ -65,6 +65,12 @@ class TableReader {
|
||||
virtual uint64_t ApproximateOffsetOf(const Slice& key,
|
||||
TableReaderCaller caller) = 0;
|
||||
|
||||
// Given start and end keys, return the approximate data size in the file
|
||||
// between the keys. The returned value is in terms of file bytes, and so
|
||||
// includes effects like compression of the underlying data.
|
||||
virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
|
||||
TableReaderCaller caller) = 0;
|
||||
|
||||
// Set up the table for Compaction. Might change some parameters with
|
||||
// posix_fadvise
|
||||
virtual void SetupForCompaction() = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user