2016-02-09 15:12:00 -08:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 16:03:42 -07:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2014-05-30 14:31:55 -07:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
#include <queue>
|
|
|
|
|
2019-05-30 17:39:43 -07:00
|
|
|
#include "memory/arena.h"
|
2014-05-30 14:31:55 -07:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/iterator.h"
|
|
|
|
#include "rocksdb/options.h"
|
2015-10-12 15:06:38 -07:00
|
|
|
#include "table/internal_iterator.h"
|
2014-05-30 14:31:55 -07:00
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2014-05-30 14:31:55 -07:00
|
|
|
|
|
|
|
class DBImpl;
|
|
|
|
class Env;
|
|
|
|
struct SuperVersion;
|
|
|
|
class ColumnFamilyData;
|
2018-02-13 13:44:22 -08:00
|
|
|
class ForwardLevelIterator;
|
Reuse file iterators in tailing iterator when memtable is flushed
Summary:
Under a tailing workload, there were increased block cache
misses when a memtable was flushed because we were rebuilding iterators
in that case since the version set changed. This was exacerbated in the
case of iterate_upper_bound, since file iterators which were over the
iterate_upper_bound would have been deleted and are now brought back as
part of the Rebuild, only to be deleted again. We now renew the iterators
and only build iterators for files which are added and delete file
iterators for files which are deleted.
Refer to https://reviews.facebook.net/D50463 for previous version
Test Plan: DBTestTailingIterator.TailingIteratorTrimSeekToNext
Reviewers: anthony, IslamAbdelRahman, igor, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: yhchiang, march, dhruba, leveldb, lovro
Differential Revision: https://reviews.facebook.net/D50679
2015-11-13 15:50:59 -08:00
|
|
|
class VersionStorageInfo;
|
2014-06-03 12:28:58 -07:00
|
|
|
struct FileMetaData;
|
2014-05-30 14:31:55 -07:00
|
|
|
|
|
|
|
class MinIterComparator {
|
|
|
|
public:
|
|
|
|
explicit MinIterComparator(const Comparator* comparator) :
|
|
|
|
comparator_(comparator) {}
|
|
|
|
|
2015-10-12 15:06:38 -07:00
|
|
|
bool operator()(InternalIterator* a, InternalIterator* b) {
|
2014-05-30 14:31:55 -07:00
|
|
|
return comparator_->Compare(a->key(), b->key()) > 0;
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
const Comparator* comparator_;
|
|
|
|
};
|
|
|
|
|
2021-09-07 11:31:12 -07:00
|
|
|
using MinIterHeap =
|
|
|
|
std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
|
|
|
|
MinIterComparator>;
|
2014-05-30 14:31:55 -07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* ForwardIterator is a special type of iterator that only supports Seek()
|
|
|
|
* and Next(). It is expected to perform better than TailingIterator by
|
|
|
|
* removing the encapsulation and making all information accessible within
|
|
|
|
* the iterator. At the current implementation, snapshot is taken at the
|
|
|
|
* time Seek() is called. The Next() followed do not see new values after.
|
|
|
|
*/
|
2015-10-12 15:06:38 -07:00
|
|
|
class ForwardIterator : public InternalIterator {
|
2014-05-30 14:31:55 -07:00
|
|
|
public:
|
2014-06-03 12:28:58 -07:00
|
|
|
ForwardIterator(DBImpl* db, const ReadOptions& read_options,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-15 17:37:23 -07:00
|
|
|
ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr,
|
|
|
|
bool allow_unprepared_value = false);
|
2014-05-30 14:31:55 -07:00
|
|
|
virtual ~ForwardIterator();
|
|
|
|
|
2018-03-05 13:08:17 -08:00
|
|
|
void SeekForPrev(const Slice& /*target*/) override {
|
2016-09-27 18:20:57 -07:00
|
|
|
status_ = Status::NotSupported("ForwardIterator::SeekForPrev()");
|
|
|
|
valid_ = false;
|
|
|
|
}
|
2014-05-30 14:31:55 -07:00
|
|
|
void SeekToLast() override {
|
|
|
|
status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
|
|
|
|
valid_ = false;
|
|
|
|
}
|
2015-02-26 11:28:41 -08:00
|
|
|
void Prev() override {
|
2014-05-30 14:31:55 -07:00
|
|
|
status_ = Status::NotSupported("ForwardIterator::Prev");
|
|
|
|
valid_ = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual bool Valid() const override;
|
|
|
|
void SeekToFirst() override;
|
|
|
|
virtual void Seek(const Slice& target) override;
|
|
|
|
virtual void Next() override;
|
|
|
|
virtual Slice key() const override;
|
|
|
|
virtual Slice value() const override;
|
|
|
|
virtual Status status() const override;
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-15 17:37:23 -07:00
|
|
|
virtual bool PrepareValue() override;
|
2016-02-29 18:38:03 -08:00
|
|
|
virtual Status GetProperty(std::string prop_name, std::string* prop) override;
|
2016-08-11 19:10:16 -07:00
|
|
|
virtual void SetPinnedItersMgr(
|
|
|
|
PinnedIteratorsManager* pinned_iters_mgr) override;
|
|
|
|
virtual bool IsKeyPinned() const override;
|
|
|
|
virtual bool IsValuePinned() const override;
|
2016-02-29 18:38:03 -08:00
|
|
|
|
2015-09-04 14:28:45 -07:00
|
|
|
bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters);
|
2014-05-30 14:31:55 -07:00
|
|
|
|
|
|
|
private:
|
2014-10-23 15:34:21 -07:00
|
|
|
void Cleanup(bool release_sv);
|
2018-02-02 21:16:26 -08:00
|
|
|
// Unreference and, if needed, clean up the current SuperVersion. This is
|
|
|
|
// either done immediately or deferred until this iterator is unpinned by
|
|
|
|
// PinnedIteratorsManager.
|
Reuse file iterators in tailing iterator when memtable is flushed
Summary:
Under a tailing workload, there were increased block cache
misses when a memtable was flushed because we were rebuilding iterators
in that case since the version set changed. This was exacerbated in the
case of iterate_upper_bound, since file iterators which were over the
iterate_upper_bound would have been deleted and are now brought back as
part of the Rebuild, only to be deleted again. We now renew the iterators
and only build iterators for files which are added and delete file
iterators for files which are deleted.
Refer to https://reviews.facebook.net/D50463 for previous version
Test Plan: DBTestTailingIterator.TailingIteratorTrimSeekToNext
Reviewers: anthony, IslamAbdelRahman, igor, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: yhchiang, march, dhruba, leveldb, lovro
Differential Revision: https://reviews.facebook.net/D50679
2015-11-13 15:50:59 -08:00
|
|
|
void SVCleanup();
|
2018-02-02 21:16:26 -08:00
|
|
|
static void SVCleanup(
|
|
|
|
DBImpl* db, SuperVersion* sv, bool background_purge_on_iterator_cleanup);
|
|
|
|
static void DeferredSVCleanup(void* arg);
|
|
|
|
|
2014-10-23 15:34:21 -07:00
|
|
|
void RebuildIterators(bool refresh_sv);
|
Reuse file iterators in tailing iterator when memtable is flushed
Summary:
Under a tailing workload, there were increased block cache
misses when a memtable was flushed because we were rebuilding iterators
in that case since the version set changed. This was exacerbated in the
case of iterate_upper_bound, since file iterators which were over the
iterate_upper_bound would have been deleted and are now brought back as
part of the Rebuild, only to be deleted again. We now renew the iterators
and only build iterators for files which are added and delete file
iterators for files which are deleted.
Refer to https://reviews.facebook.net/D50463 for previous version
Test Plan: DBTestTailingIterator.TailingIteratorTrimSeekToNext
Reviewers: anthony, IslamAbdelRahman, igor, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: yhchiang, march, dhruba, leveldb, lovro
Differential Revision: https://reviews.facebook.net/D50679
2015-11-13 15:50:59 -08:00
|
|
|
void RenewIterators();
|
2022-01-21 11:36:36 -08:00
|
|
|
void BuildLevelIterators(const VersionStorageInfo* vstorage,
|
|
|
|
SuperVersion* sv);
|
2014-08-29 14:32:37 -07:00
|
|
|
void ResetIncompleteIterators();
|
2014-05-30 14:31:55 -07:00
|
|
|
void SeekInternal(const Slice& internal_key, bool seek_to_first);
|
|
|
|
void UpdateCurrent();
|
|
|
|
bool NeedToSeekImmutable(const Slice& internal_key);
|
2015-08-19 16:05:51 -07:00
|
|
|
void DeleteCurrentIter();
|
2014-05-30 14:31:55 -07:00
|
|
|
uint32_t FindFileInRange(
|
|
|
|
const std::vector<FileMetaData*>& files, const Slice& internal_key,
|
|
|
|
uint32_t left, uint32_t right);
|
|
|
|
|
2015-08-19 16:05:51 -07:00
|
|
|
bool IsOverUpperBound(const Slice& internal_key) const;
|
|
|
|
|
2016-08-11 19:10:16 -07:00
|
|
|
// Set PinnedIteratorsManager for all children Iterators, this function should
|
|
|
|
// be called whenever we update children Iterators or pinned_iters_mgr_.
|
|
|
|
void UpdateChildrenPinnedItersMgr();
|
|
|
|
|
|
|
|
// A helper function that will release iter in the proper manner, or pass it
|
|
|
|
// to pinned_iters_mgr_ to release it later if pinning is enabled.
|
|
|
|
void DeleteIterator(InternalIterator* iter, bool is_arena = false);
|
|
|
|
|
2014-05-30 14:31:55 -07:00
|
|
|
DBImpl* const db_;
|
|
|
|
const ReadOptions read_options_;
|
|
|
|
ColumnFamilyData* const cfd_;
|
|
|
|
const SliceTransform* const prefix_extractor_;
|
|
|
|
const Comparator* user_comparator_;
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-15 17:37:23 -07:00
|
|
|
const bool allow_unprepared_value_;
|
2014-05-30 14:31:55 -07:00
|
|
|
MinIterHeap immutable_min_heap_;
|
|
|
|
|
|
|
|
SuperVersion* sv_;
|
2015-10-12 15:06:38 -07:00
|
|
|
InternalIterator* mutable_iter_;
|
|
|
|
std::vector<InternalIterator*> imm_iters_;
|
|
|
|
std::vector<InternalIterator*> l0_iters_;
|
2018-02-13 13:44:22 -08:00
|
|
|
std::vector<ForwardLevelIterator*> level_iters_;
|
2015-10-12 15:06:38 -07:00
|
|
|
InternalIterator* current_;
|
2015-08-31 16:44:34 -07:00
|
|
|
bool valid_;
|
|
|
|
|
|
|
|
// Internal iterator status; set only by one of the unsupported methods.
|
2014-05-30 14:31:55 -07:00
|
|
|
Status status_;
|
2015-08-31 16:44:34 -07:00
|
|
|
// Status of immutable iterators, maintained here to avoid iterating over
|
|
|
|
// all of them in status().
|
2014-09-26 14:20:24 -07:00
|
|
|
Status immutable_status_;
|
2015-08-31 16:44:34 -07:00
|
|
|
// Indicates that at least one of the immutable iterators pointed to a key
|
|
|
|
// larger than iterate_upper_bound and was therefore destroyed. Seek() may
|
|
|
|
// need to rebuild such iterators.
|
2015-08-19 16:05:51 -07:00
|
|
|
bool has_iter_trimmed_for_upper_bound_;
|
2015-08-31 16:44:34 -07:00
|
|
|
// Is current key larger than iterate_upper_bound? If so, makes Valid()
|
|
|
|
// return false.
|
|
|
|
bool current_over_upper_bound_;
|
|
|
|
|
|
|
|
// Left endpoint of the range of keys that immutable iterators currently
|
|
|
|
// cover. When Seek() is called with a key that's within that range, immutable
|
|
|
|
// iterators don't need to be moved; see NeedToSeekImmutable(). This key is
|
|
|
|
// included in the range after a Seek(), but excluded when advancing the
|
|
|
|
// iterator using Next().
|
2014-05-30 14:31:55 -07:00
|
|
|
IterKey prev_key_;
|
|
|
|
bool is_prev_set_;
|
ForwardIterator: update prev_key_ only if prefix hasn't changed
Summary:
Since ForwardIterator is on a level below DBIter, the latter may call Next() on
it (e.g. in order to skip deletion markers). Since this also updates
`prev_key_`, it may prevent the Seek() optimization.
For example, assume that there's only one SST file and it contains the following
entries: 0101, 0201 (`ValueType::kTypeDeletion`, i.e. a tombstone record), 0201
(`kTypeValue`), 0202. Memtable is empty. `Seek(0102)` will result in `prev_key_`
being set to `0201` instead of `0102`, since `DBIter::Seek()` will call
`ForwardIterator::Next()` to skip record 0201. Therefore, when `Seek(0102)` is
called again, `NeedToSeekImmutable()` will return true.
This fix relies on `prefix_extractor_` to detect prefix changes. `prev_key_` is
only set to `current_->key()` as long as they have the same prefix.
I also made a small change to `NeedToSeekImmutable()` so it no longer returns
true when the db is empty (i.e. there's nothing but a memtable).
Test Plan:
$ TEST_TMPDIR=/dev/shm/rocksdbtest ROCKSDB_TESTS=TailingIterator ./db_test
Reviewers: sdong, igor, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23823
2014-09-22 15:20:03 -07:00
|
|
|
bool is_prev_inclusive_;
|
2015-08-31 16:44:34 -07:00
|
|
|
|
2016-08-11 19:10:16 -07:00
|
|
|
PinnedIteratorsManager* pinned_iters_mgr_;
|
2014-09-04 17:40:41 -07:00
|
|
|
Arena arena_;
|
2014-05-30 14:31:55 -07:00
|
|
|
};
|
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2014-05-30 14:31:55 -07:00
|
|
|
#endif // ROCKSDB_LITE
|