rocksdb/table/two_level_iterator.cc
Mike Kolupaev b4d7209428 Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.

Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.

So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.

Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.

This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289

Differential Revision: D15256423

Pulled By: al13n321

fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:54:04 -07:00

212 lines
6.3 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/two_level_iterator.h"
#include "db/pinned_iterators_manager.h"
#include "memory/arena.h"
#include "rocksdb/options.h"
#include "rocksdb/table.h"
#include "table/block_based/block.h"
#include "table/format.h"
namespace rocksdb {
namespace {
class TwoLevelIndexIterator : public InternalIteratorBase<IndexValue> {
public:
explicit TwoLevelIndexIterator(
TwoLevelIteratorState* state,
InternalIteratorBase<IndexValue>* first_level_iter);
~TwoLevelIndexIterator() override {
first_level_iter_.DeleteIter(false /* is_arena_mode */);
second_level_iter_.DeleteIter(false /* is_arena_mode */);
delete state_;
}
void Seek(const Slice& target) override;
void SeekForPrev(const Slice& target) override;
void SeekToFirst() override;
void SeekToLast() override;
void Next() override;
void Prev() override;
bool Valid() const override { return second_level_iter_.Valid(); }
Slice key() const override {
assert(Valid());
return second_level_iter_.key();
}
IndexValue value() const override {
assert(Valid());
return second_level_iter_.value();
}
Status status() const override {
if (!first_level_iter_.status().ok()) {
assert(second_level_iter_.iter() == nullptr);
return first_level_iter_.status();
} else if (second_level_iter_.iter() != nullptr &&
!second_level_iter_.status().ok()) {
return second_level_iter_.status();
} else {
return status_;
}
}
void SetPinnedItersMgr(
PinnedIteratorsManager* /*pinned_iters_mgr*/) override {}
bool IsKeyPinned() const override { return false; }
bool IsValuePinned() const override { return false; }
private:
void SaveError(const Status& s) {
if (status_.ok() && !s.ok()) status_ = s;
}
void SkipEmptyDataBlocksForward();
void SkipEmptyDataBlocksBackward();
void SetSecondLevelIterator(InternalIteratorBase<IndexValue>* iter);
void InitDataBlock();
TwoLevelIteratorState* state_;
IteratorWrapperBase<IndexValue> first_level_iter_;
IteratorWrapperBase<IndexValue> second_level_iter_; // May be nullptr
Status status_;
// If second_level_iter is non-nullptr, then "data_block_handle_" holds the
// "index_value" passed to block_function_ to create the second_level_iter.
BlockHandle data_block_handle_;
};
TwoLevelIndexIterator::TwoLevelIndexIterator(
TwoLevelIteratorState* state,
InternalIteratorBase<IndexValue>* first_level_iter)
: state_(state), first_level_iter_(first_level_iter) {}
void TwoLevelIndexIterator::Seek(const Slice& target) {
first_level_iter_.Seek(target);
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
second_level_iter_.Seek(target);
}
SkipEmptyDataBlocksForward();
}
void TwoLevelIndexIterator::SeekForPrev(const Slice& target) {
first_level_iter_.Seek(target);
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
second_level_iter_.SeekForPrev(target);
}
if (!Valid()) {
if (!first_level_iter_.Valid() && first_level_iter_.status().ok()) {
first_level_iter_.SeekToLast();
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
second_level_iter_.SeekForPrev(target);
}
}
SkipEmptyDataBlocksBackward();
}
}
void TwoLevelIndexIterator::SeekToFirst() {
first_level_iter_.SeekToFirst();
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
second_level_iter_.SeekToFirst();
}
SkipEmptyDataBlocksForward();
}
void TwoLevelIndexIterator::SeekToLast() {
first_level_iter_.SeekToLast();
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
second_level_iter_.SeekToLast();
}
SkipEmptyDataBlocksBackward();
}
void TwoLevelIndexIterator::Next() {
assert(Valid());
second_level_iter_.Next();
SkipEmptyDataBlocksForward();
}
void TwoLevelIndexIterator::Prev() {
assert(Valid());
second_level_iter_.Prev();
SkipEmptyDataBlocksBackward();
}
void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() {
while (second_level_iter_.iter() == nullptr ||
(!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
// Move to next block
if (!first_level_iter_.Valid()) {
SetSecondLevelIterator(nullptr);
return;
}
first_level_iter_.Next();
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
second_level_iter_.SeekToFirst();
}
}
}
void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() {
while (second_level_iter_.iter() == nullptr ||
(!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
// Move to next block
if (!first_level_iter_.Valid()) {
SetSecondLevelIterator(nullptr);
return;
}
first_level_iter_.Prev();
InitDataBlock();
if (second_level_iter_.iter() != nullptr) {
second_level_iter_.SeekToLast();
}
}
}
void TwoLevelIndexIterator::SetSecondLevelIterator(
InternalIteratorBase<IndexValue>* iter) {
InternalIteratorBase<IndexValue>* old_iter = second_level_iter_.Set(iter);
delete old_iter;
}
void TwoLevelIndexIterator::InitDataBlock() {
if (!first_level_iter_.Valid()) {
SetSecondLevelIterator(nullptr);
} else {
BlockHandle handle = first_level_iter_.value().handle;
if (second_level_iter_.iter() != nullptr &&
!second_level_iter_.status().IsIncomplete() &&
handle.offset() == data_block_handle_.offset()) {
// second_level_iter is already constructed with this iterator, so
// no need to change anything
} else {
InternalIteratorBase<IndexValue>* iter =
state_->NewSecondaryIterator(handle);
data_block_handle_ = handle;
SetSecondLevelIterator(iter);
}
}
}
} // namespace
InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
TwoLevelIteratorState* state,
InternalIteratorBase<IndexValue>* first_level_iter) {
return new TwoLevelIndexIterator(state, first_level_iter);
}
} // namespace rocksdb