dd29ad4223
Summary: Replace `BlockIter::comparator_` and `IndexBlockIter::user_comparator_wrapper_` with a concrete `UserComparatorWrapper` and `InternalKeyComparator`. The motivation for this change was the inconvenience of not knowing the concrete type of `BlockIter::comparator_`, which prevented calling specialized internal key comparison functions to optimize comparison of keys with global seqno applied. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6944 Test Plan: benchmark setup -- single file DBs, in-memory, no compression. "normal_db" created by regular flush; "ingestion_db" created by ingesting a file. Both DBs have same contents. ``` $ TEST_TMPDIR=/dev/shm/normal_db/ ./db_bench -benchmarks=fillrandom,compact -write_buffer_size=10485760000 -disable_auto_compactions=true -compression_type=none -num=1000000 $ ./ldb write_extern_sst ./tmp.sst --db=/dev/shm/ingestion_db/dbbench/ --compression_type=no --hex --create_if_missing < <(./sst_dump --command=scan --output_hex --file=/dev/shm/normal_db/dbbench/000007.sst | awk 'began {print "0x" substr($1, 2, length($1) - 2), "==>", "0x" $5} ; /^Sst file format: block-based/ {began=1}') $ ./ldb ingest_extern_sst ./tmp.sst --db=/dev/shm/ingestion_db/dbbench/ ``` benchmark run command: ``` $ TEST_TMPDIR=/dev/shm/$DB/ ./db_bench -benchmarks=seekrandom -seek_nexts=$SEEK_NEXT -use_existing_db=true -cache_index_and_filter_blocks=false -num=1000000 -cache_size=0 -threads=1 -reads=200000000 -mmap_read=1 -verify_checksum=false ``` results: perf improved marginally for ingestion_db and did not change significantly for normal_db: SEEK_NEXT | DB | code | ops/sec | % change -- | -- | -- | -- | -- 0 | normal_db | master | 350880 | 0 | normal_db | PR6944 | 351040 | 0.0 0 | ingestion_db | master | 343255 | 0 | ingestion_db | PR6944 | 349424 | 1.8 10 | normal_db | master | 218711 | 10 | normal_db | PR6944 | 217892 | -0.4 10 | ingestion_db | master | 220334 | 10 | ingestion_db | PR6944 | 226437 | 2.8 Reviewed By: pdillinger Differential Revision: D21924676 Pulled By: ajkr fbshipit-source-id: ea4288a2eefa8112eb6c651a671c1de18c12e538
186 lines
6.7 KiB
C++
186 lines
6.7 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
#include "table/block_based/partitioned_index_reader.h"
|
|
|
|
#include "file/file_util.h"
|
|
#include "table/block_based/partitioned_index_iterator.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
Status PartitionIndexReader::Create(
|
|
const BlockBasedTable* table, const ReadOptions& ro,
|
|
FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
|
|
bool pin, BlockCacheLookupContext* lookup_context,
|
|
std::unique_ptr<IndexReader>* index_reader) {
|
|
assert(table != nullptr);
|
|
assert(table->get_rep());
|
|
assert(!pin || prefetch);
|
|
assert(index_reader != nullptr);
|
|
|
|
CachableEntry<Block> index_block;
|
|
if (prefetch || !use_cache) {
|
|
const Status s =
|
|
ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
|
|
/*get_context=*/nullptr, lookup_context, &index_block);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
if (use_cache && !pin) {
|
|
index_block.Reset();
|
|
}
|
|
}
|
|
|
|
index_reader->reset(new PartitionIndexReader(table, std::move(index_block)));
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
InternalIteratorBase<IndexValue>* PartitionIndexReader::NewIterator(
|
|
const ReadOptions& read_options, bool /* disable_prefix_seek */,
|
|
IndexBlockIter* iter, GetContext* get_context,
|
|
BlockCacheLookupContext* lookup_context) {
|
|
const bool no_io = (read_options.read_tier == kBlockCacheTier);
|
|
CachableEntry<Block> index_block;
|
|
const Status s =
|
|
GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
|
|
if (!s.ok()) {
|
|
if (iter != nullptr) {
|
|
iter->Invalidate(s);
|
|
return iter;
|
|
}
|
|
|
|
return NewErrorInternalIterator<IndexValue>(s);
|
|
}
|
|
|
|
const BlockBasedTable::Rep* rep = table()->rep_;
|
|
InternalIteratorBase<IndexValue>* it = nullptr;
|
|
|
|
Statistics* kNullStats = nullptr;
|
|
// Filters are already checked before seeking the index
|
|
if (!partition_map_.empty()) {
|
|
// We don't return pinned data from index blocks, so no need
|
|
// to set `block_contents_pinned`.
|
|
it = NewTwoLevelIterator(
|
|
new BlockBasedTable::PartitionedIndexIteratorState(table(),
|
|
&partition_map_),
|
|
index_block.GetValue()->NewIndexIterator(
|
|
internal_comparator()->user_comparator(),
|
|
rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
|
|
index_has_first_key(), index_key_includes_seq(),
|
|
index_value_is_full()));
|
|
} else {
|
|
ReadOptions ro;
|
|
ro.fill_cache = read_options.fill_cache;
|
|
ro.deadline = read_options.deadline;
|
|
// We don't return pinned data from index blocks, so no need
|
|
// to set `block_contents_pinned`.
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(
|
|
index_block.GetValue()->NewIndexIterator(
|
|
internal_comparator()->user_comparator(),
|
|
rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
|
|
index_has_first_key(), index_key_includes_seq(),
|
|
index_value_is_full()));
|
|
|
|
it = new ParititionedIndexIterator(
|
|
table(), ro, *internal_comparator(), std::move(index_iter),
|
|
lookup_context ? lookup_context->caller
|
|
: TableReaderCaller::kUncategorized);
|
|
}
|
|
|
|
assert(it != nullptr);
|
|
index_block.TransferTo(it);
|
|
|
|
return it;
|
|
|
|
// TODO(myabandeh): Update TwoLevelIterator to be able to make use of
|
|
// on-stack BlockIter while the state is on heap. Currentlly it assumes
|
|
// the first level iter is always on heap and will attempt to delete it
|
|
// in its destructor.
|
|
}
|
|
void PartitionIndexReader::CacheDependencies(const ReadOptions& ro, bool pin) {
|
|
// Before read partitions, prefetch them to avoid lots of IOs
|
|
BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
|
|
const BlockBasedTable::Rep* rep = table()->rep_;
|
|
IndexBlockIter biter;
|
|
BlockHandle handle;
|
|
Statistics* kNullStats = nullptr;
|
|
|
|
CachableEntry<Block> index_block;
|
|
Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
|
|
&lookup_context, &index_block);
|
|
if (!s.ok()) {
|
|
ROCKS_LOG_WARN(rep->ioptions.info_log,
|
|
"Error retrieving top-level index block while trying to "
|
|
"cache index partitions: %s",
|
|
s.ToString().c_str());
|
|
IGNORE_STATUS_IF_ERROR(s);
|
|
return;
|
|
}
|
|
|
|
// We don't return pinned data from index blocks, so no need
|
|
// to set `block_contents_pinned`.
|
|
index_block.GetValue()->NewIndexIterator(
|
|
internal_comparator()->user_comparator(),
|
|
rep->get_global_seqno(BlockType::kIndex), &biter, kNullStats, true,
|
|
index_has_first_key(), index_key_includes_seq(), index_value_is_full());
|
|
// Index partitions are assumed to be consecuitive. Prefetch them all.
|
|
// Read the first block offset
|
|
biter.SeekToFirst();
|
|
if (!biter.Valid()) {
|
|
// Empty index.
|
|
return;
|
|
}
|
|
handle = biter.value().handle;
|
|
uint64_t prefetch_off = handle.offset();
|
|
|
|
// Read the last block's offset
|
|
biter.SeekToLast();
|
|
if (!biter.Valid()) {
|
|
// Empty index.
|
|
return;
|
|
}
|
|
handle = biter.value().handle;
|
|
uint64_t last_off = handle.offset() + block_size(handle);
|
|
uint64_t prefetch_len = last_off - prefetch_off;
|
|
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
|
|
rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
|
|
IOOptions opts;
|
|
s = PrepareIOFromReadOptions(ro, rep->file->env(), opts);
|
|
if (s.ok()) {
|
|
s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
|
|
static_cast<size_t>(prefetch_len));
|
|
}
|
|
|
|
// After prefetch, read the partitions one by one
|
|
biter.SeekToFirst();
|
|
for (; biter.Valid(); biter.Next()) {
|
|
handle = biter.value().handle;
|
|
CachableEntry<Block> block;
|
|
// TODO: Support counter batch update for partitioned index and
|
|
// filter blocks
|
|
s = table()->MaybeReadBlockAndLoadToCache(
|
|
prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
|
|
&block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
|
|
/*contents=*/nullptr);
|
|
|
|
IGNORE_STATUS_IF_ERROR(s);
|
|
|
|
assert(s.ok() || block.GetValue() == nullptr);
|
|
if (s.ok() && block.GetValue() != nullptr) {
|
|
if (block.IsCached()) {
|
|
if (pin) {
|
|
partition_map_[handle.offset()] = std::move(block);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|