Copy Get() result when file reads use mmap
Summary: For iterator reads, a `SuperVersion` is pinned to preserve a snapshot of SST files, and `Block`s are pinned to allow `key()` and `value()` to return pointers directly into a RocksDB memory region. This works for both non-mmap reads, where the block owns the memory region, and mmap reads, where the file owns the memory region. For point reads with `PinnableSlice`, only the `Block` object is pinned. This works for non-mmap reads because the block owns the memory region, so even if the file is deleted after compaction, the memory region survives. However, for mmap reads, file deletion causes the memory region to which the `PinnableSlice` refers to be unmapped. The result is usually a segfault upon accessing the `PinnableSlice`, although sometimes it returned wrong results (I repro'd this a bunch of times with `db_stress`). This PR copies the value into the `PinnableSlice` when it comes from mmap'd memory. We can tell whether the `Block` owns its memory using `Block::cachable()`, which is unset when reads do not use the provided buffer as is the case with mmap file reads. When that is false we ensure the result of `Get()` is copied. This feels like a short-term solution as ideally we'd have the `PinnableSlice` pin the mmap'd memory so we can do zero-copy reads. It seemed hard so I chose this approach to fix correctness in the meantime. Closes https://github.com/facebook/rocksdb/pull/3881 Differential Revision: D8076288 Pulled By: ajkr fbshipit-source-id: 31d78ec010198723522323dbc6ea325122a46b08
This commit is contained in:
parent
88c3ee2d31
commit
fea2b1dfb2
@ -9,6 +9,7 @@
|
||||
|
||||
### Bug Fixes
|
||||
* fix deadlock with enable_pipelined_write=true and max_successive_merges > 0
|
||||
* Fix corruption in non-iterator reads when mmap is used for file reads
|
||||
|
||||
## 5.14.0 (5/16/2018)
|
||||
### Public API Change
|
||||
|
@ -2502,6 +2502,27 @@ TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
TEST_F(DBTest2, PinnableSliceAndMmapReads) {
|
||||
Options options = CurrentOptions();
|
||||
options.allow_mmap_reads = true;
|
||||
Reopen(options);
|
||||
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
PinnableSlice pinned_value;
|
||||
ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
|
||||
ASSERT_EQ(pinned_value.ToString(), "bar");
|
||||
|
||||
dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
|
||||
nullptr /* end */, nullptr /* column_family */,
|
||||
true /* disallow_trivial_move */);
|
||||
|
||||
// Ensure pinned_value doesn't rely on memory munmap'd by the above
|
||||
// compaction.
|
||||
ASSERT_EQ(pinned_value.ToString(), "bar");
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
@ -464,7 +464,7 @@ BlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
|
||||
total_order_seek ? nullptr : prefix_index_.get();
|
||||
ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
|
||||
prefix_index_ptr, global_seqno_,
|
||||
read_amp_bitmap_.get(), key_includes_seq);
|
||||
read_amp_bitmap_.get(), key_includes_seq, cachable());
|
||||
|
||||
if (read_amp_bitmap_) {
|
||||
if (read_amp_bitmap_->GetStatistics() != stats) {
|
||||
|
@ -220,22 +220,26 @@ class BlockIter final : public InternalIterator {
|
||||
key_includes_seq_(true),
|
||||
global_seqno_(kDisableGlobalSequenceNumber),
|
||||
read_amp_bitmap_(nullptr),
|
||||
last_bitmap_offset_(0) {}
|
||||
last_bitmap_offset_(0),
|
||||
block_contents_pinned_(false) {}
|
||||
|
||||
BlockIter(const Comparator* comparator, const Comparator* user_comparator,
|
||||
const char* data, uint32_t restarts, uint32_t num_restarts,
|
||||
BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
|
||||
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq)
|
||||
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq,
|
||||
bool block_contents_pinned)
|
||||
: BlockIter() {
|
||||
Initialize(comparator, user_comparator, data, restarts, num_restarts,
|
||||
prefix_index, global_seqno, read_amp_bitmap, key_includes_seq);
|
||||
prefix_index, global_seqno, read_amp_bitmap, key_includes_seq,
|
||||
block_contents_pinned);
|
||||
}
|
||||
|
||||
void Initialize(const Comparator* comparator,
|
||||
const Comparator* user_comparator, const char* data,
|
||||
uint32_t restarts, uint32_t num_restarts,
|
||||
BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
|
||||
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq) {
|
||||
BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq,
|
||||
bool block_contents_pinned) {
|
||||
assert(data_ == nullptr); // Ensure it is called only once
|
||||
assert(num_restarts > 0); // Ensure the param is valid
|
||||
|
||||
@ -251,6 +255,7 @@ class BlockIter final : public InternalIterator {
|
||||
read_amp_bitmap_ = read_amp_bitmap;
|
||||
last_bitmap_offset_ = current_ + 1;
|
||||
key_includes_seq_ = key_includes_seq;
|
||||
block_contents_pinned_ = block_contents_pinned;
|
||||
}
|
||||
|
||||
// Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
|
||||
@ -312,9 +317,11 @@ class BlockIter final : public InternalIterator {
|
||||
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
|
||||
#endif
|
||||
|
||||
virtual bool IsKeyPinned() const override { return key_pinned_; }
|
||||
virtual bool IsKeyPinned() const override {
|
||||
return block_contents_pinned_ && key_pinned_;
|
||||
}
|
||||
|
||||
virtual bool IsValuePinned() const override { return true; }
|
||||
virtual bool IsValuePinned() const override { return block_contents_pinned_; }
|
||||
|
||||
size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
|
||||
|
||||
@ -349,6 +356,8 @@ class BlockIter final : public InternalIterator {
|
||||
BlockReadAmpBitmap* read_amp_bitmap_;
|
||||
// last `current_` value we report to read-amp bitmp
|
||||
mutable uint32_t last_bitmap_offset_;
|
||||
// whether the block data is guaranteed to outlive this iterator
|
||||
bool block_contents_pinned_;
|
||||
|
||||
struct CachedPrevEntry {
|
||||
explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
|
||||
|
@ -2226,8 +2226,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
|
||||
s = Status::Corruption(Slice());
|
||||
}
|
||||
|
||||
if (!get_context->SaveValue(parsed_key, biter.value(), &matched,
|
||||
&biter)) {
|
||||
if (!get_context->SaveValue(
|
||||
parsed_key, biter.value(), &matched,
|
||||
biter.IsValuePinned() ? &biter : nullptr)) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user