row_cache to share entry for recent snapshots (#5600)

Summary:
Right now, users cannot take advantage of row cache, unless no snapshot is used, or Get() is repeated for the same snapshots. This limits the usage of row cache.
This change eliminate this restriction in some cases. If the snapshot used is newer than the largest sequence number in the file, and write callback function is not registered, the same row cache key is used as no snapshot is given. We still need the callback function restriction for now because the callback function may filter out different keys for different snapshots even if the snapshots are new.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5600

Test Plan: Add a unit test.

Differential Revision: D16386616

fbshipit-source-id: 6b7d214bd215d191b03ccf55926ad4b703ec2e53
This commit is contained in:
sdong 2019-07-22 18:53:03 -07:00 committed by Facebook Github Bot
parent 3778470061
commit 66b5613d0c
4 changed files with 68 additions and 2 deletions

View File

@ -33,6 +33,7 @@
* Reduce iterator key comparision for upper/lower bound check.
* Log Writer will flush after finishing the whole record, rather than a fragment.
* Lower MultiGet batching API latency by reading data blocks from disk in parallel
* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases.
### General Improvements
* Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.

View File

@ -3771,6 +3771,52 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
delete db_;
db_ = nullptr;
}
#ifndef ROCKSDB_LITE
TEST_F(DBTest2, RowCacheSnapshot) {
Options options = CurrentOptions();
options.statistics = rocksdb::CreateDBStatistics();
options.row_cache = NewLRUCache(8192);
DestroyAndReopen(options);
ASSERT_OK(Put("foo", "bar1"));
const Snapshot* s1 = db_->GetSnapshot();
ASSERT_OK(Put("foo", "bar2"));
ASSERT_OK(Flush());
ASSERT_OK(Put("foo2", "bar"));
const Snapshot* s2 = db_->GetSnapshot();
ASSERT_OK(Put("foo3", "bar"));
const Snapshot* s3 = db_->GetSnapshot();
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
ASSERT_EQ(Get("foo"), "bar2");
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
ASSERT_EQ(Get("foo"), "bar2");
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
ASSERT_EQ(Get("foo", s1), "bar1");
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
ASSERT_EQ(Get("foo", s2), "bar2");
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
ASSERT_EQ(Get("foo", s1), "bar1");
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
ASSERT_EQ(Get("foo", s3), "bar2");
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
db_->ReleaseSnapshot(s1);
db_->ReleaseSnapshot(s2);
db_->ReleaseSnapshot(s3);
}
#endif // ROCKSDB_LITE
} // namespace rocksdb
int main(int argc, char** argv) {

View File

@ -11,6 +11,7 @@
#include "db/dbformat.h"
#include "db/range_tombstone_fragmenter.h"
#include "db/snapshot_impl.h"
#include "db/version_edit.h"
#include "file/filename.h"
@ -24,6 +25,7 @@
#include "table/table_builder.h"
#include "table/table_reader.h"
#include "test_util/sync_point.h"
#include "util/cast_util.h"
#include "util/coding.h"
#include "util/file_reader_writer.h"
#include "util/stop_watch.h"
@ -277,8 +279,23 @@ Status TableCache::Get(const ReadOptions& options,
// sequence key increases. However, to support caching snapshot
// reads, we append the sequence number (incremented by 1 to
// distinguish from 0) only in this case.
uint64_t seq_no =
options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k);
// If the snapshot is larger than the largest seqno in the file,
// all data should be exposed to the snapshot, so we treat it
// the same as there is no snapshot. The exception is that if
// a seq-checking callback is registered, some internal keys
// may still be filtered out.
uint64_t seq_no = 0;
// Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
if (options.snapshot != nullptr &&
(get_context->has_callback() ||
static_cast_with_check<const SnapshotImpl, const Snapshot>(
options.snapshot)
->GetSequenceNumber() <= fd.largest_seqno)) {
// We should consider to use options.snapshot->GetSequenceNumber()
// instead of GetInternalKeySeqno(k), which will make the code
// easier to understand.
seq_no = 1 + GetInternalKeySeqno(k);
}
// Compute row cache key.
row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),

View File

@ -136,6 +136,8 @@ class GetContext {
void ReportCounters();
bool has_callback() const { return callback_ != nullptr; }
uint64_t get_tracing_get_id() const { return tracing_get_id_; }
private: