In-place updates for equal keys and similar sized values

Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory

TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.

Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.

Reviewers: xinyaohu, sumeet, haobo, dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D12423

Automatic commit by arc
This commit is contained in:
Naman Gupta 2013-08-19 14:12:47 -07:00
parent 17991cd5a0
commit fe25070242
12 changed files with 212 additions and 21 deletions

View File

@ -438,11 +438,11 @@ class DBTest {
return DB::Open(opts, dbname_, &db_); return DB::Open(opts, dbname_, &db_);
} }
Status Put(const Slice& k, const Slice& v) { Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
if (kMergePut == option_config_ ) { if (kMergePut == option_config_ ) {
return db_->Merge(WriteOptions(), k, v); return db_->Merge(wo, k, v);
} else { } else {
return db_->Put(WriteOptions(), k, v); return db_->Put(wo, k, v);
} }
} }
@ -2306,6 +2306,71 @@ TEST(DBTest, RepeatedWritesToSameKey) {
} while (ChangeCompactOptions()); } while (ChangeCompactOptions());
} }
TEST(DBTest, InPlaceUpdate) {
do {
Options options = CurrentOptions();
options.create_if_missing = true;
options.inplace_update_support = true;
options.env = env_;
options.write_buffer_size = 100000;
// Update key with values of smaller size
Reopen(&options);
int numValues = 10;
for (int i = numValues; i > 0; i--) {
std::string value = DummyString(i, 'a');
ASSERT_OK(Put("key", value));
ASSERT_EQ(value, Get("key"));
}
int count = 0;
Iterator* iter = dbfull()->TEST_NewInternalIterator();
iter->SeekToFirst();
ASSERT_EQ(iter->status().ok(), true);
while (iter->Valid()) {
ParsedInternalKey ikey;
ikey.sequence = -1;
ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
count++;
// All updates with the same sequence number.
ASSERT_EQ(ikey.sequence, (unsigned)1);
iter->Next();
}
// Only 1 instance for that key.
ASSERT_EQ(count, 1);
delete iter;
// Update key with values of larger size
DestroyAndReopen(&options);
numValues = 10;
for (int i = 0; i < numValues; i++) {
std::string value = DummyString(i, 'a');
ASSERT_OK(Put("key", value));
ASSERT_EQ(value, Get("key"));
}
count = 0;
iter = dbfull()->TEST_NewInternalIterator();
iter->SeekToFirst();
ASSERT_EQ(iter->status().ok(), true);
int seq = numValues;
while (iter->Valid()) {
ParsedInternalKey ikey;
ikey.sequence = -1;
ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
count++;
// No inplace updates. All updates are puts with new seq number
ASSERT_EQ(ikey.sequence, (unsigned)seq--);
iter->Next();
}
// All 10 updates exist in the internal iterator
ASSERT_EQ(count, numValues);
delete iter;
} while (ChangeCompactOptions());
}
// This is a static filter used for filtering // This is a static filter used for filtering
// kvs during the compaction process. // kvs during the compaction process.
static int cfilter_count; static int cfilter_count;

View File

@ -17,8 +17,18 @@
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h" #include "rocksdb/merge_operator.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/mutexlock.h"
#include "util/murmurhash.h" #include "util/murmurhash.h"
namespace std {
template <>
struct hash<rocksdb::Slice> {
size_t operator()(const rocksdb::Slice& slice) const {
return MurmurHash(slice.data(), slice.size(), 0);
}
};
}
namespace rocksdb { namespace rocksdb {
MemTable::MemTable(const InternalKeyComparator& cmp, MemTable::MemTable(const InternalKeyComparator& cmp,
@ -35,7 +45,10 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
edit_(numlevel), edit_(numlevel),
first_seqno_(0), first_seqno_(0),
mem_next_logfile_number_(0), mem_next_logfile_number_(0),
mem_logfile_number_(0) { } mem_logfile_number_(0),
locks_(options.inplace_update_support
? options.inplace_update_num_locks
: 0) { }
MemTable::~MemTable() { MemTable::~MemTable() {
assert(refs_ == 0); assert(refs_ == 0);
@ -110,6 +123,10 @@ Iterator* MemTable::NewIterator(const Slice* prefix) {
} }
} }
port::RWMutex* MemTable::GetLock(const Slice& key) {
return &locks_[std::hash<Slice>()(key) % locks_.size()];
}
void MemTable::Add(SequenceNumber s, ValueType type, void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key, const Slice& key,
const Slice& value) { const Slice& value) {
@ -171,12 +188,14 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
uint32_t key_length; uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare( if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
key.user_key()) == 0) {
// Correct user key // Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) { switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: { case kTypeValue: {
if (options.inplace_update_support) {
GetLock(key.user_key())->ReadLock();
}
Slice v = GetLengthPrefixedSlice(key_ptr + key_length); Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
*s = Status::OK(); *s = Status::OK();
if (merge_in_progress) { if (merge_in_progress) {
@ -189,6 +208,9 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
} else { } else {
value->assign(v.data(), v.size()); value->assign(v.data(), v.size());
} }
if (options.inplace_update_support) {
GetLock(key.user_key())->Unlock();
}
return true; return true;
} }
case kTypeDeletion: { case kTypeDeletion: {
@ -243,4 +265,65 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
return false; return false;
} }
bool MemTable::Update(SequenceNumber seq, ValueType type,
const Slice& key,
const Slice& value) {
LookupKey lkey(key, seq);
Slice memkey = lkey.memtable_key();
std::shared_ptr<MemTableRep::Iterator> iter(
table_.get()->GetIterator(lkey.user_key()));
iter->Seek(memkey.data());
if (iter->Valid()) {
// entry format is:
// klength varint32
// userkey char[klength-8]
// tag uint64
// vlength varint32
// value char[vlength]
// Check that it belongs to same user key. We do not check the
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
const char* entry = iter->key();
uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
// Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
uint32_t vlength;
GetVarint32Ptr(key_ptr + key_length,
key_ptr + key_length+5, &vlength);
// Update value, if newValue size <= curValue size
if (value.size() <= vlength) {
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
value.size());
WriteLock wl(GetLock(lkey.user_key()));
memcpy(p, value.data(), value.size());
assert(
(p + value.size()) - entry ==
(unsigned) (VarintLength(key_length) +
key_length +
VarintLength(value.size()) +
value.size())
);
return true;
}
}
default:
// If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
// then we probably don't have enough space to update in-place
// Maybe do something later
// Return false, and do normal Add()
return false;
}
}
}
// Key doesn't exist
return false;
}
} // namespace rocksdb } // namespace rocksdb

View File

@ -88,6 +88,16 @@ class MemTable {
bool Get(const LookupKey& key, std::string* value, Status* s, bool Get(const LookupKey& key, std::string* value, Status* s,
std::deque<std::string>* operands, const Options& options); std::deque<std::string>* operands, const Options& options);
// Update the value and return status ok,
// if key exists in current memtable
// if new sizeof(new_value) <= sizeof(old_value) &&
// old_value for that key is a put i.e. kTypeValue
// else return false, and status - NotUpdatable()
// else return false, and status - NotFound()
bool Update(SequenceNumber seq, ValueType type,
const Slice& key,
const Slice& value);
// Returns the edits area that is needed for flushing the memtable // Returns the edits area that is needed for flushing the memtable
VersionEdit* GetEdits() { return &edit_; } VersionEdit* GetEdits() { return &edit_; }
@ -144,9 +154,15 @@ class MemTable {
// memtable flush is done) // memtable flush is done)
uint64_t mem_logfile_number_; uint64_t mem_logfile_number_;
// rw locks for inplace updates
std::vector<port::RWMutex> locks_;
// No copying allowed // No copying allowed
MemTable(const MemTable&); MemTable(const MemTable&);
void operator=(const MemTable&); void operator=(const MemTable&);
// Get the lock associated for the key
port::RWMutex* GetLock(const Slice& key);
}; };
} // namespace rocksdb } // namespace rocksdb

View File

@ -208,7 +208,7 @@ class Repairer {
continue; continue;
} }
WriteBatchInternal::SetContents(&batch, record); WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, mem); status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
if (status.ok()) { if (status.ok()) {
counter += WriteBatchInternal::Count(&batch); counter += WriteBatchInternal::Count(&batch);
} else { } else {

View File

@ -188,7 +188,12 @@ class MemTableInserter : public WriteBatch::Handler {
} }
virtual void Put(const Slice& key, const Slice& value) { virtual void Put(const Slice& key, const Slice& value) {
if (options_->inplace_update_support
&& mem_->Update(sequence_, kTypeValue, key, value)) {
RecordTick(options_->statistics, NUMBER_KEYS_UPDATED);
} else {
mem_->Add(sequence_, kTypeValue, key, value); mem_->Add(sequence_, kTypeValue, key, value);
}
sequence_++; sequence_++;
} }
virtual void Merge(const Slice& key, const Slice& value) { virtual void Merge(const Slice& key, const Slice& value) {

View File

@ -48,7 +48,7 @@ class WriteBatchInternal {
// Drops deletes in batch if filter_del is set to true and // Drops deletes in batch if filter_del is set to true and
// db->KeyMayExist returns false // db->KeyMayExist returns false
static Status InsertInto(const WriteBatch* batch, MemTable* memtable, static Status InsertInto(const WriteBatch* batch, MemTable* memtable,
const Options* opts = nullptr, DB* db = nullptr, const Options* opts, DB* db = nullptr,
const bool filter_del = false); const bool filter_del = false);
static void Append(WriteBatch* dst, const WriteBatch* src); static void Append(WriteBatch* dst, const WriteBatch* src);

View File

@ -25,7 +25,8 @@ static std::string PrintContents(WriteBatch* b) {
MemTable* mem = new MemTable(cmp, factory); MemTable* mem = new MemTable(cmp, factory);
mem->Ref(); mem->Ref();
std::string state; std::string state;
Status s = WriteBatchInternal::InsertInto(b, mem); Options options;
Status s = WriteBatchInternal::InsertInto(b, mem, &options);
int count = 0; int count = 0;
Iterator* iter = mem->NewIterator(); Iterator* iter = mem->NewIterator();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {

View File

@ -81,8 +81,8 @@ class DB {
DB() { } DB() { }
virtual ~DB(); virtual ~DB();
// Set the database entry for "key" to "value". Returns OK on success, // Set the database entry for "key" to "value".
// and a non-OK status on error. // Returns OK on success, and a non-OK status on error.
// Note: consider setting options.sync = true. // Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options, virtual Status Put(const WriteOptions& options,
const Slice& key, const Slice& key,

View File

@ -594,6 +594,17 @@ struct Options {
// Default: emtpy vector -- no user-defined statistics collection will be // Default: emtpy vector -- no user-defined statistics collection will be
// performed. // performed.
std::vector<std::shared_ptr<TableStatsCollector>> table_stats_collectors; std::vector<std::shared_ptr<TableStatsCollector>> table_stats_collectors;
// Allows thread-safe inplace updates. Requires Updates iff
// * key exists in current memtable
// * new sizeof(new_value) <= sizeof(old_value)
// * old_value for that key is a put i.e. kTypeValue
// Default: false.
bool inplace_update_support;
// Number of locks used for inplace update
// Default: 10000, if inplace_update_support = true, else 0.
size_t inplace_update_num_locks;
}; };
// //

View File

@ -39,6 +39,8 @@ enum Tickers {
NUMBER_KEYS_WRITTEN, NUMBER_KEYS_WRITTEN,
// Number of Keys read, // Number of Keys read,
NUMBER_KEYS_READ, NUMBER_KEYS_READ,
// Number keys updated, if inplace update is enabled
NUMBER_KEYS_UPDATED,
// Bytes written / read // Bytes written / read
BYTES_WRITTEN, BYTES_WRITTEN,
BYTES_READ, BYTES_READ,
@ -94,6 +96,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{ COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" }, { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
{ NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" }, { NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" },
{ NUMBER_KEYS_READ, "rocksdb.number.keys.read" }, { NUMBER_KEYS_READ, "rocksdb.number.keys.read" },
{ NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" },
{ BYTES_WRITTEN, "rocksdb.bytes.written" }, { BYTES_WRITTEN, "rocksdb.bytes.written" },
{ BYTES_READ, "rocksdb.bytes.read" }, { BYTES_READ, "rocksdb.bytes.read" },
{ NO_FILE_CLOSES, "rocksdb.no.file.closes" }, { NO_FILE_CLOSES, "rocksdb.no.file.closes" },
@ -144,7 +147,7 @@ enum Histograms {
HARD_RATE_LIMIT_DELAY_COUNT, HARD_RATE_LIMIT_DELAY_COUNT,
SOFT_RATE_LIMIT_DELAY_COUNT, SOFT_RATE_LIMIT_DELAY_COUNT,
NUM_FILES_IN_SINGLE_COMPACTION, NUM_FILES_IN_SINGLE_COMPACTION,
HISTOGRAM_ENUM_MAX HISTOGRAM_ENUM_MAX,
}; };
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = { const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
@ -165,7 +168,7 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{ STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"}, { STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
{ HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"}, { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
{ SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"}, { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
{ NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" } { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
}; };
struct HistogramData { struct HistogramData {

View File

@ -821,12 +821,13 @@ TEST(MemTableTest, Simple) {
MemTable* memtable = new MemTable(cmp, table_factory); MemTable* memtable = new MemTable(cmp, table_factory);
memtable->Ref(); memtable->Ref();
WriteBatch batch; WriteBatch batch;
Options options;
WriteBatchInternal::SetSequence(&batch, 100); WriteBatchInternal::SetSequence(&batch, 100);
batch.Put(std::string("k1"), std::string("v1")); batch.Put(std::string("k1"), std::string("v1"));
batch.Put(std::string("k2"), std::string("v2")); batch.Put(std::string("k2"), std::string("v2"));
batch.Put(std::string("k3"), std::string("v3")); batch.Put(std::string("k3"), std::string("v3"));
batch.Put(std::string("largekey"), std::string("vlarge")); batch.Put(std::string("largekey"), std::string("vlarge"));
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable).ok()); ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, &options).ok());
Iterator* iter = memtable->NewIterator(); Iterator* iter = memtable->NewIterator();
iter->SeekToFirst(); iter->SeekToFirst();

View File

@ -94,7 +94,9 @@ Options::Options()
compaction_filter_factory( compaction_filter_factory(
std::shared_ptr<CompactionFilterFactory>( std::shared_ptr<CompactionFilterFactory>(
new DefaultCompactionFilterFactory())), new DefaultCompactionFilterFactory())),
purge_log_after_memtable_flush(true) { purge_log_after_memtable_flush(true),
inplace_update_support(false),
inplace_update_num_locks(10000) {
assert(memtable_factory.get() != nullptr); assert(memtable_factory.get() != nullptr);
} }
@ -271,6 +273,10 @@ Options::Dump(Logger* log) const
} }
Log(log," Options.table_stats_collectors: %s", Log(log," Options.table_stats_collectors: %s",
collector_names.c_str()); collector_names.c_str());
Log(log," Options.inplace_update_support: %d",
inplace_update_support);
Log(log," Options.inplace_update_num_locks: %zd",
inplace_update_num_locks);
} // Options::Dump } // Options::Dump
// //