Merge branch 'master' into performance

Conflicts:
	HISTORY.md
	db/db_impl.cc
	db/memtable.cc
This commit is contained in:
kailiu 2014-02-05 20:56:14 -08:00
commit 84f8185fc0
18 changed files with 158 additions and 142 deletions

View File

@ -1,10 +1,13 @@
# Rocksdb Change Log
## 2.8.0 (01/28/2014)
## Unreleased
### Public API changes
* Removed arena.h from public header files.
* By default, checksums are verified on every read from database
>>>>>>> master
## 2.7.0 (01/28/2014)

View File

@ -551,7 +551,7 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version) {
return nullptr;
}
Version::FileSummaryStorage tmp;
Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
Log(options_->info_log, "Universal: candidate files(%zu): %s\n",
version->files_[level].size(),
version->LevelFileSummary(&tmp, 0));

View File

@ -95,7 +95,12 @@ class CorruptionTest {
int bad_values = 0;
int correct = 0;
std::string value_space;
Iterator* iter = db_->NewIterator(ReadOptions());
// Do not verify checksums. If we verify checksums then the
// db itself will raise errors because data is corrupted.
// Instead, we want the reads to be successful and this test
// will detect whether the appropriate corruptions have
// occured.
Iterator* iter = db_->NewIterator(ReadOptions(false, true));
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
uint64_t key;
Slice in(iter->key());

View File

@ -100,6 +100,7 @@ DEFINE_string(benchmarks,
"Must be used with merge_operator\n"
"\treadrandommergerandom -- perform N random read-or-merge "
"operations. Must be used with merge_operator\n"
"\tnewiterator -- repeated iterator creation\n"
"\tseekrandom -- N random seeks\n"
"\tcrc32c -- repeated crc32c of 4K of data\n"
"\tacquireload -- load N*1000 times\n"
@ -1094,6 +1095,8 @@ class Benchmark {
method = &Benchmark::ReadRandom;
} else if (name == Slice("readmissing")) {
method = &Benchmark::ReadMissing;
} else if (name == Slice("newiterator")) {
method = &Benchmark::IteratorCreation;
} else if (name == Slice("seekrandom")) {
method = &Benchmark::SeekRandom;
} else if (name == Slice("readhot")) {
@ -1532,14 +1535,14 @@ class Benchmark {
++count;
char* tab = std::find(linep, linep + bufferLen, columnSeparator);
if (tab == linep + bufferLen) {
fprintf(stderr, "[Error] No Key delimiter TAB at line %ld\n", count);
fprintf(stderr, "[Error] No Key delimiter TAB at line %zu\n", count);
continue;
}
Slice key(linep, tab - linep);
tab++;
char* endLine = std::find(tab, linep + bufferLen, lineSeparator);
if (endLine == linep + bufferLen) {
fprintf(stderr, "[Error] No ENTER at end of line # %ld\n", count);
fprintf(stderr, "[Error] No ENTER at end of line # %zu\n", count);
continue;
}
Slice value(tab, endLine - tab);
@ -1883,6 +1886,16 @@ class Benchmark {
thread->stats.AddMessage(msg);
}
void IteratorCreation(ThreadState* thread) {
Duration duration(FLAGS_duration, reads_);
ReadOptions options(FLAGS_verify_checksum, true);
while (!duration.Done(1)) {
Iterator* iter = db_->NewIterator(options);
delete iter;
thread->stats.FinishedSingleOp(db_);
}
}
void SeekRandom(ThreadState* thread) {
Duration duration(FLAGS_duration, reads_);
ReadOptions options(FLAGS_verify_checksum, true);

View File

@ -1083,6 +1083,11 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
// Since we already recovered log_number, we want all logs
// with numbers `<= log_number` (includes this one) to be ignored
edit.SetLogNumber(log_number + 1);
// we must mark the next log number as used, even though it's
// not actually used. that is because VersionSet assumes
// VersionSet::next_file_number_ always to be strictly greater than any log
// number
versions_->MarkFileNumberUsed(log_number + 1);
status = versions_->LogAndApply(&edit, &mutex_);
}
@ -2676,33 +2681,29 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
namespace {
struct IterState {
IterState(DBImpl* db, port::Mutex* mu, DBImpl::SuperVersion* super_version)
: db(db), mu(mu), super_version(super_version) {}
DBImpl* db;
port::Mutex* mu;
Version* version = nullptr;
MemTable* mem = nullptr;
MemTableListVersion* imm = nullptr;
DBImpl *db;
DBImpl::SuperVersion* super_version;
};
static void CleanupIteratorState(void* arg1, void* arg2) {
IterState* state = reinterpret_cast<IterState*>(arg1);
DBImpl::DeletionState deletion_state;
state->mu->Lock();
if (state->mem) { // not set for immutable iterator
MemTable* m = state->mem->Unref();
if (m != nullptr) {
deletion_state.memtables_to_free.push_back(m);
}
DBImpl::DeletionState deletion_state(state->db->GetOptions().
max_write_buffer_number);
bool need_cleanup = state->super_version->Unref();
if (need_cleanup) {
state->mu->Lock();
state->super_version->Cleanup();
state->db->FindObsoleteFiles(deletion_state, false, true);
state->mu->Unlock();
delete state->super_version;
state->db->PurgeObsoleteFiles(deletion_state);
}
if (state->version) { // not set for memtable-only iterator
state->version->Unref();
}
if (state->imm) { // not set for memtable-only iterator
state->imm->Unref(&deletion_state.memtables_to_free);
}
// fast path FindObsoleteFiles
state->db->FindObsoleteFiles(deletion_state, false, true);
state->mu->Unlock();
state->db->PurgeObsoleteFiles(deletion_state);
delete state;
}
@ -2710,36 +2711,23 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
SequenceNumber* latest_snapshot) {
IterState* cleanup = new IterState;
MemTable* mutable_mem;
MemTableListVersion* immutable_mems;
Version* version;
// Collect together all needed child iterators for mem
mutex_.Lock();
*latest_snapshot = versions_->LastSequence();
mem_->Ref();
mutable_mem = mem_;
// Collect together all needed child iterators for imm_
immutable_mems = imm_.current();
immutable_mems->Ref();
versions_->current()->Ref();
version = versions_->current();
SuperVersion* super_version = super_version_->Ref();
mutex_.Unlock();
std::vector<Iterator*> iterator_list;
iterator_list.push_back(mutable_mem->NewIterator(options));
cleanup->mem = mutable_mem;
cleanup->imm = immutable_mems;
// Collect iterator for mutable mem
iterator_list.push_back(super_version->mem->NewIterator(options));
// Collect all needed child iterators for immutable memtables
immutable_mems->AddIterators(options, &iterator_list);
super_version->imm->AddIterators(options, &iterator_list);
// Collect iterators for files in L0 - Ln
version->AddIterators(options, storage_options_, &iterator_list);
super_version->current->AddIterators(options, storage_options_,
&iterator_list);
Iterator* internal_iter = NewMergingIterator(
env_, &internal_comparator_, &iterator_list[0], iterator_list.size());
cleanup->version = version;
cleanup->mu = &mutex_;
cleanup->db = this;
IterState* cleanup = new IterState(this, &mutex_, super_version);
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
return internal_iter;
@ -2754,53 +2742,36 @@ std::pair<Iterator*, Iterator*> DBImpl::GetTailingIteratorPair(
const ReadOptions& options,
uint64_t* superversion_number) {
MemTable* mutable_mem;
MemTableListVersion* immutable_mems;
Version* version;
// get all child iterators and bump their refcounts under lock
mutex_.Lock();
mutable_mem = mem_;
mutable_mem->Ref();
immutable_mems = imm_.current();
immutable_mems->Ref();
version = versions_->current();
version->Ref();
SuperVersion* super_version = super_version_->Ref();
if (superversion_number != nullptr) {
*superversion_number = CurrentVersionNumber();
}
mutex_.Unlock();
Iterator* mutable_iter = mutable_mem->NewIterator(options);
IterState* mutable_cleanup = new IterState();
mutable_cleanup->mem = mutable_mem;
mutable_cleanup->db = this;
mutable_cleanup->mu = &mutex_;
mutable_iter->RegisterCleanup(CleanupIteratorState, mutable_cleanup, nullptr);
Iterator* mutable_iter = super_version->mem->NewIterator(options);
// create a DBIter that only uses memtable content; see NewIterator()
mutable_iter = NewDBIterator(&dbname_, env_, options_, user_comparator(),
mutable_iter, kMaxSequenceNumber);
Iterator* immutable_iter;
IterState* immutable_cleanup = new IterState();
std::vector<Iterator*> list;
immutable_mems->AddIterators(options, &list);
immutable_cleanup->imm = immutable_mems;
version->AddIterators(options, storage_options_, &list);
immutable_cleanup->version = version;
immutable_cleanup->db = this;
immutable_cleanup->mu = &mutex_;
immutable_iter =
NewMergingIterator(env_, &internal_comparator_, &list[0], list.size());
immutable_iter->RegisterCleanup(CleanupIteratorState, immutable_cleanup,
nullptr);
super_version->imm->AddIterators(options, &list);
super_version->current->AddIterators(options, storage_options_, &list);
Iterator* immutable_iter =
NewMergingIterator(env_, &internal_comparator_, &list[0], list.size());
// create a DBIter that only uses memtable content; see NewIterator()
immutable_iter = NewDBIterator(&dbname_, env_, options_, user_comparator(),
immutable_iter, kMaxSequenceNumber);
// register cleanups
mutable_iter->RegisterCleanup(CleanupIteratorState,
new IterState(this, &mutex_, super_version), nullptr);
// bump the ref one more time since it will be Unref'ed twice
immutable_iter->RegisterCleanup(CleanupIteratorState,
new IterState(this, &mutex_, super_version->Ref()), nullptr);
return std::make_pair(mutable_iter, immutable_iter);
}
@ -2946,7 +2917,6 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
StartPerfTimer(&snapshot_timer);
SequenceNumber snapshot;
autovector<MemTable*> to_delete;
mutex_.Lock();
if (options.snapshot != nullptr) {
@ -2955,16 +2925,9 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
snapshot = versions_->LastSequence();
}
MemTable* mem = mem_;
MemTableListVersion* imm = imm_.current();
Version* current = versions_->current();
mem->Ref();
imm->Ref();
current->Ref();
// Unlock while reading from files and memtables
SuperVersion* get_version = super_version_->Ref();
mutex_.Unlock();
bool have_stat_update = false;
Version::GetStats stats;
@ -2990,12 +2953,14 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
std::string* value = &(*values)[i];
LookupKey lkey(keys[i], snapshot);
if (mem->Get(lkey, value, &s, merge_context, options_)) {
if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) {
// Done
} else if (imm->Get(lkey, value, &s, merge_context, options_)) {
} else if (get_version->imm->Get(lkey, value, &s, merge_context,
options_)) {
// Done
} else {
current->Get(options, lkey, value, &s, &merge_context, &stats, options_);
get_version->current->Get(options, lkey, value, &s, &merge_context,
&stats, options_);
have_stat_update = true;
}
@ -3007,19 +2972,28 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
// Post processing (decrement reference counts and record statistics)
StopWatchNano post_process_timer(env_, false);
StartPerfTimer(&post_process_timer);
mutex_.Lock();
if (!options_.disable_seek_compaction &&
have_stat_update && current->UpdateStats(stats)) {
MaybeScheduleFlushOrCompaction();
bool delete_get_version = false;
if (!options_.disable_seek_compaction && have_stat_update) {
mutex_.Lock();
if (get_version->current->UpdateStats(stats)) {
MaybeScheduleFlushOrCompaction();
}
if (get_version->Unref()) {
get_version->Cleanup();
delete_get_version = true;
}
mutex_.Unlock();
} else {
if (get_version->Unref()) {
mutex_.Lock();
get_version->Cleanup();
mutex_.Unlock();
delete_get_version = true;
}
}
if (delete_get_version) {
delete get_version;
}
MemTable* m = mem->Unref();
imm->Unref(&to_delete);
current->Unref();
mutex_.Unlock();
// free up all obsolete memtables outside the mutex
delete m;
for (MemTable* v: to_delete) delete v;
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS);
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys);

View File

@ -249,8 +249,8 @@ class DBImpl : public DB {
return internal_comparator_.user_comparator();
}
MemTable* GetMemTable() {
return mem_;
SuperVersion* GetSuperVersion() {
return super_version_;
}
Iterator* NewInternalIterator(const ReadOptions&,

View File

@ -56,15 +56,15 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
const Slice& key,
std::string* value) {
Status s;
MemTable* mem = GetMemTable();
Version* current = versions_->current();
SequenceNumber snapshot = versions_->LastSequence();
SuperVersion* super_version = GetSuperVersion();
MergeContext merge_context;
LookupKey lkey(key, snapshot);
if (mem->Get(lkey, value, &s, merge_context, options_)) {
if (super_version->mem->Get(lkey, value, &s, merge_context, options_)) {
} else {
Version::GetStats stats;
current->Get(options, lkey, value, &s, &merge_context, &stats, options_);
super_version->current->Get(options, lkey, value, &s, &merge_context,
&stats, options_);
}
return s;
}
@ -87,6 +87,9 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
impl->mutex_.Lock();
Status s = impl->Recover(true /* read only */, error_if_log_file_exist);
if (s.ok()) {
delete impl->InstallSuperVersion(new DBImpl::SuperVersion());
}
impl->mutex_.Unlock();
if (s.ok()) {
*dbptr = impl;

View File

@ -458,6 +458,10 @@ class DBTest {
return DB::Open(*options, dbname_, db);
}
Status ReadOnlyReopen(Options* options) {
return DB::OpenForReadOnly(*options, dbname_, &db_);
}
Status TryReopen(Options* options = nullptr) {
delete db_;
db_ = nullptr;
@ -834,6 +838,26 @@ TEST(DBTest, Empty) {
} while (ChangeOptions());
}
TEST(DBTest, ReadOnlyDB) {
ASSERT_OK(Put("foo", "v1"));
ASSERT_OK(Put("bar", "v2"));
ASSERT_OK(Put("foo", "v3"));
Close();
Options options;
ASSERT_OK(ReadOnlyReopen(&options));
ASSERT_EQ("v3", Get("foo"));
ASSERT_EQ("v2", Get("bar"));
Iterator* iter = db_->NewIterator(ReadOptions());
int count = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ASSERT_OK(iter->status());
++count;
}
ASSERT_EQ(count, 2);
delete iter;
}
// Make sure that when options.block_cache is set, after a new table is
// created its index/filter blocks are added to block cache.
TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {

View File

@ -192,7 +192,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
p += 8;
p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size);
assert((p + val_size) - buf == (unsigned)encoded_len);
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
table_->Insert(buf);
if (prefix_bloom_) {
@ -363,15 +363,10 @@ void MemTable::Update(SequenceNumber seq,
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
new_size);
WriteLock wl(GetLock(lkey.user_key()));
memcpy(p, value.data(), new_size);
assert(
(p + new_size) - entry ==
(unsigned) (VarintLength(key_length) +
key_length +
VarintLength(new_size) +
new_size)
);
// no need to update bloom, as user key does not change.
memcpy(p, value.data(), value.size());
assert((unsigned)((p + value.size()) - entry) ==
(unsigned)(VarintLength(key_length) + key_length +
VarintLength(value.size()) + value.size()));
return;
}
}

View File

@ -2110,7 +2110,6 @@ Compaction* VersionSet::CompactRange(int input_level, int output_level,
Iterator* VersionSet::MakeInputIterator(Compaction* c) {
ReadOptions options;
options.verify_checksums = options_->paranoid_checks;
options.fill_cache = false;
// Level-0 files have to be merged together. For other levels,

View File

@ -734,7 +734,7 @@ enum ReadTier {
struct ReadOptions {
// If true, all data read from underlying storage will be
// verified against corresponding checksums.
// Default: false
// Default: true
bool verify_checksums;
// Should the "data block"/"index block"/"filter block" read for this
@ -775,11 +775,13 @@ struct ReadOptions {
// Specify to create a tailing iterator -- a special iterator that has a
// view of the complete database (i.e. it can also be used to read newly
// added data) and is optimized for sequential reads.
// added data) and is optimized for sequential reads. It will return records
// that were inserted into the database after the creation of the iterator.
// Default: false
bool tailing;
ReadOptions()
: verify_checksums(false),
: verify_checksums(true),
fill_cache(true),
prefix_seek(false),
snapshot(nullptr),

View File

@ -174,7 +174,7 @@ bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
if (index < num_) {
uint32_t start = DecodeFixed32(offset_ + index*4);
uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
if (start <= limit && limit <= (offset_ - data_)) {
if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
Slice filter = Slice(data_ + start, limit - start);
return policy_->KeyMayMatch(entry, filter);
} else if (start == limit) {

View File

@ -149,9 +149,11 @@ Status ReadProperties(
}
BlockContents block_contents;
ReadOptions read_options;
read_options.verify_checksums = false;
Status s = ReadBlockContents(
file,
ReadOptions(),
read_options,
handle,
&block_contents,
env,
@ -240,9 +242,11 @@ Status ReadTableProperties(
auto metaindex_handle = footer.metaindex_handle();
BlockContents metaindex_contents;
ReadOptions read_options;
read_options.verify_checksums = false;
s = ReadBlockContents(
file,
ReadOptions(),
read_options,
metaindex_handle,
&metaindex_contents,
env,

View File

@ -125,8 +125,8 @@ int main(int argc, const char** argv) {
replThread.stop.Release_Store(nullptr);
if (replThread.no_read < dataPump.no_records) {
// no. read should be => than inserted.
fprintf(stderr, "No. of Record's written and read not same\nRead : %ld"
" Written : %ld\n", replThread.no_read, dataPump.no_records);
fprintf(stderr, "No. of Record's written and read not same\nRead : %zu"
" Written : %zu\n", replThread.no_read, dataPump.no_records);
exit(1);
}
fprintf(stderr, "Successful!\n");

View File

@ -333,17 +333,14 @@ static bool isSSE42() {
}
typedef void (*Function)(uint64_t*, uint8_t const**);
static Function func = nullptr;
static inline Function Choose_CRC32() {
return isSSE42() ? Fast_CRC32 : Slow_CRC32;
}
static Function func = Choose_CRC32();
static inline void CRC32(uint64_t* l, uint8_t const **p) {
if (func != nullptr) {
return func(l, p);
}
func = Choose_CRC32();
func(l, p);
}

View File

@ -1429,9 +1429,6 @@ class PosixEnv : public Env {
nullptr,
&ThreadPool::BGThreadWrapper,
this));
fprintf(stdout,
"Created bg thread 0x%lx\n",
(unsigned long)t);
// Set the thread name to aid debugging
#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)

View File

@ -1003,7 +1003,7 @@ Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
opt.num_levels = old_levels_;
opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1);
// Disable size compaction
opt.max_bytes_for_level_base = 1UL << 50;
opt.max_bytes_for_level_base = 1ULL << 50;
opt.max_bytes_for_level_multiplier = 1;
opt.max_mem_compaction_level = 0;
return opt;

View File

@ -168,11 +168,11 @@ Options::Dump(Logger* log) const
Log(log," Options.num_levels: %d", num_levels);
Log(log," Options.disableDataSync: %d", disableDataSync);
Log(log," Options.use_fsync: %d", use_fsync);
Log(log," Options.max_log_file_size: %ld", max_log_file_size);
Log(log," Options.max_log_file_size: %zu", max_log_file_size);
Log(log,"Options.max_manifest_file_size: %lu",
(unsigned long)max_manifest_file_size);
Log(log," Options.log_file_time_to_roll: %ld", log_file_time_to_roll);
Log(log," Options.keep_log_file_num: %ld", keep_log_file_num);
Log(log," Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
Log(log," Options.keep_log_file_num: %zu", keep_log_file_num);
Log(log," Options.db_stats_log_interval: %d",
db_stats_log_interval);
Log(log," Options.allow_os_buffer: %d", allow_os_buffer);
@ -228,7 +228,7 @@ Options::Dump(Logger* log) const
table_cache_numshardbits);
Log(log," Options.table_cache_remove_scan_count_limit: %d",
table_cache_remove_scan_count_limit);
Log(log," Options.arena_block_size: %ld",
Log(log," Options.arena_block_size: %zu",
arena_block_size);
Log(log," Options.delete_obsolete_files_period_micros: %lu",
(unsigned long)delete_obsolete_files_period_micros);
@ -248,7 +248,7 @@ Options::Dump(Logger* log) const
(unsigned long)WAL_ttl_seconds);
Log(log," Options.WAL_size_limit_MB: %lu",
(unsigned long)WAL_size_limit_MB);
Log(log," Options.manifest_preallocation_size: %ld",
Log(log," Options.manifest_preallocation_size: %zu",
manifest_preallocation_size);
Log(log," Options.purge_redundant_kvs_while_flush: %d",
purge_redundant_kvs_while_flush);