2013-10-16 23:59:46 +02:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/version_set.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
2013-06-14 07:09:08 +02:00
|
|
|
#include <climits>
|
2011-03-18 23:37:00 +01:00
|
|
|
#include <stdio.h>
|
|
|
|
#include "db/filename.h"
|
|
|
|
#include "db/log_reader.h"
|
|
|
|
#include "db/log_writer.h"
|
|
|
|
#include "db/memtable.h"
|
2013-12-03 03:34:05 +01:00
|
|
|
#include "db/merge_context.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/table_cache.h"
|
2013-08-23 17:38:13 +02:00
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/merge_operator.h"
|
2013-10-29 01:54:09 +01:00
|
|
|
#include "rocksdb/table.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "table/merger.h"
|
|
|
|
#include "table/two_level_iterator.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/logging.h"
|
2013-06-05 20:06:21 +02:00
|
|
|
#include "util/stop_watch.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
namespace rocksdb {
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2013-07-17 22:56:24 +02:00
|
|
|
static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
|
|
|
|
uint64_t sum = 0;
|
2012-08-27 09:50:26 +02:00
|
|
|
for (size_t i = 0; i < files.size() && files[i]; i++) {
|
2011-10-06 01:30:28 +02:00
|
|
|
sum += files[i]->file_size;
|
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Version::~Version() {
|
|
|
|
assert(refs_ == 0);
|
2011-05-21 04:17:43 +02:00
|
|
|
|
|
|
|
// Remove from linked list
|
|
|
|
prev_->next_ = next_;
|
|
|
|
next_->prev_ = prev_;
|
|
|
|
|
|
|
|
// Drop references to files
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < vset_->NumberLevels(); level++) {
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < files_[level].size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
FileMetaData* f = files_[level][i];
|
2011-05-21 04:17:43 +02:00
|
|
|
assert(f->refs > 0);
|
2011-03-18 23:37:00 +01:00
|
|
|
f->refs--;
|
|
|
|
if (f->refs <= 0) {
|
2013-11-09 00:23:46 +01:00
|
|
|
vset_->obsolete_files_.push_back(f);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-06-23 04:30:03 +02:00
|
|
|
delete[] files_;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2011-06-22 04:36:45 +02:00
|
|
|
int FindFile(const InternalKeyComparator& icmp,
|
|
|
|
const std::vector<FileMetaData*>& files,
|
|
|
|
const Slice& key) {
|
|
|
|
uint32_t left = 0;
|
|
|
|
uint32_t right = files.size();
|
|
|
|
while (left < right) {
|
|
|
|
uint32_t mid = (left + right) / 2;
|
|
|
|
const FileMetaData* f = files[mid];
|
|
|
|
if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
|
|
|
|
// Key at "mid.largest" is < "target". Therefore all
|
|
|
|
// files at or before "mid" are uninteresting.
|
|
|
|
left = mid + 1;
|
|
|
|
} else {
|
|
|
|
// Key at "mid.largest" is >= "target". Therefore all files
|
|
|
|
// after "mid" are uninteresting.
|
|
|
|
right = mid;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return right;
|
|
|
|
}
|
|
|
|
|
2011-10-06 01:30:28 +02:00
|
|
|
static bool AfterFile(const Comparator* ucmp,
|
|
|
|
const Slice* user_key, const FileMetaData* f) {
|
2013-03-01 03:04:58 +01:00
|
|
|
// nullptr user_key occurs before all keys and is therefore never after *f
|
|
|
|
return (user_key != nullptr &&
|
2011-10-06 01:30:28 +02:00
|
|
|
ucmp->Compare(*user_key, f->largest.user_key()) > 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool BeforeFile(const Comparator* ucmp,
|
|
|
|
const Slice* user_key, const FileMetaData* f) {
|
2013-03-01 03:04:58 +01:00
|
|
|
// nullptr user_key occurs after all keys and is therefore never before *f
|
|
|
|
return (user_key != nullptr &&
|
2011-10-06 01:30:28 +02:00
|
|
|
ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
|
|
|
|
}
|
|
|
|
|
2011-06-22 04:36:45 +02:00
|
|
|
bool SomeFileOverlapsRange(
|
|
|
|
const InternalKeyComparator& icmp,
|
2011-10-06 01:30:28 +02:00
|
|
|
bool disjoint_sorted_files,
|
2011-06-22 04:36:45 +02:00
|
|
|
const std::vector<FileMetaData*>& files,
|
2011-10-06 01:30:28 +02:00
|
|
|
const Slice* smallest_user_key,
|
|
|
|
const Slice* largest_user_key) {
|
|
|
|
const Comparator* ucmp = icmp.user_comparator();
|
|
|
|
if (!disjoint_sorted_files) {
|
|
|
|
// Need to check against all files
|
2012-08-27 08:45:35 +02:00
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
2011-10-06 01:30:28 +02:00
|
|
|
const FileMetaData* f = files[i];
|
|
|
|
if (AfterFile(ucmp, smallest_user_key, f) ||
|
|
|
|
BeforeFile(ucmp, largest_user_key, f)) {
|
|
|
|
// No overlap
|
|
|
|
} else {
|
|
|
|
return true; // Overlap
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Binary search over file list
|
|
|
|
uint32_t index = 0;
|
2013-03-01 03:04:58 +01:00
|
|
|
if (smallest_user_key != nullptr) {
|
2011-10-06 01:30:28 +02:00
|
|
|
// Find the earliest possible internal key for smallest_user_key
|
|
|
|
InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
|
|
|
|
index = FindFile(icmp, files, small.Encode());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (index >= files.size()) {
|
|
|
|
// beginning of range is after all files, so no overlap.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return !BeforeFile(ucmp, largest_user_key, files[index]);
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// An internal iterator. For a given version/level pair, yields
|
|
|
|
// information about the files in the level. For a given entry, key()
|
|
|
|
// is the largest key that occurs in the file, and value() is an
|
2011-03-28 22:43:44 +02:00
|
|
|
// 16-byte value containing the file number and file size, both
|
|
|
|
// encoded using EncodeFixed64.
|
2011-03-18 23:37:00 +01:00
|
|
|
class Version::LevelFileNumIterator : public Iterator {
|
|
|
|
public:
|
2011-05-21 04:17:43 +02:00
|
|
|
LevelFileNumIterator(const InternalKeyComparator& icmp,
|
2011-03-18 23:37:00 +01:00
|
|
|
const std::vector<FileMetaData*>* flist)
|
2011-05-21 04:17:43 +02:00
|
|
|
: icmp_(icmp),
|
2011-03-18 23:37:00 +01:00
|
|
|
flist_(flist),
|
|
|
|
index_(flist->size()) { // Marks as invalid
|
|
|
|
}
|
|
|
|
virtual bool Valid() const {
|
|
|
|
return index_ < flist_->size();
|
|
|
|
}
|
|
|
|
virtual void Seek(const Slice& target) {
|
2011-06-22 04:36:45 +02:00
|
|
|
index_ = FindFile(icmp_, *flist_, target);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
virtual void SeekToFirst() { index_ = 0; }
|
|
|
|
virtual void SeekToLast() {
|
|
|
|
index_ = flist_->empty() ? 0 : flist_->size() - 1;
|
|
|
|
}
|
|
|
|
virtual void Next() {
|
|
|
|
assert(Valid());
|
|
|
|
index_++;
|
|
|
|
}
|
|
|
|
virtual void Prev() {
|
|
|
|
assert(Valid());
|
|
|
|
if (index_ == 0) {
|
|
|
|
index_ = flist_->size(); // Marks as invalid
|
|
|
|
} else {
|
|
|
|
index_--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Slice key() const {
|
|
|
|
assert(Valid());
|
|
|
|
return (*flist_)[index_]->largest.Encode();
|
|
|
|
}
|
|
|
|
Slice value() const {
|
|
|
|
assert(Valid());
|
|
|
|
EncodeFixed64(value_buf_, (*flist_)[index_]->number);
|
2011-03-28 22:43:44 +02:00
|
|
|
EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size);
|
2011-03-18 23:37:00 +01:00
|
|
|
return Slice(value_buf_, sizeof(value_buf_));
|
|
|
|
}
|
|
|
|
virtual Status status() const { return Status::OK(); }
|
|
|
|
private:
|
|
|
|
const InternalKeyComparator icmp_;
|
|
|
|
const std::vector<FileMetaData*>* const flist_;
|
2011-04-21 00:48:11 +02:00
|
|
|
uint32_t index_;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2011-03-28 22:43:44 +02:00
|
|
|
// Backing store for value(). Holds the file number and size.
|
|
|
|
mutable char value_buf_[16];
|
2011-03-18 23:37:00 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
static Iterator* GetFileIterator(void* arg,
|
|
|
|
const ReadOptions& options,
|
2013-03-15 01:00:04 +01:00
|
|
|
const EnvOptions& soptions,
|
2013-05-18 00:53:01 +02:00
|
|
|
const Slice& file_value,
|
|
|
|
bool for_compaction) {
|
2011-03-18 23:37:00 +01:00
|
|
|
TableCache* cache = reinterpret_cast<TableCache*>(arg);
|
2011-03-28 22:43:44 +02:00
|
|
|
if (file_value.size() != 16) {
|
2011-03-18 23:37:00 +01:00
|
|
|
return NewErrorIterator(
|
|
|
|
Status::Corruption("FileReader invoked with unexpected value"));
|
|
|
|
} else {
|
2013-08-23 23:49:57 +02:00
|
|
|
ReadOptions options_copy;
|
|
|
|
if (options.prefix) {
|
|
|
|
// suppress prefix filtering since we have already checked the
|
|
|
|
// filters once at this point
|
|
|
|
options_copy = options;
|
|
|
|
options_copy.prefix = nullptr;
|
|
|
|
}
|
|
|
|
return cache->NewIterator(options.prefix ? options_copy : options,
|
2013-03-15 01:00:04 +01:00
|
|
|
soptions,
|
2011-03-28 22:43:44 +02:00
|
|
|
DecodeFixed64(file_value.data()),
|
2013-05-18 00:53:01 +02:00
|
|
|
DecodeFixed64(file_value.data() + 8),
|
|
|
|
nullptr /* don't need reference to table*/,
|
|
|
|
for_compaction);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-23 23:49:57 +02:00
|
|
|
bool Version::PrefixMayMatch(const ReadOptions& options,
|
|
|
|
const EnvOptions& soptions,
|
|
|
|
const Slice& internal_prefix,
|
|
|
|
Iterator* level_iter) const {
|
|
|
|
bool may_match = true;
|
|
|
|
level_iter->Seek(internal_prefix);
|
|
|
|
if (!level_iter->Valid()) {
|
|
|
|
// we're past end of level
|
|
|
|
may_match = false;
|
|
|
|
} else if (ExtractUserKey(level_iter->key()).starts_with(
|
|
|
|
ExtractUserKey(internal_prefix))) {
|
|
|
|
// TODO(tylerharter): do we need this case? Or are we guaranteed
|
|
|
|
// key() will always be the biggest value for this SST?
|
|
|
|
may_match = true;
|
|
|
|
} else {
|
|
|
|
may_match = vset_->table_cache_->PrefixMayMatch(
|
|
|
|
options,
|
|
|
|
DecodeFixed64(level_iter->value().data()),
|
|
|
|
DecodeFixed64(level_iter->value().data() + 8),
|
|
|
|
internal_prefix, nullptr);
|
|
|
|
}
|
|
|
|
return may_match;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
|
2013-06-08 00:35:17 +02:00
|
|
|
const EnvOptions& soptions,
|
2011-03-18 23:37:00 +01:00
|
|
|
int level) const {
|
2013-08-23 23:49:57 +02:00
|
|
|
Iterator* level_iter = new LevelFileNumIterator(vset_->icmp_, &files_[level]);
|
|
|
|
if (options.prefix) {
|
|
|
|
InternalKey internal_prefix(*options.prefix, 0, kTypeValue);
|
|
|
|
if (!PrefixMayMatch(options, soptions,
|
|
|
|
internal_prefix.Encode(), level_iter)) {
|
|
|
|
delete level_iter;
|
|
|
|
// nothing in this level can match the prefix
|
|
|
|
return NewEmptyIterator();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NewTwoLevelIterator(level_iter, &GetFileIterator,
|
|
|
|
vset_->table_cache_, options, soptions);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void Version::AddIterators(const ReadOptions& options,
|
2013-06-08 00:35:17 +02:00
|
|
|
const EnvOptions& soptions,
|
2011-03-18 23:37:00 +01:00
|
|
|
std::vector<Iterator*>* iters) {
|
|
|
|
// Merge all level zero files together since they may overlap
|
2013-08-21 07:58:16 +02:00
|
|
|
for (const FileMetaData* file : files_[0]) {
|
2011-03-18 23:37:00 +01:00
|
|
|
iters->push_back(
|
2011-03-28 22:43:44 +02:00
|
|
|
vset_->table_cache_->NewIterator(
|
2013-08-21 07:58:16 +02:00
|
|
|
options, soptions, file->number, file->file_size));
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// For levels > 0, we can use a concatenating iterator that sequentially
|
|
|
|
// walks through the non-overlapping files in the level, opening them
|
|
|
|
// lazily.
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 1; level < vset_->NumberLevels(); level++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
if (!files_[level].empty()) {
|
2013-03-15 01:00:04 +01:00
|
|
|
iters->push_back(NewConcatenatingIterator(options, soptions, level));
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-04-17 17:36:46 +02:00
|
|
|
// Callback from TableCache::Get()
|
|
|
|
namespace {
|
|
|
|
enum SaverState {
|
|
|
|
kNotFound,
|
|
|
|
kFound,
|
|
|
|
kDeleted,
|
|
|
|
kCorrupt,
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
kMerge // saver contains the current merge result (the operands)
|
2012-04-17 17:36:46 +02:00
|
|
|
};
|
|
|
|
struct Saver {
|
|
|
|
SaverState state;
|
|
|
|
const Comparator* ucmp;
|
|
|
|
Slice user_key;
|
2013-07-26 21:57:01 +02:00
|
|
|
bool* value_found; // Is value set correctly? Used by KeyMayExist
|
2012-04-17 17:36:46 +02:00
|
|
|
std::string* value;
|
2013-03-21 23:59:47 +01:00
|
|
|
const MergeOperator* merge_operator;
|
2013-12-03 03:34:05 +01:00
|
|
|
// the merge operations encountered;
|
|
|
|
MergeContext* merge_context;
|
2013-03-21 23:59:47 +01:00
|
|
|
Logger* logger;
|
2012-09-27 10:05:38 +02:00
|
|
|
bool didIO; // did we do any disk io?
|
2013-11-22 23:14:05 +01:00
|
|
|
Statistics* statistics;
|
2012-04-17 17:36:46 +02:00
|
|
|
};
|
|
|
|
}
|
2013-07-06 03:49:18 +02:00
|
|
|
|
2013-10-29 01:54:09 +01:00
|
|
|
// Called from TableCache::Get and Table::Get when file/block in which
|
|
|
|
// key may exist are not there in TableCache/BlockCache respectively. In this
|
|
|
|
// case we can't guarantee that key does not exist and are not permitted to do
|
|
|
|
// IO to be certain.Set the status=kFound and value_found=false to let the
|
|
|
|
// caller know that key may exist but is not there in memory
|
2013-07-06 03:49:18 +02:00
|
|
|
static void MarkKeyMayExist(void* arg) {
|
|
|
|
Saver* s = reinterpret_cast<Saver*>(arg);
|
|
|
|
s->state = kFound;
|
2013-07-26 21:57:01 +02:00
|
|
|
if (s->value_found != nullptr) {
|
|
|
|
*(s->value_found) = false;
|
|
|
|
}
|
2013-07-06 03:49:18 +02:00
|
|
|
}
|
|
|
|
|
2013-03-21 23:59:47 +01:00
|
|
|
static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){
|
2012-04-17 17:36:46 +02:00
|
|
|
Saver* s = reinterpret_cast<Saver*>(arg);
|
2013-12-03 03:34:05 +01:00
|
|
|
MergeContext* merge_contex = s->merge_context;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
std::string merge_result; // temporary area for merge results later
|
|
|
|
|
2013-12-03 03:34:05 +01:00
|
|
|
assert(s != nullptr && merge_contex != nullptr);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
|
2011-06-22 04:36:45 +02:00
|
|
|
ParsedInternalKey parsed_key;
|
2013-03-21 23:59:47 +01:00
|
|
|
// TODO: didIO and Merge?
|
2012-09-27 10:05:38 +02:00
|
|
|
s->didIO = didIO;
|
2012-04-17 17:36:46 +02:00
|
|
|
if (!ParseInternalKey(ikey, &parsed_key)) {
|
2013-03-21 23:59:47 +01:00
|
|
|
// TODO: what about corrupt during Merge?
|
2012-04-17 17:36:46 +02:00
|
|
|
s->state = kCorrupt;
|
|
|
|
} else {
|
|
|
|
if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// Key matches. Process it
|
2013-03-21 23:59:47 +01:00
|
|
|
switch (parsed_key.type) {
|
|
|
|
case kTypeValue:
|
|
|
|
if (kNotFound == s->state) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
s->state = kFound;
|
2013-03-21 23:59:47 +01:00
|
|
|
s->value->assign(v.data(), v.size());
|
|
|
|
} else if (kMerge == s->state) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
assert(s->merge_operator != nullptr);
|
|
|
|
s->state = kFound;
|
2013-12-03 03:34:05 +01:00
|
|
|
if (!s->merge_operator->FullMerge(s->user_key, &v,
|
|
|
|
merge_contex->GetOperands(),
|
2013-08-19 20:42:47 +02:00
|
|
|
s->value, s->logger)) {
|
2013-08-09 08:07:36 +02:00
|
|
|
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
s->state = kCorrupt;
|
|
|
|
}
|
2013-03-21 23:59:47 +01:00
|
|
|
} else {
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
|
|
|
|
case kTypeDeletion:
|
|
|
|
if (kNotFound == s->state) {
|
|
|
|
s->state = kDeleted;
|
|
|
|
} else if (kMerge == s->state) {
|
|
|
|
s->state = kFound;
|
2013-12-03 03:34:05 +01:00
|
|
|
if (!s->merge_operator->FullMerge(s->user_key, nullptr,
|
|
|
|
merge_contex->GetOperands(),
|
|
|
|
s->value, s->logger)) {
|
2013-08-09 08:07:36 +02:00
|
|
|
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
s->state = kCorrupt;
|
|
|
|
}
|
2013-03-21 23:59:47 +01:00
|
|
|
} else {
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
case kTypeMerge:
|
|
|
|
assert(s->state == kNotFound || s->state == kMerge);
|
|
|
|
s->state = kMerge;
|
2013-12-03 03:34:05 +01:00
|
|
|
merge_contex->PushOperand(v);
|
|
|
|
while (merge_contex->GetNumOperands() >= 2) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// Attempt to merge operands together via user associateive merge
|
|
|
|
if (s->merge_operator->PartialMerge(s->user_key,
|
2013-12-03 03:34:05 +01:00
|
|
|
merge_contex->GetOperand(0),
|
|
|
|
merge_contex->GetOperand(1),
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
&merge_result,
|
|
|
|
s->logger)) {
|
2013-12-03 03:34:05 +01:00
|
|
|
merge_contex->PushPartialMergeResult(merge_result);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
} else {
|
|
|
|
// Associative merge returns false ==> stack the operands
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
2013-08-15 01:32:46 +02:00
|
|
|
|
|
|
|
case kTypeLogData:
|
|
|
|
assert(false);
|
|
|
|
break;
|
2012-04-17 17:36:46 +02:00
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
}
|
2013-03-21 23:59:47 +01:00
|
|
|
|
|
|
|
// s->state could be Corrupt, merge or notfound
|
|
|
|
|
|
|
|
return false;
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
|
|
|
|
return a->number > b->number;
|
|
|
|
}
|
2013-06-14 07:09:08 +02:00
|
|
|
static bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
|
|
|
|
if (a->smallest_seqno > b->smallest_seqno) {
|
|
|
|
assert(a->largest_seqno > b->largest_seqno);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
assert(a->largest_seqno <= b->largest_seqno);
|
|
|
|
return false;
|
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
Version::Version(VersionSet* vset, uint64_t version_number)
|
2012-06-23 04:30:03 +02:00
|
|
|
: vset_(vset), next_(this), prev_(this), refs_(0),
|
2013-08-24 00:41:18 +02:00
|
|
|
files_(new std::vector<FileMetaData*>[vset->NumberLevels()]),
|
2012-10-26 03:21:54 +02:00
|
|
|
files_by_size_(vset->NumberLevels()),
|
2012-11-01 06:01:57 +01:00
|
|
|
next_file_to_compact_by_size_(vset->NumberLevels()),
|
2013-03-01 03:04:58 +01:00
|
|
|
file_to_compact_(nullptr),
|
2012-06-23 04:30:03 +02:00
|
|
|
file_to_compact_level_(-1),
|
2012-10-19 23:00:53 +02:00
|
|
|
compaction_score_(vset->NumberLevels()),
|
|
|
|
compaction_level_(vset->NumberLevels()),
|
|
|
|
offset_manifest_file_(0),
|
|
|
|
version_number_(version_number) {
|
2012-06-23 04:30:03 +02:00
|
|
|
}
|
|
|
|
|
2013-03-21 23:59:47 +01:00
|
|
|
void Version::Get(const ReadOptions& options,
|
|
|
|
const LookupKey& k,
|
|
|
|
std::string* value,
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
Status* status,
|
2013-12-03 03:34:05 +01:00
|
|
|
MergeContext* merge_context,
|
2013-03-21 23:59:47 +01:00
|
|
|
GetStats* stats,
|
2013-07-06 03:49:18 +02:00
|
|
|
const Options& db_options,
|
2013-07-26 21:57:01 +02:00
|
|
|
bool* value_found) {
|
2011-06-22 04:36:45 +02:00
|
|
|
Slice ikey = k.internal_key();
|
|
|
|
Slice user_key = k.user_key();
|
|
|
|
const Comparator* ucmp = vset_->icmp_.user_comparator();
|
2013-03-21 23:59:47 +01:00
|
|
|
|
2013-08-20 22:35:28 +02:00
|
|
|
auto merge_operator = db_options.merge_operator.get();
|
2013-03-21 23:59:47 +01:00
|
|
|
auto logger = db_options.info_log;
|
|
|
|
|
|
|
|
assert(status->ok() || status->IsMergeInProgress());
|
|
|
|
Saver saver;
|
|
|
|
saver.state = status->ok()? kNotFound : kMerge;
|
|
|
|
saver.ucmp = ucmp;
|
|
|
|
saver.user_key = user_key;
|
2013-07-26 21:57:01 +02:00
|
|
|
saver.value_found = value_found;
|
2013-03-21 23:59:47 +01:00
|
|
|
saver.value = value;
|
|
|
|
saver.merge_operator = merge_operator;
|
2013-12-03 03:34:05 +01:00
|
|
|
saver.merge_context = merge_context;
|
2013-03-21 23:59:47 +01:00
|
|
|
saver.logger = logger.get();
|
|
|
|
saver.didIO = false;
|
2013-11-22 23:14:05 +01:00
|
|
|
saver.statistics = db_options.statistics.get();
|
2011-06-22 04:36:45 +02:00
|
|
|
|
2013-03-01 03:04:58 +01:00
|
|
|
stats->seek_file = nullptr;
|
2011-06-22 04:36:45 +02:00
|
|
|
stats->seek_file_level = -1;
|
2013-03-01 03:04:58 +01:00
|
|
|
FileMetaData* last_file_read = nullptr;
|
2011-09-01 21:08:02 +02:00
|
|
|
int last_file_read_level = -1;
|
2011-06-22 04:36:45 +02:00
|
|
|
|
|
|
|
// We can search level-by-level since entries never hop across
|
|
|
|
// levels. Therefore we are guaranteed that if we find data
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// in an smaller level, later levels are irrelevant (unless we
|
|
|
|
// are MergeInProgress).
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < vset_->NumberLevels(); level++) {
|
2011-06-22 04:36:45 +02:00
|
|
|
size_t num_files = files_[level].size();
|
|
|
|
if (num_files == 0) continue;
|
|
|
|
|
|
|
|
// Get the list of files to search in this level
|
|
|
|
FileMetaData* const* files = &files_[level][0];
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
|
|
|
|
// Some files may overlap each other. We find
|
|
|
|
// all files that overlap user_key and process them in order from
|
|
|
|
// newest to oldest. In the context of merge-operator,
|
|
|
|
// this can occur at any level. Otherwise, it only occurs
|
|
|
|
// at Level-0 (since Put/Deletes are always compacted into a single entry).
|
|
|
|
uint32_t start_index;
|
2011-06-22 04:36:45 +02:00
|
|
|
if (level == 0) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// On Level-0, we read through all files to check for overlap.
|
|
|
|
start_index = 0;
|
2011-06-22 04:36:45 +02:00
|
|
|
} else {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// On Level-n (n>=1), files are sorted.
|
2011-06-22 04:36:45 +02:00
|
|
|
// Binary search to find earliest index whose largest key >= ikey.
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// We will also stop when the file no longer overlaps ikey
|
|
|
|
start_index = FindFile(vset_->icmp_, files_[level], ikey);
|
|
|
|
}
|
|
|
|
|
2013-12-09 23:28:26 +01:00
|
|
|
// Traverse each relevant file to find the desired key
|
|
|
|
#ifndef NDEBUG
|
|
|
|
FileMetaData* prev_file = nullptr;
|
|
|
|
#endif
|
|
|
|
for (uint32_t i = start_index; i < num_files; ++i) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
FileMetaData* f = files[i];
|
2013-12-09 23:28:26 +01:00
|
|
|
if (ucmp->Compare(user_key, f->smallest.user_key()) < 0 ||
|
|
|
|
ucmp->Compare(user_key, f->largest.user_key()) > 0) {
|
|
|
|
// Only process overlapping files.
|
|
|
|
if (level > 0) {
|
|
|
|
// If on Level-n (n>=1) then the files are sorted.
|
|
|
|
// So we can stop looking when we are past the ikey.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// TODO: do we want to check file ranges for level0 files at all?
|
|
|
|
// For new SST format where Get() is fast, we might want to consider
|
|
|
|
// to avoid those two comparisons, if it can filter out too few files.
|
|
|
|
continue;
|
2013-06-14 07:09:08 +02:00
|
|
|
}
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
#ifndef NDEBUG
|
2013-12-09 23:28:26 +01:00
|
|
|
// Sanity check to make sure that the files are correctly sorted
|
|
|
|
if (prev_file) {
|
|
|
|
if (level != 0) {
|
|
|
|
int comp_sign = vset_->icmp_.Compare(prev_file->largest, f->smallest);
|
|
|
|
assert(comp_sign < 0);
|
|
|
|
} else {
|
|
|
|
// level == 0, the current file cannot be newer than the previous one.
|
|
|
|
if (vset_->options_->compaction_style == kCompactionStyleUniversal) {
|
|
|
|
assert(!NewestFirstBySeqNo(f, prev_file));
|
|
|
|
} else {
|
|
|
|
assert(!NewestFirst(f, prev_file));
|
|
|
|
}
|
|
|
|
}
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
}
|
2013-12-09 23:28:26 +01:00
|
|
|
prev_file = f;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
#endif
|
2012-09-27 10:05:38 +02:00
|
|
|
bool tableIO = false;
|
2013-03-21 23:59:47 +01:00
|
|
|
*status = vset_->table_cache_->Get(options, f->number, f->file_size,
|
2013-07-06 03:49:18 +02:00
|
|
|
ikey, &saver, SaveValue, &tableIO,
|
2013-08-25 07:48:51 +02:00
|
|
|
MarkKeyMayExist);
|
2013-03-21 23:59:47 +01:00
|
|
|
// TODO: examine the behavior for corrupted key
|
|
|
|
if (!status->ok()) {
|
|
|
|
return;
|
2012-04-17 17:36:46 +02:00
|
|
|
}
|
2012-09-27 10:05:38 +02:00
|
|
|
|
2013-03-01 03:04:58 +01:00
|
|
|
if (last_file_read != nullptr && stats->seek_file == nullptr) {
|
2012-09-27 10:05:38 +02:00
|
|
|
// We have had more than one seek for this read. Charge the 1st file.
|
|
|
|
stats->seek_file = last_file_read;
|
|
|
|
stats->seek_file_level = last_file_read_level;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we did any IO as part of the read, then we remember it because
|
|
|
|
// it is a possible candidate for seek-based compaction. saver.didIO
|
|
|
|
// is true if the block had to be read in from storage and was not
|
|
|
|
// pre-exisiting in the block cache. Also, if this file was not pre-
|
|
|
|
// existing in the table cache and had to be freshly opened that needed
|
|
|
|
// the index blocks to be read-in, then tableIO is true. One thing
|
|
|
|
// to note is that the index blocks are not part of the block cache.
|
|
|
|
if (saver.didIO || tableIO) {
|
|
|
|
last_file_read = f;
|
|
|
|
last_file_read_level = level;
|
|
|
|
}
|
|
|
|
|
2012-04-17 17:36:46 +02:00
|
|
|
switch (saver.state) {
|
|
|
|
case kNotFound:
|
|
|
|
break; // Keep searching in other files
|
|
|
|
case kFound:
|
2013-03-21 23:59:47 +01:00
|
|
|
return;
|
2012-04-17 17:36:46 +02:00
|
|
|
case kDeleted:
|
2013-12-26 22:49:04 +01:00
|
|
|
*status = Status::NotFound(); // Use empty error message for speed
|
2013-03-21 23:59:47 +01:00
|
|
|
return;
|
2012-04-17 17:36:46 +02:00
|
|
|
case kCorrupt:
|
2013-03-21 23:59:47 +01:00
|
|
|
*status = Status::Corruption("corrupted key for ", user_key);
|
|
|
|
return;
|
|
|
|
case kMerge:
|
|
|
|
break;
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-03-21 23:59:47 +01:00
|
|
|
|
|
|
|
if (kMerge == saver.state) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// merge_operands are in saver and we hit the beginning of the key history
|
|
|
|
// do a final merge of nullptr and operands;
|
2013-12-03 03:34:05 +01:00
|
|
|
if (merge_operator->FullMerge(user_key, nullptr,
|
|
|
|
saver.merge_context->GetOperands(),
|
2013-08-19 20:42:47 +02:00
|
|
|
value, logger.get())) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
*status = Status::OK();
|
|
|
|
} else {
|
2013-11-22 23:14:05 +01:00
|
|
|
RecordTick(db_options.statistics.get(), NUMBER_MERGE_FAILURES);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
*status = Status::Corruption("could not perform end-of-key merge for ",
|
|
|
|
user_key);
|
|
|
|
}
|
2013-03-21 23:59:47 +01:00
|
|
|
} else {
|
2013-12-26 22:49:04 +01:00
|
|
|
*status = Status::NotFound(); // Use an empty error message for speed
|
2013-03-21 23:59:47 +01:00
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Version::UpdateStats(const GetStats& stats) {
|
|
|
|
FileMetaData* f = stats.seek_file;
|
2013-03-01 03:04:58 +01:00
|
|
|
if (f != nullptr) {
|
2011-06-22 04:36:45 +02:00
|
|
|
f->allowed_seeks--;
|
2013-03-01 03:04:58 +01:00
|
|
|
if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) {
|
2011-06-22 04:36:45 +02:00
|
|
|
file_to_compact_ = f;
|
|
|
|
file_to_compact_level_ = stats.seek_file_level;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
void Version::Ref() {
|
|
|
|
++refs_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Version::Unref() {
|
2011-05-21 04:17:43 +02:00
|
|
|
assert(this != &vset_->dummy_versions_);
|
2011-03-18 23:37:00 +01:00
|
|
|
assert(refs_ >= 1);
|
|
|
|
--refs_;
|
|
|
|
if (refs_ == 0) {
|
2011-05-21 04:17:43 +02:00
|
|
|
delete this;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-06-22 04:36:45 +02:00
|
|
|
bool Version::OverlapInLevel(int level,
|
2011-10-06 01:30:28 +02:00
|
|
|
const Slice* smallest_user_key,
|
|
|
|
const Slice* largest_user_key) {
|
|
|
|
return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
|
|
|
|
smallest_user_key, largest_user_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
int Version::PickLevelForMemTableOutput(
|
|
|
|
const Slice& smallest_user_key,
|
|
|
|
const Slice& largest_user_key) {
|
|
|
|
int level = 0;
|
|
|
|
if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
|
|
|
|
// Push to next level if there is no overlap in next level,
|
|
|
|
// and the #bytes overlapping in the level after that are limited.
|
|
|
|
InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
|
|
|
|
InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
|
|
|
|
std::vector<FileMetaData*> overlaps;
|
2012-06-23 04:30:03 +02:00
|
|
|
int max_mem_compact_level = vset_->options_->max_mem_compaction_level;
|
|
|
|
while (max_mem_compact_level > 0 && level < max_mem_compact_level) {
|
2011-10-06 01:30:28 +02:00
|
|
|
if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
|
|
|
|
break;
|
|
|
|
}
|
2012-06-23 04:30:03 +02:00
|
|
|
if (level + 2 >= vset_->NumberLevels()) {
|
2012-10-31 19:47:18 +01:00
|
|
|
level++;
|
|
|
|
break;
|
2012-06-23 04:30:03 +02:00
|
|
|
}
|
2011-10-06 01:30:28 +02:00
|
|
|
GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
|
2013-07-17 22:56:24 +02:00
|
|
|
const uint64_t sum = TotalFileSize(overlaps);
|
2012-06-23 04:30:03 +02:00
|
|
|
if (sum > vset_->MaxGrandParentOverlapBytes(level)) {
|
2011-10-06 01:30:28 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
level++;
|
|
|
|
}
|
|
|
|
}
|
2012-06-23 04:30:03 +02:00
|
|
|
|
2011-10-06 01:30:28 +02:00
|
|
|
return level;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Store in "*inputs" all files in "level" that overlap [begin,end]
|
2012-11-29 01:42:36 +01:00
|
|
|
// If hint_index is specified, then it points to a file in the
|
2012-11-06 18:06:16 +01:00
|
|
|
// overlapping range.
|
|
|
|
// The file_index returns a pointer to any file in an overlapping range.
|
2011-10-06 01:30:28 +02:00
|
|
|
void Version::GetOverlappingInputs(
|
|
|
|
int level,
|
|
|
|
const InternalKey* begin,
|
|
|
|
const InternalKey* end,
|
2012-11-06 18:06:16 +01:00
|
|
|
std::vector<FileMetaData*>* inputs,
|
|
|
|
int hint_index,
|
|
|
|
int* file_index) {
|
2011-10-06 01:30:28 +02:00
|
|
|
inputs->clear();
|
|
|
|
Slice user_begin, user_end;
|
2013-03-01 03:04:58 +01:00
|
|
|
if (begin != nullptr) {
|
2011-10-06 01:30:28 +02:00
|
|
|
user_begin = begin->user_key();
|
|
|
|
}
|
2013-03-01 03:04:58 +01:00
|
|
|
if (end != nullptr) {
|
2011-10-06 01:30:28 +02:00
|
|
|
user_end = end->user_key();
|
|
|
|
}
|
Assertion failure while running with unit tests with OPT=-g
Summary:
When we expand the range of keys for a level 0 compaction, we
need to invoke ParentFilesInCompaction() only once for the
entire range of keys that is being compacted. We were invoking
it for each file that was being compacted, but this triggers
an assertion because each file's range were contiguous but
non-overlapping.
I renamed ParentFilesInCompaction to ParentRangeInCompaction
to adequately represent that it is the range-of-keys and
not individual files that we compact in a single compaction run.
Here is the assertion that is fixed by this patch.
db_test: db/version_set.cc:585: void leveldb::Version::ExtendOverlappingInputs(int, const leveldb::Slice&, const leveldb::Slice&, std::vector<leveldb::FileMetaData*, std::allocator<leveldb::FileMetaData*> >*, int): Assertion `user_cmp->Compare(flimit, user_begin) >= 0' failed.
Test Plan: make clean check OPT=-g
Reviewers: sheki
Reviewed By: sheki
CC: MarkCallaghan, emayanke, leveldb
Differential Revision: https://reviews.facebook.net/D6963
2012-11-26 10:49:50 +01:00
|
|
|
if (file_index) {
|
|
|
|
*file_index = -1;
|
|
|
|
}
|
2011-10-06 01:30:28 +02:00
|
|
|
const Comparator* user_cmp = vset_->icmp_.user_comparator();
|
2013-03-01 03:04:58 +01:00
|
|
|
if (begin != nullptr && end != nullptr && level > 0) {
|
2012-11-06 18:06:16 +01:00
|
|
|
GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs,
|
|
|
|
hint_index, file_index);
|
2012-11-05 08:47:06 +01:00
|
|
|
return;
|
|
|
|
}
|
2011-10-31 18:22:06 +01:00
|
|
|
for (size_t i = 0; i < files_[level].size(); ) {
|
|
|
|
FileMetaData* f = files_[level][i++];
|
|
|
|
const Slice file_start = f->smallest.user_key();
|
|
|
|
const Slice file_limit = f->largest.user_key();
|
2013-03-01 03:04:58 +01:00
|
|
|
if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
|
2011-10-06 01:30:28 +02:00
|
|
|
// "f" is completely before specified range; skip it
|
2013-03-01 03:04:58 +01:00
|
|
|
} else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
|
2011-10-06 01:30:28 +02:00
|
|
|
// "f" is completely after specified range; skip it
|
|
|
|
} else {
|
|
|
|
inputs->push_back(f);
|
2011-10-31 18:22:06 +01:00
|
|
|
if (level == 0) {
|
|
|
|
// Level-0 files may overlap each other. So check if the newly
|
|
|
|
// added file has expanded the range. If so, restart search.
|
2013-03-01 03:04:58 +01:00
|
|
|
if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
|
2011-10-31 18:22:06 +01:00
|
|
|
user_begin = file_start;
|
|
|
|
inputs->clear();
|
|
|
|
i = 0;
|
2013-03-01 03:04:58 +01:00
|
|
|
} else if (end != nullptr
|
|
|
|
&& user_cmp->Compare(file_limit, user_end) > 0) {
|
2011-10-31 18:22:06 +01:00
|
|
|
user_end = file_limit;
|
|
|
|
inputs->clear();
|
|
|
|
i = 0;
|
|
|
|
}
|
2012-11-06 18:06:16 +01:00
|
|
|
} else if (file_index) {
|
Assertion failure while running with unit tests with OPT=-g
Summary:
When we expand the range of keys for a level 0 compaction, we
need to invoke ParentFilesInCompaction() only once for the
entire range of keys that is being compacted. We were invoking
it for each file that was being compacted, but this triggers
an assertion because each file's range were contiguous but
non-overlapping.
I renamed ParentFilesInCompaction to ParentRangeInCompaction
to adequately represent that it is the range-of-keys and
not individual files that we compact in a single compaction run.
Here is the assertion that is fixed by this patch.
db_test: db/version_set.cc:585: void leveldb::Version::ExtendOverlappingInputs(int, const leveldb::Slice&, const leveldb::Slice&, std::vector<leveldb::FileMetaData*, std::allocator<leveldb::FileMetaData*> >*, int): Assertion `user_cmp->Compare(flimit, user_begin) >= 0' failed.
Test Plan: make clean check OPT=-g
Reviewers: sheki
Reviewed By: sheki
CC: MarkCallaghan, emayanke, leveldb
Differential Revision: https://reviews.facebook.net/D6963
2012-11-26 10:49:50 +01:00
|
|
|
*file_index = i-1;
|
2011-10-31 18:22:06 +01:00
|
|
|
}
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
|
2012-11-05 08:47:06 +01:00
|
|
|
// Store in "*inputs" all files in "level" that overlap [begin,end]
|
|
|
|
// Employ binary search to find at least one file that overlaps the
|
|
|
|
// specified range. From that file, iterate backwards and
|
|
|
|
// forwards to find all overlapping files.
|
|
|
|
void Version::GetOverlappingInputsBinarySearch(
|
|
|
|
int level,
|
|
|
|
const Slice& user_begin,
|
|
|
|
const Slice& user_end,
|
2012-11-06 18:06:16 +01:00
|
|
|
std::vector<FileMetaData*>* inputs,
|
|
|
|
int hint_index,
|
|
|
|
int* file_index) {
|
2012-11-05 08:47:06 +01:00
|
|
|
assert(level > 0);
|
|
|
|
int min = 0;
|
|
|
|
int mid = 0;
|
|
|
|
int max = files_[level].size() -1;
|
|
|
|
bool foundOverlap = false;
|
|
|
|
const Comparator* user_cmp = vset_->icmp_.user_comparator();
|
2012-11-06 18:06:16 +01:00
|
|
|
|
|
|
|
// if the caller already knows the index of a file that has overlap,
|
|
|
|
// then we can skip the binary search.
|
|
|
|
if (hint_index != -1) {
|
|
|
|
mid = hint_index;
|
|
|
|
foundOverlap = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (!foundOverlap && min <= max) {
|
2012-11-05 08:47:06 +01:00
|
|
|
mid = (min + max)/2;
|
|
|
|
FileMetaData* f = files_[level][mid];
|
|
|
|
const Slice file_start = f->smallest.user_key();
|
|
|
|
const Slice file_limit = f->largest.user_key();
|
|
|
|
if (user_cmp->Compare(file_limit, user_begin) < 0) {
|
|
|
|
min = mid + 1;
|
|
|
|
} else if (user_cmp->Compare(user_end, file_start) < 0) {
|
|
|
|
max = mid - 1;
|
|
|
|
} else {
|
|
|
|
foundOverlap = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2012-11-05 08:47:06 +01:00
|
|
|
// If there were no overlapping files, return immediately.
|
|
|
|
if (!foundOverlap) {
|
|
|
|
return;
|
|
|
|
}
|
2012-11-06 18:06:16 +01:00
|
|
|
// returns the index where an overlap is found
|
|
|
|
if (file_index) {
|
|
|
|
*file_index = mid;
|
|
|
|
}
|
2012-11-05 08:47:06 +01:00
|
|
|
ExtendOverlappingInputs(level, user_begin, user_end, inputs, mid);
|
|
|
|
}
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2012-11-05 08:47:06 +01:00
|
|
|
// Store in "*inputs" all files in "level" that overlap [begin,end]
|
|
|
|
// The midIndex specifies the index of at least one file that
|
|
|
|
// overlaps the specified range. From that file, iterate backward
|
|
|
|
// and forward to find all overlapping files.
|
|
|
|
void Version::ExtendOverlappingInputs(
|
|
|
|
int level,
|
|
|
|
const Slice& user_begin,
|
|
|
|
const Slice& user_end,
|
|
|
|
std::vector<FileMetaData*>* inputs,
|
2013-03-15 02:32:01 +01:00
|
|
|
unsigned int midIndex) {
|
2012-11-05 08:47:06 +01:00
|
|
|
|
|
|
|
const Comparator* user_cmp = vset_->icmp_.user_comparator();
|
2012-11-06 18:06:16 +01:00
|
|
|
#ifndef NDEBUG
|
|
|
|
{
|
|
|
|
// assert that the file at midIndex overlaps with the range
|
|
|
|
assert(midIndex < files_[level].size());
|
|
|
|
FileMetaData* f = files_[level][midIndex];
|
|
|
|
const Slice fstart = f->smallest.user_key();
|
|
|
|
const Slice flimit = f->largest.user_key();
|
|
|
|
if (user_cmp->Compare(fstart, user_begin) >= 0) {
|
|
|
|
assert(user_cmp->Compare(fstart, user_end) <= 0);
|
|
|
|
} else {
|
|
|
|
assert(user_cmp->Compare(flimit, user_begin) >= 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2012-12-31 07:18:52 +01:00
|
|
|
int startIndex = midIndex + 1;
|
|
|
|
int endIndex = midIndex;
|
2013-01-14 21:39:24 +01:00
|
|
|
int count __attribute__((unused)) = 0;
|
2012-11-05 08:47:06 +01:00
|
|
|
|
|
|
|
// check backwards from 'mid' to lower indices
|
2012-12-31 07:18:52 +01:00
|
|
|
for (int i = midIndex; i >= 0 ; i--) {
|
2012-11-05 08:47:06 +01:00
|
|
|
FileMetaData* f = files_[level][i];
|
|
|
|
const Slice file_limit = f->largest.user_key();
|
|
|
|
if (user_cmp->Compare(file_limit, user_begin) >= 0) {
|
2012-12-31 07:18:52 +01:00
|
|
|
startIndex = i;
|
|
|
|
assert((count++, true));
|
2012-11-05 08:47:06 +01:00
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// check forward from 'mid+1' to higher indices
|
2012-12-31 07:18:52 +01:00
|
|
|
for (unsigned int i = midIndex+1; i < files_[level].size(); i++) {
|
2012-11-05 08:47:06 +01:00
|
|
|
FileMetaData* f = files_[level][i];
|
|
|
|
const Slice file_start = f->smallest.user_key();
|
|
|
|
if (user_cmp->Compare(file_start, user_end) <= 0) {
|
2012-12-31 07:18:52 +01:00
|
|
|
assert((count++, true));
|
|
|
|
endIndex = i;
|
2012-11-05 08:47:06 +01:00
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2012-12-31 07:18:52 +01:00
|
|
|
assert(count == endIndex - startIndex + 1);
|
|
|
|
|
|
|
|
// insert overlapping files into vector
|
|
|
|
for (int i = startIndex; i <= endIndex; i++) {
|
|
|
|
FileMetaData* f = files_[level][i];
|
2013-01-08 21:00:13 +01:00
|
|
|
inputs->push_back(f);
|
2012-12-31 07:18:52 +01:00
|
|
|
}
|
2012-11-05 08:47:06 +01:00
|
|
|
}
|
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// Returns true iff the first or last file in inputs contains
|
|
|
|
// an overlapping user key to the file "just outside" of it (i.e.
|
|
|
|
// just after the last file, or just before the first file)
|
|
|
|
// REQUIRES: "*inputs" is a sorted list of non-overlapping files
|
|
|
|
bool Version::HasOverlappingUserKey(
|
|
|
|
const std::vector<FileMetaData*>* inputs,
|
|
|
|
int level) {
|
|
|
|
|
|
|
|
// If inputs empty, there is no overlap.
|
|
|
|
// If level == 0, it is assumed that all needed files were already included.
|
|
|
|
if (inputs->empty() || level == 0){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
const Comparator* user_cmp = vset_->icmp_.user_comparator();
|
|
|
|
const std::vector<FileMetaData*>& files = files_[level];
|
|
|
|
const size_t kNumFiles = files.size();
|
|
|
|
|
|
|
|
// Check the last file in inputs against the file after it
|
|
|
|
size_t last_file = FindFile(vset_->icmp_, files,
|
|
|
|
inputs->back()->largest.Encode());
|
|
|
|
assert(0 <= last_file && last_file < kNumFiles); // File should exist!
|
|
|
|
if (last_file < kNumFiles-1) { // If not the last file
|
|
|
|
const Slice last_key_in_input = files[last_file]->largest.user_key();
|
|
|
|
const Slice first_key_after = files[last_file+1]->smallest.user_key();
|
|
|
|
if (user_cmp->Compare(last_key_in_input, first_key_after) == 0) {
|
|
|
|
// The last user key in input overlaps with the next file's first key
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check the first file in inputs against the file just before it
|
|
|
|
size_t first_file = FindFile(vset_->icmp_, files,
|
|
|
|
inputs->front()->smallest.Encode());
|
|
|
|
assert(0 <= first_file && first_file <= last_file); // File should exist!
|
|
|
|
if (first_file > 0) { // If not first file
|
|
|
|
const Slice& first_key_in_input = files[first_file]->smallest.user_key();
|
|
|
|
const Slice& last_key_before = files[first_file-1]->largest.user_key();
|
|
|
|
if (user_cmp->Compare(first_key_in_input, last_key_before) == 0) {
|
|
|
|
// The first user key in input overlaps with the previous file's last key
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-16 03:28:36 +01:00
|
|
|
std::string Version::DebugString(bool hex) const {
|
2011-03-18 23:37:00 +01:00
|
|
|
std::string r;
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < vset_->NumberLevels(); level++) {
|
2011-06-22 04:36:45 +02:00
|
|
|
// E.g.,
|
|
|
|
// --- level 1 ---
|
|
|
|
// 17:123['a' .. 'd']
|
|
|
|
// 20:43['e' .. 'g']
|
|
|
|
r.append("--- level ");
|
2011-03-18 23:37:00 +01:00
|
|
|
AppendNumberTo(&r, level);
|
2012-10-19 23:00:53 +02:00
|
|
|
r.append(" --- version# ");
|
|
|
|
AppendNumberTo(&r, version_number_);
|
2011-06-22 04:36:45 +02:00
|
|
|
r.append(" ---\n");
|
2011-03-18 23:37:00 +01:00
|
|
|
const std::vector<FileMetaData*>& files = files_[level];
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
r.push_back(' ');
|
|
|
|
AppendNumberTo(&r, files[i]->number);
|
|
|
|
r.push_back(':');
|
|
|
|
AppendNumberTo(&r, files[i]->file_size);
|
2011-10-06 01:30:28 +02:00
|
|
|
r.append("[");
|
2012-12-16 03:28:36 +01:00
|
|
|
r.append(files[i]->smallest.DebugString(hex));
|
2011-10-06 01:30:28 +02:00
|
|
|
r.append(" .. ");
|
2012-12-16 03:28:36 +01:00
|
|
|
r.append(files[i]->largest.DebugString(hex));
|
2011-10-06 01:30:28 +02:00
|
|
|
r.append("]\n");
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// this is used to batch writes to the manifest file
|
|
|
|
struct VersionSet::ManifestWriter {
|
|
|
|
Status status;
|
|
|
|
bool done;
|
|
|
|
port::CondVar cv;
|
|
|
|
VersionEdit* edit;
|
2012-11-29 01:42:36 +01:00
|
|
|
|
|
|
|
explicit ManifestWriter(port::Mutex* mu, VersionEdit* e) :
|
2012-10-19 23:00:53 +02:00
|
|
|
done(false), cv(mu), edit(e) {}
|
|
|
|
};
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// A helper class so we can efficiently apply a whole sequence
|
|
|
|
// of edits to a particular state without creating intermediate
|
|
|
|
// Versions that contain full copies of the intermediate state.
|
|
|
|
class VersionSet::Builder {
|
|
|
|
private:
|
2011-05-21 04:17:43 +02:00
|
|
|
// Helper to sort by v->files_[file_number].smallest
|
|
|
|
struct BySmallestKey {
|
|
|
|
const InternalKeyComparator* internal_comparator;
|
|
|
|
|
|
|
|
bool operator()(FileMetaData* f1, FileMetaData* f2) const {
|
|
|
|
int r = internal_comparator->Compare(f1->smallest, f2->smallest);
|
|
|
|
if (r != 0) {
|
|
|
|
return (r < 0);
|
|
|
|
} else {
|
|
|
|
// Break ties by file number
|
|
|
|
return (f1->number < f2->number);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef std::set<FileMetaData*, BySmallestKey> FileSet;
|
|
|
|
struct LevelState {
|
|
|
|
std::set<uint64_t> deleted_files;
|
|
|
|
FileSet* added_files;
|
|
|
|
};
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
VersionSet* vset_;
|
2011-05-21 04:17:43 +02:00
|
|
|
Version* base_;
|
2012-06-23 04:30:03 +02:00
|
|
|
LevelState* levels_;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
public:
|
|
|
|
// Initialize a builder with the files from *base and other info from *vset
|
|
|
|
Builder(VersionSet* vset, Version* base)
|
2011-05-21 04:17:43 +02:00
|
|
|
: vset_(vset),
|
|
|
|
base_(base) {
|
|
|
|
base_->Ref();
|
2012-06-23 04:30:03 +02:00
|
|
|
levels_ = new LevelState[vset_->NumberLevels()];
|
2011-05-21 04:17:43 +02:00
|
|
|
BySmallestKey cmp;
|
|
|
|
cmp.internal_comparator = &vset_->icmp_;
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < vset_->NumberLevels(); level++) {
|
2011-05-21 04:17:43 +02:00
|
|
|
levels_[level].added_files = new FileSet(cmp);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
~Builder() {
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < vset_->NumberLevels(); level++) {
|
2011-07-20 01:36:47 +02:00
|
|
|
const FileSet* added = levels_[level].added_files;
|
|
|
|
std::vector<FileMetaData*> to_unref;
|
|
|
|
to_unref.reserve(added->size());
|
|
|
|
for (FileSet::const_iterator it = added->begin();
|
|
|
|
it != added->end(); ++it) {
|
|
|
|
to_unref.push_back(*it);
|
|
|
|
}
|
|
|
|
delete added;
|
2011-08-06 02:19:37 +02:00
|
|
|
for (uint32_t i = 0; i < to_unref.size(); i++) {
|
2011-05-21 04:17:43 +02:00
|
|
|
FileMetaData* f = to_unref[i];
|
2011-03-18 23:37:00 +01:00
|
|
|
f->refs--;
|
|
|
|
if (f->refs <= 0) {
|
|
|
|
delete f;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-06-23 04:30:03 +02:00
|
|
|
delete[] levels_;
|
2011-05-21 04:17:43 +02:00
|
|
|
base_->Unref();
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
void CheckConsistency(Version* v) {
|
|
|
|
#ifndef NDEBUG
|
|
|
|
for (int level = 0; level < vset_->NumberLevels(); level++) {
|
|
|
|
// Make sure there is no overlap in levels > 0
|
|
|
|
if (level > 0) {
|
|
|
|
for (uint32_t i = 1; i < v->files_[level].size(); i++) {
|
|
|
|
const InternalKey& prev_end = v->files_[level][i-1]->largest;
|
|
|
|
const InternalKey& this_begin = v->files_[level][i]->smallest;
|
|
|
|
if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
|
|
|
|
fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
|
|
|
|
prev_end.DebugString().c_str(),
|
|
|
|
this_begin.DebugString().c_str());
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2013-03-15 02:32:01 +01:00
|
|
|
void CheckConsistencyForDeletes(
|
|
|
|
VersionEdit* edit,
|
|
|
|
unsigned int number,
|
|
|
|
int level) {
|
2012-11-13 19:30:00 +01:00
|
|
|
#ifndef NDEBUG
|
|
|
|
// a file to be deleted better exist in the previous version
|
|
|
|
bool found = false;
|
2013-12-12 02:46:26 +01:00
|
|
|
for (int l = 0; !found && l < vset_->NumberLevels(); l++) {
|
2012-11-13 19:30:00 +01:00
|
|
|
const std::vector<FileMetaData*>& base_files = base_->files_[l];
|
2013-03-15 02:32:01 +01:00
|
|
|
for (unsigned int i = 0; i < base_files.size(); i++) {
|
2012-11-13 19:30:00 +01:00
|
|
|
FileMetaData* f = base_files[i];
|
|
|
|
if (f->number == number) {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// if the file did not exist in the previous version, then it
|
|
|
|
// is possibly moved from lower level to higher level in current
|
|
|
|
// version
|
2013-12-12 02:46:26 +01:00
|
|
|
for (int l = level+1; !found && l < vset_->NumberLevels(); l++) {
|
2012-11-13 19:30:00 +01:00
|
|
|
const FileSet* added = levels_[l].added_files;
|
2012-11-19 23:51:22 +01:00
|
|
|
for (FileSet::const_iterator added_iter = added->begin();
|
|
|
|
added_iter != added->end(); ++added_iter) {
|
|
|
|
FileMetaData* f = *added_iter;
|
|
|
|
if (f->number == number) {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// maybe this file was added in a previous edit that was Applied
|
|
|
|
if (!found) {
|
|
|
|
const FileSet* added = levels_[level].added_files;
|
2012-11-13 19:30:00 +01:00
|
|
|
for (FileSet::const_iterator added_iter = added->begin();
|
|
|
|
added_iter != added->end(); ++added_iter) {
|
|
|
|
FileMetaData* f = *added_iter;
|
|
|
|
if (f->number == number) {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(found);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Apply all of the edits in *edit to the current state.
|
|
|
|
void Apply(VersionEdit* edit) {
|
2012-10-19 23:00:53 +02:00
|
|
|
CheckConsistency(base_);
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Update compaction pointers
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
const int level = edit->compact_pointers_[i].first;
|
|
|
|
vset_->compact_pointer_[level] =
|
|
|
|
edit->compact_pointers_[i].second.Encode().ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete files
|
|
|
|
const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
|
|
|
|
for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
|
|
|
|
iter != del.end();
|
|
|
|
++iter) {
|
|
|
|
const int level = iter->first;
|
|
|
|
const uint64_t number = iter->second;
|
2011-05-21 04:17:43 +02:00
|
|
|
levels_[level].deleted_files.insert(number);
|
2012-11-13 19:30:00 +01:00
|
|
|
CheckConsistencyForDeletes(edit, number, level);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Add new files
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < edit->new_files_.size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
const int level = edit->new_files_[i].first;
|
|
|
|
FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
|
|
|
|
f->refs = 1;
|
2011-06-22 04:36:45 +02:00
|
|
|
|
|
|
|
// We arrange to automatically compact this file after
|
|
|
|
// a certain number of seeks. Let's assume:
|
|
|
|
// (1) One seek costs 10ms
|
|
|
|
// (2) Writing or reading 1MB costs 10ms (100MB/s)
|
|
|
|
// (3) A compaction of 1MB does 25MB of IO:
|
|
|
|
// 1MB read from this level
|
|
|
|
// 10-12MB read from next level (boundaries may be misaligned)
|
|
|
|
// 10-12MB written to next level
|
|
|
|
// This implies that 25 seeks cost the same as the compaction
|
|
|
|
// of 1MB of data. I.e., one seek costs approximately the
|
|
|
|
// same as the compaction of 40KB of data. We are a little
|
|
|
|
// conservative and allow approximately one seek for every 16KB
|
|
|
|
// of data before triggering a compaction.
|
|
|
|
f->allowed_seeks = (f->file_size / 16384);
|
|
|
|
if (f->allowed_seeks < 100) f->allowed_seeks = 100;
|
|
|
|
|
2011-05-21 04:17:43 +02:00
|
|
|
levels_[level].deleted_files.erase(f->number);
|
|
|
|
levels_[level].added_files->insert(f);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save the current state in *v.
|
|
|
|
void SaveTo(Version* v) {
|
2012-10-19 23:00:53 +02:00
|
|
|
CheckConsistency(base_);
|
|
|
|
CheckConsistency(v);
|
2011-05-21 04:17:43 +02:00
|
|
|
BySmallestKey cmp;
|
|
|
|
cmp.internal_comparator = &vset_->icmp_;
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < vset_->NumberLevels(); level++) {
|
2011-05-21 04:17:43 +02:00
|
|
|
// Merge the set of added files with the set of pre-existing files.
|
|
|
|
// Drop any deleted files. Store the result in *v.
|
|
|
|
const std::vector<FileMetaData*>& base_files = base_->files_[level];
|
|
|
|
std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
|
|
|
|
std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
|
|
|
|
const FileSet* added = levels_[level].added_files;
|
|
|
|
v->files_[level].reserve(base_files.size() + added->size());
|
|
|
|
for (FileSet::const_iterator added_iter = added->begin();
|
|
|
|
added_iter != added->end();
|
|
|
|
++added_iter) {
|
|
|
|
// Add all smaller files listed in base_
|
|
|
|
for (std::vector<FileMetaData*>::const_iterator bpos
|
|
|
|
= std::upper_bound(base_iter, base_end, *added_iter, cmp);
|
|
|
|
base_iter != bpos;
|
|
|
|
++base_iter) {
|
|
|
|
MaybeAddFile(v, level, *base_iter);
|
|
|
|
}
|
|
|
|
|
|
|
|
MaybeAddFile(v, level, *added_iter);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add remaining base files
|
|
|
|
for (; base_iter != base_end; ++base_iter) {
|
|
|
|
MaybeAddFile(v, level, *base_iter);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2011-05-21 04:17:43 +02:00
|
|
|
}
|
2013-12-09 23:28:26 +01:00
|
|
|
|
2012-10-26 03:21:54 +02:00
|
|
|
CheckConsistency(v);
|
2011-05-21 04:17:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void MaybeAddFile(Version* v, int level, FileMetaData* f) {
|
|
|
|
if (levels_[level].deleted_files.count(f->number) > 0) {
|
|
|
|
// File is deleted: do nothing
|
|
|
|
} else {
|
2011-06-22 04:36:45 +02:00
|
|
|
std::vector<FileMetaData*>* files = &v->files_[level];
|
|
|
|
if (level > 0 && !files->empty()) {
|
|
|
|
// Must not overlap
|
|
|
|
assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest,
|
|
|
|
f->smallest) < 0);
|
|
|
|
}
|
2011-05-21 04:17:43 +02:00
|
|
|
f->refs++;
|
2011-06-22 04:36:45 +02:00
|
|
|
files->push_back(f);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
VersionSet::VersionSet(const std::string& dbname,
|
|
|
|
const Options* options,
|
2013-06-08 00:35:17 +02:00
|
|
|
const EnvOptions& storage_options,
|
2011-03-18 23:37:00 +01:00
|
|
|
TableCache* table_cache,
|
|
|
|
const InternalKeyComparator* cmp)
|
|
|
|
: env_(options->env),
|
|
|
|
dbname_(dbname),
|
|
|
|
options_(options),
|
|
|
|
table_cache_(table_cache),
|
|
|
|
icmp_(*cmp),
|
|
|
|
next_file_number_(2),
|
|
|
|
manifest_file_number_(0), // Filled by Recover()
|
2011-04-12 21:38:58 +02:00
|
|
|
last_sequence_(0),
|
|
|
|
log_number_(0),
|
|
|
|
prev_log_number_(0),
|
2012-11-06 21:02:18 +01:00
|
|
|
num_levels_(options_->num_levels),
|
2011-05-21 04:17:43 +02:00
|
|
|
dummy_versions_(this),
|
2013-03-01 03:04:58 +01:00
|
|
|
current_(nullptr),
|
2012-10-19 23:00:53 +02:00
|
|
|
compactions_in_progress_(options_->num_levels),
|
2013-01-11 02:18:50 +01:00
|
|
|
current_version_number_(0),
|
2013-03-15 01:00:04 +01:00
|
|
|
last_observed_manifest_size_(0),
|
|
|
|
storage_options_(storage_options),
|
|
|
|
storage_options_compactions_(storage_options_) {
|
2012-09-24 23:01:01 +02:00
|
|
|
compact_pointer_ = new std::string[options_->num_levels];
|
2012-10-31 19:47:18 +01:00
|
|
|
Init(options_->num_levels);
|
2012-10-19 23:00:53 +02:00
|
|
|
AppendVersion(new Version(this, current_version_number_++));
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
VersionSet::~VersionSet() {
|
2011-05-21 04:17:43 +02:00
|
|
|
current_->Unref();
|
|
|
|
assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty
|
2013-11-12 20:53:26 +01:00
|
|
|
for (auto file : obsolete_files_) {
|
|
|
|
delete file;
|
|
|
|
}
|
|
|
|
obsolete_files_.clear();
|
2012-06-23 04:30:03 +02:00
|
|
|
delete[] compact_pointer_;
|
|
|
|
delete[] max_file_size_;
|
|
|
|
delete[] level_max_bytes_;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2012-10-31 19:47:18 +01:00
|
|
|
void VersionSet::Init(int num_levels) {
|
|
|
|
max_file_size_ = new uint64_t[num_levels];
|
|
|
|
level_max_bytes_ = new uint64_t[num_levels];
|
|
|
|
int target_file_size_multiplier = options_->target_file_size_multiplier;
|
|
|
|
int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
|
|
|
|
for (int i = 0; i < num_levels; i++) {
|
2013-07-24 23:20:54 +02:00
|
|
|
if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
|
|
|
|
max_file_size_[i] = ULLONG_MAX;
|
2013-06-14 07:09:08 +02:00
|
|
|
level_max_bytes_[i] = options_->max_bytes_for_level_base;
|
|
|
|
} else if (i > 1) {
|
2012-10-31 19:47:18 +01:00
|
|
|
max_file_size_[i] = max_file_size_[i-1] * target_file_size_multiplier;
|
2013-05-21 20:37:06 +02:00
|
|
|
level_max_bytes_[i] = level_max_bytes_[i-1] * max_bytes_multiplier *
|
|
|
|
options_->max_bytes_for_level_multiplier_additional[i-1];
|
2012-10-31 19:47:18 +01:00
|
|
|
} else {
|
|
|
|
max_file_size_[i] = options_->target_file_size_base;
|
|
|
|
level_max_bytes_[i] = options_->max_bytes_for_level_base;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-05-21 04:17:43 +02:00
|
|
|
void VersionSet::AppendVersion(Version* v) {
|
|
|
|
// Make "v" current
|
|
|
|
assert(v->refs_ == 0);
|
|
|
|
assert(v != current_);
|
2013-03-01 03:04:58 +01:00
|
|
|
if (current_ != nullptr) {
|
2012-10-19 23:00:53 +02:00
|
|
|
assert(current_->refs_ > 0);
|
2011-05-21 04:17:43 +02:00
|
|
|
current_->Unref();
|
|
|
|
}
|
|
|
|
current_ = v;
|
|
|
|
v->Ref();
|
|
|
|
|
|
|
|
// Append to linked list
|
|
|
|
v->prev_ = dummy_versions_.prev_;
|
|
|
|
v->next_ = &dummy_versions_;
|
|
|
|
v->prev_->next_ = v;
|
|
|
|
v->next_->prev_ = v;
|
|
|
|
}
|
|
|
|
|
2012-10-31 19:47:18 +01:00
|
|
|
Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
|
|
|
|
bool new_descriptor_log) {
|
2012-10-19 23:00:53 +02:00
|
|
|
mu->AssertHeld();
|
2011-04-12 21:38:58 +02:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// queue our request
|
|
|
|
ManifestWriter w(mu, edit);
|
|
|
|
manifest_writers_.push_back(&w);
|
|
|
|
while (!w.done && &w != manifest_writers_.front()) {
|
|
|
|
w.cv.Wait();
|
2011-04-12 21:38:58 +02:00
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
if (w.done) {
|
|
|
|
return w.status;
|
|
|
|
}
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
std::vector<VersionEdit*> batch_edits;
|
|
|
|
Version* v = new Version(this, current_version_number_++);
|
|
|
|
Builder builder(this, current_);
|
2011-04-12 21:38:58 +02:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// process all requests in the queue
|
|
|
|
ManifestWriter* last_writer = &w;
|
|
|
|
assert(!manifest_writers_.empty());
|
2012-11-08 00:11:37 +01:00
|
|
|
assert(manifest_writers_.front() == &w);
|
2012-10-19 23:00:53 +02:00
|
|
|
std::deque<ManifestWriter*>::iterator iter = manifest_writers_.begin();
|
|
|
|
for (; iter != manifest_writers_.end(); ++iter) {
|
|
|
|
last_writer = *iter;
|
2012-11-29 01:42:36 +01:00
|
|
|
LogAndApplyHelper(&builder, v, last_writer->edit, mu);
|
2012-10-19 23:00:53 +02:00
|
|
|
batch_edits.push_back(last_writer->edit);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
builder.SaveTo(v);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
// Initialize new descriptor log file if necessary by creating
|
|
|
|
// a temporary file that contains a snapshot of the current version.
|
2011-05-21 04:17:43 +02:00
|
|
|
std::string new_manifest_file;
|
2012-09-24 23:01:01 +02:00
|
|
|
uint64_t new_manifest_file_size = 0;
|
2011-05-21 04:17:43 +02:00
|
|
|
Status s;
|
2013-11-09 00:23:46 +01:00
|
|
|
// we will need this if we are creating new manifest
|
|
|
|
uint64_t old_manifest_file_number = manifest_file_number_;
|
2013-01-11 02:18:50 +01:00
|
|
|
|
|
|
|
// No need to perform this check if a new Manifest is being created anyways.
|
2013-02-19 05:08:12 +01:00
|
|
|
if (!descriptor_log_ ||
|
|
|
|
last_observed_manifest_size_ > options_->max_manifest_file_size) {
|
2013-01-11 02:18:50 +01:00
|
|
|
new_descriptor_log = true;
|
|
|
|
manifest_file_number_ = NewFileNumber(); // Change manifest file no.
|
|
|
|
}
|
|
|
|
|
2013-11-09 00:23:46 +01:00
|
|
|
if (new_descriptor_log) {
|
2011-05-21 04:17:43 +02:00
|
|
|
new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
|
|
|
|
edit->SetNextFile(next_file_number_);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// Unlock during expensive MANIFEST log write. New writes cannot get here
|
|
|
|
// because &w is ensuring that all new writes get queued.
|
2011-09-01 21:08:02 +02:00
|
|
|
{
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
// calculate the amount of data being compacted at every level
|
|
|
|
std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
|
|
|
|
SizeBeingCompacted(size_being_compacted);
|
|
|
|
|
2011-09-01 21:08:02 +02:00
|
|
|
mu->Unlock();
|
2012-11-01 06:01:57 +01:00
|
|
|
|
2013-11-01 20:32:27 +01:00
|
|
|
// This is fine because everything inside of this block is serialized --
|
|
|
|
// only one thread can be here at the same time
|
|
|
|
if (!new_manifest_file.empty()) {
|
|
|
|
unique_ptr<WritableFile> descriptor_file;
|
|
|
|
s = env_->NewWritableFile(new_manifest_file, &descriptor_file,
|
|
|
|
storage_options_);
|
|
|
|
if (s.ok()) {
|
|
|
|
descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
|
|
|
|
s = WriteSnapshot(descriptor_log_.get());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
// The calls to Finalize and UpdateFilesBySize are cpu-heavy
|
2012-11-01 06:01:57 +01:00
|
|
|
// and is best called outside the mutex.
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
Finalize(v, size_being_compacted);
|
2012-11-01 06:01:57 +01:00
|
|
|
UpdateFilesBySize(v);
|
2011-09-01 21:08:02 +02:00
|
|
|
|
|
|
|
// Write new record to MANIFEST log
|
2011-03-18 23:37:00 +01:00
|
|
|
if (s.ok()) {
|
2011-09-01 21:08:02 +02:00
|
|
|
std::string record;
|
2012-10-19 23:00:53 +02:00
|
|
|
for (unsigned int i = 0; i < batch_edits.size(); i++) {
|
|
|
|
batch_edits[i]->EncodeTo(&record);
|
|
|
|
s = descriptor_log_->AddRecord(record);
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2011-09-01 21:08:02 +02:00
|
|
|
if (s.ok()) {
|
2012-08-27 21:10:26 +02:00
|
|
|
if (options_->use_fsync) {
|
2013-11-22 23:14:05 +01:00
|
|
|
StopWatch sw(env_, options_->statistics.get(),
|
|
|
|
MANIFEST_FILE_SYNC_MICROS);
|
2013-01-20 11:07:13 +01:00
|
|
|
s = descriptor_log_->file()->Fsync();
|
2012-08-27 21:10:26 +02:00
|
|
|
} else {
|
2013-11-22 23:14:05 +01:00
|
|
|
StopWatch sw(env_, options_->statistics.get(),
|
|
|
|
MANIFEST_FILE_SYNC_MICROS);
|
2013-01-20 11:07:13 +01:00
|
|
|
s = descriptor_log_->file()->Sync();
|
2012-08-27 21:10:26 +02:00
|
|
|
}
|
2011-09-01 21:08:02 +02:00
|
|
|
}
|
2013-01-08 21:00:13 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
|
|
|
|
if (ManifestContains(record)) {
|
|
|
|
Log(options_->info_log,
|
|
|
|
"MANIFEST contains log record despite error; advancing to new "
|
2013-03-06 22:28:54 +01:00
|
|
|
"version to prevent mismatch between in-memory and logged state"
|
|
|
|
" If paranoid is set, then the db is now in readonly mode.");
|
2013-01-08 21:00:13 +01:00
|
|
|
s = Status::OK();
|
|
|
|
}
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2011-09-01 21:08:02 +02:00
|
|
|
// If we just created a new descriptor file, install it by writing a
|
|
|
|
// new CURRENT file that points to it.
|
|
|
|
if (s.ok() && !new_manifest_file.empty()) {
|
|
|
|
s = SetCurrentFile(env_, dbname_, manifest_file_number_);
|
2013-11-09 00:23:46 +01:00
|
|
|
if (s.ok() && old_manifest_file_number < manifest_file_number_) {
|
|
|
|
// delete old manifest file
|
|
|
|
Log(options_->info_log,
|
2013-11-13 06:02:03 +01:00
|
|
|
"Deleting manifest %lu current manifest %lu\n",
|
|
|
|
(unsigned long)old_manifest_file_number,
|
|
|
|
(unsigned long)manifest_file_number_);
|
2013-11-09 00:23:46 +01:00
|
|
|
// we don't care about an error here, PurgeObsoleteFiles will take care
|
|
|
|
// of it later
|
|
|
|
env_->DeleteFile(DescriptorFileName(dbname_, old_manifest_file_number));
|
|
|
|
}
|
2011-09-01 21:08:02 +02:00
|
|
|
}
|
|
|
|
|
2012-09-24 23:01:01 +02:00
|
|
|
// find offset in manifest file where this version is stored.
|
2013-01-20 11:07:13 +01:00
|
|
|
new_manifest_file_size = descriptor_log_->file()->GetFileSize();
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2013-11-07 20:31:56 +01:00
|
|
|
LogFlush(options_->info_log);
|
2011-09-01 21:08:02 +02:00
|
|
|
mu->Lock();
|
2013-01-11 02:18:50 +01:00
|
|
|
// cache the manifest_file_size so that it can be used to rollover in the
|
|
|
|
// next call to LogAndApply
|
|
|
|
last_observed_manifest_size_ = new_manifest_file_size;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Install the new version
|
|
|
|
if (s.ok()) {
|
2012-09-24 23:01:01 +02:00
|
|
|
v->offset_manifest_file_ = new_manifest_file_size;
|
2011-05-21 04:17:43 +02:00
|
|
|
AppendVersion(v);
|
2011-04-12 21:38:58 +02:00
|
|
|
log_number_ = edit->log_number_;
|
|
|
|
prev_log_number_ = edit->prev_log_number_;
|
2012-10-19 23:00:53 +02:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
} else {
|
2013-11-13 06:02:03 +01:00
|
|
|
Log(options_->info_log, "Error in committing version %lu",
|
|
|
|
(unsigned long)v->GetVersionNumber());
|
2011-03-18 23:37:00 +01:00
|
|
|
delete v;
|
|
|
|
if (!new_manifest_file.empty()) {
|
2013-01-20 11:07:13 +01:00
|
|
|
descriptor_log_.reset();
|
2011-03-18 23:37:00 +01:00
|
|
|
env_->DeleteFile(new_manifest_file);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// wake up all the waiting writers
|
|
|
|
while (true) {
|
|
|
|
ManifestWriter* ready = manifest_writers_.front();
|
|
|
|
manifest_writers_.pop_front();
|
|
|
|
if (ready != &w) {
|
|
|
|
ready->status = s;
|
|
|
|
ready->done = true;
|
|
|
|
ready->cv.Signal();
|
|
|
|
}
|
|
|
|
if (ready == last_writer) break;
|
|
|
|
}
|
|
|
|
// Notify new head of write queue
|
|
|
|
if (!manifest_writers_.empty()) {
|
|
|
|
manifest_writers_.front()->cv.Signal();
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
void VersionSet::LogAndApplyHelper(Builder* builder, Version* v,
|
|
|
|
VersionEdit* edit, port::Mutex* mu) {
|
|
|
|
mu->AssertHeld();
|
|
|
|
|
|
|
|
if (edit->has_log_number_) {
|
|
|
|
assert(edit->log_number_ >= log_number_);
|
|
|
|
assert(edit->log_number_ < next_file_number_);
|
|
|
|
} else {
|
|
|
|
edit->SetLogNumber(log_number_);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!edit->has_prev_log_number_) {
|
|
|
|
edit->SetPrevLogNumber(prev_log_number_);
|
|
|
|
}
|
|
|
|
|
|
|
|
edit->SetNextFile(next_file_number_);
|
|
|
|
edit->SetLastSequence(last_sequence_);
|
|
|
|
|
|
|
|
builder->Apply(edit);
|
|
|
|
}
|
|
|
|
|
2011-04-12 21:38:58 +02:00
|
|
|
Status VersionSet::Recover() {
|
2011-03-18 23:37:00 +01:00
|
|
|
struct LogReporter : public log::Reader::Reporter {
|
|
|
|
Status* status;
|
|
|
|
virtual void Corruption(size_t bytes, const Status& s) {
|
|
|
|
if (this->status->ok()) *this->status = s;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Read "CURRENT" file, which contains a pointer to the current manifest file
|
|
|
|
std::string current;
|
|
|
|
Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (current.empty() || current[current.size()-1] != '\n') {
|
|
|
|
return Status::Corruption("CURRENT file does not end with newline");
|
|
|
|
}
|
|
|
|
current.resize(current.size() - 1);
|
|
|
|
|
2012-08-23 04:15:06 +02:00
|
|
|
Log(options_->info_log, "Recovering from manifest file:%s\n",
|
|
|
|
current.c_str());
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
std::string dscname = dbname_ + "/" + current;
|
2013-01-20 11:07:13 +01:00
|
|
|
unique_ptr<SequentialFile> file;
|
2013-03-15 01:00:04 +01:00
|
|
|
s = env_->NewSequentialFile(dscname, &file, storage_options_);
|
2011-03-18 23:37:00 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2012-09-24 23:01:01 +02:00
|
|
|
uint64_t manifest_file_size;
|
|
|
|
s = env_->GetFileSize(dscname, &manifest_file_size);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
bool have_log_number = false;
|
2011-04-12 21:38:58 +02:00
|
|
|
bool have_prev_log_number = false;
|
2011-03-18 23:37:00 +01:00
|
|
|
bool have_next_file = false;
|
|
|
|
bool have_last_sequence = false;
|
|
|
|
uint64_t next_file = 0;
|
2011-04-12 21:38:58 +02:00
|
|
|
uint64_t last_sequence = 0;
|
|
|
|
uint64_t log_number = 0;
|
|
|
|
uint64_t prev_log_number = 0;
|
2011-03-18 23:37:00 +01:00
|
|
|
Builder builder(this, current_);
|
|
|
|
|
2014-01-06 22:31:06 +01:00
|
|
|
// add default column family
|
|
|
|
column_families_.insert({default_column_family_name, 0});
|
|
|
|
column_family_data_.insert(
|
|
|
|
{0, ColumnFamilyData(default_column_family_name)});
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
{
|
|
|
|
LogReporter reporter;
|
|
|
|
reporter.status = &s;
|
2013-01-20 11:07:13 +01:00
|
|
|
log::Reader reader(std::move(file), &reporter, true/*checksum*/,
|
|
|
|
0/*initial_offset*/);
|
2011-03-18 23:37:00 +01:00
|
|
|
Slice record;
|
|
|
|
std::string scratch;
|
|
|
|
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
2012-06-23 04:30:03 +02:00
|
|
|
VersionEdit edit(NumberLevels());
|
2011-03-18 23:37:00 +01:00
|
|
|
s = edit.DecodeFrom(record);
|
|
|
|
if (s.ok()) {
|
|
|
|
if (edit.has_comparator_ &&
|
|
|
|
edit.comparator_ != icmp_.user_comparator()->Name()) {
|
2013-10-14 20:22:52 +02:00
|
|
|
s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
|
|
|
|
"does not match existing comparator " +
|
|
|
|
edit.comparator_);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
builder.Apply(&edit);
|
|
|
|
}
|
|
|
|
|
2014-01-02 18:08:12 +01:00
|
|
|
if (edit.is_column_family_add_) {
|
|
|
|
assert(column_families_.find(edit.column_family_name_) ==
|
|
|
|
column_families_.end());
|
|
|
|
column_families_.insert(
|
|
|
|
{edit.column_family_name_, edit.column_family_});
|
|
|
|
column_family_data_.insert(
|
|
|
|
{edit.column_family_, ColumnFamilyData(edit.column_family_name_)});
|
|
|
|
max_column_family_ = std::max(max_column_family_, edit.column_family_);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (edit.is_column_family_drop_) {
|
|
|
|
auto cf = column_family_data_.find(edit.column_family_);
|
|
|
|
assert(cf != column_family_data_.end());
|
|
|
|
column_families_.erase(cf->second.name);
|
|
|
|
column_family_data_.erase(cf);
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
if (edit.has_log_number_) {
|
2011-04-12 21:38:58 +02:00
|
|
|
log_number = edit.log_number_;
|
2011-03-18 23:37:00 +01:00
|
|
|
have_log_number = true;
|
|
|
|
}
|
|
|
|
|
2011-04-12 21:38:58 +02:00
|
|
|
if (edit.has_prev_log_number_) {
|
|
|
|
prev_log_number = edit.prev_log_number_;
|
|
|
|
have_prev_log_number = true;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
if (edit.has_next_file_number_) {
|
|
|
|
next_file = edit.next_file_number_;
|
|
|
|
have_next_file = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (edit.has_last_sequence_) {
|
2011-04-12 21:38:58 +02:00
|
|
|
last_sequence = edit.last_sequence_;
|
2011-03-18 23:37:00 +01:00
|
|
|
have_last_sequence = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2013-01-20 11:07:13 +01:00
|
|
|
file.reset();
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
if (!have_next_file) {
|
|
|
|
s = Status::Corruption("no meta-nextfile entry in descriptor");
|
|
|
|
} else if (!have_log_number) {
|
|
|
|
s = Status::Corruption("no meta-lognumber entry in descriptor");
|
|
|
|
} else if (!have_last_sequence) {
|
|
|
|
s = Status::Corruption("no last-sequence-number entry in descriptor");
|
|
|
|
}
|
2011-04-12 21:38:58 +02:00
|
|
|
|
|
|
|
if (!have_prev_log_number) {
|
|
|
|
prev_log_number = 0;
|
|
|
|
}
|
2011-09-01 21:08:02 +02:00
|
|
|
|
|
|
|
MarkFileNumberUsed(prev_log_number);
|
|
|
|
MarkFileNumberUsed(log_number);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2012-10-19 23:00:53 +02:00
|
|
|
Version* v = new Version(this, current_version_number_++);
|
2011-03-18 23:37:00 +01:00
|
|
|
builder.SaveTo(v);
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
|
2011-05-21 04:17:43 +02:00
|
|
|
// Install recovered version
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
|
|
|
|
SizeBeingCompacted(size_being_compacted);
|
|
|
|
Finalize(v, size_being_compacted);
|
|
|
|
|
2012-09-24 23:01:01 +02:00
|
|
|
v->offset_manifest_file_ = manifest_file_size;
|
2011-05-21 04:17:43 +02:00
|
|
|
AppendVersion(v);
|
|
|
|
manifest_file_number_ = next_file;
|
|
|
|
next_file_number_ = next_file + 1;
|
|
|
|
last_sequence_ = last_sequence;
|
|
|
|
log_number_ = log_number;
|
|
|
|
prev_log_number_ = prev_log_number;
|
2012-08-23 04:15:06 +02:00
|
|
|
|
2012-08-24 20:28:59 +02:00
|
|
|
Log(options_->info_log, "Recovered from manifest file:%s succeeded,"
|
2013-11-13 06:02:03 +01:00
|
|
|
"manifest_file_number is %lu, next_file_number is %lu, "
|
|
|
|
"last_sequence is %lu, log_number is %lu,"
|
|
|
|
"prev_log_number is %lu\n",
|
|
|
|
current.c_str(),
|
|
|
|
(unsigned long)manifest_file_number_,
|
|
|
|
(unsigned long)next_file_number_,
|
|
|
|
(unsigned long)last_sequence_,
|
|
|
|
(unsigned long)log_number_,
|
|
|
|
(unsigned long)prev_log_number_);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2012-11-19 20:54:13 +01:00
|
|
|
Status VersionSet::DumpManifest(Options& options, std::string& dscname,
|
2012-12-16 03:28:36 +01:00
|
|
|
bool verbose, bool hex) {
|
2012-08-17 19:48:40 +02:00
|
|
|
struct LogReporter : public log::Reader::Reporter {
|
|
|
|
Status* status;
|
|
|
|
virtual void Corruption(size_t bytes, const Status& s) {
|
|
|
|
if (this->status->ok()) *this->status = s;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Open the specified manifest file.
|
2013-01-20 11:07:13 +01:00
|
|
|
unique_ptr<SequentialFile> file;
|
2013-03-15 01:00:04 +01:00
|
|
|
Status s = options.env->NewSequentialFile(dscname, &file, storage_options_);
|
2012-08-17 19:48:40 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool have_log_number = false;
|
|
|
|
bool have_prev_log_number = false;
|
|
|
|
bool have_next_file = false;
|
|
|
|
bool have_last_sequence = false;
|
|
|
|
uint64_t next_file = 0;
|
|
|
|
uint64_t last_sequence = 0;
|
|
|
|
uint64_t log_number = 0;
|
|
|
|
uint64_t prev_log_number = 0;
|
2012-11-19 20:54:13 +01:00
|
|
|
int count = 0;
|
2012-08-17 19:48:40 +02:00
|
|
|
VersionSet::Builder builder(this, current_);
|
|
|
|
|
|
|
|
{
|
|
|
|
LogReporter reporter;
|
|
|
|
reporter.status = &s;
|
2013-01-20 11:07:13 +01:00
|
|
|
log::Reader reader(std::move(file), &reporter, true/*checksum*/,
|
|
|
|
0/*initial_offset*/);
|
2012-08-17 19:48:40 +02:00
|
|
|
Slice record;
|
|
|
|
std::string scratch;
|
|
|
|
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
|
|
|
VersionEdit edit(NumberLevels());
|
|
|
|
s = edit.DecodeFrom(record);
|
|
|
|
if (s.ok()) {
|
|
|
|
if (edit.has_comparator_ &&
|
|
|
|
edit.comparator_ != icmp_.user_comparator()->Name()) {
|
2013-10-14 20:22:52 +02:00
|
|
|
s = Status::InvalidArgument(icmp_.user_comparator()->Name(),
|
|
|
|
"does not match existing comparator " +
|
|
|
|
edit.comparator_);
|
2012-08-17 19:48:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-19 21:16:45 +01:00
|
|
|
// Write out each individual edit
|
|
|
|
if (verbose) {
|
2012-11-29 01:42:36 +01:00
|
|
|
printf("*************************Edit[%d] = %s\n",
|
2013-08-09 00:51:16 +02:00
|
|
|
count, edit.DebugString(hex).c_str());
|
2012-11-19 21:16:45 +01:00
|
|
|
}
|
|
|
|
count++;
|
|
|
|
|
2012-08-17 19:48:40 +02:00
|
|
|
if (s.ok()) {
|
|
|
|
builder.Apply(&edit);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (edit.has_log_number_) {
|
|
|
|
log_number = edit.log_number_;
|
|
|
|
have_log_number = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (edit.has_prev_log_number_) {
|
|
|
|
prev_log_number = edit.prev_log_number_;
|
|
|
|
have_prev_log_number = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (edit.has_next_file_number_) {
|
|
|
|
next_file = edit.next_file_number_;
|
|
|
|
have_next_file = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (edit.has_last_sequence_) {
|
|
|
|
last_sequence = edit.last_sequence_;
|
|
|
|
have_last_sequence = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2013-01-20 11:07:13 +01:00
|
|
|
file.reset();
|
2012-08-17 19:48:40 +02:00
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
if (!have_next_file) {
|
|
|
|
s = Status::Corruption("no meta-nextfile entry in descriptor");
|
|
|
|
printf("no meta-nextfile entry in descriptor");
|
|
|
|
} else if (!have_log_number) {
|
|
|
|
s = Status::Corruption("no meta-lognumber entry in descriptor");
|
|
|
|
printf("no meta-lognumber entry in descriptor");
|
|
|
|
} else if (!have_last_sequence) {
|
|
|
|
printf("no last-sequence-number entry in descriptor");
|
|
|
|
s = Status::Corruption("no last-sequence-number entry in descriptor");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!have_prev_log_number) {
|
|
|
|
prev_log_number = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
MarkFileNumberUsed(prev_log_number);
|
|
|
|
MarkFileNumberUsed(log_number);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2012-10-19 23:00:53 +02:00
|
|
|
Version* v = new Version(this, 0);
|
2012-08-17 19:48:40 +02:00
|
|
|
builder.SaveTo(v);
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
|
2012-08-17 19:48:40 +02:00
|
|
|
// Install recovered version
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
|
|
|
|
SizeBeingCompacted(size_being_compacted);
|
|
|
|
Finalize(v, size_being_compacted);
|
|
|
|
|
2012-08-17 19:48:40 +02:00
|
|
|
AppendVersion(v);
|
|
|
|
manifest_file_number_ = next_file;
|
|
|
|
next_file_number_ = next_file + 1;
|
|
|
|
last_sequence_ = last_sequence;
|
|
|
|
log_number_ = log_number;
|
|
|
|
prev_log_number_ = prev_log_number;
|
2012-08-17 19:48:40 +02:00
|
|
|
|
2013-11-13 06:02:03 +01:00
|
|
|
printf("manifest_file_number %lu next_file_number %lu last_sequence "
|
|
|
|
"%lu log_number %lu prev_log_number %lu\n",
|
|
|
|
(unsigned long)manifest_file_number_,
|
|
|
|
(unsigned long)next_file_number_,
|
|
|
|
(unsigned long)last_sequence,
|
|
|
|
(unsigned long)log_number,
|
|
|
|
(unsigned long)prev_log_number);
|
2012-12-16 03:28:36 +01:00
|
|
|
printf("%s \n", v->DebugString(hex).c_str());
|
2012-08-17 19:48:40 +02:00
|
|
|
}
|
2012-08-17 19:48:40 +02:00
|
|
|
|
2012-08-17 19:48:40 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2011-09-01 21:08:02 +02:00
|
|
|
void VersionSet::MarkFileNumberUsed(uint64_t number) {
|
|
|
|
if (next_file_number_ <= number) {
|
|
|
|
next_file_number_ = number + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
void VersionSet::Finalize(Version* v,
|
|
|
|
std::vector<uint64_t>& size_being_compacted) {
|
2013-12-12 00:40:22 +01:00
|
|
|
// Pre-sort level0 for Get()
|
|
|
|
if (options_->compaction_style == kCompactionStyleUniversal) {
|
|
|
|
std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo);
|
|
|
|
} else {
|
|
|
|
std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2012-10-29 22:18:00 +01:00
|
|
|
double max_score = 0;
|
2013-03-02 21:56:04 +01:00
|
|
|
int max_score_level = 0;
|
|
|
|
|
2013-10-16 22:32:53 +02:00
|
|
|
int num_levels_to_check =
|
|
|
|
(options_->compaction_style != kCompactionStyleUniversal) ?
|
|
|
|
NumberLevels() - 1 : 1;
|
|
|
|
|
|
|
|
for (int level = 0; level < num_levels_to_check; level++) {
|
|
|
|
|
2011-04-12 21:38:58 +02:00
|
|
|
double score;
|
2011-03-18 23:37:00 +01:00
|
|
|
if (level == 0) {
|
2011-04-12 21:38:58 +02:00
|
|
|
// We treat level-0 specially by bounding the number of files
|
|
|
|
// instead of number of bytes for two reasons:
|
|
|
|
//
|
|
|
|
// (1) With larger write-buffer sizes, it is nice not to do too
|
|
|
|
// many level-0 compactions.
|
|
|
|
//
|
|
|
|
// (2) The files in level-0 are merged on every read and
|
|
|
|
// therefore we wish to avoid too many files when the individual
|
|
|
|
// file size is small (perhaps because of a small write-buffer
|
|
|
|
// setting, or very high compression ratios, or lots of
|
|
|
|
// overwrites/deletions).
|
2012-10-19 23:00:53 +02:00
|
|
|
int numfiles = 0;
|
|
|
|
for (unsigned int i = 0; i < v->files_[level].size(); i++) {
|
|
|
|
if (!v->files_[level][i]->being_compacted) {
|
|
|
|
numfiles++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we are slowing down writes, then we better compact that first
|
|
|
|
if (numfiles >= options_->level0_stop_writes_trigger) {
|
|
|
|
score = 1000000;
|
|
|
|
// Log(options_->info_log, "XXX score l0 = 1000000000 max");
|
|
|
|
} else if (numfiles >= options_->level0_slowdown_writes_trigger) {
|
|
|
|
score = 10000;
|
|
|
|
// Log(options_->info_log, "XXX score l0 = 1000000 medium");
|
|
|
|
} else {
|
|
|
|
score = numfiles /
|
2012-06-23 04:30:03 +02:00
|
|
|
static_cast<double>(options_->level0_file_num_compaction_trigger);
|
2012-10-19 23:00:53 +02:00
|
|
|
if (score >= 1) {
|
|
|
|
// Log(options_->info_log, "XXX score l0 = %d least", (int)score);
|
|
|
|
}
|
|
|
|
}
|
2011-04-12 21:38:58 +02:00
|
|
|
} else {
|
|
|
|
// Compute the ratio of current size to size limit.
|
2012-10-19 23:00:53 +02:00
|
|
|
const uint64_t level_bytes = TotalFileSize(v->files_[level]) -
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
size_being_compacted[level];
|
2011-04-12 21:38:58 +02:00
|
|
|
score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
|
2012-10-19 23:00:53 +02:00
|
|
|
if (score > 1) {
|
|
|
|
// Log(options_->info_log, "XXX score l%d = %d ", level, (int)score);
|
|
|
|
}
|
2012-10-29 22:18:00 +01:00
|
|
|
if (max_score < score) {
|
|
|
|
max_score = score;
|
2013-03-02 21:56:04 +01:00
|
|
|
max_score_level = level;
|
2012-10-29 22:18:00 +01:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
v->compaction_level_[level] = level;
|
|
|
|
v->compaction_score_[level] = score;
|
|
|
|
}
|
|
|
|
|
2012-10-29 22:18:00 +01:00
|
|
|
// update the max compaction score in levels 1 to n-1
|
|
|
|
v->max_compaction_score_ = max_score;
|
2013-03-02 21:56:04 +01:00
|
|
|
v->max_compaction_score_level_ = max_score_level;
|
2012-10-29 22:18:00 +01:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// sort all the levels based on their score. Higher scores get listed
|
|
|
|
// first. Use bubble sort because the number of entries are small.
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
for (int i = 0; i < NumberLevels()-2; i++) {
|
2012-11-20 18:08:11 +01:00
|
|
|
for (int j = i+1; j < NumberLevels()-1; j++) {
|
2012-10-19 23:00:53 +02:00
|
|
|
if (v->compaction_score_[i] < v->compaction_score_[j]) {
|
|
|
|
double score = v->compaction_score_[i];
|
|
|
|
int level = v->compaction_level_[i];
|
|
|
|
v->compaction_score_[i] = v->compaction_score_[j];
|
|
|
|
v->compaction_level_[i] = v->compaction_level_[j];
|
|
|
|
v->compaction_score_[j] = score;
|
|
|
|
v->compaction_level_[j] = level;
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-06-14 07:09:08 +02:00
|
|
|
// A static compator used to sort files based on their size
|
|
|
|
// In normal mode: descending size
|
|
|
|
static bool compareSizeDescending(const VersionSet::Fsize& first,
|
2012-11-01 06:01:57 +01:00
|
|
|
const VersionSet::Fsize& second) {
|
|
|
|
return (first.file->file_size > second.file->file_size);
|
|
|
|
}
|
2013-06-14 07:09:08 +02:00
|
|
|
// A static compator used to sort files based on their seqno
|
2013-07-04 00:32:49 +02:00
|
|
|
// In universal style : descending seqno
|
2013-06-14 07:09:08 +02:00
|
|
|
static bool compareSeqnoDescending(const VersionSet::Fsize& first,
|
|
|
|
const VersionSet::Fsize& second) {
|
|
|
|
if (first.file->smallest_seqno > second.file->smallest_seqno) {
|
|
|
|
assert(first.file->largest_seqno > second.file->largest_seqno);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
assert(first.file->largest_seqno <= second.file->largest_seqno);
|
|
|
|
return false;
|
|
|
|
}
|
2012-11-01 06:01:57 +01:00
|
|
|
|
|
|
|
// sort all files in level1 to level(n-1) based on file size
|
|
|
|
void VersionSet::UpdateFilesBySize(Version* v) {
|
|
|
|
|
|
|
|
// No need to sort the highest level because it is never compacted.
|
2013-07-04 00:32:49 +02:00
|
|
|
int max_level = (options_->compaction_style == kCompactionStyleUniversal) ?
|
|
|
|
NumberLevels() : NumberLevels() - 1;
|
2013-06-14 07:09:08 +02:00
|
|
|
|
|
|
|
for (int level = 0; level < max_level; level++) {
|
2012-11-01 06:01:57 +01:00
|
|
|
|
|
|
|
const std::vector<FileMetaData*>& files = v->files_[level];
|
|
|
|
std::vector<int>& files_by_size = v->files_by_size_[level];
|
|
|
|
assert(files_by_size.size() == 0);
|
|
|
|
|
|
|
|
// populate a temp vector for sorting based on size
|
|
|
|
std::vector<Fsize> temp(files.size());
|
|
|
|
for (unsigned int i = 0; i < files.size(); i++) {
|
|
|
|
temp[i].index = i;
|
|
|
|
temp[i].file = files[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
// sort the top number_of_files_to_sort_ based on file size
|
2013-07-04 00:32:49 +02:00
|
|
|
if (options_->compaction_style == kCompactionStyleUniversal) {
|
2013-06-14 07:09:08 +02:00
|
|
|
int num = temp.size();
|
|
|
|
std::partial_sort(temp.begin(), temp.begin() + num,
|
|
|
|
temp.end(), compareSeqnoDescending);
|
|
|
|
} else {
|
|
|
|
int num = Version::number_of_files_to_sort_;
|
|
|
|
if (num > (int)temp.size()) {
|
|
|
|
num = temp.size();
|
|
|
|
}
|
|
|
|
std::partial_sort(temp.begin(), temp.begin() + num,
|
|
|
|
temp.end(), compareSizeDescending);
|
2012-11-01 06:01:57 +01:00
|
|
|
}
|
|
|
|
assert(temp.size() == files.size());
|
|
|
|
|
|
|
|
// initialize files_by_size_
|
|
|
|
for (unsigned int i = 0; i < temp.size(); i++) {
|
|
|
|
files_by_size.push_back(temp[i].index);
|
|
|
|
}
|
|
|
|
v->next_file_to_compact_by_size_[level] = 0;
|
|
|
|
assert(v->files_[level].size() == v->files_by_size_[level].size());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Status VersionSet::WriteSnapshot(log::Writer* log) {
|
|
|
|
// TODO: Break up into multiple records to reduce memory usage on recovery?
|
|
|
|
|
2014-01-02 18:08:12 +01:00
|
|
|
// Save column families
|
|
|
|
for (auto cf : column_families_) {
|
|
|
|
VersionEdit edit(0);
|
2014-01-06 22:31:06 +01:00
|
|
|
if (cf.second == 0) {
|
|
|
|
// default column family is always there,
|
|
|
|
// no need to explicitly write it
|
|
|
|
continue;
|
|
|
|
}
|
2014-01-02 18:08:12 +01:00
|
|
|
edit.AddColumnFamily(cf.first);
|
|
|
|
edit.SetColumnFamily(cf.second);
|
|
|
|
std::string record;
|
|
|
|
edit.EncodeTo(&record);
|
|
|
|
Status s = log->AddRecord(record);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Save metadata
|
2012-06-23 04:30:03 +02:00
|
|
|
VersionEdit edit(NumberLevels());
|
2011-03-18 23:37:00 +01:00
|
|
|
edit.SetComparatorName(icmp_.user_comparator()->Name());
|
|
|
|
|
|
|
|
// Save compaction pointers
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < NumberLevels(); level++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
if (!compact_pointer_[level].empty()) {
|
|
|
|
InternalKey key;
|
|
|
|
key.DecodeFrom(compact_pointer_[level]);
|
|
|
|
edit.SetCompactPointer(level, key);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save files
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < NumberLevels(); level++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
const std::vector<FileMetaData*>& files = current_->files_[level];
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
const FileMetaData* f = files[i];
|
2013-06-14 07:09:08 +02:00
|
|
|
edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest,
|
|
|
|
f->smallest_seqno, f->largest_seqno);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string record;
|
|
|
|
edit.EncodeTo(&record);
|
|
|
|
return log->AddRecord(record);
|
|
|
|
}
|
|
|
|
|
|
|
|
int VersionSet::NumLevelFiles(int level) const {
|
|
|
|
assert(level >= 0);
|
2012-06-23 04:30:03 +02:00
|
|
|
assert(level < NumberLevels());
|
2011-03-18 23:37:00 +01:00
|
|
|
return current_->files_[level].size();
|
|
|
|
}
|
|
|
|
|
2011-05-21 04:17:43 +02:00
|
|
|
const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
|
2012-06-23 04:30:03 +02:00
|
|
|
int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
|
|
|
|
for (int i = 0; i < NumberLevels(); i++) {
|
|
|
|
int sz = sizeof(scratch->buffer) - len;
|
|
|
|
int ret = snprintf(scratch->buffer + len, sz, "%d ",
|
|
|
|
int(current_->files_[i].size()));
|
|
|
|
if (ret < 0 || ret >= sz)
|
|
|
|
break;
|
|
|
|
len += ret;
|
|
|
|
}
|
|
|
|
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
|
2011-05-21 04:17:43 +02:00
|
|
|
return scratch->buffer;
|
|
|
|
}
|
|
|
|
|
2012-08-15 00:20:36 +02:00
|
|
|
const char* VersionSet::LevelDataSizeSummary(
|
|
|
|
LevelSummaryStorage* scratch) const {
|
|
|
|
int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
|
|
|
|
for (int i = 0; i < NumberLevels(); i++) {
|
|
|
|
int sz = sizeof(scratch->buffer) - len;
|
2013-11-13 06:02:03 +01:00
|
|
|
int ret = snprintf(scratch->buffer + len, sz, "%lu ",
|
|
|
|
(unsigned long)NumLevelBytes(i));
|
2012-08-15 00:20:36 +02:00
|
|
|
if (ret < 0 || ret >= sz)
|
|
|
|
break;
|
|
|
|
len += ret;
|
|
|
|
}
|
|
|
|
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
|
|
|
|
return scratch->buffer;
|
|
|
|
}
|
|
|
|
|
2013-06-14 07:09:08 +02:00
|
|
|
const char* VersionSet::LevelFileSummary(
|
|
|
|
FileSummaryStorage* scratch, int level) const {
|
|
|
|
int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
|
|
|
|
for (unsigned int i = 0; i < current_->files_[level].size(); i++) {
|
|
|
|
FileMetaData* f = current_->files_[level][i];
|
|
|
|
int sz = sizeof(scratch->buffer) - len;
|
2013-11-13 05:05:28 +01:00
|
|
|
int ret = snprintf(scratch->buffer + len, sz,
|
2013-11-13 06:02:03 +01:00
|
|
|
"#%lu(seq=%lu,sz=%lu,%lu) ",
|
|
|
|
(unsigned long)f->number,
|
|
|
|
(unsigned long)f->smallest_seqno,
|
|
|
|
(unsigned long)f->file_size,
|
|
|
|
(unsigned long)f->being_compacted);
|
2013-06-14 07:09:08 +02:00
|
|
|
if (ret < 0 || ret >= sz)
|
|
|
|
break;
|
|
|
|
len += ret;
|
|
|
|
}
|
|
|
|
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
|
|
|
|
return scratch->buffer;
|
|
|
|
}
|
|
|
|
|
2013-01-08 21:00:13 +01:00
|
|
|
// Opens the mainfest file and reads all records
|
|
|
|
// till it finds the record we are looking for.
|
|
|
|
bool VersionSet::ManifestContains(const std::string& record) const {
|
|
|
|
std::string fname = DescriptorFileName(dbname_, manifest_file_number_);
|
|
|
|
Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
|
2013-01-20 11:07:13 +01:00
|
|
|
unique_ptr<SequentialFile> file;
|
2013-03-15 01:00:04 +01:00
|
|
|
Status s = env_->NewSequentialFile(fname, &file, storage_options_);
|
2013-01-08 21:00:13 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
|
2013-03-06 22:28:54 +01:00
|
|
|
Log(options_->info_log,
|
|
|
|
"ManifestContains: is unable to reopen the manifest file %s",
|
|
|
|
fname.c_str());
|
2013-01-08 21:00:13 +01:00
|
|
|
return false;
|
|
|
|
}
|
2013-03-01 03:04:58 +01:00
|
|
|
log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0);
|
2013-01-08 21:00:13 +01:00
|
|
|
Slice r;
|
|
|
|
std::string scratch;
|
|
|
|
bool result = false;
|
|
|
|
while (reader.ReadRecord(&r, &scratch)) {
|
|
|
|
if (r == Slice(record)) {
|
|
|
|
result = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
|
|
|
|
uint64_t result = 0;
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < NumberLevels(); level++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
const std::vector<FileMetaData*>& files = v->files_[level];
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
|
|
|
|
// Entire file is before "ikey", so just add the file size
|
|
|
|
result += files[i]->file_size;
|
|
|
|
} else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
|
|
|
|
// Entire file is after "ikey", so ignore
|
|
|
|
if (level > 0) {
|
|
|
|
// Files other than level 0 are sorted by meta->smallest, so
|
|
|
|
// no further files in this level will contain data for
|
|
|
|
// "ikey".
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// "ikey" falls in the range for this table. Add the
|
|
|
|
// approximate offset of "ikey" within the table.
|
2013-10-30 18:52:33 +01:00
|
|
|
TableReader* table_reader_ptr;
|
2011-03-18 23:37:00 +01:00
|
|
|
Iterator* iter = table_cache_->NewIterator(
|
2013-03-15 01:00:04 +01:00
|
|
|
ReadOptions(), storage_options_, files[i]->number,
|
2013-10-30 18:52:33 +01:00
|
|
|
files[i]->file_size, &table_reader_ptr);
|
|
|
|
if (table_reader_ptr != nullptr) {
|
|
|
|
result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
2013-04-12 01:49:53 +02:00
|
|
|
void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_list) {
|
|
|
|
// pre-calculate space requirement
|
|
|
|
int64_t total_files = 0;
|
2011-05-21 04:17:43 +02:00
|
|
|
for (Version* v = dummy_versions_.next_;
|
|
|
|
v != &dummy_versions_;
|
|
|
|
v = v->next_) {
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 0; level < NumberLevels(); level++) {
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
2013-04-12 01:49:53 +02:00
|
|
|
total_files += v->files_[level].size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// just one time extension to the right size
|
|
|
|
live_list->reserve(live_list->size() + total_files);
|
|
|
|
|
|
|
|
for (Version* v = dummy_versions_.next_;
|
|
|
|
v != &dummy_versions_;
|
|
|
|
v = v->next_) {
|
|
|
|
for (int level = 0; level < NumberLevels(); level++) {
|
|
|
|
for (const auto& f : v->files_[level]) {
|
|
|
|
live_list->push_back(f->number);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-09-15 02:11:35 +02:00
|
|
|
void VersionSet::AddLiveFilesCurrentVersion(std::set<uint64_t>* live) {
|
|
|
|
Version* v = current_;
|
|
|
|
for (int level = 0; level < NumberLevels(); level++) {
|
|
|
|
const std::vector<FileMetaData*>& files = v->files_[level];
|
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
|
|
live->insert(files[i]->number);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-04-12 21:38:58 +02:00
|
|
|
int64_t VersionSet::NumLevelBytes(int level) const {
|
|
|
|
assert(level >= 0);
|
2012-06-23 04:30:03 +02:00
|
|
|
assert(level < NumberLevels());
|
2013-03-18 20:04:38 +01:00
|
|
|
assert(current_);
|
|
|
|
return TotalFileSize(current_->files_[level]);
|
2011-03-23 00:24:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int64_t VersionSet::MaxNextLevelOverlappingBytes() {
|
2013-07-17 22:56:24 +02:00
|
|
|
uint64_t result = 0;
|
2011-03-22 19:32:49 +01:00
|
|
|
std::vector<FileMetaData*> overlaps;
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int level = 1; level < NumberLevels() - 1; level++) {
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < current_->files_[level].size(); i++) {
|
2011-03-22 19:32:49 +01:00
|
|
|
const FileMetaData* f = current_->files_[level][i];
|
2011-10-06 01:30:28 +02:00
|
|
|
current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
|
|
|
|
&overlaps);
|
2013-07-17 22:56:24 +02:00
|
|
|
const uint64_t sum = TotalFileSize(overlaps);
|
2011-03-22 19:32:49 +01:00
|
|
|
if (sum > result) {
|
|
|
|
result = sum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Stores the minimal range that covers all entries in inputs in
|
|
|
|
// *smallest, *largest.
|
|
|
|
// REQUIRES: inputs is not empty
|
|
|
|
void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
|
|
|
|
InternalKey* smallest,
|
|
|
|
InternalKey* largest) {
|
|
|
|
assert(!inputs.empty());
|
|
|
|
smallest->Clear();
|
|
|
|
largest->Clear();
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < inputs.size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
FileMetaData* f = inputs[i];
|
|
|
|
if (i == 0) {
|
|
|
|
*smallest = f->smallest;
|
|
|
|
*largest = f->largest;
|
|
|
|
} else {
|
|
|
|
if (icmp_.Compare(f->smallest, *smallest) < 0) {
|
|
|
|
*smallest = f->smallest;
|
|
|
|
}
|
|
|
|
if (icmp_.Compare(f->largest, *largest) > 0) {
|
|
|
|
*largest = f->largest;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-22 19:32:49 +01:00
|
|
|
// Stores the minimal range that covers all entries in inputs1 and inputs2
|
|
|
|
// in *smallest, *largest.
|
|
|
|
// REQUIRES: inputs is not empty
|
|
|
|
void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
|
|
|
|
const std::vector<FileMetaData*>& inputs2,
|
|
|
|
InternalKey* smallest,
|
|
|
|
InternalKey* largest) {
|
|
|
|
std::vector<FileMetaData*> all = inputs1;
|
|
|
|
all.insert(all.end(), inputs2.begin(), inputs2.end());
|
|
|
|
GetRange(all, smallest, largest);
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Iterator* VersionSet::MakeInputIterator(Compaction* c) {
|
|
|
|
ReadOptions options;
|
|
|
|
options.verify_checksums = options_->paranoid_checks;
|
|
|
|
options.fill_cache = false;
|
|
|
|
|
|
|
|
// Level-0 files have to be merged together. For other levels,
|
|
|
|
// we will make a concatenating iterator per level.
|
|
|
|
// TODO(opt): use concatenating iterator for level-0 if there is no overlap
|
|
|
|
const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
|
|
|
|
Iterator** list = new Iterator*[space];
|
|
|
|
int num = 0;
|
|
|
|
for (int which = 0; which < 2; which++) {
|
|
|
|
if (!c->inputs_[which].empty()) {
|
|
|
|
if (c->level() + which == 0) {
|
|
|
|
const std::vector<FileMetaData*>& files = c->inputs_[which];
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
2011-03-28 22:43:44 +02:00
|
|
|
list[num++] = table_cache_->NewIterator(
|
2013-03-15 01:00:04 +01:00
|
|
|
options, storage_options_compactions_,
|
2013-05-18 00:53:01 +02:00
|
|
|
files[i]->number, files[i]->file_size, nullptr,
|
|
|
|
true /* for compaction */);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Create concatenating iterator for the files from this level
|
|
|
|
list[num++] = NewTwoLevelIterator(
|
2011-05-21 04:17:43 +02:00
|
|
|
new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
|
2013-05-18 00:53:01 +02:00
|
|
|
&GetFileIterator, table_cache_, options, storage_options_,
|
|
|
|
true /* for compaction */);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(num <= space);
|
2013-11-22 02:40:39 +01:00
|
|
|
Iterator* result = NewMergingIterator(&icmp_, list, num);
|
2011-03-18 23:37:00 +01:00
|
|
|
delete[] list;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2012-06-23 04:30:03 +02:00
|
|
|
double VersionSet::MaxBytesForLevel(int level) {
|
2012-11-29 01:42:36 +01:00
|
|
|
// Note: the result for level zero is not really used since we set
|
|
|
|
// the level-0 compaction threshold based on number of files.
|
2012-06-23 04:30:03 +02:00
|
|
|
assert(level >= 0);
|
|
|
|
assert(level < NumberLevels());
|
2012-11-29 01:42:36 +01:00
|
|
|
return level_max_bytes_[level];
|
2012-06-23 04:30:03 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t VersionSet::MaxFileSizeForLevel(int level) {
|
|
|
|
assert(level >= 0);
|
|
|
|
assert(level < NumberLevels());
|
2012-08-29 21:29:43 +02:00
|
|
|
return max_file_size_[level];
|
2012-06-23 04:30:03 +02:00
|
|
|
}
|
|
|
|
|
2013-07-17 22:56:24 +02:00
|
|
|
uint64_t VersionSet::ExpandedCompactionByteSizeLimit(int level) {
|
2012-06-23 04:30:03 +02:00
|
|
|
uint64_t result = MaxFileSizeForLevel(level);
|
|
|
|
result *= options_->expanded_compaction_factor;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2013-07-17 22:56:24 +02:00
|
|
|
uint64_t VersionSet::MaxGrandParentOverlapBytes(int level) {
|
2012-06-23 04:30:03 +02:00
|
|
|
uint64_t result = MaxFileSizeForLevel(level);
|
|
|
|
result *= options_->max_grandparent_overlap_factor;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2012-11-29 01:42:36 +01:00
|
|
|
// verify that the files listed in this compaction are present
|
2012-10-19 23:00:53 +02:00
|
|
|
// in the current version
|
|
|
|
bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
|
2013-03-06 22:28:54 +01:00
|
|
|
#ifndef NDEBUG
|
2012-10-19 23:00:53 +02:00
|
|
|
if (c->input_version_ != current_) {
|
|
|
|
Log(options_->info_log, "VerifyCompactionFileConsistency version mismatch");
|
|
|
|
}
|
|
|
|
|
|
|
|
// verify files in level
|
|
|
|
int level = c->level();
|
|
|
|
for (int i = 0; i < c->num_input_files(0); i++) {
|
|
|
|
uint64_t number = c->input(0,i)->number;
|
|
|
|
|
|
|
|
// look for this file in the current version
|
|
|
|
bool found = false;
|
|
|
|
for (unsigned int j = 0; j < current_->files_[level].size(); j++) {
|
|
|
|
FileMetaData* f = current_->files_[level][j];
|
|
|
|
if (f->number == number) {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!found) {
|
|
|
|
return false; // input files non existant in current version
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// verify level+1 files
|
|
|
|
level++;
|
|
|
|
for (int i = 0; i < c->num_input_files(1); i++) {
|
|
|
|
uint64_t number = c->input(1,i)->number;
|
|
|
|
|
|
|
|
// look for this file in the current version
|
|
|
|
bool found = false;
|
|
|
|
for (unsigned int j = 0; j < current_->files_[level].size(); j++) {
|
|
|
|
FileMetaData* f = current_->files_[level][j];
|
|
|
|
if (f->number == number) {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!found) {
|
|
|
|
return false; // input files non existant in current version
|
|
|
|
}
|
|
|
|
}
|
2013-03-06 22:28:54 +01:00
|
|
|
#endif
|
2012-10-19 23:00:53 +02:00
|
|
|
return true; // everything good
|
|
|
|
}
|
|
|
|
|
|
|
|
// Clear all files to indicate that they are not being compacted
|
|
|
|
// Delete this compaction from the list of running compactions.
|
2012-11-01 06:01:57 +01:00
|
|
|
void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) {
|
2012-10-19 23:00:53 +02:00
|
|
|
c->MarkFilesBeingCompacted(false);
|
|
|
|
compactions_in_progress_[c->level()].erase(c);
|
2012-11-01 06:01:57 +01:00
|
|
|
if (!status.ok()) {
|
|
|
|
c->ResetNextCompactionIndex();
|
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// The total size of files that are currently being compacted
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
// at at every level upto the penultimate level.
|
|
|
|
void VersionSet::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
|
|
|
|
for (int level = 0; level < NumberLevels()-1; level++) {
|
|
|
|
uint64_t total = 0;
|
|
|
|
for (std::set<Compaction*>::iterator it =
|
|
|
|
compactions_in_progress_[level].begin();
|
|
|
|
it != compactions_in_progress_[level].end();
|
|
|
|
++it) {
|
|
|
|
Compaction* c = (*it);
|
|
|
|
assert(c->level() == level);
|
|
|
|
for (int i = 0; i < c->num_input_files(0); i++) {
|
|
|
|
total += c->input(0,i)->file_size;
|
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
sizes[level] = total;
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
//
|
|
|
|
// Look at overall size amplification. If size amplification
|
|
|
|
// exceeeds the configured value, then do a compaction
|
|
|
|
// of the candidate files all the way upto the earliest
|
|
|
|
// base file (overrides configured values of file-size ratios,
|
|
|
|
// min_merge_width and max_merge_width).
|
|
|
|
//
|
|
|
|
Compaction* VersionSet::PickCompactionUniversalSizeAmp(
|
|
|
|
int level, double score) {
|
2013-06-14 07:09:08 +02:00
|
|
|
assert (level == 0);
|
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
// percentage flexibilty while reducing size amplification
|
|
|
|
uint64_t ratio = options_->compaction_options_universal.
|
|
|
|
max_size_amplification_percent;
|
2013-06-14 07:09:08 +02:00
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
// The files are sorted from newest first to oldest last.
|
|
|
|
std::vector<int>& file_by_time = current_->files_by_size_[level];
|
|
|
|
assert(file_by_time.size() == current_->files_[level].size());
|
|
|
|
|
|
|
|
unsigned int candidate_count = 0;
|
|
|
|
uint64_t candidate_size = 0;
|
|
|
|
unsigned int start_index = 0;
|
|
|
|
FileMetaData* f = nullptr;
|
|
|
|
|
|
|
|
// Skip files that are already being compacted
|
|
|
|
for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
|
|
|
|
int index = file_by_time[loop];
|
|
|
|
f = current_->files_[level][index];
|
|
|
|
if (!f->being_compacted) {
|
|
|
|
start_index = loop; // Consider this as the first candidate.
|
|
|
|
break;
|
|
|
|
}
|
2013-11-13 06:02:03 +01:00
|
|
|
Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s",
|
|
|
|
(unsigned long)f->number,
|
|
|
|
loop,
|
|
|
|
" cannot be a candidate to reduce size amp.\n");
|
2013-09-10 01:06:10 +02:00
|
|
|
f = nullptr;
|
|
|
|
}
|
|
|
|
if (f == nullptr) {
|
|
|
|
return nullptr; // no candidate files
|
|
|
|
}
|
|
|
|
|
2013-11-13 06:02:03 +01:00
|
|
|
Log(options_->info_log, "Universal: First candidate file %lu[%d] %s",
|
|
|
|
(unsigned long)f->number,
|
|
|
|
start_index,
|
|
|
|
" to reduce size amp.\n");
|
2013-09-10 01:06:10 +02:00
|
|
|
|
|
|
|
// keep adding up all the remaining files
|
|
|
|
for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
|
|
|
|
loop++) {
|
|
|
|
int index = file_by_time[loop];
|
|
|
|
f = current_->files_[level][index];
|
|
|
|
if (f->being_compacted) {
|
|
|
|
Log(options_->info_log,
|
2013-11-13 06:02:03 +01:00
|
|
|
"Universal: Possible candidate file %lu[%d] %s.",
|
|
|
|
(unsigned long)f->number,
|
|
|
|
loop,
|
2013-09-10 01:06:10 +02:00
|
|
|
" is already being compacted. No size amp reduction possible.\n");
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
candidate_size += f->file_size;
|
|
|
|
candidate_count++;
|
|
|
|
}
|
|
|
|
if (candidate_count == 0) {
|
2013-06-14 07:09:08 +02:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
// size of earliest file
|
|
|
|
int index = file_by_time[file_by_time.size() - 1];
|
|
|
|
uint64_t earliest_file_size = current_->files_[level][index]->file_size;
|
|
|
|
|
|
|
|
// size amplification = percentage of additional size
|
|
|
|
if (candidate_size * 100 < ratio * earliest_file_size) {
|
|
|
|
Log(options_->info_log,
|
2013-11-13 06:02:03 +01:00
|
|
|
"Universal: size amp not needed. newer-files-total-size %lu "
|
|
|
|
"earliest-file-size %lu",
|
|
|
|
(unsigned long)candidate_size,
|
|
|
|
(unsigned long)earliest_file_size);
|
2013-09-10 01:06:10 +02:00
|
|
|
return nullptr;
|
|
|
|
} else {
|
|
|
|
Log(options_->info_log,
|
2013-11-13 06:02:03 +01:00
|
|
|
"Universal: size amp needed. newer-files-total-size %lu "
|
|
|
|
"earliest-file-size %lu",
|
|
|
|
(unsigned long)candidate_size,
|
|
|
|
(unsigned long)earliest_file_size);
|
2013-09-10 01:06:10 +02:00
|
|
|
}
|
|
|
|
assert(start_index >= 0 && start_index < file_by_time.size() - 1);
|
|
|
|
|
|
|
|
// create a compaction request
|
2013-10-17 22:33:39 +02:00
|
|
|
// We always compact all the files, so always compress.
|
2013-09-10 01:06:10 +02:00
|
|
|
Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level),
|
2013-10-17 22:33:39 +02:00
|
|
|
LLONG_MAX, NumberLevels(), false,
|
|
|
|
true);
|
2013-06-14 07:09:08 +02:00
|
|
|
c->score_ = score;
|
2013-09-10 01:06:10 +02:00
|
|
|
for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
|
|
|
|
int index = file_by_time[loop];
|
|
|
|
f = current_->files_[level][index];
|
|
|
|
c->inputs_[0].push_back(f);
|
|
|
|
Log(options_->info_log,
|
2013-11-13 06:02:03 +01:00
|
|
|
"Universal: size amp picking file %lu[%d] with size %lu",
|
|
|
|
(unsigned long)f->number,
|
|
|
|
index,
|
|
|
|
(unsigned long)f->file_size);
|
2013-09-10 01:06:10 +02:00
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Consider compaction files based on their size differences with
|
|
|
|
// the next file in time order.
|
|
|
|
//
|
|
|
|
Compaction* VersionSet::PickCompactionUniversalReadAmp(
|
|
|
|
int level, double score, unsigned int ratio,
|
|
|
|
unsigned int max_number_of_files_to_compact) {
|
|
|
|
|
|
|
|
unsigned int min_merge_width =
|
|
|
|
options_->compaction_options_universal.min_merge_width;
|
|
|
|
unsigned int max_merge_width =
|
|
|
|
options_->compaction_options_universal.max_merge_width;
|
2013-06-14 07:09:08 +02:00
|
|
|
|
|
|
|
// The files are sorted from newest first to oldest last.
|
|
|
|
std::vector<int>& file_by_time = current_->files_by_size_[level];
|
|
|
|
FileMetaData* f = nullptr;
|
|
|
|
bool done = false;
|
2013-09-10 01:06:10 +02:00
|
|
|
int start_index = 0;
|
|
|
|
unsigned int candidate_count;
|
2013-06-14 07:09:08 +02:00
|
|
|
assert(file_by_time.size() == current_->files_[level].size());
|
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
unsigned int max_files_to_compact = std::min(max_merge_width,
|
|
|
|
max_number_of_files_to_compact);
|
|
|
|
min_merge_width = std::max(min_merge_width, 2U);
|
2013-06-14 07:09:08 +02:00
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
// Considers a candidate file only if it is smaller than the
|
|
|
|
// total size accumulated so far.
|
|
|
|
for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
|
2013-06-14 07:09:08 +02:00
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
candidate_count = 0;
|
2013-06-14 07:09:08 +02:00
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
// Skip files that are already being compacted
|
|
|
|
for (f = nullptr; loop < file_by_time.size(); loop++) {
|
|
|
|
int index = file_by_time[loop];
|
|
|
|
f = current_->files_[level][index];
|
2013-06-14 07:09:08 +02:00
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
if (!f->being_compacted) {
|
|
|
|
candidate_count = 1;
|
|
|
|
break;
|
2013-06-14 07:09:08 +02:00
|
|
|
}
|
2013-09-10 01:06:10 +02:00
|
|
|
Log(options_->info_log,
|
2013-11-13 06:02:03 +01:00
|
|
|
"Universal: file %lu[%d] being compacted, skipping",
|
|
|
|
(unsigned long)f->number, loop);
|
2013-09-10 01:06:10 +02:00
|
|
|
f = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// This file is not being compacted. Consider it as the
|
|
|
|
// first candidate to be compacted.
|
|
|
|
uint64_t candidate_size = f != nullptr? f->file_size : 0;
|
|
|
|
if (f != nullptr) {
|
2013-11-13 06:02:03 +01:00
|
|
|
Log(options_->info_log, "Universal: Possible candidate file %lu[%d].",
|
|
|
|
(unsigned long)f->number, loop);
|
2013-09-10 01:06:10 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the suceeding files need compaction.
|
|
|
|
for (unsigned int i = loop+1;
|
|
|
|
candidate_count < max_files_to_compact && i < file_by_time.size();
|
|
|
|
i++) {
|
|
|
|
int index = file_by_time[i];
|
|
|
|
FileMetaData* f = current_->files_[level][index];
|
|
|
|
if (f->being_compacted) {
|
2013-06-14 07:09:08 +02:00
|
|
|
break;
|
|
|
|
}
|
2013-09-10 01:06:10 +02:00
|
|
|
// pick files if the total candidate file size (increased by the
|
|
|
|
// specified ratio) is still larger than the next candidate file.
|
|
|
|
uint64_t sz = (candidate_size * (100L + ratio)) /100;
|
|
|
|
if (sz < f->file_size) {
|
|
|
|
break;
|
2013-06-14 07:09:08 +02:00
|
|
|
}
|
2013-09-10 01:06:10 +02:00
|
|
|
candidate_count++;
|
|
|
|
candidate_size += f->file_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Found a series of consecutive files that need compaction.
|
|
|
|
if (candidate_count >= (unsigned int)min_merge_width) {
|
|
|
|
start_index = loop;
|
|
|
|
done = true;
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
for (unsigned int i = loop;
|
|
|
|
i < loop + candidate_count && i < file_by_time.size(); i++) {
|
|
|
|
int index = file_by_time[i];
|
|
|
|
FileMetaData* f = current_->files_[level][index];
|
|
|
|
Log(options_->info_log,
|
2013-11-13 06:02:03 +01:00
|
|
|
"Universal: Skipping file %lu[%d] with size %lu %d\n",
|
|
|
|
(unsigned long)f->number,
|
|
|
|
i,
|
|
|
|
(unsigned long)f->file_size,
|
|
|
|
f->being_compacted);
|
2013-06-14 07:09:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2013-09-10 01:06:10 +02:00
|
|
|
if (!done || candidate_count <= 1) {
|
2013-06-14 07:09:08 +02:00
|
|
|
return nullptr;
|
|
|
|
}
|
2013-10-17 22:33:39 +02:00
|
|
|
unsigned int first_index_after = start_index + candidate_count;
|
|
|
|
// Compression is enabled if files compacted earlier already reached
|
|
|
|
// size ratio of compression.
|
|
|
|
bool enable_compression = true;
|
|
|
|
int ratio_to_compress =
|
|
|
|
options_->compaction_options_universal.compression_size_percent;
|
|
|
|
if (ratio_to_compress >= 0) {
|
|
|
|
uint64_t total_size = TotalFileSize(current_->files_[level]);
|
|
|
|
uint64_t older_file_size = 0;
|
|
|
|
for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
|
|
|
|
i--) {
|
|
|
|
older_file_size += current_->files_[level][file_by_time[i]]->file_size;
|
|
|
|
if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
|
|
|
|
enable_compression = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2013-09-10 01:06:10 +02:00
|
|
|
Compaction* c = new Compaction(level, level, MaxFileSizeForLevel(level),
|
2013-10-17 22:33:39 +02:00
|
|
|
LLONG_MAX, NumberLevels(), false,
|
|
|
|
enable_compression);
|
2013-09-10 01:06:10 +02:00
|
|
|
c->score_ = score;
|
|
|
|
|
2013-10-17 22:33:39 +02:00
|
|
|
for (unsigned int i = start_index; i < first_index_after; i++) {
|
2013-09-10 01:06:10 +02:00
|
|
|
int index = file_by_time[i];
|
|
|
|
FileMetaData* f = current_->files_[level][index];
|
|
|
|
c->inputs_[0].push_back(f);
|
2013-11-13 06:02:03 +01:00
|
|
|
Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
|
|
|
|
(unsigned long)f->number,
|
|
|
|
i,
|
|
|
|
(unsigned long)f->file_size);
|
2013-09-10 01:06:10 +02:00
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Universal style of compaction. Pick files that are contiguous in
|
|
|
|
// time-range to compact.
|
|
|
|
//
|
|
|
|
Compaction* VersionSet::PickCompactionUniversal(int level, double score) {
|
|
|
|
assert (level == 0);
|
|
|
|
|
2013-09-15 19:21:03 +02:00
|
|
|
if ((current_->files_[level].size() <
|
2013-09-10 01:06:10 +02:00
|
|
|
(unsigned int)options_->level0_file_num_compaction_trigger)) {
|
|
|
|
Log(options_->info_log, "Universal: nothing to do\n");
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
VersionSet::FileSummaryStorage tmp;
|
|
|
|
Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
|
|
|
|
current_->files_[level].size(),
|
|
|
|
LevelFileSummary(&tmp, 0));
|
|
|
|
|
|
|
|
// Check for size amplification first.
|
|
|
|
Compaction* c = PickCompactionUniversalSizeAmp(level, score);
|
|
|
|
if (c == nullptr) {
|
|
|
|
|
|
|
|
// Size amplification is within limits. Try reducing read
|
|
|
|
// amplification while maintaining file size ratios.
|
|
|
|
unsigned int ratio = options_->compaction_options_universal.size_ratio;
|
|
|
|
c = PickCompactionUniversalReadAmp(level, score, ratio, UINT_MAX);
|
|
|
|
|
|
|
|
// Size amplification and file size ratios are within configured limits.
|
|
|
|
// If max read amplification is exceeding configured limits, then force
|
|
|
|
// compaction without looking at filesize ratios and try to reduce
|
|
|
|
// the number of files to fewer than level0_file_num_compaction_trigger.
|
|
|
|
if (c == nullptr) {
|
|
|
|
unsigned int num_files = current_->files_[level].size() -
|
|
|
|
options_->level0_file_num_compaction_trigger;
|
|
|
|
c = PickCompactionUniversalReadAmp(level, score, UINT_MAX, num_files);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (c == nullptr) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
assert(c->inputs_[0].size() > 1);
|
2013-06-14 07:09:08 +02:00
|
|
|
|
|
|
|
// validate that all the chosen files are non overlapping in time
|
|
|
|
FileMetaData* newerfile __attribute__((unused)) = nullptr;
|
|
|
|
for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
|
|
|
|
FileMetaData* f = c->inputs_[0][i];
|
|
|
|
assert (f->smallest_seqno <= f->largest_seqno);
|
|
|
|
assert(newerfile == nullptr ||
|
|
|
|
newerfile->smallest_seqno > f->largest_seqno);
|
|
|
|
newerfile = f;
|
|
|
|
}
|
|
|
|
|
2013-09-10 01:06:10 +02:00
|
|
|
// The files are sorted from newest first to oldest last.
|
|
|
|
std::vector<int>& file_by_time = current_->files_by_size_[level];
|
|
|
|
|
2013-08-08 00:25:00 +02:00
|
|
|
// Is the earliest file part of this compaction?
|
|
|
|
int last_index = file_by_time[file_by_time.size()-1];
|
|
|
|
FileMetaData* last_file = current_->files_[level][last_index];
|
|
|
|
if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
|
|
|
|
c->bottommost_level_ = true;
|
|
|
|
}
|
|
|
|
|
2013-06-14 07:09:08 +02:00
|
|
|
// update statistics
|
2013-07-10 01:17:00 +02:00
|
|
|
if (options_->statistics != nullptr) {
|
|
|
|
options_->statistics->measureTime(NUM_FILES_IN_SINGLE_COMPACTION,
|
|
|
|
c->inputs_[0].size());
|
|
|
|
}
|
2013-06-14 07:09:08 +02:00
|
|
|
|
|
|
|
c->input_version_ = current_;
|
|
|
|
c->input_version_->Ref();
|
|
|
|
|
|
|
|
// mark all the files that are being compacted
|
|
|
|
c->MarkFilesBeingCompacted(true);
|
|
|
|
|
|
|
|
// remember this currently undergoing compaction
|
|
|
|
compactions_in_progress_[level].insert(c);
|
|
|
|
|
2013-10-27 07:01:26 +01:00
|
|
|
// Record whether this compaction includes all sst files.
|
|
|
|
// For now, it is only relevant in universal compaction mode.
|
|
|
|
c->is_full_compaction_ = (c->inputs_[0].size() == current_->files_[0].size());
|
|
|
|
|
2013-06-14 07:09:08 +02:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2012-12-04 18:47:05 +01:00
|
|
|
Compaction* VersionSet::PickCompactionBySize(int level, double score) {
|
2013-03-01 03:04:58 +01:00
|
|
|
Compaction* c = nullptr;
|
2012-10-19 23:00:53 +02:00
|
|
|
|
|
|
|
// level 0 files are overlapping. So we cannot pick more
|
|
|
|
// than one concurrent compactions at this level. This
|
2012-10-26 03:21:54 +02:00
|
|
|
// could be made better by looking at key-ranges that are
|
2012-10-19 23:00:53 +02:00
|
|
|
// being compacted at level 0.
|
|
|
|
if (level == 0 && compactions_in_progress_[level].size() == 1) {
|
2013-03-01 03:04:58 +01:00
|
|
|
return nullptr;
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
assert(level >= 0);
|
|
|
|
assert(level+1 < NumberLevels());
|
2013-07-18 00:05:57 +02:00
|
|
|
c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1),
|
2012-10-19 23:00:53 +02:00
|
|
|
MaxGrandParentOverlapBytes(level), NumberLevels());
|
2012-12-04 18:47:05 +01:00
|
|
|
c->score_ = score;
|
2012-10-19 23:00:53 +02:00
|
|
|
|
2012-10-26 03:21:54 +02:00
|
|
|
// Pick the largest file in this level that is not already
|
|
|
|
// being compacted
|
|
|
|
std::vector<int>& file_size = current_->files_by_size_[level];
|
2012-11-01 06:01:57 +01:00
|
|
|
|
|
|
|
// record the first file that is not yet compacted
|
|
|
|
int nextIndex = -1;
|
|
|
|
|
|
|
|
for (unsigned int i = current_->next_file_to_compact_by_size_[level];
|
|
|
|
i < file_size.size(); i++) {
|
2012-10-26 03:21:54 +02:00
|
|
|
int index = file_size[i];
|
|
|
|
FileMetaData* f = current_->files_[level][index];
|
|
|
|
|
|
|
|
// check to verify files are arranged in descending size
|
|
|
|
assert((i == file_size.size() - 1) ||
|
2012-11-13 19:30:00 +01:00
|
|
|
(i >= Version::number_of_files_to_sort_-1) ||
|
2012-10-26 03:21:54 +02:00
|
|
|
(f->file_size >= current_->files_[level][file_size[i+1]]->file_size));
|
2012-10-19 23:00:53 +02:00
|
|
|
|
|
|
|
// do not pick a file to compact if it is being compacted
|
|
|
|
// from n-1 level.
|
|
|
|
if (f->being_compacted) {
|
|
|
|
continue;
|
|
|
|
}
|
2012-11-01 06:01:57 +01:00
|
|
|
|
|
|
|
// remember the startIndex for the next call to PickCompaction
|
|
|
|
if (nextIndex == -1) {
|
|
|
|
nextIndex = i;
|
|
|
|
}
|
|
|
|
|
|
|
|
//if (i > Version::number_of_files_to_sort_) {
|
|
|
|
// Log(options_->info_log, "XXX Looking at index %d", i);
|
|
|
|
//}
|
|
|
|
|
2012-10-26 03:21:54 +02:00
|
|
|
// Do not pick this file if its parents at level+1 are being compacted.
|
|
|
|
// Maybe we can avoid redoing this work in SetupOtherInputs
|
2012-11-08 00:11:37 +01:00
|
|
|
int parent_index = -1;
|
2012-11-29 01:42:36 +01:00
|
|
|
if (ParentRangeInCompaction(&f->smallest, &f->largest, level,
|
Assertion failure while running with unit tests with OPT=-g
Summary:
When we expand the range of keys for a level 0 compaction, we
need to invoke ParentFilesInCompaction() only once for the
entire range of keys that is being compacted. We were invoking
it for each file that was being compacted, but this triggers
an assertion because each file's range were contiguous but
non-overlapping.
I renamed ParentFilesInCompaction to ParentRangeInCompaction
to adequately represent that it is the range-of-keys and
not individual files that we compact in a single compaction run.
Here is the assertion that is fixed by this patch.
db_test: db/version_set.cc:585: void leveldb::Version::ExtendOverlappingInputs(int, const leveldb::Slice&, const leveldb::Slice&, std::vector<leveldb::FileMetaData*, std::allocator<leveldb::FileMetaData*> >*, int): Assertion `user_cmp->Compare(flimit, user_begin) >= 0' failed.
Test Plan: make clean check OPT=-g
Reviewers: sheki
Reviewed By: sheki
CC: MarkCallaghan, emayanke, leveldb
Differential Revision: https://reviews.facebook.net/D6963
2012-11-26 10:49:50 +01:00
|
|
|
&parent_index)) {
|
2012-10-26 03:21:54 +02:00
|
|
|
continue;
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
2012-10-26 03:21:54 +02:00
|
|
|
c->inputs_[0].push_back(f);
|
2012-11-13 19:30:00 +01:00
|
|
|
c->base_index_ = index;
|
2012-11-08 00:11:37 +01:00
|
|
|
c->parent_index_ = parent_index;
|
2012-10-26 03:21:54 +02:00
|
|
|
break;
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
2012-10-26 03:21:54 +02:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
if (c->inputs_[0].empty()) {
|
|
|
|
delete c;
|
2013-03-01 03:04:58 +01:00
|
|
|
c = nullptr;
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
2012-11-01 06:01:57 +01:00
|
|
|
|
|
|
|
// store where to start the iteration in the next call to PickCompaction
|
|
|
|
current_->next_file_to_compact_by_size_[level] = nextIndex;
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Compaction* VersionSet::PickCompaction() {
|
2013-03-01 03:04:58 +01:00
|
|
|
Compaction* c = nullptr;
|
2012-11-08 00:11:37 +01:00
|
|
|
int level = -1;
|
2011-06-22 04:36:45 +02:00
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// Compute the compactions needed. It is better to do it here
|
2012-10-19 23:00:53 +02:00
|
|
|
// and also in LogAndApply(), otherwise the values could be stale.
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
std::vector<uint64_t> size_being_compacted(NumberLevels()-1);
|
|
|
|
current_->vset_->SizeBeingCompacted(size_being_compacted);
|
|
|
|
Finalize(current_, size_being_compacted);
|
2012-10-19 23:00:53 +02:00
|
|
|
|
2013-07-04 00:32:49 +02:00
|
|
|
// In universal style of compaction, compact L0 files back into L0.
|
|
|
|
if (options_->compaction_style == kCompactionStyleUniversal) {
|
2013-06-14 07:09:08 +02:00
|
|
|
int level = 0;
|
2013-07-10 01:08:54 +02:00
|
|
|
c = PickCompactionUniversal(level, current_->compaction_score_[level]);
|
2013-06-14 07:09:08 +02:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2011-06-22 04:36:45 +02:00
|
|
|
// We prefer compactions triggered by too much data in a level over
|
|
|
|
// the compactions triggered by seeks.
|
2012-10-19 23:00:53 +02:00
|
|
|
//
|
|
|
|
// Find the compactions by size on all levels.
|
|
|
|
for (int i = 0; i < NumberLevels()-1; i++) {
|
2012-11-20 18:08:11 +01:00
|
|
|
assert(i == 0 || current_->compaction_score_[i] <=
|
|
|
|
current_->compaction_score_[i-1]);
|
2012-10-19 23:00:53 +02:00
|
|
|
level = current_->compaction_level_[i];
|
|
|
|
if ((current_->compaction_score_[i] >= 1)) {
|
2012-12-04 18:47:05 +01:00
|
|
|
c = PickCompactionBySize(level, current_->compaction_score_[i]);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
ExpandWhileOverlapping(c);
|
2013-03-01 03:04:58 +01:00
|
|
|
if (c != nullptr) {
|
2011-06-22 04:36:45 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Find compactions needed by seeks
|
2013-06-03 21:57:23 +02:00
|
|
|
FileMetaData* f = current_->file_to_compact_;
|
|
|
|
if (c == nullptr && f != nullptr && !f->being_compacted) {
|
|
|
|
|
2011-06-22 04:36:45 +02:00
|
|
|
level = current_->file_to_compact_level_;
|
2013-06-03 21:57:23 +02:00
|
|
|
int parent_index = -1;
|
2013-01-15 21:43:09 +01:00
|
|
|
|
|
|
|
// Only allow one level 0 compaction at a time.
|
2013-06-03 21:57:23 +02:00
|
|
|
// Do not pick this file if its parents at level+1 are being compacted.
|
2013-01-15 21:43:09 +01:00
|
|
|
if (level != 0 || compactions_in_progress_[0].empty()) {
|
2013-06-03 21:57:23 +02:00
|
|
|
if(!ParentRangeInCompaction(&f->smallest, &f->largest, level,
|
|
|
|
&parent_index)) {
|
2013-08-02 20:46:47 +02:00
|
|
|
c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1),
|
2013-06-03 21:57:23 +02:00
|
|
|
MaxGrandParentOverlapBytes(level), NumberLevels(), true);
|
|
|
|
c->inputs_[0].push_back(f);
|
|
|
|
c->parent_index_ = parent_index;
|
|
|
|
current_->file_to_compact_ = nullptr;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
ExpandWhileOverlapping(c);
|
2013-06-03 21:57:23 +02:00
|
|
|
}
|
2013-01-15 21:43:09 +01:00
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
|
2013-03-01 03:04:58 +01:00
|
|
|
if (c == nullptr) {
|
|
|
|
return nullptr;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
c->input_version_ = current_;
|
|
|
|
c->input_version_->Ref();
|
|
|
|
|
2013-01-15 21:43:09 +01:00
|
|
|
// Two level 0 compaction won't run at the same time, so don't need to worry
|
|
|
|
// about files on level 0 being compacted.
|
2011-03-18 23:37:00 +01:00
|
|
|
if (level == 0) {
|
2013-01-15 21:43:09 +01:00
|
|
|
assert(compactions_in_progress_[0].empty());
|
2011-03-22 19:32:49 +01:00
|
|
|
InternalKey smallest, largest;
|
|
|
|
GetRange(c->inputs_[0], &smallest, &largest);
|
2011-03-18 23:37:00 +01:00
|
|
|
// Note that the next call will discard the file we placed in
|
|
|
|
// c->inputs_[0] earlier and replace it with an overlapping set
|
|
|
|
// which will include the picked file.
|
2012-10-19 23:00:53 +02:00
|
|
|
c->inputs_[0].clear();
|
2013-01-15 21:43:09 +01:00
|
|
|
current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
|
2013-05-08 19:19:24 +02:00
|
|
|
|
|
|
|
// If we include more L0 files in the same compaction run it can
|
|
|
|
// cause the 'smallest' and 'largest' key to get extended to a
|
|
|
|
// larger range. So, re-invoke GetRange to get the new key range
|
|
|
|
GetRange(c->inputs_[0], &smallest, &largest);
|
Assertion failure while running with unit tests with OPT=-g
Summary:
When we expand the range of keys for a level 0 compaction, we
need to invoke ParentFilesInCompaction() only once for the
entire range of keys that is being compacted. We were invoking
it for each file that was being compacted, but this triggers
an assertion because each file's range were contiguous but
non-overlapping.
I renamed ParentFilesInCompaction to ParentRangeInCompaction
to adequately represent that it is the range-of-keys and
not individual files that we compact in a single compaction run.
Here is the assertion that is fixed by this patch.
db_test: db/version_set.cc:585: void leveldb::Version::ExtendOverlappingInputs(int, const leveldb::Slice&, const leveldb::Slice&, std::vector<leveldb::FileMetaData*, std::allocator<leveldb::FileMetaData*> >*, int): Assertion `user_cmp->Compare(flimit, user_begin) >= 0' failed.
Test Plan: make clean check OPT=-g
Reviewers: sheki
Reviewed By: sheki
CC: MarkCallaghan, emayanke, leveldb
Differential Revision: https://reviews.facebook.net/D6963
2012-11-26 10:49:50 +01:00
|
|
|
if (ParentRangeInCompaction(&smallest, &largest,
|
|
|
|
level, &c->parent_index_)) {
|
|
|
|
delete c;
|
2013-03-01 03:04:58 +01:00
|
|
|
return nullptr;
|
Assertion failure while running with unit tests with OPT=-g
Summary:
When we expand the range of keys for a level 0 compaction, we
need to invoke ParentFilesInCompaction() only once for the
entire range of keys that is being compacted. We were invoking
it for each file that was being compacted, but this triggers
an assertion because each file's range were contiguous but
non-overlapping.
I renamed ParentFilesInCompaction to ParentRangeInCompaction
to adequately represent that it is the range-of-keys and
not individual files that we compact in a single compaction run.
Here is the assertion that is fixed by this patch.
db_test: db/version_set.cc:585: void leveldb::Version::ExtendOverlappingInputs(int, const leveldb::Slice&, const leveldb::Slice&, std::vector<leveldb::FileMetaData*, std::allocator<leveldb::FileMetaData*> >*, int): Assertion `user_cmp->Compare(flimit, user_begin) >= 0' failed.
Test Plan: make clean check OPT=-g
Reviewers: sheki
Reviewed By: sheki
CC: MarkCallaghan, emayanke, leveldb
Differential Revision: https://reviews.facebook.net/D6963
2012-11-26 10:49:50 +01:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
assert(!c->inputs_[0].empty());
|
|
|
|
}
|
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// Setup "level+1" files (inputs_[1])
|
2011-03-22 19:32:49 +01:00
|
|
|
SetupOtherInputs(c);
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// mark all the files that are being compacted
|
|
|
|
c->MarkFilesBeingCompacted(true);
|
|
|
|
|
2013-08-08 00:25:00 +02:00
|
|
|
// Is this compaction creating a file at the bottommost level
|
|
|
|
c->SetupBottomMostLevel(false);
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// remember this currently undergoing compaction
|
|
|
|
compactions_in_progress_[level].insert(c);
|
|
|
|
|
2011-03-22 19:32:49 +01:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// Returns true if any one of the parent files are being compacted
|
Assertion failure while running with unit tests with OPT=-g
Summary:
When we expand the range of keys for a level 0 compaction, we
need to invoke ParentFilesInCompaction() only once for the
entire range of keys that is being compacted. We were invoking
it for each file that was being compacted, but this triggers
an assertion because each file's range were contiguous but
non-overlapping.
I renamed ParentFilesInCompaction to ParentRangeInCompaction
to adequately represent that it is the range-of-keys and
not individual files that we compact in a single compaction run.
Here is the assertion that is fixed by this patch.
db_test: db/version_set.cc:585: void leveldb::Version::ExtendOverlappingInputs(int, const leveldb::Slice&, const leveldb::Slice&, std::vector<leveldb::FileMetaData*, std::allocator<leveldb::FileMetaData*> >*, int): Assertion `user_cmp->Compare(flimit, user_begin) >= 0' failed.
Test Plan: make clean check OPT=-g
Reviewers: sheki
Reviewed By: sheki
CC: MarkCallaghan, emayanke, leveldb
Differential Revision: https://reviews.facebook.net/D6963
2012-11-26 10:49:50 +01:00
|
|
|
bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest,
|
|
|
|
const InternalKey* largest, int level, int* parent_index) {
|
2012-10-19 23:00:53 +02:00
|
|
|
std::vector<FileMetaData*> inputs;
|
2012-11-29 01:42:36 +01:00
|
|
|
|
Assertion failure while running with unit tests with OPT=-g
Summary:
When we expand the range of keys for a level 0 compaction, we
need to invoke ParentFilesInCompaction() only once for the
entire range of keys that is being compacted. We were invoking
it for each file that was being compacted, but this triggers
an assertion because each file's range were contiguous but
non-overlapping.
I renamed ParentFilesInCompaction to ParentRangeInCompaction
to adequately represent that it is the range-of-keys and
not individual files that we compact in a single compaction run.
Here is the assertion that is fixed by this patch.
db_test: db/version_set.cc:585: void leveldb::Version::ExtendOverlappingInputs(int, const leveldb::Slice&, const leveldb::Slice&, std::vector<leveldb::FileMetaData*, std::allocator<leveldb::FileMetaData*> >*, int): Assertion `user_cmp->Compare(flimit, user_begin) >= 0' failed.
Test Plan: make clean check OPT=-g
Reviewers: sheki
Reviewed By: sheki
CC: MarkCallaghan, emayanke, leveldb
Differential Revision: https://reviews.facebook.net/D6963
2012-11-26 10:49:50 +01:00
|
|
|
current_->GetOverlappingInputs(level+1, smallest, largest,
|
2012-11-08 00:11:37 +01:00
|
|
|
&inputs, *parent_index, parent_index);
|
2012-10-19 23:00:53 +02:00
|
|
|
return FilesInCompaction(inputs);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns true if any one of specified files are being compacted
|
|
|
|
bool VersionSet::FilesInCompaction(std::vector<FileMetaData*>& files) {
|
|
|
|
for (unsigned int i = 0; i < files.size(); i++) {
|
|
|
|
if (files[i]->being_compacted) {
|
|
|
|
return true;
|
2012-11-29 01:42:36 +01:00
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// Add more files to the inputs on "level" to make sure that
|
|
|
|
// no newer version of a key is compacted to "level+1" while leaving an older
|
|
|
|
// version in a "level". Otherwise, any Get() will search "level" first,
|
|
|
|
// and will likely return an old/stale value for the key, since it always
|
|
|
|
// searches in increasing order of level to find the value. This could
|
|
|
|
// also scramble the order of merge operands. This function should be
|
|
|
|
// called any time a new Compaction is created, and its inputs_[0] are
|
|
|
|
// populated.
|
|
|
|
//
|
|
|
|
// Will set c to nullptr if it is impossible to apply this compaction.
|
|
|
|
void VersionSet::ExpandWhileOverlapping(Compaction* c) {
|
|
|
|
// If inputs are empty then there is nothing to expand.
|
|
|
|
if (!c || c->inputs_[0].empty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetOverlappingInputs will always do the right thing for level-0.
|
|
|
|
// So we don't need to do any expansion if level == 0.
|
|
|
|
if (c->level() == 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const int level = c->level();
|
|
|
|
InternalKey smallest, largest;
|
|
|
|
|
|
|
|
// Keep expanding c->inputs_[0] until we are sure that there is a
|
|
|
|
// "clean cut" boundary between the files in input and the surrounding files.
|
|
|
|
// This will ensure that no parts of a key are lost during compaction.
|
|
|
|
int hint_index = -1;
|
|
|
|
size_t old_size;
|
|
|
|
do {
|
|
|
|
old_size = c->inputs_[0].size();
|
|
|
|
GetRange(c->inputs_[0], &smallest, &largest);
|
|
|
|
c->inputs_[0].clear();
|
|
|
|
current_->GetOverlappingInputs(level, &smallest, &largest, &c->inputs_[0],
|
|
|
|
hint_index, &hint_index);
|
|
|
|
} while(c->inputs_[0].size() > old_size);
|
|
|
|
|
|
|
|
// Get the new range
|
|
|
|
GetRange(c->inputs_[0], &smallest, &largest);
|
|
|
|
|
|
|
|
// If, after the expansion, there are files that are already under
|
|
|
|
// compaction, then we must drop/cancel this compaction.
|
|
|
|
int parent_index = -1;
|
|
|
|
if (FilesInCompaction(c->inputs_[0]) ||
|
|
|
|
ParentRangeInCompaction(&smallest, &largest, level, &parent_index)) {
|
|
|
|
c->inputs_[0].clear();
|
|
|
|
c->inputs_[1].clear();
|
|
|
|
delete c;
|
|
|
|
c = nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Populates the set of inputs from "level+1" that overlap with "level".
|
|
|
|
// Will also attempt to expand "level" if that doesn't expand "level+1"
|
|
|
|
// or cause "level" to include a file for compaction that has an overlapping
|
|
|
|
// user-key with another file.
|
2011-03-22 19:32:49 +01:00
|
|
|
void VersionSet::SetupOtherInputs(Compaction* c) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// If inputs are empty, then there is nothing to expand.
|
|
|
|
if (c->inputs_[0].empty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-03-22 19:32:49 +01:00
|
|
|
const int level = c->level();
|
|
|
|
InternalKey smallest, largest;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
|
|
|
|
// Get the range one last time.
|
2011-03-22 19:32:49 +01:00
|
|
|
GetRange(c->inputs_[0], &smallest, &largest);
|
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// Populate the set of next-level files (inputs_[1]) to include in compaction
|
2012-11-06 18:06:16 +01:00
|
|
|
current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1],
|
|
|
|
c->parent_index_, &c->parent_index_);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2011-03-22 19:32:49 +01:00
|
|
|
// Get entire range covered by compaction
|
|
|
|
InternalKey all_start, all_limit;
|
|
|
|
GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
|
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// See if we can further grow the number of inputs in "level" without
|
|
|
|
// changing the number of "level+1" files we pick up. We also choose NOT
|
|
|
|
// to expand if this would cause "level" to include some entries for some
|
|
|
|
// user key, while excluding other entries for the same user key. This
|
|
|
|
// can happen when one user key spans multiple files.
|
2011-03-18 23:37:00 +01:00
|
|
|
if (!c->inputs_[1].empty()) {
|
|
|
|
std::vector<FileMetaData*> expanded0;
|
2012-11-06 18:06:16 +01:00
|
|
|
current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0,
|
2013-03-01 03:04:58 +01:00
|
|
|
c->base_index_, nullptr);
|
2013-07-17 22:56:24 +02:00
|
|
|
const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
|
|
|
|
const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
|
|
|
|
const uint64_t expanded0_size = TotalFileSize(expanded0);
|
|
|
|
uint64_t limit = ExpandedCompactionByteSizeLimit(level);
|
2012-02-02 18:34:14 +01:00
|
|
|
if (expanded0.size() > c->inputs_[0].size() &&
|
2012-10-19 23:00:53 +02:00
|
|
|
inputs1_size + expanded0_size < limit &&
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
!FilesInCompaction(expanded0) &&
|
|
|
|
!current_->HasOverlappingUserKey(&expanded0, level)) {
|
2011-03-18 23:37:00 +01:00
|
|
|
InternalKey new_start, new_limit;
|
|
|
|
GetRange(expanded0, &new_start, &new_limit);
|
|
|
|
std::vector<FileMetaData*> expanded1;
|
2011-10-06 01:30:28 +02:00
|
|
|
current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
|
2012-11-06 18:06:16 +01:00
|
|
|
&expanded1, c->parent_index_,
|
|
|
|
&c->parent_index_);
|
2012-11-29 01:42:36 +01:00
|
|
|
if (expanded1.size() == c->inputs_[1].size() &&
|
2012-10-19 23:00:53 +02:00
|
|
|
!FilesInCompaction(expanded1)) {
|
2011-07-21 04:40:18 +02:00
|
|
|
Log(options_->info_log,
|
2013-11-13 06:02:03 +01:00
|
|
|
"Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
|
|
|
|
"\n",
|
|
|
|
(unsigned long)level,
|
|
|
|
(unsigned long)(c->inputs_[0].size()),
|
|
|
|
(unsigned long)(c->inputs_[1].size()),
|
|
|
|
(unsigned long)inputs0_size,
|
|
|
|
(unsigned long)inputs1_size,
|
|
|
|
(unsigned long)(expanded0.size()),
|
|
|
|
(unsigned long)(expanded1.size()),
|
|
|
|
(unsigned long)expanded0_size,
|
|
|
|
(unsigned long)inputs1_size);
|
2011-03-18 23:37:00 +01:00
|
|
|
smallest = new_start;
|
|
|
|
largest = new_limit;
|
|
|
|
c->inputs_[0] = expanded0;
|
|
|
|
c->inputs_[1] = expanded1;
|
2011-03-22 19:32:49 +01:00
|
|
|
GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-22 19:32:49 +01:00
|
|
|
// Compute the set of grandparent files that overlap this compaction
|
|
|
|
// (parent == level+1; grandparent == level+2)
|
2012-06-23 04:30:03 +02:00
|
|
|
if (level + 2 < NumberLevels()) {
|
2011-10-06 01:30:28 +02:00
|
|
|
current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
|
|
|
|
&c->grandparents_);
|
2011-03-22 19:32:49 +01:00
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
if (false) {
|
2011-07-21 04:40:18 +02:00
|
|
|
Log(options_->info_log, "Compacting %d '%s' .. '%s'",
|
2011-03-18 23:37:00 +01:00
|
|
|
level,
|
2011-10-06 01:30:28 +02:00
|
|
|
smallest.DebugString().c_str(),
|
|
|
|
largest.DebugString().c_str());
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Update the place where we will do the next compaction for this level.
|
|
|
|
// We update this immediately instead of waiting for the VersionEdit
|
|
|
|
// to be applied so that if the compaction fails, we will try a different
|
|
|
|
// key range next time.
|
|
|
|
compact_pointer_[level] = largest.Encode().ToString();
|
2012-06-23 04:30:03 +02:00
|
|
|
c->edit_->SetCompactPointer(level, largest);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2013-08-22 23:32:53 +02:00
|
|
|
Status VersionSet::GetMetadataForFile(
|
2013-09-01 10:52:32 +02:00
|
|
|
uint64_t number,
|
|
|
|
int *filelevel,
|
|
|
|
FileMetaData *meta) {
|
2013-08-22 23:32:53 +02:00
|
|
|
for (int level = 0; level < NumberLevels(); level++) {
|
|
|
|
const std::vector<FileMetaData*>& files = current_->files_[level];
|
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
|
|
if (files[i]->number == number) {
|
|
|
|
*meta = *files[i];
|
|
|
|
*filelevel = level;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::NotFound("File not present in any level");
|
|
|
|
}
|
|
|
|
|
|
|
|
void VersionSet::GetLiveFilesMetaData(
|
2013-09-01 10:52:32 +02:00
|
|
|
std::vector<LiveFileMetaData> * metadata) {
|
2013-08-22 23:32:53 +02:00
|
|
|
for (int level = 0; level < NumberLevels(); level++) {
|
|
|
|
const std::vector<FileMetaData*>& files = current_->files_[level];
|
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
|
|
LiveFileMetaData filemetadata;
|
|
|
|
filemetadata.name = TableFileName("", files[i]->number);
|
|
|
|
filemetadata.level = level;
|
|
|
|
filemetadata.size = files[i]->file_size;
|
|
|
|
filemetadata.smallestkey = files[i]->smallest.user_key().ToString();
|
|
|
|
filemetadata.largestkey = files[i]->largest.user_key().ToString();
|
2013-08-27 23:54:06 +02:00
|
|
|
filemetadata.smallest_seqno = files[i]->smallest_seqno;
|
|
|
|
filemetadata.largest_seqno = files[i]->largest_seqno;
|
2013-08-22 23:32:53 +02:00
|
|
|
metadata->push_back(filemetadata);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-12 20:53:26 +01:00
|
|
|
void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
|
|
|
|
files->insert(files->end(),
|
|
|
|
obsolete_files_.begin(),
|
|
|
|
obsolete_files_.end());
|
2013-11-09 00:23:46 +01:00
|
|
|
obsolete_files_.clear();
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Compaction* VersionSet::CompactRange(
|
|
|
|
int level,
|
2011-10-06 01:30:28 +02:00
|
|
|
const InternalKey* begin,
|
|
|
|
const InternalKey* end) {
|
2011-03-18 23:37:00 +01:00
|
|
|
std::vector<FileMetaData*> inputs;
|
2013-08-08 00:25:00 +02:00
|
|
|
|
|
|
|
// All files are 'overlapping' in universal style compaction.
|
|
|
|
// We have to compact the entire range in one shot.
|
|
|
|
if (options_->compaction_style == kCompactionStyleUniversal) {
|
|
|
|
begin = nullptr;
|
|
|
|
end = nullptr;
|
|
|
|
}
|
2011-10-06 01:30:28 +02:00
|
|
|
current_->GetOverlappingInputs(level, begin, end, &inputs);
|
2011-03-18 23:37:00 +01:00
|
|
|
if (inputs.empty()) {
|
2013-03-01 03:04:58 +01:00
|
|
|
return nullptr;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2011-10-06 01:30:28 +02:00
|
|
|
// Avoid compacting too much in one shot in case the range is large.
|
2013-06-17 22:58:17 +02:00
|
|
|
// But we cannot do this for level-0 since level-0 files can overlap
|
|
|
|
// and we must not pick one file and drop another older file if the
|
|
|
|
// two files overlap.
|
|
|
|
if (level > 0) {
|
|
|
|
const uint64_t limit = MaxFileSizeForLevel(level) *
|
2012-11-21 08:07:41 +01:00
|
|
|
options_->source_compaction_factor;
|
2013-06-17 22:58:17 +02:00
|
|
|
uint64_t total = 0;
|
|
|
|
for (size_t i = 0; i < inputs.size(); ++i) {
|
|
|
|
uint64_t s = inputs[i]->file_size;
|
|
|
|
total += s;
|
|
|
|
if (total >= limit) {
|
|
|
|
inputs.resize(i + 1);
|
|
|
|
break;
|
|
|
|
}
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
|
|
|
}
|
2013-07-04 00:32:49 +02:00
|
|
|
int out_level = (options_->compaction_style == kCompactionStyleUniversal) ?
|
|
|
|
level : level+1;
|
2011-10-06 01:30:28 +02:00
|
|
|
|
2013-07-18 00:05:57 +02:00
|
|
|
Compaction* c = new Compaction(level, out_level, MaxFileSizeForLevel(out_level),
|
2012-10-20 01:41:34 +02:00
|
|
|
MaxGrandParentOverlapBytes(level), NumberLevels());
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
|
|
|
|
c->inputs_[0] = inputs;
|
|
|
|
ExpandWhileOverlapping(c);
|
|
|
|
if (c == nullptr) {
|
|
|
|
Log(options_->info_log, "Could not compact due to expansion failure.\n");
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
c->input_version_ = current_;
|
|
|
|
c->input_version_->Ref();
|
2011-03-22 19:32:49 +01:00
|
|
|
SetupOtherInputs(c);
|
2012-10-19 23:00:53 +02:00
|
|
|
|
|
|
|
// These files that are to be manaully compacted do not trample
|
|
|
|
// upon other files because manual compactions are processed when
|
|
|
|
// the system has a max of 1 background compaction thread.
|
|
|
|
c->MarkFilesBeingCompacted(true);
|
2013-08-08 00:25:00 +02:00
|
|
|
|
|
|
|
// Is this compaction creating a file at the bottommost level
|
|
|
|
c->SetupBottomMostLevel(true);
|
2011-03-18 23:37:00 +01:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2013-06-14 07:09:08 +02:00
|
|
|
Compaction::Compaction(int level, int out_level, uint64_t target_file_size,
|
2012-10-20 01:41:34 +02:00
|
|
|
uint64_t max_grandparent_overlap_bytes, int number_levels,
|
2013-10-17 22:33:39 +02:00
|
|
|
bool seek_compaction, bool enable_compression)
|
2011-03-18 23:37:00 +01:00
|
|
|
: level_(level),
|
2013-06-14 07:09:08 +02:00
|
|
|
out_level_(out_level),
|
2012-06-23 04:30:03 +02:00
|
|
|
max_output_file_size_(target_file_size),
|
|
|
|
maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
|
2013-03-01 03:04:58 +01:00
|
|
|
input_version_(nullptr),
|
2012-08-29 21:29:43 +02:00
|
|
|
number_levels_(number_levels),
|
2012-10-20 01:41:34 +02:00
|
|
|
seek_compaction_(seek_compaction),
|
2013-10-17 22:33:39 +02:00
|
|
|
enable_compression_(enable_compression),
|
2011-03-22 19:32:49 +01:00
|
|
|
grandparent_index_(0),
|
2011-03-23 00:24:02 +01:00
|
|
|
seen_key_(false),
|
2012-11-06 18:06:16 +01:00
|
|
|
overlapped_bytes_(0),
|
|
|
|
base_index_(-1),
|
2012-12-04 18:47:05 +01:00
|
|
|
parent_index_(-1),
|
2013-04-15 22:25:53 +02:00
|
|
|
score_(0),
|
2013-08-08 00:25:00 +02:00
|
|
|
bottommost_level_(false),
|
2013-10-27 07:01:26 +01:00
|
|
|
is_full_compaction_(false),
|
2013-04-15 22:25:53 +02:00
|
|
|
level_ptrs_(std::vector<size_t>(number_levels)) {
|
2012-10-20 01:41:34 +02:00
|
|
|
edit_ = new VersionEdit(number_levels_);
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int i = 0; i < number_levels_; i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
level_ptrs_[i] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Compaction::~Compaction() {
|
2012-11-29 01:42:36 +01:00
|
|
|
delete edit_;
|
2013-03-01 03:04:58 +01:00
|
|
|
if (input_version_ != nullptr) {
|
2011-03-18 23:37:00 +01:00
|
|
|
input_version_->Unref();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-22 19:32:49 +01:00
|
|
|
bool Compaction::IsTrivialMove() const {
|
2011-03-23 00:24:02 +01:00
|
|
|
// Avoid a move if there is lots of overlapping grandparent data.
|
2011-03-22 19:32:49 +01:00
|
|
|
// Otherwise, the move could create a parent file that will require
|
|
|
|
// a very expensive merge later on.
|
2011-03-23 00:24:02 +01:00
|
|
|
return (num_input_files(0) == 1 &&
|
|
|
|
num_input_files(1) == 0 &&
|
2012-06-23 04:30:03 +02:00
|
|
|
TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
|
2011-03-22 19:32:49 +01:00
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
void Compaction::AddInputDeletions(VersionEdit* edit) {
|
|
|
|
for (int which = 0; which < 2; which++) {
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < inputs_[which].size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
edit->DeleteFile(level_ + which, inputs_[which][i]->number);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
|
2013-08-08 00:25:00 +02:00
|
|
|
if (input_version_->vset_->options_->compaction_style ==
|
|
|
|
kCompactionStyleUniversal) {
|
|
|
|
return bottommost_level_;
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
// Maybe use binary search to find right entry instead of linear search?
|
|
|
|
const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
|
2012-06-23 04:30:03 +02:00
|
|
|
for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
|
|
|
|
for (; level_ptrs_[lvl] < files.size(); ) {
|
|
|
|
FileMetaData* f = files[level_ptrs_[lvl]];
|
|
|
|
if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
|
|
|
|
// We've advanced far enough
|
|
|
|
if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
|
|
|
|
// Key falls in this file's range, so definitely not base level
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
level_ptrs_[lvl]++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-05-21 04:17:43 +02:00
|
|
|
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
|
2011-03-22 19:32:49 +01:00
|
|
|
// Scan to find earliest grandparent file that contains key.
|
|
|
|
const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
|
|
|
|
while (grandparent_index_ < grandparents_.size() &&
|
2011-05-21 04:17:43 +02:00
|
|
|
icmp->Compare(internal_key,
|
|
|
|
grandparents_[grandparent_index_]->largest.Encode()) > 0) {
|
2011-03-23 00:24:02 +01:00
|
|
|
if (seen_key_) {
|
|
|
|
overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
|
|
|
|
}
|
2012-11-20 18:08:11 +01:00
|
|
|
assert(grandparent_index_ + 1 >= grandparents_.size() ||
|
|
|
|
icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
|
|
|
|
grandparents_[grandparent_index_+1]->smallest.Encode())
|
|
|
|
< 0);
|
2011-03-22 19:32:49 +01:00
|
|
|
grandparent_index_++;
|
|
|
|
}
|
2011-03-23 00:24:02 +01:00
|
|
|
seen_key_ = true;
|
2011-03-22 19:32:49 +01:00
|
|
|
|
2012-06-23 04:30:03 +02:00
|
|
|
if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
|
2011-03-23 00:24:02 +01:00
|
|
|
// Too much overlap for current output; start new output
|
|
|
|
overlapped_bytes_ = 0;
|
2011-03-22 19:32:49 +01:00
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// Mark (or clear) each file that is being compacted
|
|
|
|
void Compaction::MarkFilesBeingCompacted(bool value) {
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
std::vector<FileMetaData*> v = inputs_[i];
|
|
|
|
for (unsigned int j = 0; j < inputs_[i].size(); j++) {
|
|
|
|
assert(value ? !inputs_[i][j]->being_compacted :
|
|
|
|
inputs_[i][j]->being_compacted);
|
|
|
|
inputs_[i][j]->being_compacted = value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-08 00:25:00 +02:00
|
|
|
// Is this compaction producing files at the bottommost level?
|
|
|
|
void Compaction::SetupBottomMostLevel(bool isManual) {
|
|
|
|
if (input_version_->vset_->options_->compaction_style ==
|
|
|
|
kCompactionStyleUniversal) {
|
|
|
|
// If universal compaction style is used and manual
|
|
|
|
// compaction is occuring, then we are guaranteed that
|
|
|
|
// all files will be picked in a single compaction
|
|
|
|
// run. We can safely set bottommost_level_ = true.
|
|
|
|
// If it is not manual compaction, then bottommost_level_
|
|
|
|
// is already set when the Compaction was created.
|
|
|
|
if (isManual) {
|
|
|
|
bottommost_level_ = true;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
bottommost_level_ = true;
|
|
|
|
int num_levels = input_version_->vset_->NumberLevels();
|
|
|
|
for (int i = level() + 2; i < num_levels; i++) {
|
|
|
|
if (input_version_->vset_->NumLevelFiles(i) > 0) {
|
|
|
|
bottommost_level_ = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
void Compaction::ReleaseInputs() {
|
2013-03-01 03:04:58 +01:00
|
|
|
if (input_version_ != nullptr) {
|
2011-03-18 23:37:00 +01:00
|
|
|
input_version_->Unref();
|
2013-03-01 03:04:58 +01:00
|
|
|
input_version_ = nullptr;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-01 06:01:57 +01:00
|
|
|
void Compaction::ResetNextCompactionIndex() {
|
2012-11-29 01:42:36 +01:00
|
|
|
input_version_->ResetNextCompactionIndex(level_);
|
2012-11-01 06:01:57 +01:00
|
|
|
}
|
|
|
|
|
2012-08-18 03:10:09 +02:00
|
|
|
static void InputSummary(std::vector<FileMetaData*>& files,
|
|
|
|
char* output,
|
|
|
|
int len) {
|
|
|
|
int write = 0;
|
2012-08-29 21:29:43 +02:00
|
|
|
for (unsigned int i = 0; i < files.size(); i++) {
|
2012-08-18 03:10:09 +02:00
|
|
|
int sz = len - write;
|
2013-11-13 06:02:03 +01:00
|
|
|
int ret = snprintf(output + write, sz, "%lu(%lu) ",
|
|
|
|
(unsigned long)files.at(i)->number,
|
|
|
|
(unsigned long)files.at(i)->file_size);
|
2012-08-18 03:10:09 +02:00
|
|
|
if (ret < 0 || ret >= sz)
|
|
|
|
break;
|
|
|
|
write += ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void Compaction::Summary(char* output, int len) {
|
2012-10-20 01:41:34 +02:00
|
|
|
int write = snprintf(output, len,
|
2013-11-13 06:02:03 +01:00
|
|
|
"Base version %lu Base level %d, seek compaction:%d, inputs:",
|
|
|
|
(unsigned long)input_version_->GetVersionNumber(),
|
|
|
|
level_,
|
|
|
|
seek_compaction_);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
if (write < 0 || write > len) {
|
2012-08-18 03:10:09 +02:00
|
|
|
return;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
}
|
2012-08-18 03:10:09 +02:00
|
|
|
|
|
|
|
char level_low_summary[100];
|
2012-09-21 19:47:08 +02:00
|
|
|
InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
|
2012-08-18 03:10:09 +02:00
|
|
|
char level_up_summary[100];
|
|
|
|
if (inputs_[1].size()) {
|
2012-09-21 19:47:08 +02:00
|
|
|
InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
|
2012-08-18 03:10:09 +02:00
|
|
|
} else {
|
|
|
|
level_up_summary[0] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(output + write, len - write, "[%s],[%s]",
|
|
|
|
level_low_summary, level_up_summary);
|
|
|
|
}
|
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
} // namespace rocksdb
|