2013-10-16 23:59:46 +02:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/version_set.h"
|
2014-02-26 23:16:23 +01:00
|
|
|
|
2014-09-05 08:14:37 +02:00
|
|
|
#ifndef __STDC_FORMAT_MACROS
|
2014-05-14 21:13:50 +02:00
|
|
|
#define __STDC_FORMAT_MACROS
|
2014-09-05 08:14:37 +02:00
|
|
|
#endif
|
|
|
|
|
2014-02-26 23:16:23 +01:00
|
|
|
#include <inttypes.h>
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
#include <stdio.h>
|
2011-03-18 23:37:00 +01:00
|
|
|
#include <algorithm>
|
2014-01-22 20:44:53 +01:00
|
|
|
#include <map>
|
2014-01-30 00:26:43 +01:00
|
|
|
#include <set>
|
2013-06-14 07:09:08 +02:00
|
|
|
#include <climits>
|
2014-02-28 01:18:23 +01:00
|
|
|
#include <unordered_map>
|
2014-07-02 18:54:20 +02:00
|
|
|
#include <vector>
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
#include <string>
|
2014-01-28 06:58:46 +01:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/filename.h"
|
|
|
|
#include "db/log_reader.h"
|
|
|
|
#include "db/log_writer.h"
|
|
|
|
#include "db/memtable.h"
|
2013-12-03 03:34:05 +01:00
|
|
|
#include "db/merge_context.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/table_cache.h"
|
2014-01-16 01:22:34 +01:00
|
|
|
#include "db/compaction.h"
|
2014-10-31 16:48:19 +01:00
|
|
|
#include "db/version_builder.h"
|
2014-12-02 21:09:20 +01:00
|
|
|
#include "db/writebuffer.h"
|
2013-08-23 17:38:13 +02:00
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/merge_operator.h"
|
2014-01-28 06:58:46 +01:00
|
|
|
#include "table/table_reader.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "table/merger.h"
|
|
|
|
#include "table/two_level_iterator.h"
|
2014-02-14 01:28:21 +01:00
|
|
|
#include "table/format.h"
|
2014-04-25 21:23:07 +02:00
|
|
|
#include "table/plain_table_factory.h"
|
2014-02-14 01:28:21 +01:00
|
|
|
#include "table/meta_blocks.h"
|
2014-09-29 20:09:09 +02:00
|
|
|
#include "table/get_context.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/logging.h"
|
2013-06-05 20:06:21 +02:00
|
|
|
#include "util/stop_watch.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
namespace rocksdb {
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-07-16 22:33:02 +02:00
|
|
|
namespace {
|
|
|
|
|
2014-10-28 18:03:13 +01:00
|
|
|
// Find File in LevelFilesBrief data structure
|
2014-07-16 22:33:02 +02:00
|
|
|
// Within an index range defined by left and right
|
|
|
|
int FindFileInRange(const InternalKeyComparator& icmp,
|
2014-10-28 18:03:13 +01:00
|
|
|
const LevelFilesBrief& file_level,
|
2014-07-16 22:33:02 +02:00
|
|
|
const Slice& key,
|
|
|
|
uint32_t left,
|
|
|
|
uint32_t right) {
|
|
|
|
while (left < right) {
|
|
|
|
uint32_t mid = (left + right) / 2;
|
|
|
|
const FdWithKeyRange& f = file_level.files[mid];
|
|
|
|
if (icmp.InternalKeyComparator::Compare(f.largest_key, key) < 0) {
|
|
|
|
// Key at "mid.largest" is < "target". Therefore all
|
|
|
|
// files at or before "mid" are uninteresting.
|
|
|
|
left = mid + 1;
|
|
|
|
} else {
|
|
|
|
// Key at "mid.largest" is >= "target". Therefore all files
|
|
|
|
// after "mid" are uninteresting.
|
|
|
|
right = mid;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return right;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Class to help choose the next file to search for the particular key.
|
|
|
|
// Searches and returns files level by level.
|
|
|
|
// We can search level-by-level since entries never hop across
|
|
|
|
// levels. Therefore we are guaranteed that if we find data
|
|
|
|
// in a smaller level, later levels are irrelevant (unless we
|
|
|
|
// are MergeInProgress).
|
|
|
|
class FilePicker {
|
|
|
|
public:
|
|
|
|
FilePicker(
|
|
|
|
std::vector<FileMetaData*>* files,
|
|
|
|
const Slice& user_key,
|
|
|
|
const Slice& ikey,
|
2014-10-28 18:03:13 +01:00
|
|
|
autovector<LevelFilesBrief>* file_levels,
|
2014-07-16 22:33:02 +02:00
|
|
|
unsigned int num_levels,
|
|
|
|
FileIndexer* file_indexer,
|
|
|
|
const Comparator* user_comparator,
|
|
|
|
const InternalKeyComparator* internal_comparator)
|
|
|
|
: num_levels_(num_levels),
|
|
|
|
curr_level_(-1),
|
2015-02-09 23:53:58 +01:00
|
|
|
hit_file_level_(-1),
|
2014-07-16 22:33:02 +02:00
|
|
|
search_left_bound_(0),
|
|
|
|
search_right_bound_(FileIndexer::kLevelMaxIndex),
|
2014-07-18 00:07:05 +02:00
|
|
|
#ifndef NDEBUG
|
2014-07-16 22:33:02 +02:00
|
|
|
files_(files),
|
2014-07-18 00:07:05 +02:00
|
|
|
#endif
|
2014-10-28 18:03:13 +01:00
|
|
|
level_files_brief_(file_levels),
|
2014-07-16 22:33:02 +02:00
|
|
|
user_key_(user_key),
|
|
|
|
ikey_(ikey),
|
|
|
|
file_indexer_(file_indexer),
|
|
|
|
user_comparator_(user_comparator),
|
|
|
|
internal_comparator_(internal_comparator) {
|
|
|
|
// Setup member variables to search first level.
|
|
|
|
search_ended_ = !PrepareNextLevel();
|
|
|
|
if (!search_ended_) {
|
|
|
|
// Prefetch Level 0 table data to avoid cache miss if possible.
|
2014-10-28 18:03:13 +01:00
|
|
|
for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
|
|
|
|
auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
|
2014-07-16 22:33:02 +02:00
|
|
|
if (r) {
|
|
|
|
r->Prepare(ikey);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
FdWithKeyRange* GetNextFile() {
|
|
|
|
while (!search_ended_) { // Loops over different levels.
|
|
|
|
while (curr_index_in_curr_level_ < curr_file_level_->num_files) {
|
|
|
|
// Loops over all files in current level.
|
|
|
|
FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_];
|
2015-02-09 23:53:58 +01:00
|
|
|
hit_file_level_ = curr_level_;
|
2014-07-16 22:33:02 +02:00
|
|
|
int cmp_largest = -1;
|
|
|
|
|
|
|
|
// Do key range filtering of files or/and fractional cascading if:
|
|
|
|
// (1) not all the files are in level 0, or
|
|
|
|
// (2) there are more than 3 Level 0 files
|
|
|
|
// If there are only 3 or less level 0 files in the system, we skip
|
|
|
|
// the key range filtering. In this case, more likely, the system is
|
|
|
|
// highly tuned to minimize number of tables queried by each query,
|
|
|
|
// so it is unlikely that key range filtering is more efficient than
|
|
|
|
// querying the files.
|
|
|
|
if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
|
|
|
|
// Check if key is within a file's range. If search left bound and
|
|
|
|
// right bound point to the same find, we are sure key falls in
|
|
|
|
// range.
|
|
|
|
assert(
|
|
|
|
curr_level_ == 0 ||
|
|
|
|
curr_index_in_curr_level_ == start_index_in_curr_level_ ||
|
|
|
|
user_comparator_->Compare(user_key_,
|
|
|
|
ExtractUserKey(f->smallest_key)) <= 0);
|
|
|
|
|
|
|
|
int cmp_smallest = user_comparator_->Compare(user_key_,
|
|
|
|
ExtractUserKey(f->smallest_key));
|
|
|
|
if (cmp_smallest >= 0) {
|
|
|
|
cmp_largest = user_comparator_->Compare(user_key_,
|
|
|
|
ExtractUserKey(f->largest_key));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup file search bound for the next level based on the
|
|
|
|
// comparison results
|
|
|
|
if (curr_level_ > 0) {
|
|
|
|
file_indexer_->GetNextLevelIndex(curr_level_,
|
|
|
|
curr_index_in_curr_level_,
|
|
|
|
cmp_smallest, cmp_largest,
|
|
|
|
&search_left_bound_,
|
|
|
|
&search_right_bound_);
|
|
|
|
}
|
|
|
|
// Key falls out of current file's range
|
|
|
|
if (cmp_smallest < 0 || cmp_largest > 0) {
|
|
|
|
if (curr_level_ == 0) {
|
|
|
|
++curr_index_in_curr_level_;
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
// Search next level.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
|
|
// Sanity check to make sure that the files are correctly sorted
|
|
|
|
if (prev_file_) {
|
|
|
|
if (curr_level_ != 0) {
|
|
|
|
int comp_sign = internal_comparator_->Compare(
|
|
|
|
prev_file_->largest_key, f->smallest_key);
|
|
|
|
assert(comp_sign < 0);
|
|
|
|
} else {
|
|
|
|
// level == 0, the current file cannot be newer than the previous
|
|
|
|
// one. Use compressed data structure, has no attribute seqNo
|
|
|
|
assert(curr_index_in_curr_level_ > 0);
|
|
|
|
assert(!NewestFirstBySeqNo(files_[0][curr_index_in_curr_level_],
|
|
|
|
files_[0][curr_index_in_curr_level_-1]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
prev_file_ = f;
|
|
|
|
#endif
|
|
|
|
if (curr_level_ > 0 && cmp_largest < 0) {
|
|
|
|
// No more files to search in this level.
|
|
|
|
search_ended_ = !PrepareNextLevel();
|
|
|
|
} else {
|
|
|
|
++curr_index_in_curr_level_;
|
|
|
|
}
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
// Start searching next level.
|
|
|
|
search_ended_ = !PrepareNextLevel();
|
|
|
|
}
|
|
|
|
// Search ended.
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2015-02-09 23:53:58 +01:00
|
|
|
// getter for current file level
|
|
|
|
// for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
|
|
|
|
unsigned int GetHitFileLevel() { return hit_file_level_; }
|
|
|
|
|
2014-07-16 22:33:02 +02:00
|
|
|
private:
|
|
|
|
unsigned int num_levels_;
|
|
|
|
unsigned int curr_level_;
|
2015-02-09 23:53:58 +01:00
|
|
|
unsigned int hit_file_level_;
|
2014-11-11 22:47:22 +01:00
|
|
|
int32_t search_left_bound_;
|
|
|
|
int32_t search_right_bound_;
|
2014-07-18 00:07:05 +02:00
|
|
|
#ifndef NDEBUG
|
2014-07-16 22:33:02 +02:00
|
|
|
std::vector<FileMetaData*>* files_;
|
2014-07-18 00:07:05 +02:00
|
|
|
#endif
|
2014-10-28 18:03:13 +01:00
|
|
|
autovector<LevelFilesBrief>* level_files_brief_;
|
2014-07-16 22:33:02 +02:00
|
|
|
bool search_ended_;
|
2014-10-28 18:03:13 +01:00
|
|
|
LevelFilesBrief* curr_file_level_;
|
2014-07-16 22:33:02 +02:00
|
|
|
unsigned int curr_index_in_curr_level_;
|
|
|
|
unsigned int start_index_in_curr_level_;
|
|
|
|
Slice user_key_;
|
|
|
|
Slice ikey_;
|
|
|
|
FileIndexer* file_indexer_;
|
|
|
|
const Comparator* user_comparator_;
|
|
|
|
const InternalKeyComparator* internal_comparator_;
|
|
|
|
#ifndef NDEBUG
|
|
|
|
FdWithKeyRange* prev_file_;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// Setup local variables to search next level.
|
|
|
|
// Returns false if there are no more levels to search.
|
|
|
|
bool PrepareNextLevel() {
|
|
|
|
curr_level_++;
|
|
|
|
while (curr_level_ < num_levels_) {
|
2014-10-28 18:03:13 +01:00
|
|
|
curr_file_level_ = &(*level_files_brief_)[curr_level_];
|
2014-07-16 22:33:02 +02:00
|
|
|
if (curr_file_level_->num_files == 0) {
|
|
|
|
// When current level is empty, the search bound generated from upper
|
|
|
|
// level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
|
|
|
|
// also empty.
|
|
|
|
assert(search_left_bound_ == 0);
|
|
|
|
assert(search_right_bound_ == -1 ||
|
|
|
|
search_right_bound_ == FileIndexer::kLevelMaxIndex);
|
|
|
|
// Since current level is empty, it will need to search all files in
|
|
|
|
// the next level
|
|
|
|
search_left_bound_ = 0;
|
|
|
|
search_right_bound_ = FileIndexer::kLevelMaxIndex;
|
|
|
|
curr_level_++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Some files may overlap each other. We find
|
|
|
|
// all files that overlap user_key and process them in order from
|
|
|
|
// newest to oldest. In the context of merge-operator, this can occur at
|
|
|
|
// any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
|
|
|
|
// are always compacted into a single entry).
|
|
|
|
int32_t start_index;
|
|
|
|
if (curr_level_ == 0) {
|
|
|
|
// On Level-0, we read through all files to check for overlap.
|
|
|
|
start_index = 0;
|
|
|
|
} else {
|
|
|
|
// On Level-n (n>=1), files are sorted. Binary search to find the
|
|
|
|
// earliest file whose largest key >= ikey. Search left bound and
|
|
|
|
// right bound are used to narrow the range.
|
|
|
|
if (search_left_bound_ == search_right_bound_) {
|
|
|
|
start_index = search_left_bound_;
|
|
|
|
} else if (search_left_bound_ < search_right_bound_) {
|
|
|
|
if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
|
2014-11-11 22:47:22 +01:00
|
|
|
search_right_bound_ =
|
|
|
|
static_cast<int32_t>(curr_file_level_->num_files) - 1;
|
2014-07-16 22:33:02 +02:00
|
|
|
}
|
2014-11-11 22:47:22 +01:00
|
|
|
start_index =
|
|
|
|
FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
|
|
|
|
static_cast<uint32_t>(search_left_bound_),
|
|
|
|
static_cast<uint32_t>(search_right_bound_));
|
2014-07-16 22:33:02 +02:00
|
|
|
} else {
|
|
|
|
// search_left_bound > search_right_bound, key does not exist in
|
|
|
|
// this level. Since no comparision is done in this level, it will
|
|
|
|
// need to search all files in the next level.
|
|
|
|
search_left_bound_ = 0;
|
|
|
|
search_right_bound_ = FileIndexer::kLevelMaxIndex;
|
|
|
|
curr_level_++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
start_index_in_curr_level_ = start_index;
|
|
|
|
curr_index_in_curr_level_ = start_index;
|
|
|
|
#ifndef NDEBUG
|
|
|
|
prev_file_ = nullptr;
|
|
|
|
#endif
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// curr_level_ = num_levels_. So, no more levels to search.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
} // anonymous namespace
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Version::~Version() {
|
|
|
|
assert(refs_ == 0);
|
2011-05-21 04:17:43 +02:00
|
|
|
|
|
|
|
// Remove from linked list
|
|
|
|
prev_->next_ = next_;
|
|
|
|
next_->prev_ = prev_;
|
|
|
|
|
|
|
|
// Drop references to files
|
2014-10-31 16:48:19 +01:00
|
|
|
for (int level = 0; level < storage_info_.num_levels_; level++) {
|
|
|
|
for (size_t i = 0; i < storage_info_.files_[level].size(); i++) {
|
|
|
|
FileMetaData* f = storage_info_.files_[level][i];
|
2011-05-21 04:17:43 +02:00
|
|
|
assert(f->refs > 0);
|
2011-03-18 23:37:00 +01:00
|
|
|
f->refs--;
|
|
|
|
if (f->refs <= 0) {
|
2014-01-07 05:29:17 +01:00
|
|
|
if (f->table_reader_handle) {
|
2014-02-07 00:42:16 +01:00
|
|
|
cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
|
2014-01-07 05:29:17 +01:00
|
|
|
f->table_reader_handle = nullptr;
|
|
|
|
}
|
2015-02-06 17:44:30 +01:00
|
|
|
vset_->obsolete_files_.push_back(f);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
int FindFile(const InternalKeyComparator& icmp,
|
2014-10-28 18:03:13 +01:00
|
|
|
const LevelFilesBrief& file_level,
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const Slice& key) {
|
2014-11-11 22:47:22 +01:00
|
|
|
return FindFileInRange(icmp, file_level, key, 0,
|
|
|
|
static_cast<uint32_t>(file_level.num_files));
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
}
|
|
|
|
|
2014-10-28 18:03:13 +01:00
|
|
|
void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
|
2014-07-11 21:52:41 +02:00
|
|
|
const std::vector<FileMetaData*>& files,
|
|
|
|
Arena* arena) {
|
|
|
|
assert(file_level);
|
|
|
|
assert(arena);
|
|
|
|
|
|
|
|
size_t num = files.size();
|
|
|
|
file_level->num_files = num;
|
|
|
|
char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange));
|
|
|
|
file_level->files = new (mem)FdWithKeyRange[num];
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num; i++) {
|
|
|
|
Slice smallest_key = files[i]->smallest.Encode();
|
|
|
|
Slice largest_key = files[i]->largest.Encode();
|
|
|
|
|
|
|
|
// Copy key slice to sequential memory
|
|
|
|
size_t smallest_size = smallest_key.size();
|
|
|
|
size_t largest_size = largest_key.size();
|
|
|
|
mem = arena->AllocateAligned(smallest_size + largest_size);
|
|
|
|
memcpy(mem, smallest_key.data(), smallest_size);
|
|
|
|
memcpy(mem + smallest_size, largest_key.data(), largest_size);
|
|
|
|
|
|
|
|
FdWithKeyRange& f = file_level->files[i];
|
|
|
|
f.fd = files[i]->fd;
|
|
|
|
f.smallest_key = Slice(mem, smallest_size);
|
|
|
|
f.largest_key = Slice(mem + smallest_size, largest_size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-06 01:30:28 +02:00
|
|
|
static bool AfterFile(const Comparator* ucmp,
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const Slice* user_key, const FdWithKeyRange* f) {
|
2013-03-01 03:04:58 +01:00
|
|
|
// nullptr user_key occurs before all keys and is therefore never after *f
|
|
|
|
return (user_key != nullptr &&
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
ucmp->Compare(*user_key, ExtractUserKey(f->largest_key)) > 0);
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool BeforeFile(const Comparator* ucmp,
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const Slice* user_key, const FdWithKeyRange* f) {
|
2013-03-01 03:04:58 +01:00
|
|
|
// nullptr user_key occurs after all keys and is therefore never before *f
|
|
|
|
return (user_key != nullptr &&
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
ucmp->Compare(*user_key, ExtractUserKey(f->smallest_key)) < 0);
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
|
|
|
|
2011-06-22 04:36:45 +02:00
|
|
|
bool SomeFileOverlapsRange(
|
|
|
|
const InternalKeyComparator& icmp,
|
2011-10-06 01:30:28 +02:00
|
|
|
bool disjoint_sorted_files,
|
2014-10-28 18:03:13 +01:00
|
|
|
const LevelFilesBrief& file_level,
|
2011-10-06 01:30:28 +02:00
|
|
|
const Slice* smallest_user_key,
|
|
|
|
const Slice* largest_user_key) {
|
|
|
|
const Comparator* ucmp = icmp.user_comparator();
|
|
|
|
if (!disjoint_sorted_files) {
|
|
|
|
// Need to check against all files
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
for (size_t i = 0; i < file_level.num_files; i++) {
|
|
|
|
const FdWithKeyRange* f = &(file_level.files[i]);
|
2011-10-06 01:30:28 +02:00
|
|
|
if (AfterFile(ucmp, smallest_user_key, f) ||
|
|
|
|
BeforeFile(ucmp, largest_user_key, f)) {
|
|
|
|
// No overlap
|
|
|
|
} else {
|
|
|
|
return true; // Overlap
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Binary search over file list
|
|
|
|
uint32_t index = 0;
|
2013-03-01 03:04:58 +01:00
|
|
|
if (smallest_user_key != nullptr) {
|
2011-10-06 01:30:28 +02:00
|
|
|
// Find the earliest possible internal key for smallest_user_key
|
|
|
|
InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
index = FindFile(icmp, file_level, small.Encode());
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
|
|
|
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
if (index >= file_level.num_files) {
|
2011-10-06 01:30:28 +02:00
|
|
|
// beginning of range is after all files, so no overlap.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]);
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
|
2014-10-28 19:42:22 +01:00
|
|
|
namespace {
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// An internal iterator. For a given version/level pair, yields
|
|
|
|
// information about the files in the level. For a given entry, key()
|
|
|
|
// is the largest key that occurs in the file, and value() is an
|
2011-03-28 22:43:44 +02:00
|
|
|
// 16-byte value containing the file number and file size, both
|
|
|
|
// encoded using EncodeFixed64.
|
2014-10-28 19:42:22 +01:00
|
|
|
class LevelFileNumIterator : public Iterator {
|
2011-03-18 23:37:00 +01:00
|
|
|
public:
|
2011-05-21 04:17:43 +02:00
|
|
|
LevelFileNumIterator(const InternalKeyComparator& icmp,
|
2014-10-28 18:03:13 +01:00
|
|
|
const LevelFilesBrief* flevel)
|
2011-05-21 04:17:43 +02:00
|
|
|
: icmp_(icmp),
|
2014-07-11 21:52:41 +02:00
|
|
|
flevel_(flevel),
|
2014-11-11 22:47:22 +01:00
|
|
|
index_(static_cast<uint32_t>(flevel->num_files)),
|
2014-07-02 18:54:20 +02:00
|
|
|
current_value_(0, 0, 0) { // Marks as invalid
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2015-02-26 20:28:41 +01:00
|
|
|
virtual bool Valid() const override { return index_ < flevel_->num_files; }
|
|
|
|
virtual void Seek(const Slice& target) override {
|
2014-07-11 21:52:41 +02:00
|
|
|
index_ = FindFile(icmp_, *flevel_, target);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2015-02-26 20:28:41 +01:00
|
|
|
virtual void SeekToFirst() override { index_ = 0; }
|
|
|
|
virtual void SeekToLast() override {
|
2014-11-11 22:47:22 +01:00
|
|
|
index_ = (flevel_->num_files == 0)
|
|
|
|
? 0
|
|
|
|
: static_cast<uint32_t>(flevel_->num_files) - 1;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2015-02-26 20:28:41 +01:00
|
|
|
virtual void Next() override {
|
2011-03-18 23:37:00 +01:00
|
|
|
assert(Valid());
|
|
|
|
index_++;
|
|
|
|
}
|
2015-02-26 20:28:41 +01:00
|
|
|
virtual void Prev() override {
|
2011-03-18 23:37:00 +01:00
|
|
|
assert(Valid());
|
|
|
|
if (index_ == 0) {
|
2014-11-11 22:47:22 +01:00
|
|
|
index_ = static_cast<uint32_t>(flevel_->num_files); // Marks as invalid
|
2011-03-18 23:37:00 +01:00
|
|
|
} else {
|
|
|
|
index_--;
|
|
|
|
}
|
|
|
|
}
|
2015-02-26 20:28:41 +01:00
|
|
|
Slice key() const override {
|
2011-03-18 23:37:00 +01:00
|
|
|
assert(Valid());
|
2014-07-11 21:52:41 +02:00
|
|
|
return flevel_->files[index_].largest_key;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2015-02-26 20:28:41 +01:00
|
|
|
Slice value() const override {
|
2011-03-18 23:37:00 +01:00
|
|
|
assert(Valid());
|
2014-07-11 21:52:41 +02:00
|
|
|
|
|
|
|
auto file_meta = flevel_->files[index_];
|
|
|
|
current_value_ = file_meta.fd;
|
2014-04-02 03:36:18 +02:00
|
|
|
return Slice(reinterpret_cast<const char*>(¤t_value_),
|
2014-06-14 00:54:19 +02:00
|
|
|
sizeof(FileDescriptor));
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2015-02-26 20:28:41 +01:00
|
|
|
virtual Status status() const override { return Status::OK(); }
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
private:
|
|
|
|
const InternalKeyComparator icmp_;
|
2014-10-28 18:03:13 +01:00
|
|
|
const LevelFilesBrief* flevel_;
|
2011-04-21 00:48:11 +02:00
|
|
|
uint32_t index_;
|
2014-06-14 00:54:19 +02:00
|
|
|
mutable FileDescriptor current_value_;
|
2011-03-18 23:37:00 +01:00
|
|
|
};
|
|
|
|
|
2014-10-28 19:42:22 +01:00
|
|
|
class LevelFileIteratorState : public TwoLevelIteratorState {
|
2014-04-25 21:22:23 +02:00
|
|
|
public:
|
|
|
|
LevelFileIteratorState(TableCache* table_cache,
|
|
|
|
const ReadOptions& read_options, const EnvOptions& env_options,
|
|
|
|
const InternalKeyComparator& icomparator, bool for_compaction,
|
|
|
|
bool prefix_enabled)
|
|
|
|
: TwoLevelIteratorState(prefix_enabled),
|
|
|
|
table_cache_(table_cache), read_options_(read_options),
|
|
|
|
env_options_(env_options), icomparator_(icomparator),
|
|
|
|
for_compaction_(for_compaction) {}
|
|
|
|
|
|
|
|
Iterator* NewSecondaryIterator(const Slice& meta_handle) override {
|
2014-06-14 00:54:19 +02:00
|
|
|
if (meta_handle.size() != sizeof(FileDescriptor)) {
|
2014-04-25 21:22:23 +02:00
|
|
|
return NewErrorIterator(
|
|
|
|
Status::Corruption("FileReader invoked with unexpected value"));
|
|
|
|
} else {
|
2014-06-14 00:54:19 +02:00
|
|
|
const FileDescriptor* fd =
|
|
|
|
reinterpret_cast<const FileDescriptor*>(meta_handle.data());
|
|
|
|
return table_cache_->NewIterator(
|
|
|
|
read_options_, env_options_, icomparator_, *fd,
|
|
|
|
nullptr /* don't need reference to table*/, for_compaction_);
|
2014-04-25 21:22:23 +02:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-04-25 21:22:23 +02:00
|
|
|
bool PrefixMayMatch(const Slice& internal_key) override {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
TableCache* table_cache_;
|
|
|
|
const ReadOptions read_options_;
|
|
|
|
const EnvOptions& env_options_;
|
|
|
|
const InternalKeyComparator& icomparator_;
|
|
|
|
bool for_compaction_;
|
|
|
|
};
|
2013-08-23 23:49:57 +02:00
|
|
|
|
2014-10-31 16:48:19 +01:00
|
|
|
// A wrapper of version builder which references the current version in
|
|
|
|
// constructor and unref it in the destructor.
|
2014-10-31 20:16:35 +01:00
|
|
|
// Both of the constructor and destructor need to be called inside DB Mutex.
|
2014-10-31 16:48:19 +01:00
|
|
|
class BaseReferencedVersionBuilder {
|
|
|
|
public:
|
|
|
|
explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
|
2014-10-31 20:16:35 +01:00
|
|
|
: version_builder_(new VersionBuilder(
|
2014-11-04 02:45:55 +01:00
|
|
|
cfd->current()->version_set()->env_options(), cfd->table_cache(),
|
2014-10-31 20:16:35 +01:00
|
|
|
cfd->current()->storage_info())),
|
2014-10-31 16:48:19 +01:00
|
|
|
version_(cfd->current()) {
|
|
|
|
version_->Ref();
|
|
|
|
}
|
2014-10-31 20:16:35 +01:00
|
|
|
~BaseReferencedVersionBuilder() {
|
|
|
|
delete version_builder_;
|
|
|
|
version_->Unref();
|
|
|
|
}
|
2014-11-04 02:45:55 +01:00
|
|
|
VersionBuilder* version_builder() { return version_builder_; }
|
2014-10-31 16:48:19 +01:00
|
|
|
|
|
|
|
private:
|
2014-10-31 20:16:35 +01:00
|
|
|
VersionBuilder* version_builder_;
|
2014-10-31 16:48:19 +01:00
|
|
|
Version* version_;
|
|
|
|
};
|
2014-10-28 19:42:22 +01:00
|
|
|
} // anonymous namespace
|
|
|
|
|
2014-06-25 00:37:06 +02:00
|
|
|
Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
|
|
|
|
const FileMetaData* file_meta,
|
|
|
|
const std::string* fname) {
|
2014-02-15 02:02:10 +01:00
|
|
|
auto table_cache = cfd_->table_cache();
|
2014-09-09 00:04:34 +02:00
|
|
|
auto ioptions = cfd_->ioptions();
|
2014-06-25 00:37:06 +02:00
|
|
|
Status s = table_cache->GetTableProperties(
|
2014-09-09 00:25:01 +02:00
|
|
|
vset_->env_options_, cfd_->internal_comparator(), file_meta->fd,
|
2014-06-25 00:37:06 +02:00
|
|
|
tp, true /* no io */);
|
|
|
|
if (s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We only ignore error type `Incomplete` since it's by design that we
|
|
|
|
// disallow table when it's not in table cache.
|
|
|
|
if (!s.IsIncomplete()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// 2. Table is not present in table cache, we'll read the table properties
|
|
|
|
// directly from the properties block in the file.
|
|
|
|
std::unique_ptr<RandomAccessFile> file;
|
|
|
|
if (fname != nullptr) {
|
2014-09-09 00:04:34 +02:00
|
|
|
s = ioptions->env->NewRandomAccessFile(
|
2014-09-09 00:25:01 +02:00
|
|
|
*fname, &file, vset_->env_options_);
|
2014-06-25 00:37:06 +02:00
|
|
|
} else {
|
2014-09-09 00:04:34 +02:00
|
|
|
s = ioptions->env->NewRandomAccessFile(
|
2014-09-09 00:25:01 +02:00
|
|
|
TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
|
2014-07-02 18:54:20 +02:00
|
|
|
file_meta->fd.GetPathId()),
|
2014-09-09 00:25:01 +02:00
|
|
|
&file, vset_->env_options_);
|
2014-06-25 00:37:06 +02:00
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
TableProperties* raw_table_properties;
|
|
|
|
// By setting the magic number to kInvalidTableMagicNumber, we can by
|
|
|
|
// pass the magic number check in the footer.
|
|
|
|
s = ReadTableProperties(
|
|
|
|
file.get(), file_meta->fd.GetFileSize(),
|
|
|
|
Footer::kInvalidTableMagicNumber /* table's magic number */,
|
2014-09-09 00:04:34 +02:00
|
|
|
vset_->env_, ioptions->info_log, &raw_table_properties);
|
2014-06-25 00:37:06 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2014-09-09 00:04:34 +02:00
|
|
|
RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
|
2014-06-25 00:37:06 +02:00
|
|
|
|
|
|
|
*tp = std::shared_ptr<const TableProperties>(raw_table_properties);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
|
2014-10-31 16:48:19 +01:00
|
|
|
for (int level = 0; level < storage_info_.num_levels_; level++) {
|
|
|
|
for (const auto& file_meta : storage_info_.files_[level]) {
|
2014-07-02 18:54:20 +02:00
|
|
|
auto fname =
|
2014-09-09 00:25:01 +02:00
|
|
|
TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
|
2014-07-02 18:54:20 +02:00
|
|
|
file_meta->fd.GetPathId());
|
2014-02-14 01:28:21 +01:00
|
|
|
// 1. If the table is already present in table cache, load table
|
|
|
|
// properties from there.
|
|
|
|
std::shared_ptr<const TableProperties> table_properties;
|
2014-06-25 00:37:06 +02:00
|
|
|
Status s = GetTableProperties(&table_properties, file_meta, &fname);
|
2014-02-14 01:28:21 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
props->insert({fname, table_properties});
|
2014-06-25 00:37:06 +02:00
|
|
|
} else {
|
2014-02-14 01:28:21 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2014-08-05 20:27:34 +02:00
|
|
|
size_t Version::GetMemoryUsageByTableReaders() {
|
|
|
|
size_t total_usage = 0;
|
2014-10-31 16:48:19 +01:00
|
|
|
for (auto& file_level : storage_info_.level_files_brief_) {
|
2014-08-05 20:27:34 +02:00
|
|
|
for (size_t i = 0; i < file_level.num_files; i++) {
|
|
|
|
total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
|
2014-09-09 00:25:01 +02:00
|
|
|
vset_->env_options_, cfd_->internal_comparator(),
|
2014-08-05 20:27:34 +02:00
|
|
|
file_level.files[i].fd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return total_usage;
|
|
|
|
}
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
|
|
|
|
assert(cf_meta);
|
|
|
|
assert(cfd_);
|
|
|
|
|
|
|
|
cf_meta->name = cfd_->GetName();
|
|
|
|
cf_meta->size = 0;
|
|
|
|
cf_meta->file_count = 0;
|
|
|
|
cf_meta->levels.clear();
|
|
|
|
|
|
|
|
auto* ioptions = cfd_->ioptions();
|
|
|
|
auto* vstorage = storage_info();
|
|
|
|
|
|
|
|
for (int level = 0; level < cfd_->NumberLevels(); level++) {
|
|
|
|
uint64_t level_size = 0;
|
|
|
|
cf_meta->file_count += vstorage->LevelFiles(level).size();
|
|
|
|
std::vector<SstFileMetaData> files;
|
|
|
|
for (const auto& file : vstorage->LevelFiles(level)) {
|
|
|
|
uint32_t path_id = file->fd.GetPathId();
|
|
|
|
std::string file_path;
|
|
|
|
if (path_id < ioptions->db_paths.size()) {
|
|
|
|
file_path = ioptions->db_paths[path_id].path;
|
|
|
|
} else {
|
|
|
|
assert(!ioptions->db_paths.empty());
|
|
|
|
file_path = ioptions->db_paths.back().path;
|
|
|
|
}
|
|
|
|
files.emplace_back(
|
|
|
|
MakeTableFileName("", file->fd.GetNumber()),
|
|
|
|
file_path,
|
|
|
|
file->fd.GetFileSize(),
|
|
|
|
file->smallest_seqno,
|
|
|
|
file->largest_seqno,
|
|
|
|
file->smallest.user_key().ToString(),
|
|
|
|
file->largest.user_key().ToString(),
|
|
|
|
file->being_compacted);
|
|
|
|
level_size += file->fd.GetFileSize();
|
|
|
|
}
|
|
|
|
cf_meta->levels.emplace_back(
|
|
|
|
level, level_size, std::move(files));
|
|
|
|
cf_meta->size += level_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-10-31 16:48:19 +01:00
|
|
|
uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
|
avoid returning a number-of-active-keys estimate of nearly 2^64
Summary:
If accumulated_num_non_deletions_ were ever smaller than
accumulated_num_deletions_, the computation of
"accumulated_num_non_deletions_ - accumulated_num_deletions_"
would result in a logically "negative" value, but since
the two operands are unsigned (uint64_t), the result corresponding
to e.g., -1 would 2^64-1.
Instead, return 0 in that case.
Test Plan:
- ensure "make check" still passes
- temporarily add an "abort();" call in the new "if"-block, and
observe that it fails in some test cases. However, note that
this case is triggered only when the two numbers are equal.
Thus, no test case triggers the erroneous behavior this
change is designed to avoid. If anyone can construct a
scenario in which that bug would be triggered, I'll be
happy to add a test case.
Reviewers: ljin, igor, rven, igor.sugak, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D36489
2015-04-03 23:46:35 +02:00
|
|
|
// Estimation will be inaccurate when:
|
|
|
|
// (1) there exist merge keys
|
2014-07-28 23:50:16 +02:00
|
|
|
// (2) keys are directly overwritten
|
|
|
|
// (3) deletion on non-existing keys
|
2014-10-17 23:58:30 +02:00
|
|
|
// (4) low number of samples
|
|
|
|
if (num_samples_ == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
avoid returning a number-of-active-keys estimate of nearly 2^64
Summary:
If accumulated_num_non_deletions_ were ever smaller than
accumulated_num_deletions_, the computation of
"accumulated_num_non_deletions_ - accumulated_num_deletions_"
would result in a logically "negative" value, but since
the two operands are unsigned (uint64_t), the result corresponding
to e.g., -1 would 2^64-1.
Instead, return 0 in that case.
Test Plan:
- ensure "make check" still passes
- temporarily add an "abort();" call in the new "if"-block, and
observe that it fails in some test cases. However, note that
this case is triggered only when the two numbers are equal.
Thus, no test case triggers the erroneous behavior this
change is designed to avoid. If anyone can construct a
scenario in which that bug would be triggered, I'll be
happy to add a test case.
Reviewers: ljin, igor, rven, igor.sugak, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D36489
2015-04-03 23:46:35 +02:00
|
|
|
if (accumulated_num_non_deletions_ <= accumulated_num_deletions_) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t est = accumulated_num_non_deletions_ - accumulated_num_deletions_;
|
|
|
|
|
2014-11-11 23:28:18 +01:00
|
|
|
uint64_t file_count = 0;
|
|
|
|
for (int level = 0; level < num_levels_; ++level) {
|
|
|
|
file_count += files_[level].size();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (num_samples_ < file_count) {
|
2014-10-17 23:58:30 +02:00
|
|
|
// casting to avoid overflowing
|
avoid returning a number-of-active-keys estimate of nearly 2^64
Summary:
If accumulated_num_non_deletions_ were ever smaller than
accumulated_num_deletions_, the computation of
"accumulated_num_non_deletions_ - accumulated_num_deletions_"
would result in a logically "negative" value, but since
the two operands are unsigned (uint64_t), the result corresponding
to e.g., -1 would 2^64-1.
Instead, return 0 in that case.
Test Plan:
- ensure "make check" still passes
- temporarily add an "abort();" call in the new "if"-block, and
observe that it fails in some test cases. However, note that
this case is triggered only when the two numbers are equal.
Thus, no test case triggers the erroneous behavior this
change is designed to avoid. If anyone can construct a
scenario in which that bug would be triggered, I'll be
happy to add a test case.
Reviewers: ljin, igor, rven, igor.sugak, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D36489
2015-04-03 23:46:35 +02:00
|
|
|
return (est * static_cast<double>(file_count) / num_samples_);
|
2014-10-17 23:58:30 +02:00
|
|
|
} else {
|
avoid returning a number-of-active-keys estimate of nearly 2^64
Summary:
If accumulated_num_non_deletions_ were ever smaller than
accumulated_num_deletions_, the computation of
"accumulated_num_non_deletions_ - accumulated_num_deletions_"
would result in a logically "negative" value, but since
the two operands are unsigned (uint64_t), the result corresponding
to e.g., -1 would 2^64-1.
Instead, return 0 in that case.
Test Plan:
- ensure "make check" still passes
- temporarily add an "abort();" call in the new "if"-block, and
observe that it fails in some test cases. However, note that
this case is triggered only when the two numbers are equal.
Thus, no test case triggers the erroneous behavior this
change is designed to avoid. If anyone can construct a
scenario in which that bug would be triggered, I'll be
happy to add a test case.
Reviewers: ljin, igor, rven, igor.sugak, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D36489
2015-04-03 23:46:35 +02:00
|
|
|
return est;
|
2014-10-17 23:58:30 +02:00
|
|
|
}
|
2014-07-28 23:50:16 +02:00
|
|
|
}
|
|
|
|
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
void Version::AddIterators(const ReadOptions& read_options,
|
|
|
|
const EnvOptions& soptions,
|
|
|
|
MergeIteratorBuilder* merge_iter_builder) {
|
2014-10-31 16:48:19 +01:00
|
|
|
assert(storage_info_.finalized_);
|
2014-10-28 17:59:56 +01:00
|
|
|
|
2014-11-12 23:19:33 +01:00
|
|
|
if (storage_info_.num_non_empty_levels() == 0) {
|
|
|
|
// No file in the Version.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
// Merge all level zero files together since they may overlap
|
2014-10-31 16:48:19 +01:00
|
|
|
for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
|
|
|
|
const auto& file = storage_info_.LevelFilesBrief(0).files[i];
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
|
2014-07-11 21:52:41 +02:00
|
|
|
read_options, soptions, cfd_->internal_comparator(), file.fd, nullptr,
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
false, merge_iter_builder->GetArena()));
|
|
|
|
}
|
|
|
|
|
|
|
|
// For levels > 0, we can use a concatenating iterator that sequentially
|
|
|
|
// walks through the non-overlapping files in the level, opening them
|
|
|
|
// lazily.
|
2014-11-12 23:19:33 +01:00
|
|
|
for (int level = 1; level < storage_info_.num_non_empty_levels(); level++) {
|
|
|
|
if (storage_info_.LevelFilesBrief(level).num_files != 0) {
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
merge_iter_builder->AddIterator(NewTwoLevelIterator(
|
|
|
|
new LevelFileIteratorState(
|
|
|
|
cfd_->table_cache(), read_options, soptions,
|
|
|
|
cfd_->internal_comparator(), false /* for_compaction */,
|
2014-09-09 00:04:34 +02:00
|
|
|
cfd_->ioptions()->prefix_extractor != nullptr),
|
2014-07-11 21:52:41 +02:00
|
|
|
new LevelFileNumIterator(cfd_->internal_comparator(),
|
2014-10-31 16:48:19 +01:00
|
|
|
&storage_info_.LevelFilesBrief(level)),
|
2014-10-27 23:49:46 +01:00
|
|
|
merge_iter_builder->GetArena()));
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
VersionStorageInfo::VersionStorageInfo(
|
|
|
|
const InternalKeyComparator* internal_comparator,
|
2014-11-06 20:14:28 +01:00
|
|
|
const Comparator* user_comparator, int levels,
|
2014-10-27 23:49:46 +01:00
|
|
|
CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage)
|
|
|
|
: internal_comparator_(internal_comparator),
|
|
|
|
user_comparator_(user_comparator),
|
2014-06-14 00:06:10 +02:00
|
|
|
// cfd is nullptr if Version is dummy
|
2014-11-06 20:14:28 +01:00
|
|
|
num_levels_(levels),
|
2014-11-12 23:19:33 +01:00
|
|
|
num_non_empty_levels_(0),
|
2014-10-27 23:49:46 +01:00
|
|
|
file_indexer_(user_comparator),
|
|
|
|
compaction_style_(compaction_style),
|
2014-01-16 01:15:43 +01:00
|
|
|
files_(new std::vector<FileMetaData*>[num_levels_]),
|
2015-03-30 23:04:21 +02:00
|
|
|
base_level_(num_levels_ == 1 ? -1 : 1),
|
2014-01-16 01:15:43 +01:00
|
|
|
files_by_size_(num_levels_),
|
|
|
|
next_file_to_compact_by_size_(num_levels_),
|
|
|
|
compaction_score_(num_levels_),
|
|
|
|
compaction_level_(num_levels_),
|
2015-03-30 23:04:21 +02:00
|
|
|
l0_delay_trigger_count_(0),
|
2014-10-17 23:58:30 +02:00
|
|
|
accumulated_file_size_(0),
|
|
|
|
accumulated_raw_key_size_(0),
|
|
|
|
accumulated_raw_value_size_(0),
|
|
|
|
accumulated_num_non_deletions_(0),
|
|
|
|
accumulated_num_deletions_(0),
|
2014-10-28 17:59:56 +01:00
|
|
|
num_samples_(0),
|
|
|
|
finalized_(false) {
|
2014-10-27 23:49:46 +01:00
|
|
|
if (ref_vstorage != nullptr) {
|
|
|
|
accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
|
|
|
|
accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
|
|
|
|
accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_;
|
|
|
|
accumulated_num_non_deletions_ =
|
|
|
|
ref_vstorage->accumulated_num_non_deletions_;
|
|
|
|
accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
|
|
|
|
num_samples_ = ref_vstorage->num_samples_;
|
2014-07-09 21:46:08 +02:00
|
|
|
}
|
hints for narrowing down FindFile range and avoiding checking unrelevant L0 files
Summary:
The file tree structure in Version is prebuilt and the range of each file is known.
On the Get() code path, we do binary search in FindFile() by comparing
target key with each file's largest key and also check the range for each L0 file.
With some pre-calculated knowledge, each key comparision that has been done can serve
as a hint to narrow down further searches:
(1) If a key falls within a L0 file's range, we can safely skip the next
file if its range does not overlap with the current one.
(2) If a key falls within a file's range in level L0 - Ln-1, we should only
need to binary search in the next level for files that overlap with the current one.
(1) will be able to skip some files depending one the key distribution.
(2) can greatly reduce the range of binary search, especially for bottom
levels, given that one file most likely only overlaps with N files from
the level below (where N is max_bytes_for_level_multiplier). So on level
L, we will only look at ~N files instead of N^L files.
Some inital results: measured with 500M key DB, when write is light (10k/s = 1.2M/s), this
improves QPS ~7% on top of blocked bloom. When write is heavier (80k/s =
9.6M/s), it gives us ~13% improvement.
Test Plan: make all check
Reviewers: haobo, igor, dhruba, sdong, yhchiang
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D17205
2014-04-21 18:10:12 +02:00
|
|
|
}
|
2012-06-23 04:30:03 +02:00
|
|
|
|
2014-11-06 20:14:28 +01:00
|
|
|
Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
|
2014-10-27 23:49:46 +01:00
|
|
|
uint64_t version_number)
|
2015-03-03 19:59:36 +01:00
|
|
|
: env_(vset->env_),
|
|
|
|
cfd_(column_family_data),
|
2014-11-06 20:14:28 +01:00
|
|
|
info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
|
|
|
|
db_statistics_((cfd_ == nullptr) ? nullptr
|
|
|
|
: cfd_->ioptions()->statistics),
|
|
|
|
table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
|
|
|
|
merge_operator_((cfd_ == nullptr) ? nullptr
|
|
|
|
: cfd_->ioptions()->merge_operator),
|
|
|
|
storage_info_((cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
|
|
|
|
(cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
|
|
|
|
cfd_ == nullptr ? 0 : cfd_->NumberLevels(),
|
|
|
|
cfd_ == nullptr ? kCompactionStyleLevel
|
|
|
|
: cfd_->ioptions()->compaction_style,
|
|
|
|
(cfd_ == nullptr || cfd_->current() == nullptr)
|
2014-10-31 16:48:19 +01:00
|
|
|
? nullptr
|
2014-11-06 20:14:28 +01:00
|
|
|
: cfd_->current()->storage_info()),
|
2014-10-27 23:49:46 +01:00
|
|
|
vset_(vset),
|
|
|
|
next_(this),
|
|
|
|
prev_(this),
|
|
|
|
refs_(0),
|
|
|
|
version_number_(version_number) {}
|
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
void Version::Get(const ReadOptions& read_options,
|
2013-03-21 23:59:47 +01:00
|
|
|
const LookupKey& k,
|
|
|
|
std::string* value,
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
Status* status,
|
2013-12-03 03:34:05 +01:00
|
|
|
MergeContext* merge_context,
|
2013-07-26 21:57:01 +02:00
|
|
|
bool* value_found) {
|
2011-06-22 04:36:45 +02:00
|
|
|
Slice ikey = k.internal_key();
|
|
|
|
Slice user_key = k.user_key();
|
2013-03-21 23:59:47 +01:00
|
|
|
|
|
|
|
assert(status->ok() || status->IsMergeInProgress());
|
2014-09-29 20:09:09 +02:00
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
GetContext get_context(
|
2014-10-31 16:48:19 +01:00
|
|
|
user_comparator(), merge_operator_, info_log_, db_statistics_,
|
2014-10-27 23:49:46 +01:00
|
|
|
status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
|
2015-03-03 19:59:36 +01:00
|
|
|
value, value_found, merge_context, this->env_);
|
2011-06-22 04:36:45 +02:00
|
|
|
|
2014-10-31 16:48:19 +01:00
|
|
|
FilePicker fp(
|
|
|
|
storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_,
|
|
|
|
storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_,
|
|
|
|
user_comparator(), internal_comparator());
|
2014-07-16 22:33:02 +02:00
|
|
|
FdWithKeyRange* f = fp.GetNextFile();
|
|
|
|
while (f != nullptr) {
|
2014-10-31 16:48:19 +01:00
|
|
|
*status = table_cache_->Get(read_options, *internal_comparator(), f->fd,
|
2014-10-02 01:19:16 +02:00
|
|
|
ikey, &get_context);
|
2014-07-16 22:33:02 +02:00
|
|
|
// TODO: examine the behavior for corrupted key
|
|
|
|
if (!status->ok()) {
|
|
|
|
return;
|
hints for narrowing down FindFile range and avoiding checking unrelevant L0 files
Summary:
The file tree structure in Version is prebuilt and the range of each file is known.
On the Get() code path, we do binary search in FindFile() by comparing
target key with each file's largest key and also check the range for each L0 file.
With some pre-calculated knowledge, each key comparision that has been done can serve
as a hint to narrow down further searches:
(1) If a key falls within a L0 file's range, we can safely skip the next
file if its range does not overlap with the current one.
(2) If a key falls within a file's range in level L0 - Ln-1, we should only
need to binary search in the next level for files that overlap with the current one.
(1) will be able to skip some files depending one the key distribution.
(2) can greatly reduce the range of binary search, especially for bottom
levels, given that one file most likely only overlaps with N files from
the level below (where N is max_bytes_for_level_multiplier). So on level
L, we will only look at ~N files instead of N^L files.
Some inital results: measured with 500M key DB, when write is light (10k/s = 1.2M/s), this
improves QPS ~7% on top of blocked bloom. When write is heavier (80k/s =
9.6M/s), it gives us ~13% improvement.
Test Plan: make all check
Reviewers: haobo, igor, dhruba, sdong, yhchiang
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D17205
2014-04-21 18:10:12 +02:00
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
|
2014-09-29 20:09:09 +02:00
|
|
|
switch (get_context.State()) {
|
|
|
|
case GetContext::kNotFound:
|
|
|
|
// Keep searching in other files
|
|
|
|
break;
|
|
|
|
case GetContext::kFound:
|
2015-02-09 23:53:58 +01:00
|
|
|
if (fp.GetHitFileLevel() == 0) {
|
|
|
|
RecordTick(db_statistics_, GET_HIT_L0);
|
|
|
|
} else if (fp.GetHitFileLevel() == 1) {
|
|
|
|
RecordTick(db_statistics_, GET_HIT_L1);
|
|
|
|
} else if (fp.GetHitFileLevel() >= 2) {
|
|
|
|
RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
|
|
|
|
}
|
2013-03-21 23:59:47 +01:00
|
|
|
return;
|
2014-09-29 20:09:09 +02:00
|
|
|
case GetContext::kDeleted:
|
|
|
|
// Use empty error message for speed
|
|
|
|
*status = Status::NotFound();
|
2014-07-16 22:33:02 +02:00
|
|
|
return;
|
2014-09-29 20:09:09 +02:00
|
|
|
case GetContext::kCorrupt:
|
2014-07-16 22:33:02 +02:00
|
|
|
*status = Status::Corruption("corrupted key for ", user_key);
|
|
|
|
return;
|
2014-09-29 20:09:09 +02:00
|
|
|
case GetContext::kMerge:
|
hints for narrowing down FindFile range and avoiding checking unrelevant L0 files
Summary:
The file tree structure in Version is prebuilt and the range of each file is known.
On the Get() code path, we do binary search in FindFile() by comparing
target key with each file's largest key and also check the range for each L0 file.
With some pre-calculated knowledge, each key comparision that has been done can serve
as a hint to narrow down further searches:
(1) If a key falls within a L0 file's range, we can safely skip the next
file if its range does not overlap with the current one.
(2) If a key falls within a file's range in level L0 - Ln-1, we should only
need to binary search in the next level for files that overlap with the current one.
(1) will be able to skip some files depending one the key distribution.
(2) can greatly reduce the range of binary search, especially for bottom
levels, given that one file most likely only overlaps with N files from
the level below (where N is max_bytes_for_level_multiplier). So on level
L, we will only look at ~N files instead of N^L files.
Some inital results: measured with 500M key DB, when write is light (10k/s = 1.2M/s), this
improves QPS ~7% on top of blocked bloom. When write is heavier (80k/s =
9.6M/s), it gives us ~13% improvement.
Test Plan: make all check
Reviewers: haobo, igor, dhruba, sdong, yhchiang
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D17205
2014-04-21 18:10:12 +02:00
|
|
|
break;
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
2014-07-16 22:33:02 +02:00
|
|
|
f = fp.GetNextFile();
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
|
2014-09-29 20:09:09 +02:00
|
|
|
if (GetContext::kMerge == get_context.State()) {
|
2014-07-31 02:24:36 +02:00
|
|
|
if (!merge_operator_) {
|
|
|
|
*status = Status::InvalidArgument(
|
|
|
|
"merge_operator is not properly initialized.");
|
|
|
|
return;
|
|
|
|
}
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// merge_operands are in saver and we hit the beginning of the key history
|
|
|
|
// do a final merge of nullptr and operands;
|
2014-04-17 23:07:05 +02:00
|
|
|
if (merge_operator_->FullMerge(user_key, nullptr,
|
2014-09-29 20:09:09 +02:00
|
|
|
merge_context->GetOperands(), value,
|
2014-04-17 23:07:05 +02:00
|
|
|
info_log_)) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
*status = Status::OK();
|
|
|
|
} else {
|
2014-04-17 23:07:05 +02:00
|
|
|
RecordTick(db_statistics_, NUMBER_MERGE_FAILURES);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
*status = Status::Corruption("could not perform end-of-key merge for ",
|
|
|
|
user_key);
|
|
|
|
}
|
2013-03-21 23:59:47 +01:00
|
|
|
} else {
|
2013-12-26 22:49:04 +01:00
|
|
|
*status = Status::NotFound(); // Use an empty error message for speed
|
2013-03-21 23:59:47 +01:00
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
void VersionStorageInfo::GenerateLevelFilesBrief() {
|
2014-10-28 18:03:13 +01:00
|
|
|
level_files_brief_.resize(num_non_empty_levels_);
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
for (int level = 0; level < num_non_empty_levels_; level++) {
|
2014-10-28 18:03:13 +01:00
|
|
|
DoGenerateLevelFilesBrief(
|
|
|
|
&level_files_brief_[level], files_[level], &arena_);
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
void Version::PrepareApply(const MutableCFOptions& mutable_cf_options) {
|
2014-10-17 23:58:30 +02:00
|
|
|
UpdateAccumulatedStats();
|
2014-10-31 16:48:19 +01:00
|
|
|
storage_info_.UpdateNumNonEmptyLevels();
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
|
|
|
|
storage_info_.UpdateFilesBySize();
|
2014-10-31 16:48:19 +01:00
|
|
|
storage_info_.GenerateFileIndexer();
|
|
|
|
storage_info_.GenerateLevelFilesBrief();
|
2014-06-14 00:06:10 +02:00
|
|
|
}
|
|
|
|
|
2014-06-25 00:37:06 +02:00
|
|
|
bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
|
2014-10-17 23:58:30 +02:00
|
|
|
if (file_meta->init_stats_from_file ||
|
|
|
|
file_meta->compensated_file_size > 0) {
|
2014-06-25 00:37:06 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
std::shared_ptr<const TableProperties> tp;
|
|
|
|
Status s = GetTableProperties(&tp, file_meta);
|
2014-08-15 21:17:44 +02:00
|
|
|
file_meta->init_stats_from_file = true;
|
2014-06-25 00:37:06 +02:00
|
|
|
if (!s.ok()) {
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, vset_->db_options_->info_log,
|
2014-08-15 21:17:44 +02:00
|
|
|
"Unable to load table properties for file %" PRIu64 " --- %s\n",
|
|
|
|
file_meta->fd.GetNumber(), s.ToString().c_str());
|
2014-06-25 00:37:06 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (tp.get() == nullptr) return false;
|
|
|
|
file_meta->num_entries = tp->num_entries;
|
|
|
|
file_meta->num_deletions = GetDeletedKeys(tp->user_collected_properties);
|
|
|
|
file_meta->raw_value_size = tp->raw_value_size;
|
|
|
|
file_meta->raw_key_size = tp->raw_key_size;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
|
2014-10-17 23:58:30 +02:00
|
|
|
assert(file_meta->init_stats_from_file);
|
|
|
|
accumulated_file_size_ += file_meta->fd.GetFileSize();
|
|
|
|
accumulated_raw_key_size_ += file_meta->raw_key_size;
|
|
|
|
accumulated_raw_value_size_ += file_meta->raw_value_size;
|
|
|
|
accumulated_num_non_deletions_ +=
|
|
|
|
file_meta->num_entries - file_meta->num_deletions;
|
|
|
|
accumulated_num_deletions_ += file_meta->num_deletions;
|
|
|
|
num_samples_++;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Version::UpdateAccumulatedStats() {
|
|
|
|
// maximum number of table properties loaded from files.
|
|
|
|
const int kMaxInitCount = 20;
|
2014-06-25 00:37:06 +02:00
|
|
|
int init_count = 0;
|
2014-10-17 23:58:30 +02:00
|
|
|
// here only the first kMaxInitCount files which haven't been
|
|
|
|
// initialized from file will be updated with num_deletions.
|
|
|
|
// The motivation here is to cap the maximum I/O per Version creation.
|
|
|
|
// The reason for choosing files from lower-level instead of higher-level
|
|
|
|
// is that such design is able to propagate the initialization from
|
|
|
|
// lower-level to higher-level: When the num_deletions of lower-level
|
|
|
|
// files are updated, it will make the lower-level files have accurate
|
|
|
|
// compensated_file_size, making lower-level to higher-level compaction
|
|
|
|
// will be triggered, which creates higher-level files whose num_deletions
|
|
|
|
// will be updated here.
|
|
|
|
for (int level = 0;
|
2014-10-31 16:48:19 +01:00
|
|
|
level < storage_info_.num_levels_ && init_count < kMaxInitCount;
|
|
|
|
++level) {
|
|
|
|
for (auto* file_meta : storage_info_.files_[level]) {
|
2014-06-25 00:37:06 +02:00
|
|
|
if (MaybeInitializeFileMetaData(file_meta)) {
|
|
|
|
// each FileMeta will be initialized only once.
|
2014-10-31 16:48:19 +01:00
|
|
|
storage_info_.UpdateAccumulatedStats(file_meta);
|
2014-10-17 23:58:30 +02:00
|
|
|
if (++init_count >= kMaxInitCount) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// In case all sampled-files contain only deletion entries, then we
|
|
|
|
// load the table-property of a file in higher-level to initialize
|
|
|
|
// that value.
|
2014-10-31 16:48:19 +01:00
|
|
|
for (int level = storage_info_.num_levels_ - 1;
|
|
|
|
storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
|
|
|
|
for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
|
|
|
|
storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
|
|
|
|
if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
|
|
|
|
storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
|
2014-06-25 00:37:06 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-31 16:48:19 +01:00
|
|
|
storage_info_.ComputeCompensatedSizes();
|
2014-10-27 23:49:46 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void VersionStorageInfo::ComputeCompensatedSizes() {
|
|
|
|
static const int kDeletionWeightOnCompaction = 2;
|
2014-06-25 00:37:06 +02:00
|
|
|
uint64_t average_value_size = GetAverageValueSize();
|
|
|
|
|
|
|
|
// compute the compensated size
|
|
|
|
for (int level = 0; level < num_levels_; level++) {
|
|
|
|
for (auto* file_meta : files_[level]) {
|
2014-07-09 21:46:08 +02:00
|
|
|
// Here we only compute compensated_file_size for those file_meta
|
2015-02-05 01:04:51 +01:00
|
|
|
// which compensated_file_size is uninitialized (== 0). This is true only
|
|
|
|
// for files that have been created right now and no other thread has
|
|
|
|
// access to them. That's why we can safely mutate compensated_file_size.
|
2014-07-09 21:46:08 +02:00
|
|
|
if (file_meta->compensated_file_size == 0) {
|
|
|
|
file_meta->compensated_file_size = file_meta->fd.GetFileSize() +
|
|
|
|
file_meta->num_deletions * average_value_size *
|
|
|
|
kDeletionWeightOnCompaction;
|
|
|
|
}
|
2014-06-25 00:37:06 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
int VersionStorageInfo::MaxInputLevel() const {
|
|
|
|
if (compaction_style_ == kCompactionStyleLevel) {
|
2014-11-04 02:45:55 +01:00
|
|
|
return num_levels() - 2;
|
2014-10-27 23:49:46 +01:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void VersionStorageInfo::ComputeCompactionScore(
|
2014-10-02 01:19:16 +02:00
|
|
|
const MutableCFOptions& mutable_cf_options,
|
2015-02-05 01:04:51 +01:00
|
|
|
const CompactionOptionsFIFO& compaction_options_fifo) {
|
2014-01-16 01:23:36 +01:00
|
|
|
double max_score = 0;
|
|
|
|
int max_score_level = 0;
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
for (int level = 0; level <= MaxInputLevel(); level++) {
|
2014-01-16 01:23:36 +01:00
|
|
|
double score;
|
|
|
|
if (level == 0) {
|
|
|
|
// We treat level-0 specially by bounding the number of files
|
|
|
|
// instead of number of bytes for two reasons:
|
|
|
|
//
|
|
|
|
// (1) With larger write-buffer sizes, it is nice not to do too
|
|
|
|
// many level-0 compactions.
|
|
|
|
//
|
|
|
|
// (2) The files in level-0 are merged on every read and
|
|
|
|
// therefore we wish to avoid too many files when the individual
|
|
|
|
// file size is small (perhaps because of a small write-buffer
|
|
|
|
// setting, or very high compression ratios, or lots of
|
|
|
|
// overwrites/deletions).
|
2015-03-30 23:04:21 +02:00
|
|
|
int num_sorted_runs = 0;
|
2014-05-21 20:43:35 +02:00
|
|
|
uint64_t total_size = 0;
|
Add experimental API MarkForCompaction()
Summary:
Some Mongo+Rocks datasets in Parse's environment are not doing compactions very frequently. During the quiet period (with no IO), we'd like to schedule compactions so that our reads become faster. Also, aggressively compacting during quiet periods helps when write bursts happen. In addition, we also want to compact files that are containing deleted key ranges (like old oplog keys).
All of this is currently not possible with CompactRange() because it's single-threaded and blocks all other compactions from happening. Running CompactRange() risks an issue of blocking writes because we generate too much Level 0 files before the compaction is over. Stopping writes is very dangerous because they hold transaction locks. We tried running manual compaction once on Mongo+Rocks and everything fell apart.
MarkForCompaction() solves all of those problems. This is very light-weight manual compaction. It is lower priority than automatic compactions, which means it shouldn't interfere with background process keeping the LSM tree clean. However, if no automatic compactions need to be run (or we have extra background threads available), we will start compacting files that are marked for compaction.
Test Plan: added a new unit test
Reviewers: yhchiang, rven, MarkCallaghan, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37083
2015-04-18 01:44:45 +02:00
|
|
|
for (auto* f : files_[level]) {
|
|
|
|
if (!f->being_compacted) {
|
|
|
|
total_size += f->compensated_file_size;
|
2015-03-30 23:04:21 +02:00
|
|
|
num_sorted_runs++;
|
2014-01-16 01:23:36 +01:00
|
|
|
}
|
|
|
|
}
|
2015-03-30 23:04:21 +02:00
|
|
|
if (compaction_style_ == kCompactionStyleUniversal) {
|
|
|
|
// For universal compaction, we use level0 score to indicate
|
|
|
|
// compaction score for the whole DB. Adding other levels as if
|
|
|
|
// they are L0 files.
|
|
|
|
for (int i = 1; i < num_levels(); i++) {
|
|
|
|
if (!files_[i].empty() && !files_[i][0]->being_compacted) {
|
|
|
|
num_sorted_runs++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
if (compaction_style_ == kCompactionStyleFIFO) {
|
2014-05-21 20:43:35 +02:00
|
|
|
score = static_cast<double>(total_size) /
|
2014-10-27 23:49:46 +01:00
|
|
|
compaction_options_fifo.max_table_files_size;
|
2015-03-30 23:04:21 +02:00
|
|
|
} else if (num_sorted_runs >=
|
|
|
|
mutable_cf_options.level0_stop_writes_trigger) {
|
2014-05-21 20:43:35 +02:00
|
|
|
// If we are slowing down writes, then we better compact that first
|
2014-01-16 01:23:36 +01:00
|
|
|
score = 1000000;
|
2015-03-30 23:04:21 +02:00
|
|
|
} else if (num_sorted_runs >=
|
|
|
|
mutable_cf_options.level0_slowdown_writes_trigger) {
|
2014-01-16 01:23:36 +01:00
|
|
|
score = 10000;
|
|
|
|
} else {
|
2015-03-30 23:04:21 +02:00
|
|
|
score = static_cast<double>(num_sorted_runs) /
|
2014-10-02 01:19:16 +02:00
|
|
|
mutable_cf_options.level0_file_num_compaction_trigger;
|
2014-01-16 01:23:36 +01:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Compute the ratio of current size to size limit.
|
2015-02-05 01:04:51 +01:00
|
|
|
uint64_t level_bytes_no_compacting = 0;
|
|
|
|
for (auto f : files_[level]) {
|
Add experimental API MarkForCompaction()
Summary:
Some Mongo+Rocks datasets in Parse's environment are not doing compactions very frequently. During the quiet period (with no IO), we'd like to schedule compactions so that our reads become faster. Also, aggressively compacting during quiet periods helps when write bursts happen. In addition, we also want to compact files that are containing deleted key ranges (like old oplog keys).
All of this is currently not possible with CompactRange() because it's single-threaded and blocks all other compactions from happening. Running CompactRange() risks an issue of blocking writes because we generate too much Level 0 files before the compaction is over. Stopping writes is very dangerous because they hold transaction locks. We tried running manual compaction once on Mongo+Rocks and everything fell apart.
MarkForCompaction() solves all of those problems. This is very light-weight manual compaction. It is lower priority than automatic compactions, which means it shouldn't interfere with background process keeping the LSM tree clean. However, if no automatic compactions need to be run (or we have extra background threads available), we will start compacting files that are marked for compaction.
Test Plan: added a new unit test
Reviewers: yhchiang, rven, MarkCallaghan, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37083
2015-04-18 01:44:45 +02:00
|
|
|
if (!f->being_compacted) {
|
2015-02-05 01:04:51 +01:00
|
|
|
level_bytes_no_compacting += f->compensated_file_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
score = static_cast<double>(level_bytes_no_compacting) /
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
MaxBytesForLevel(level);
|
2014-01-16 01:23:36 +01:00
|
|
|
if (max_score < score) {
|
|
|
|
max_score = score;
|
|
|
|
max_score_level = level;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
compaction_level_[level] = level;
|
|
|
|
compaction_score_[level] = score;
|
|
|
|
}
|
|
|
|
|
|
|
|
// update the max compaction score in levels 1 to n-1
|
|
|
|
max_compaction_score_ = max_score;
|
|
|
|
max_compaction_score_level_ = max_score_level;
|
|
|
|
|
|
|
|
// sort all the levels based on their score. Higher scores get listed
|
|
|
|
// first. Use bubble sort because the number of entries are small.
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int i = 0; i < num_levels() - 2; i++) {
|
|
|
|
for (int j = i + 1; j < num_levels() - 1; j++) {
|
2014-01-16 01:23:36 +01:00
|
|
|
if (compaction_score_[i] < compaction_score_[j]) {
|
|
|
|
double score = compaction_score_[i];
|
|
|
|
int level = compaction_level_[i];
|
|
|
|
compaction_score_[i] = compaction_score_[j];
|
|
|
|
compaction_level_[i] = compaction_level_[j];
|
|
|
|
compaction_score_[j] = score;
|
|
|
|
compaction_level_[j] = level;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
Add experimental API MarkForCompaction()
Summary:
Some Mongo+Rocks datasets in Parse's environment are not doing compactions very frequently. During the quiet period (with no IO), we'd like to schedule compactions so that our reads become faster. Also, aggressively compacting during quiet periods helps when write bursts happen. In addition, we also want to compact files that are containing deleted key ranges (like old oplog keys).
All of this is currently not possible with CompactRange() because it's single-threaded and blocks all other compactions from happening. Running CompactRange() risks an issue of blocking writes because we generate too much Level 0 files before the compaction is over. Stopping writes is very dangerous because they hold transaction locks. We tried running manual compaction once on Mongo+Rocks and everything fell apart.
MarkForCompaction() solves all of those problems. This is very light-weight manual compaction. It is lower priority than automatic compactions, which means it shouldn't interfere with background process keeping the LSM tree clean. However, if no automatic compactions need to be run (or we have extra background threads available), we will start compacting files that are marked for compaction.
Test Plan: added a new unit test
Reviewers: yhchiang, rven, MarkCallaghan, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37083
2015-04-18 01:44:45 +02:00
|
|
|
ComputeFilesMarkedForCompaction();
|
|
|
|
}
|
|
|
|
|
|
|
|
void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
|
|
|
|
files_marked_for_compaction_.clear();
|
|
|
|
for (int level = 0; level <= MaxInputLevel(); level++) {
|
|
|
|
for (auto* f : files_[level]) {
|
|
|
|
if (!f->being_compacted && f->marked_for_compaction) {
|
|
|
|
files_marked_for_compaction_.emplace_back(level, f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-01-16 01:23:36 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
2014-10-28 17:59:56 +01:00
|
|
|
|
|
|
|
// used to sort files by size
|
|
|
|
struct Fsize {
|
|
|
|
int index;
|
|
|
|
FileMetaData* file;
|
|
|
|
};
|
|
|
|
|
2014-01-16 01:23:36 +01:00
|
|
|
// Compator that is used to sort files based on their size
|
|
|
|
// In normal mode: descending size
|
2014-10-28 17:59:56 +01:00
|
|
|
bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
|
2014-06-25 00:37:06 +02:00
|
|
|
return (first.file->compensated_file_size >
|
|
|
|
second.file->compensated_file_size);
|
2014-01-16 01:23:36 +01:00
|
|
|
}
|
2014-10-28 17:59:56 +01:00
|
|
|
|
2014-01-16 08:12:31 +01:00
|
|
|
} // anonymous namespace
|
2014-01-16 01:23:36 +01:00
|
|
|
|
2014-11-13 22:41:43 +01:00
|
|
|
void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
|
2014-11-04 02:45:55 +01:00
|
|
|
assert(level < num_levels());
|
2014-10-31 16:48:19 +01:00
|
|
|
auto* level_files = &files_[level];
|
|
|
|
// Must not overlap
|
|
|
|
assert(level <= 0 || level_files->empty() ||
|
|
|
|
internal_comparator_->Compare(
|
|
|
|
(*level_files)[level_files->size() - 1]->largest, f->smallest) <
|
|
|
|
0);
|
|
|
|
f->refs++;
|
|
|
|
level_files->push_back(f);
|
|
|
|
}
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
// Version::PrepareApply() need to be called before calling the function, or
|
|
|
|
// following functions called:
|
|
|
|
// 1. UpdateNumNonEmptyLevels();
|
|
|
|
// 2. CalculateBaseBytes();
|
|
|
|
// 3. UpdateFilesBySize();
|
|
|
|
// 4. GenerateFileIndexer();
|
|
|
|
// 5. GenerateLevelFilesBrief();
|
|
|
|
void VersionStorageInfo::SetFinalized() {
|
|
|
|
finalized_ = true;
|
|
|
|
#ifndef NDEBUG
|
2015-03-30 23:04:21 +02:00
|
|
|
if (compaction_style_ != kCompactionStyleLevel) {
|
|
|
|
// Not level based compaction.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
assert(base_level_ < 0 || num_levels() == 1 ||
|
|
|
|
(base_level_ >= 1 && base_level_ < num_levels()));
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
// Verify all levels newer than base_level are empty except L0
|
|
|
|
for (int level = 1; level < base_level(); level++) {
|
|
|
|
assert(NumLevelBytes(level) == 0);
|
|
|
|
}
|
|
|
|
uint64_t max_bytes_prev_level = 0;
|
|
|
|
for (int level = base_level(); level < num_levels() - 1; level++) {
|
|
|
|
if (LevelFiles(level).size() == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
assert(MaxBytesForLevel(level) >= max_bytes_prev_level);
|
|
|
|
max_bytes_prev_level = MaxBytesForLevel(level);
|
|
|
|
}
|
|
|
|
int num_empty_non_l0_level = 0;
|
|
|
|
for (int level = 0; level < num_levels(); level++) {
|
|
|
|
assert(LevelFiles(level).size() == 0 ||
|
|
|
|
LevelFiles(level).size() == LevelFilesBrief(level).num_files);
|
|
|
|
if (level > 0 && NumLevelBytes(level) > 0) {
|
|
|
|
num_empty_non_l0_level++;
|
|
|
|
}
|
|
|
|
if (LevelFiles(level).size() > 0) {
|
|
|
|
assert(level < num_non_empty_levels());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(compaction_level_.size() > 0);
|
|
|
|
assert(compaction_level_.size() == compaction_score_.size());
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
void VersionStorageInfo::UpdateNumNonEmptyLevels() {
|
2014-06-14 00:06:10 +02:00
|
|
|
num_non_empty_levels_ = num_levels_;
|
|
|
|
for (int i = num_levels_ - 1; i >= 0; i--) {
|
|
|
|
if (files_[i].size() != 0) {
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
num_non_empty_levels_ = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
void VersionStorageInfo::UpdateFilesBySize() {
|
|
|
|
if (compaction_style_ == kCompactionStyleFIFO ||
|
|
|
|
compaction_style_ == kCompactionStyleUniversal) {
|
2014-05-21 20:43:35 +02:00
|
|
|
// don't need this
|
|
|
|
return;
|
|
|
|
}
|
2014-01-16 01:23:36 +01:00
|
|
|
// No need to sort the highest level because it is never compacted.
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int level = 0; level < num_levels() - 1; level++) {
|
2014-01-16 01:23:36 +01:00
|
|
|
const std::vector<FileMetaData*>& files = files_[level];
|
2014-07-01 08:55:04 +02:00
|
|
|
auto& files_by_size = files_by_size_[level];
|
2014-01-16 01:23:36 +01:00
|
|
|
assert(files_by_size.size() == 0);
|
|
|
|
|
|
|
|
// populate a temp vector for sorting based on size
|
|
|
|
std::vector<Fsize> temp(files.size());
|
|
|
|
for (unsigned int i = 0; i < files.size(); i++) {
|
|
|
|
temp[i].index = i;
|
|
|
|
temp[i].file = files[i];
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
// sort the top number_of_files_to_sort_ based on file size
|
|
|
|
size_t num = VersionStorageInfo::kNumberFilesToSort;
|
2014-07-01 08:55:04 +02:00
|
|
|
if (num > temp.size()) {
|
|
|
|
num = temp.size();
|
2014-01-16 01:23:36 +01:00
|
|
|
}
|
2014-07-01 08:55:04 +02:00
|
|
|
std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
|
|
|
|
CompareCompensatedSizeDescending);
|
2014-01-16 01:23:36 +01:00
|
|
|
assert(temp.size() == files.size());
|
|
|
|
|
|
|
|
// initialize files_by_size_
|
|
|
|
for (unsigned int i = 0; i < temp.size(); i++) {
|
|
|
|
files_by_size.push_back(temp[i].index);
|
|
|
|
}
|
|
|
|
next_file_to_compact_by_size_[level] = 0;
|
|
|
|
assert(files_[level].size() == files_by_size_[level].size());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
void Version::Ref() {
|
|
|
|
++refs_;
|
|
|
|
}
|
|
|
|
|
2013-12-11 20:56:36 +01:00
|
|
|
bool Version::Unref() {
|
2011-03-18 23:37:00 +01:00
|
|
|
assert(refs_ >= 1);
|
|
|
|
--refs_;
|
|
|
|
if (refs_ == 0) {
|
2011-05-21 04:17:43 +02:00
|
|
|
delete this;
|
2013-12-11 20:56:36 +01:00
|
|
|
return true;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2013-12-11 20:56:36 +01:00
|
|
|
return false;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
bool VersionStorageInfo::OverlapInLevel(int level,
|
|
|
|
const Slice* smallest_user_key,
|
|
|
|
const Slice* largest_user_key) {
|
|
|
|
return SomeFileOverlapsRange(*internal_comparator_, (level > 0),
|
2014-10-28 18:03:13 +01:00
|
|
|
level_files_brief_[level], smallest_user_key,
|
2014-02-01 00:30:27 +01:00
|
|
|
largest_user_key);
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
int VersionStorageInfo::PickLevelForMemTableOutput(
|
|
|
|
const MutableCFOptions& mutable_cf_options, const Slice& smallest_user_key,
|
2011-10-06 01:30:28 +02:00
|
|
|
const Slice& largest_user_key) {
|
|
|
|
int level = 0;
|
|
|
|
if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
|
|
|
|
// Push to next level if there is no overlap in next level,
|
|
|
|
// and the #bytes overlapping in the level after that are limited.
|
|
|
|
InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
|
|
|
|
InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
|
|
|
|
std::vector<FileMetaData*> overlaps;
|
2014-10-24 00:37:14 +02:00
|
|
|
while (mutable_cf_options.max_mem_compaction_level > 0 &&
|
|
|
|
level < mutable_cf_options.max_mem_compaction_level) {
|
2011-10-06 01:30:28 +02:00
|
|
|
if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
|
|
|
|
break;
|
|
|
|
}
|
2014-01-16 01:15:43 +01:00
|
|
|
if (level + 2 >= num_levels_) {
|
2012-10-31 19:47:18 +01:00
|
|
|
level++;
|
|
|
|
break;
|
2012-06-23 04:30:03 +02:00
|
|
|
}
|
2011-10-06 01:30:28 +02:00
|
|
|
GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
|
2013-07-17 22:56:24 +02:00
|
|
|
const uint64_t sum = TotalFileSize(overlaps);
|
2014-10-02 01:19:16 +02:00
|
|
|
if (sum > mutable_cf_options.MaxGrandParentOverlapBytes(level)) {
|
2011-10-06 01:30:28 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
level++;
|
|
|
|
}
|
|
|
|
}
|
2012-06-23 04:30:03 +02:00
|
|
|
|
2011-10-06 01:30:28 +02:00
|
|
|
return level;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Store in "*inputs" all files in "level" that overlap [begin,end]
|
2012-11-29 01:42:36 +01:00
|
|
|
// If hint_index is specified, then it points to a file in the
|
2012-11-06 18:06:16 +01:00
|
|
|
// overlapping range.
|
|
|
|
// The file_index returns a pointer to any file in an overlapping range.
|
2014-10-27 23:49:46 +01:00
|
|
|
void VersionStorageInfo::GetOverlappingInputs(
|
|
|
|
int level, const InternalKey* begin, const InternalKey* end,
|
|
|
|
std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) {
|
2011-10-06 01:30:28 +02:00
|
|
|
inputs->clear();
|
|
|
|
Slice user_begin, user_end;
|
2013-03-01 03:04:58 +01:00
|
|
|
if (begin != nullptr) {
|
2011-10-06 01:30:28 +02:00
|
|
|
user_begin = begin->user_key();
|
|
|
|
}
|
2013-03-01 03:04:58 +01:00
|
|
|
if (end != nullptr) {
|
2011-10-06 01:30:28 +02:00
|
|
|
user_end = end->user_key();
|
|
|
|
}
|
Assertion failure while running with unit tests with OPT=-g
Summary:
When we expand the range of keys for a level 0 compaction, we
need to invoke ParentFilesInCompaction() only once for the
entire range of keys that is being compacted. We were invoking
it for each file that was being compacted, but this triggers
an assertion because each file's range were contiguous but
non-overlapping.
I renamed ParentFilesInCompaction to ParentRangeInCompaction
to adequately represent that it is the range-of-keys and
not individual files that we compact in a single compaction run.
Here is the assertion that is fixed by this patch.
db_test: db/version_set.cc:585: void leveldb::Version::ExtendOverlappingInputs(int, const leveldb::Slice&, const leveldb::Slice&, std::vector<leveldb::FileMetaData*, std::allocator<leveldb::FileMetaData*> >*, int): Assertion `user_cmp->Compare(flimit, user_begin) >= 0' failed.
Test Plan: make clean check OPT=-g
Reviewers: sheki
Reviewed By: sheki
CC: MarkCallaghan, emayanke, leveldb
Differential Revision: https://reviews.facebook.net/D6963
2012-11-26 10:49:50 +01:00
|
|
|
if (file_index) {
|
|
|
|
*file_index = -1;
|
|
|
|
}
|
2014-10-27 23:49:46 +01:00
|
|
|
const Comparator* user_cmp = user_comparator_;
|
2013-03-01 03:04:58 +01:00
|
|
|
if (begin != nullptr && end != nullptr && level > 0) {
|
2012-11-06 18:06:16 +01:00
|
|
|
GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs,
|
|
|
|
hint_index, file_index);
|
2012-11-05 08:47:06 +01:00
|
|
|
return;
|
|
|
|
}
|
2014-10-28 18:03:13 +01:00
|
|
|
for (size_t i = 0; i < level_files_brief_[level].num_files; ) {
|
|
|
|
FdWithKeyRange* f = &(level_files_brief_[level].files[i++]);
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const Slice file_start = ExtractUserKey(f->smallest_key);
|
|
|
|
const Slice file_limit = ExtractUserKey(f->largest_key);
|
2013-03-01 03:04:58 +01:00
|
|
|
if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
|
2011-10-06 01:30:28 +02:00
|
|
|
// "f" is completely before specified range; skip it
|
2013-03-01 03:04:58 +01:00
|
|
|
} else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
|
2011-10-06 01:30:28 +02:00
|
|
|
// "f" is completely after specified range; skip it
|
|
|
|
} else {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
inputs->push_back(files_[level][i-1]);
|
2011-10-31 18:22:06 +01:00
|
|
|
if (level == 0) {
|
|
|
|
// Level-0 files may overlap each other. So check if the newly
|
|
|
|
// added file has expanded the range. If so, restart search.
|
2013-03-01 03:04:58 +01:00
|
|
|
if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
|
2011-10-31 18:22:06 +01:00
|
|
|
user_begin = file_start;
|
|
|
|
inputs->clear();
|
|
|
|
i = 0;
|
2013-03-01 03:04:58 +01:00
|
|
|
} else if (end != nullptr
|
|
|
|
&& user_cmp->Compare(file_limit, user_end) > 0) {
|
2011-10-31 18:22:06 +01:00
|
|
|
user_end = file_limit;
|
|
|
|
inputs->clear();
|
|
|
|
i = 0;
|
|
|
|
}
|
2012-11-06 18:06:16 +01:00
|
|
|
} else if (file_index) {
|
2014-11-11 22:47:22 +01:00
|
|
|
*file_index = static_cast<int>(i) - 1;
|
2011-10-31 18:22:06 +01:00
|
|
|
}
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
|
2012-11-05 08:47:06 +01:00
|
|
|
// Store in "*inputs" all files in "level" that overlap [begin,end]
|
|
|
|
// Employ binary search to find at least one file that overlaps the
|
|
|
|
// specified range. From that file, iterate backwards and
|
|
|
|
// forwards to find all overlapping files.
|
2014-10-27 23:49:46 +01:00
|
|
|
void VersionStorageInfo::GetOverlappingInputsBinarySearch(
|
|
|
|
int level, const Slice& user_begin, const Slice& user_end,
|
|
|
|
std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) {
|
2012-11-05 08:47:06 +01:00
|
|
|
assert(level > 0);
|
|
|
|
int min = 0;
|
|
|
|
int mid = 0;
|
2014-11-11 22:47:22 +01:00
|
|
|
int max = static_cast<int>(files_[level].size()) - 1;
|
2012-11-05 08:47:06 +01:00
|
|
|
bool foundOverlap = false;
|
2014-10-27 23:49:46 +01:00
|
|
|
const Comparator* user_cmp = user_comparator_;
|
2012-11-06 18:06:16 +01:00
|
|
|
|
|
|
|
// if the caller already knows the index of a file that has overlap,
|
|
|
|
// then we can skip the binary search.
|
|
|
|
if (hint_index != -1) {
|
|
|
|
mid = hint_index;
|
|
|
|
foundOverlap = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (!foundOverlap && min <= max) {
|
2012-11-05 08:47:06 +01:00
|
|
|
mid = (min + max)/2;
|
2014-10-28 18:03:13 +01:00
|
|
|
FdWithKeyRange* f = &(level_files_brief_[level].files[mid]);
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const Slice file_start = ExtractUserKey(f->smallest_key);
|
|
|
|
const Slice file_limit = ExtractUserKey(f->largest_key);
|
2012-11-05 08:47:06 +01:00
|
|
|
if (user_cmp->Compare(file_limit, user_begin) < 0) {
|
|
|
|
min = mid + 1;
|
|
|
|
} else if (user_cmp->Compare(user_end, file_start) < 0) {
|
|
|
|
max = mid - 1;
|
|
|
|
} else {
|
|
|
|
foundOverlap = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2012-11-05 08:47:06 +01:00
|
|
|
// If there were no overlapping files, return immediately.
|
|
|
|
if (!foundOverlap) {
|
|
|
|
return;
|
|
|
|
}
|
2012-11-06 18:06:16 +01:00
|
|
|
// returns the index where an overlap is found
|
|
|
|
if (file_index) {
|
|
|
|
*file_index = mid;
|
|
|
|
}
|
2012-11-05 08:47:06 +01:00
|
|
|
ExtendOverlappingInputs(level, user_begin, user_end, inputs, mid);
|
|
|
|
}
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2012-11-05 08:47:06 +01:00
|
|
|
// Store in "*inputs" all files in "level" that overlap [begin,end]
|
|
|
|
// The midIndex specifies the index of at least one file that
|
|
|
|
// overlaps the specified range. From that file, iterate backward
|
|
|
|
// and forward to find all overlapping files.
|
2014-10-27 23:49:46 +01:00
|
|
|
// Use FileLevel in searching, make it faster
|
|
|
|
void VersionStorageInfo::ExtendOverlappingInputs(
|
|
|
|
int level, const Slice& user_begin, const Slice& user_end,
|
|
|
|
std::vector<FileMetaData*>* inputs, unsigned int midIndex) {
|
|
|
|
|
|
|
|
const Comparator* user_cmp = user_comparator_;
|
2014-10-28 18:03:13 +01:00
|
|
|
const FdWithKeyRange* files = level_files_brief_[level].files;
|
2012-11-06 18:06:16 +01:00
|
|
|
#ifndef NDEBUG
|
|
|
|
{
|
|
|
|
// assert that the file at midIndex overlaps with the range
|
2014-10-28 18:03:13 +01:00
|
|
|
assert(midIndex < level_files_brief_[level].num_files);
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const FdWithKeyRange* f = &files[midIndex];
|
|
|
|
const Slice fstart = ExtractUserKey(f->smallest_key);
|
|
|
|
const Slice flimit = ExtractUserKey(f->largest_key);
|
2012-11-06 18:06:16 +01:00
|
|
|
if (user_cmp->Compare(fstart, user_begin) >= 0) {
|
|
|
|
assert(user_cmp->Compare(fstart, user_end) <= 0);
|
|
|
|
} else {
|
|
|
|
assert(user_cmp->Compare(flimit, user_begin) >= 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2012-12-31 07:18:52 +01:00
|
|
|
int startIndex = midIndex + 1;
|
|
|
|
int endIndex = midIndex;
|
2013-01-14 21:39:24 +01:00
|
|
|
int count __attribute__((unused)) = 0;
|
2012-11-05 08:47:06 +01:00
|
|
|
|
|
|
|
// check backwards from 'mid' to lower indices
|
2012-12-31 07:18:52 +01:00
|
|
|
for (int i = midIndex; i >= 0 ; i--) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const FdWithKeyRange* f = &files[i];
|
|
|
|
const Slice file_limit = ExtractUserKey(f->largest_key);
|
2012-11-05 08:47:06 +01:00
|
|
|
if (user_cmp->Compare(file_limit, user_begin) >= 0) {
|
2012-12-31 07:18:52 +01:00
|
|
|
startIndex = i;
|
|
|
|
assert((count++, true));
|
2012-11-05 08:47:06 +01:00
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// check forward from 'mid+1' to higher indices
|
2014-10-28 18:03:13 +01:00
|
|
|
for (unsigned int i = midIndex+1;
|
|
|
|
i < level_files_brief_[level].num_files; i++) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const FdWithKeyRange* f = &files[i];
|
|
|
|
const Slice file_start = ExtractUserKey(f->smallest_key);
|
2012-11-05 08:47:06 +01:00
|
|
|
if (user_cmp->Compare(file_start, user_end) <= 0) {
|
2012-12-31 07:18:52 +01:00
|
|
|
assert((count++, true));
|
|
|
|
endIndex = i;
|
2012-11-05 08:47:06 +01:00
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2012-12-31 07:18:52 +01:00
|
|
|
assert(count == endIndex - startIndex + 1);
|
|
|
|
|
|
|
|
// insert overlapping files into vector
|
|
|
|
for (int i = startIndex; i <= endIndex; i++) {
|
|
|
|
FileMetaData* f = files_[level][i];
|
2013-01-08 21:00:13 +01:00
|
|
|
inputs->push_back(f);
|
2012-12-31 07:18:52 +01:00
|
|
|
}
|
2012-11-05 08:47:06 +01:00
|
|
|
}
|
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// Returns true iff the first or last file in inputs contains
|
|
|
|
// an overlapping user key to the file "just outside" of it (i.e.
|
|
|
|
// just after the last file, or just before the first file)
|
|
|
|
// REQUIRES: "*inputs" is a sorted list of non-overlapping files
|
2014-10-27 23:49:46 +01:00
|
|
|
bool VersionStorageInfo::HasOverlappingUserKey(
|
|
|
|
const std::vector<FileMetaData*>* inputs, int level) {
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
|
|
|
|
// If inputs empty, there is no overlap.
|
|
|
|
// If level == 0, it is assumed that all needed files were already included.
|
|
|
|
if (inputs->empty() || level == 0){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
const Comparator* user_cmp = user_comparator_;
|
|
|
|
const rocksdb::LevelFilesBrief& file_level = level_files_brief_[level];
|
2014-10-28 18:03:13 +01:00
|
|
|
const FdWithKeyRange* files = level_files_brief_[level].files;
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const size_t kNumFiles = file_level.num_files;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
|
|
|
|
// Check the last file in inputs against the file after it
|
2014-10-27 23:49:46 +01:00
|
|
|
size_t last_file = FindFile(*internal_comparator_, file_level,
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
inputs->back()->largest.Encode());
|
2014-10-01 11:09:22 +02:00
|
|
|
assert(last_file < kNumFiles); // File should exist!
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
if (last_file < kNumFiles-1) { // If not the last file
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const Slice last_key_in_input = ExtractUserKey(
|
|
|
|
files[last_file].largest_key);
|
|
|
|
const Slice first_key_after = ExtractUserKey(
|
|
|
|
files[last_file+1].smallest_key);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
if (user_cmp->Compare(last_key_in_input, first_key_after) == 0) {
|
|
|
|
// The last user key in input overlaps with the next file's first key
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check the first file in inputs against the file just before it
|
2014-10-27 23:49:46 +01:00
|
|
|
size_t first_file = FindFile(*internal_comparator_, file_level,
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
inputs->front()->smallest.Encode());
|
2014-10-01 11:09:22 +02:00
|
|
|
assert(first_file <= last_file); // File should exist!
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
if (first_file > 0) { // If not first file
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 07:14:39 +02:00
|
|
|
const Slice& first_key_in_input = ExtractUserKey(
|
|
|
|
files[first_file].smallest_key);
|
|
|
|
const Slice& last_key_before = ExtractUserKey(
|
|
|
|
files[first_file-1].largest_key);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
if (user_cmp->Compare(first_key_in_input, last_key_before) == 0) {
|
|
|
|
// The first user key in input overlaps with the previous file's last key
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
|
2014-01-16 01:18:04 +01:00
|
|
|
assert(level >= 0);
|
2014-11-04 02:45:55 +01:00
|
|
|
assert(level < num_levels());
|
2014-01-16 01:18:04 +01:00
|
|
|
return TotalFileSize(files_[level]);
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
const char* VersionStorageInfo::LevelSummary(
|
|
|
|
LevelSummaryStorage* scratch) const {
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
int len = 0;
|
2015-03-30 23:04:21 +02:00
|
|
|
if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
|
|
|
|
len = snprintf(scratch->buffer, sizeof(scratch->buffer),
|
2015-03-30 23:04:21 +02:00
|
|
|
"base level %d max bytes base %" PRIu64 " ", base_level_,
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
level_max_bytes_[base_level_]);
|
|
|
|
}
|
|
|
|
len +=
|
2015-03-30 23:04:21 +02:00
|
|
|
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int i = 0; i < num_levels(); i++) {
|
2014-01-16 01:18:04 +01:00
|
|
|
int sz = sizeof(scratch->buffer) - len;
|
|
|
|
int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
|
|
|
|
if (ret < 0 || ret >= sz) break;
|
|
|
|
len += ret;
|
|
|
|
}
|
2014-05-14 21:13:50 +02:00
|
|
|
if (len > 0) {
|
|
|
|
// overwrite the last space
|
|
|
|
--len;
|
|
|
|
}
|
2014-01-16 01:18:04 +01:00
|
|
|
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
|
|
|
|
return scratch->buffer;
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
|
|
|
|
int level) const {
|
2014-01-16 01:18:04 +01:00
|
|
|
int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
|
|
|
|
for (const auto& f : files_[level]) {
|
|
|
|
int sz = sizeof(scratch->buffer) - len;
|
2014-05-14 21:13:50 +02:00
|
|
|
char sztxt[16];
|
2014-08-13 20:57:40 +02:00
|
|
|
AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
|
2014-01-16 01:18:04 +01:00
|
|
|
int ret = snprintf(scratch->buffer + len, sz,
|
2014-06-14 00:54:19 +02:00
|
|
|
"#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
|
|
|
|
f->fd.GetNumber(), f->smallest_seqno, sztxt,
|
2014-05-14 21:13:50 +02:00
|
|
|
static_cast<int>(f->being_compacted));
|
2014-01-16 01:18:04 +01:00
|
|
|
if (ret < 0 || ret >= sz)
|
|
|
|
break;
|
|
|
|
len += ret;
|
|
|
|
}
|
2014-05-14 21:13:50 +02:00
|
|
|
// overwrite the last space (only if files_[level].size() is non-zero)
|
|
|
|
if (files_[level].size() && len > 0) {
|
|
|
|
--len;
|
|
|
|
}
|
2014-01-16 01:18:04 +01:00
|
|
|
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
|
|
|
|
return scratch->buffer;
|
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
|
2014-01-16 01:18:04 +01:00
|
|
|
uint64_t result = 0;
|
|
|
|
std::vector<FileMetaData*> overlaps;
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int level = 1; level < num_levels() - 1; level++) {
|
2014-01-16 01:18:04 +01:00
|
|
|
for (const auto& f : files_[level]) {
|
|
|
|
GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
|
|
|
|
const uint64_t sum = TotalFileSize(overlaps);
|
|
|
|
if (sum > result) {
|
|
|
|
result = sum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
|
|
|
|
// Note: the result for level zero is not really used since we set
|
|
|
|
// the level-0 compaction threshold based on number of files.
|
|
|
|
assert(level >= 0);
|
|
|
|
assert(level < static_cast<int>(level_max_bytes_.size()));
|
|
|
|
return level_max_bytes_[level];
|
|
|
|
}
|
|
|
|
|
|
|
|
void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
|
|
|
|
const MutableCFOptions& options) {
|
2015-03-30 23:04:21 +02:00
|
|
|
// Special logic to set number of sorted runs.
|
|
|
|
// It is to match the previous behavior when all files are in L0.
|
|
|
|
int num_l0_count = static_cast<int>(files_[0].size());
|
|
|
|
if (compaction_style_ == kCompactionStyleUniversal) {
|
|
|
|
// For universal compaction, we use level0 score to indicate
|
|
|
|
// compaction score for the whole DB. Adding other levels as if
|
|
|
|
// they are L0 files.
|
|
|
|
for (int i = 1; i < num_levels(); i++) {
|
|
|
|
if (!files_[i].empty()) {
|
|
|
|
num_l0_count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
set_l0_delay_trigger_count(num_l0_count);
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
level_max_bytes_.resize(ioptions.num_levels);
|
|
|
|
if (!ioptions.level_compaction_dynamic_level_bytes) {
|
2015-03-30 23:04:21 +02:00
|
|
|
base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1;
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
|
|
|
|
// Calculate for static bytes base case
|
|
|
|
for (int i = 0; i < ioptions.num_levels; ++i) {
|
|
|
|
if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
|
|
|
|
level_max_bytes_[i] = options.max_bytes_for_level_base;
|
|
|
|
} else if (i > 1) {
|
|
|
|
level_max_bytes_[i] = MultiplyCheckOverflow(
|
|
|
|
MultiplyCheckOverflow(level_max_bytes_[i - 1],
|
|
|
|
options.max_bytes_for_level_multiplier),
|
2015-03-30 23:04:21 +02:00
|
|
|
options.MaxBytesMultiplerAdditional(i - 1));
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
} else {
|
|
|
|
level_max_bytes_[i] = options.max_bytes_for_level_base;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
uint64_t max_level_size = 0;
|
|
|
|
|
|
|
|
int first_non_empty_level = -1;
|
|
|
|
// Find size of non-L0 level of most data.
|
|
|
|
// Cannot use the size of the last level because it can be empty or less
|
|
|
|
// than previous levels after compaction.
|
|
|
|
for (int i = 1; i < num_levels_; i++) {
|
|
|
|
uint64_t total_size = 0;
|
|
|
|
for (const auto& f : files_[i]) {
|
|
|
|
total_size += f->fd.GetFileSize();
|
|
|
|
}
|
|
|
|
if (total_size > 0 && first_non_empty_level == -1) {
|
|
|
|
first_non_empty_level = i;
|
|
|
|
}
|
|
|
|
if (total_size > max_level_size) {
|
|
|
|
max_level_size = total_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Prefill every level's max bytes to disallow compaction from there.
|
|
|
|
for (int i = 0; i < num_levels_; i++) {
|
|
|
|
level_max_bytes_[i] = std::numeric_limits<uint64_t>::max();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (max_level_size == 0) {
|
|
|
|
// No data for L1 and up. L0 compacts to last level directly.
|
|
|
|
// No compaction from L1+ needs to be scheduled.
|
|
|
|
base_level_ = num_levels_ - 1;
|
|
|
|
} else {
|
|
|
|
uint64_t base_bytes_max = options.max_bytes_for_level_base;
|
|
|
|
uint64_t base_bytes_min =
|
|
|
|
base_bytes_max / options.max_bytes_for_level_multiplier;
|
|
|
|
|
|
|
|
// Try whether we can make last level's target size to be max_level_size
|
|
|
|
uint64_t cur_level_size = max_level_size;
|
|
|
|
for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) {
|
|
|
|
// Round up after dividing
|
|
|
|
cur_level_size /= options.max_bytes_for_level_multiplier;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Calculate base level and its size.
|
2015-04-03 07:24:50 +02:00
|
|
|
uint64_t base_level_size;
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
if (cur_level_size <= base_bytes_min) {
|
|
|
|
// Case 1. If we make target size of last level to be max_level_size,
|
|
|
|
// target size of the first non-empty level would be smaller than
|
|
|
|
// base_bytes_min. We set it be base_bytes_min.
|
2015-04-03 07:24:50 +02:00
|
|
|
base_level_size = base_bytes_min + 1U;
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
base_level_ = first_non_empty_level;
|
|
|
|
Warn(ioptions.info_log,
|
|
|
|
"More existing levels in DB than needed. "
|
|
|
|
"max_bytes_for_level_multiplier may not be guaranteed.");
|
|
|
|
} else {
|
|
|
|
// Find base level (where L0 data is compacted to).
|
|
|
|
base_level_ = first_non_empty_level;
|
|
|
|
while (base_level_ > 1 && cur_level_size > base_bytes_max) {
|
|
|
|
--base_level_;
|
|
|
|
cur_level_size =
|
|
|
|
cur_level_size / options.max_bytes_for_level_multiplier;
|
|
|
|
}
|
|
|
|
if (cur_level_size > base_bytes_max) {
|
|
|
|
// Even L1 will be too large
|
|
|
|
assert(base_level_ == 1);
|
2015-04-03 07:24:50 +02:00
|
|
|
base_level_size = base_bytes_max;
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
} else {
|
2015-04-03 07:24:50 +02:00
|
|
|
base_level_size = cur_level_size;
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-04-03 07:24:50 +02:00
|
|
|
uint64_t level_size = base_level_size;
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
for (int i = base_level_; i < num_levels_; i++) {
|
|
|
|
if (i > base_level_) {
|
2015-04-03 07:24:50 +02:00
|
|
|
level_size = MultiplyCheckOverflow(
|
|
|
|
level_size, options.max_bytes_for_level_multiplier);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
}
|
|
|
|
level_max_bytes_[i] = level_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-02 18:54:20 +02:00
|
|
|
void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int level = 0; level < storage_info_.num_levels(); level++) {
|
2014-10-31 16:48:19 +01:00
|
|
|
const std::vector<FileMetaData*>& files = storage_info_.files_[level];
|
2014-01-16 01:18:04 +01:00
|
|
|
for (const auto& file : files) {
|
2014-07-02 18:54:20 +02:00
|
|
|
live->push_back(file->fd);
|
2014-01-16 01:18:04 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-16 03:28:36 +01:00
|
|
|
std::string Version::DebugString(bool hex) const {
|
2011-03-18 23:37:00 +01:00
|
|
|
std::string r;
|
2014-10-31 16:48:19 +01:00
|
|
|
for (int level = 0; level < storage_info_.num_levels_; level++) {
|
2011-06-22 04:36:45 +02:00
|
|
|
// E.g.,
|
|
|
|
// --- level 1 ---
|
|
|
|
// 17:123['a' .. 'd']
|
|
|
|
// 20:43['e' .. 'g']
|
|
|
|
r.append("--- level ");
|
2011-03-18 23:37:00 +01:00
|
|
|
AppendNumberTo(&r, level);
|
2012-10-19 23:00:53 +02:00
|
|
|
r.append(" --- version# ");
|
|
|
|
AppendNumberTo(&r, version_number_);
|
2011-06-22 04:36:45 +02:00
|
|
|
r.append(" ---\n");
|
2014-10-31 16:48:19 +01:00
|
|
|
const std::vector<FileMetaData*>& files = storage_info_.files_[level];
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
r.push_back(' ');
|
2014-06-14 00:54:19 +02:00
|
|
|
AppendNumberTo(&r, files[i]->fd.GetNumber());
|
2011-03-18 23:37:00 +01:00
|
|
|
r.push_back(':');
|
2014-06-14 00:54:19 +02:00
|
|
|
AppendNumberTo(&r, files[i]->fd.GetFileSize());
|
2011-10-06 01:30:28 +02:00
|
|
|
r.append("[");
|
2012-12-16 03:28:36 +01:00
|
|
|
r.append(files[i]->smallest.DebugString(hex));
|
2011-10-06 01:30:28 +02:00
|
|
|
r.append(" .. ");
|
2012-12-16 03:28:36 +01:00
|
|
|
r.append(files[i]->largest.DebugString(hex));
|
2011-10-06 01:30:28 +02:00
|
|
|
r.append("]\n");
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// this is used to batch writes to the manifest file
|
|
|
|
struct VersionSet::ManifestWriter {
|
|
|
|
Status status;
|
|
|
|
bool done;
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedCondVar cv;
|
2014-01-31 02:48:42 +01:00
|
|
|
ColumnFamilyData* cfd;
|
2012-10-19 23:00:53 +02:00
|
|
|
VersionEdit* edit;
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2015-02-05 06:39:45 +01:00
|
|
|
explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
|
2014-01-31 02:48:42 +01:00
|
|
|
VersionEdit* e)
|
2014-10-31 19:59:54 +01:00
|
|
|
: done(false), cv(mu), cfd(_cfd), edit(e) {}
|
2012-10-19 23:00:53 +02:00
|
|
|
};
|
|
|
|
|
2014-09-09 00:25:01 +02:00
|
|
|
VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
|
2014-11-06 20:14:28 +01:00
|
|
|
const EnvOptions& storage_options, Cache* table_cache,
|
2014-12-02 21:09:20 +01:00
|
|
|
WriteBuffer* write_buffer,
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
WriteController* write_controller)
|
2014-11-06 20:14:28 +01:00
|
|
|
: column_family_set_(new ColumnFamilySet(
|
2014-12-02 21:09:20 +01:00
|
|
|
dbname, db_options, storage_options, table_cache,
|
|
|
|
write_buffer, write_controller)),
|
2014-09-09 00:25:01 +02:00
|
|
|
env_(db_options->env),
|
2011-03-18 23:37:00 +01:00
|
|
|
dbname_(dbname),
|
2014-09-09 00:25:01 +02:00
|
|
|
db_options_(db_options),
|
2011-03-18 23:37:00 +01:00
|
|
|
next_file_number_(2),
|
|
|
|
manifest_file_number_(0), // Filled by Recover()
|
2014-03-18 05:50:15 +01:00
|
|
|
pending_manifest_file_number_(0),
|
2011-04-12 21:38:58 +02:00
|
|
|
last_sequence_(0),
|
|
|
|
prev_log_number_(0),
|
2013-01-11 02:18:50 +01:00
|
|
|
current_version_number_(0),
|
2014-01-11 00:12:34 +01:00
|
|
|
manifest_file_size_(0),
|
2014-11-06 20:14:28 +01:00
|
|
|
env_options_(storage_options),
|
2014-09-09 00:25:01 +02:00
|
|
|
env_options_compactions_(env_options_) {}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
VersionSet::~VersionSet() {
|
2014-01-24 23:30:28 +01:00
|
|
|
// we need to delete column_family_set_ because its destructor depends on
|
|
|
|
// VersionSet
|
|
|
|
column_family_set_.reset();
|
2013-11-12 20:53:26 +01:00
|
|
|
for (auto file : obsolete_files_) {
|
|
|
|
delete file;
|
|
|
|
}
|
|
|
|
obsolete_files_.clear();
|
2012-10-31 19:47:18 +01:00
|
|
|
}
|
|
|
|
|
2014-01-11 00:12:34 +01:00
|
|
|
void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
|
|
|
|
Version* v) {
|
2015-02-05 01:04:51 +01:00
|
|
|
// compute new compaction score
|
|
|
|
v->storage_info()->ComputeCompactionScore(
|
|
|
|
*column_family_data->GetLatestMutableCFOptions(),
|
|
|
|
column_family_data->ioptions()->compaction_options_fifo);
|
|
|
|
|
2014-10-28 17:59:56 +01:00
|
|
|
// Mark v finalized
|
2014-10-31 16:48:19 +01:00
|
|
|
v->storage_info_.SetFinalized();
|
2014-10-28 17:59:56 +01:00
|
|
|
|
2011-05-21 04:17:43 +02:00
|
|
|
// Make "v" current
|
|
|
|
assert(v->refs_ == 0);
|
2014-01-29 22:28:50 +01:00
|
|
|
Version* current = column_family_data->current();
|
|
|
|
assert(v != current);
|
|
|
|
if (current != nullptr) {
|
|
|
|
assert(current->refs_ > 0);
|
|
|
|
current->Unref();
|
2011-05-21 04:17:43 +02:00
|
|
|
}
|
2014-01-29 22:28:50 +01:00
|
|
|
column_family_data->SetCurrent(v);
|
2011-05-21 04:17:43 +02:00
|
|
|
v->Ref();
|
|
|
|
|
|
|
|
// Append to linked list
|
2014-01-29 22:28:50 +01:00
|
|
|
v->prev_ = column_family_data->dummy_versions()->prev_;
|
|
|
|
v->next_ = column_family_data->dummy_versions();
|
2011-05-21 04:17:43 +02:00
|
|
|
v->prev_->next_ = v;
|
|
|
|
v->next_->prev_ = v;
|
|
|
|
}
|
|
|
|
|
2014-01-11 00:12:34 +01:00
|
|
|
Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
2014-10-02 01:19:16 +02:00
|
|
|
const MutableCFOptions& mutable_cf_options,
|
2015-02-05 06:39:45 +01:00
|
|
|
VersionEdit* edit, InstrumentedMutex* mu,
|
2014-02-28 23:05:11 +01:00
|
|
|
Directory* db_directory, bool new_descriptor_log,
|
2014-10-02 01:19:16 +02:00
|
|
|
const ColumnFamilyOptions* new_cf_options) {
|
2012-10-19 23:00:53 +02:00
|
|
|
mu->AssertHeld();
|
2011-04-12 21:38:58 +02:00
|
|
|
|
2014-03-13 02:09:03 +01:00
|
|
|
// column_family_data can be nullptr only if this is column_family_add.
|
|
|
|
// in that case, we also need to specify ColumnFamilyOptions
|
|
|
|
if (column_family_data == nullptr) {
|
|
|
|
assert(edit->is_column_family_add_);
|
2014-10-02 01:19:16 +02:00
|
|
|
assert(new_cf_options != nullptr);
|
2014-02-28 23:05:11 +01:00
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// queue our request
|
2014-01-31 02:48:42 +01:00
|
|
|
ManifestWriter w(mu, column_family_data, edit);
|
2012-10-19 23:00:53 +02:00
|
|
|
manifest_writers_.push_back(&w);
|
|
|
|
while (!w.done && &w != manifest_writers_.front()) {
|
|
|
|
w.cv.Wait();
|
2011-04-12 21:38:58 +02:00
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
if (w.done) {
|
|
|
|
return w.status;
|
|
|
|
}
|
2014-03-11 22:52:17 +01:00
|
|
|
if (column_family_data != nullptr && column_family_data->IsDropped()) {
|
|
|
|
// if column family is dropped by the time we get here, no need to write
|
|
|
|
// anything to the manifest
|
|
|
|
manifest_writers_.pop_front();
|
|
|
|
// Notify new head of write queue
|
|
|
|
if (!manifest_writers_.empty()) {
|
|
|
|
manifest_writers_.front()->cv.Signal();
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
std::vector<VersionEdit*> batch_edits;
|
2014-02-28 23:05:11 +01:00
|
|
|
Version* v = nullptr;
|
2014-10-31 16:48:19 +01:00
|
|
|
std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(nullptr);
|
2011-04-12 21:38:58 +02:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// process all requests in the queue
|
|
|
|
ManifestWriter* last_writer = &w;
|
|
|
|
assert(!manifest_writers_.empty());
|
2012-11-08 00:11:37 +01:00
|
|
|
assert(manifest_writers_.front() == &w);
|
2014-02-28 23:05:11 +01:00
|
|
|
if (edit->IsColumnFamilyManipulation()) {
|
|
|
|
// no group commits for column family add or drop
|
2014-03-13 02:09:03 +01:00
|
|
|
LogAndApplyCFHelper(edit);
|
2014-02-28 23:05:11 +01:00
|
|
|
batch_edits.push_back(edit);
|
|
|
|
} else {
|
|
|
|
v = new Version(column_family_data, this, current_version_number_++);
|
2014-10-31 16:48:19 +01:00
|
|
|
builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data));
|
2014-11-04 02:45:55 +01:00
|
|
|
auto* builder = builder_guard->version_builder();
|
2014-02-28 23:05:11 +01:00
|
|
|
for (const auto& writer : manifest_writers_) {
|
|
|
|
if (writer->edit->IsColumnFamilyManipulation() ||
|
|
|
|
writer->cfd->GetID() != column_family_data->GetID()) {
|
|
|
|
// no group commits for column family add or drop
|
|
|
|
// also, group commits across column families are not supported
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
last_writer = writer;
|
2014-10-31 16:48:19 +01:00
|
|
|
LogAndApplyHelper(column_family_data, builder, v, last_writer->edit, mu);
|
2014-02-28 23:05:11 +01:00
|
|
|
batch_edits.push_back(last_writer->edit);
|
2014-01-31 02:48:42 +01:00
|
|
|
}
|
2014-10-31 16:48:19 +01:00
|
|
|
builder->SaveTo(v->storage_info());
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Initialize new descriptor log file if necessary by creating
|
|
|
|
// a temporary file that contains a snapshot of the current version.
|
2012-09-24 23:01:01 +02:00
|
|
|
uint64_t new_manifest_file_size = 0;
|
2011-05-21 04:17:43 +02:00
|
|
|
Status s;
|
2013-01-11 02:18:50 +01:00
|
|
|
|
2014-03-18 05:50:15 +01:00
|
|
|
assert(pending_manifest_file_number_ == 0);
|
2013-02-19 05:08:12 +01:00
|
|
|
if (!descriptor_log_ ||
|
2014-09-09 00:25:01 +02:00
|
|
|
manifest_file_size_ > db_options_->max_manifest_file_size) {
|
2014-03-18 05:50:15 +01:00
|
|
|
pending_manifest_file_number_ = NewFileNumber();
|
2014-11-08 00:44:12 +01:00
|
|
|
batch_edits.back()->SetNextFile(next_file_number_.load());
|
2013-01-11 02:18:50 +01:00
|
|
|
new_descriptor_log = true;
|
2014-03-18 05:50:15 +01:00
|
|
|
} else {
|
|
|
|
pending_manifest_file_number_ = manifest_file_number_;
|
2013-01-11 02:18:50 +01:00
|
|
|
}
|
|
|
|
|
2013-11-09 00:23:46 +01:00
|
|
|
if (new_descriptor_log) {
|
2014-03-18 21:24:27 +01:00
|
|
|
// if we're writing out new snapshot make sure to persist max column family
|
2014-03-13 02:09:03 +01:00
|
|
|
if (column_family_set_->GetMaxColumnFamily() > 0) {
|
|
|
|
edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-01-07 05:29:17 +01:00
|
|
|
// Unlock during expensive operations. New writes cannot get here
|
2012-10-19 23:00:53 +02:00
|
|
|
// because &w is ensuring that all new writes get queued.
|
2011-09-01 21:08:02 +02:00
|
|
|
{
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
|
2011-09-01 21:08:02 +02:00
|
|
|
mu->Unlock();
|
2012-11-01 06:01:57 +01:00
|
|
|
|
2014-09-09 00:25:01 +02:00
|
|
|
if (!edit->IsColumnFamilyManipulation() &&
|
|
|
|
db_options_->max_open_files == -1) {
|
2014-01-07 05:29:17 +01:00
|
|
|
// unlimited table cache. Pre-load table handle now.
|
|
|
|
// Need to do it out of the mutex.
|
2014-11-04 02:45:55 +01:00
|
|
|
builder_guard->version_builder()->LoadTableHandlers();
|
2014-01-07 05:29:17 +01:00
|
|
|
}
|
|
|
|
|
2013-11-01 20:32:27 +01:00
|
|
|
// This is fine because everything inside of this block is serialized --
|
|
|
|
// only one thread can be here at the same time
|
2014-03-13 02:09:03 +01:00
|
|
|
if (new_descriptor_log) {
|
2014-08-04 20:25:42 +02:00
|
|
|
// create manifest file
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
2014-08-04 20:25:42 +02:00
|
|
|
"Creating manifest %" PRIu64 "\n", pending_manifest_file_number_);
|
2013-11-01 20:32:27 +01:00
|
|
|
unique_ptr<WritableFile> descriptor_file;
|
2014-03-18 05:50:15 +01:00
|
|
|
s = env_->NewWritableFile(
|
|
|
|
DescriptorFileName(dbname_, pending_manifest_file_number_),
|
2014-09-09 00:25:01 +02:00
|
|
|
&descriptor_file, env_->OptimizeForManifestWrite(env_options_));
|
2013-11-01 20:32:27 +01:00
|
|
|
if (s.ok()) {
|
2014-03-26 17:37:53 +01:00
|
|
|
descriptor_file->SetPreallocationBlockSize(
|
2014-09-09 00:25:01 +02:00
|
|
|
db_options_->manifest_preallocation_size);
|
2013-11-01 20:32:27 +01:00
|
|
|
descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
|
|
|
|
s = WriteSnapshot(descriptor_log_.get());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-28 23:05:11 +01:00
|
|
|
if (!edit->IsColumnFamilyManipulation()) {
|
2014-06-14 00:06:10 +02:00
|
|
|
// This is cpu-heavy operations, which should be called outside mutex.
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
v->PrepareApply(mutable_cf_options);
|
2014-02-28 23:05:11 +01:00
|
|
|
}
|
2011-09-01 21:08:02 +02:00
|
|
|
|
|
|
|
// Write new record to MANIFEST log
|
2011-03-18 23:37:00 +01:00
|
|
|
if (s.ok()) {
|
2014-02-28 21:22:45 +01:00
|
|
|
for (auto& e : batch_edits) {
|
|
|
|
std::string record;
|
2014-10-28 22:27:26 +01:00
|
|
|
if (!e->EncodeTo(&record)) {
|
|
|
|
s = Status::Corruption(
|
|
|
|
"Unable to Encode VersionEdit:" + e->DebugString(true));
|
|
|
|
break;
|
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
s = descriptor_log_->AddRecord(record);
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2015-01-22 20:43:38 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
s = SyncManifest(env_, db_options_, descriptor_log_->file());
|
2011-09-01 21:08:02 +02:00
|
|
|
}
|
2013-01-08 21:00:13 +01:00
|
|
|
if (!s.ok()) {
|
2014-10-28 22:27:26 +01:00
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log,
|
|
|
|
"MANIFEST write: %s\n", s.ToString().c_str());
|
2014-02-28 21:22:45 +01:00
|
|
|
bool all_records_in = true;
|
|
|
|
for (auto& e : batch_edits) {
|
|
|
|
std::string record;
|
2014-10-28 22:27:26 +01:00
|
|
|
if (!e->EncodeTo(&record)) {
|
|
|
|
s = Status::Corruption(
|
|
|
|
"Unable to Encode VersionEdit:" + e->DebugString(true));
|
|
|
|
all_records_in = false;
|
|
|
|
break;
|
|
|
|
}
|
2014-03-18 05:50:15 +01:00
|
|
|
if (!ManifestContains(pending_manifest_file_number_, record)) {
|
2014-02-28 21:22:45 +01:00
|
|
|
all_records_in = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (all_records_in) {
|
2014-10-28 22:27:26 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log,
|
2013-01-08 21:00:13 +01:00
|
|
|
"MANIFEST contains log record despite error; advancing to new "
|
2013-03-06 22:28:54 +01:00
|
|
|
"version to prevent mismatch between in-memory and logged state"
|
|
|
|
" If paranoid is set, then the db is now in readonly mode.");
|
2013-01-08 21:00:13 +01:00
|
|
|
s = Status::OK();
|
|
|
|
}
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2011-09-01 21:08:02 +02:00
|
|
|
// If we just created a new descriptor file, install it by writing a
|
|
|
|
// new CURRENT file that points to it.
|
2014-03-18 05:50:15 +01:00
|
|
|
if (s.ok() && new_descriptor_log) {
|
2014-05-06 23:51:33 +02:00
|
|
|
s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_,
|
2014-09-15 20:32:01 +02:00
|
|
|
db_options_->disableDataSync ? nullptr : db_directory);
|
2014-03-18 05:50:15 +01:00
|
|
|
if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) {
|
2013-11-09 00:23:46 +01:00
|
|
|
// delete old manifest file
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
2014-03-18 05:50:15 +01:00
|
|
|
"Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
|
|
|
|
manifest_file_number_, pending_manifest_file_number_);
|
2013-11-09 00:23:46 +01:00
|
|
|
// we don't care about an error here, PurgeObsoleteFiles will take care
|
|
|
|
// of it later
|
2014-03-18 05:50:15 +01:00
|
|
|
env_->DeleteFile(DescriptorFileName(dbname_, manifest_file_number_));
|
2013-11-09 00:23:46 +01:00
|
|
|
}
|
2011-09-01 21:08:02 +02:00
|
|
|
}
|
|
|
|
|
2014-01-29 01:02:51 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
// find offset in manifest file where this version is stored.
|
|
|
|
new_manifest_file_size = descriptor_log_->file()->GetFileSize();
|
|
|
|
}
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2014-09-09 00:25:01 +02:00
|
|
|
LogFlush(db_options_->info_log);
|
2011-09-01 21:08:02 +02:00
|
|
|
mu->Lock();
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Install the new version
|
|
|
|
if (s.ok()) {
|
2014-02-28 23:05:11 +01:00
|
|
|
if (edit->is_column_family_add_) {
|
|
|
|
// no group commit on column family add
|
|
|
|
assert(batch_edits.size() == 1);
|
2014-10-02 01:19:16 +02:00
|
|
|
assert(new_cf_options != nullptr);
|
|
|
|
CreateColumnFamily(*new_cf_options, edit);
|
2014-02-28 23:05:11 +01:00
|
|
|
} else if (edit->is_column_family_drop_) {
|
|
|
|
assert(batch_edits.size() == 1);
|
2014-03-11 04:22:31 +01:00
|
|
|
column_family_data->SetDropped();
|
2014-02-28 23:05:11 +01:00
|
|
|
if (column_family_data->Unref()) {
|
|
|
|
delete column_family_data;
|
|
|
|
}
|
|
|
|
} else {
|
2014-03-14 21:11:41 +01:00
|
|
|
uint64_t max_log_number_in_batch = 0;
|
|
|
|
for (auto& e : batch_edits) {
|
|
|
|
if (e->has_log_number_) {
|
|
|
|
max_log_number_in_batch =
|
|
|
|
std::max(max_log_number_in_batch, e->log_number_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (max_log_number_in_batch != 0) {
|
2014-04-15 18:57:25 +02:00
|
|
|
assert(column_family_data->GetLogNumber() <= max_log_number_in_batch);
|
2014-03-14 21:11:41 +01:00
|
|
|
column_family_data->SetLogNumber(max_log_number_in_batch);
|
|
|
|
}
|
2014-02-28 23:05:11 +01:00
|
|
|
AppendVersion(column_family_data, v);
|
|
|
|
}
|
|
|
|
|
2014-03-18 05:50:15 +01:00
|
|
|
manifest_file_number_ = pending_manifest_file_number_;
|
2014-01-11 00:12:34 +01:00
|
|
|
manifest_file_size_ = new_manifest_file_size;
|
2011-04-12 21:38:58 +02:00
|
|
|
prev_log_number_ = edit->prev_log_number_;
|
2011-03-18 23:37:00 +01:00
|
|
|
} else {
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log,
|
|
|
|
"Error in committing version %lu to [%s]",
|
2014-04-25 15:51:16 +02:00
|
|
|
(unsigned long)v->GetVersionNumber(),
|
|
|
|
column_family_data->GetName().c_str());
|
2011-03-18 23:37:00 +01:00
|
|
|
delete v;
|
2014-03-18 05:50:15 +01:00
|
|
|
if (new_descriptor_log) {
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
2014-08-04 20:25:42 +02:00
|
|
|
"Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
|
|
|
|
manifest_file_number_, pending_manifest_file_number_);
|
2013-01-20 11:07:13 +01:00
|
|
|
descriptor_log_.reset();
|
2014-03-18 05:50:15 +01:00
|
|
|
env_->DeleteFile(
|
|
|
|
DescriptorFileName(dbname_, pending_manifest_file_number_));
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
2014-03-18 05:50:15 +01:00
|
|
|
pending_manifest_file_number_ = 0;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// wake up all the waiting writers
|
|
|
|
while (true) {
|
|
|
|
ManifestWriter* ready = manifest_writers_.front();
|
|
|
|
manifest_writers_.pop_front();
|
|
|
|
if (ready != &w) {
|
|
|
|
ready->status = s;
|
|
|
|
ready->done = true;
|
|
|
|
ready->cv.Signal();
|
|
|
|
}
|
|
|
|
if (ready == last_writer) break;
|
|
|
|
}
|
|
|
|
// Notify new head of write queue
|
|
|
|
if (!manifest_writers_.empty()) {
|
|
|
|
manifest_writers_.front()->cv.Signal();
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-03-13 02:09:03 +01:00
|
|
|
void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
|
|
|
|
assert(edit->IsColumnFamilyManipulation());
|
2014-11-08 00:44:12 +01:00
|
|
|
edit->SetNextFile(next_file_number_.load());
|
2014-03-13 02:09:03 +01:00
|
|
|
edit->SetLastSequence(last_sequence_);
|
|
|
|
if (edit->is_column_family_drop_) {
|
|
|
|
// if we drop column family, we have to make sure to save max column family,
|
|
|
|
// so that we don't reuse existing ID
|
|
|
|
edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-31 16:48:19 +01:00
|
|
|
void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
|
|
|
|
VersionBuilder* builder, Version* v,
|
2015-02-05 06:39:45 +01:00
|
|
|
VersionEdit* edit, InstrumentedMutex* mu) {
|
2012-10-19 23:00:53 +02:00
|
|
|
mu->AssertHeld();
|
2014-03-13 02:09:03 +01:00
|
|
|
assert(!edit->IsColumnFamilyManipulation());
|
2012-10-19 23:00:53 +02:00
|
|
|
|
2014-02-28 23:05:11 +01:00
|
|
|
if (edit->has_log_number_) {
|
|
|
|
assert(edit->log_number_ >= cfd->GetLogNumber());
|
2014-11-08 00:44:12 +01:00
|
|
|
assert(edit->log_number_ < next_file_number_.load());
|
2014-02-28 19:29:37 +01:00
|
|
|
}
|
2014-02-28 23:05:11 +01:00
|
|
|
|
2014-03-13 02:09:03 +01:00
|
|
|
if (!edit->has_prev_log_number_) {
|
|
|
|
edit->SetPrevLogNumber(prev_log_number_);
|
|
|
|
}
|
2014-11-08 00:44:12 +01:00
|
|
|
edit->SetNextFile(next_file_number_.load());
|
2014-03-13 02:09:03 +01:00
|
|
|
edit->SetLastSequence(last_sequence_);
|
|
|
|
|
2014-02-28 23:05:11 +01:00
|
|
|
builder->Apply(edit);
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
Status VersionSet::Recover(
|
2014-04-09 18:56:17 +02:00
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
bool read_only) {
|
2014-01-22 20:44:53 +01:00
|
|
|
std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
|
|
|
|
for (auto cf : column_families) {
|
|
|
|
cf_name_to_options.insert({cf.name, cf.options});
|
|
|
|
}
|
|
|
|
// keeps track of column families in manifest that were not found in
|
|
|
|
// column families parameters. if those column families are not dropped
|
|
|
|
// by subsequent manifest records, Recover() will return failure status
|
2014-04-09 19:38:05 +02:00
|
|
|
std::unordered_map<int, std::string> column_families_not_found;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
// Read "CURRENT" file, which contains a pointer to the current manifest file
|
2013-12-31 03:33:57 +01:00
|
|
|
std::string manifest_filename;
|
|
|
|
Status s = ReadFileToString(
|
|
|
|
env_, CurrentFileName(dbname_), &manifest_filename
|
|
|
|
);
|
2011-03-18 23:37:00 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2013-12-31 03:33:57 +01:00
|
|
|
if (manifest_filename.empty() ||
|
|
|
|
manifest_filename.back() != '\n') {
|
2011-03-18 23:37:00 +01:00
|
|
|
return Status::Corruption("CURRENT file does not end with newline");
|
|
|
|
}
|
2013-12-31 03:33:57 +01:00
|
|
|
// remove the trailing '\n'
|
|
|
|
manifest_filename.resize(manifest_filename.size() - 1);
|
2014-03-12 18:52:32 +01:00
|
|
|
FileType type;
|
|
|
|
bool parse_ok =
|
|
|
|
ParseFileName(manifest_filename, &manifest_file_number_, &type);
|
|
|
|
if (!parse_ok || type != kDescriptorFile) {
|
|
|
|
return Status::Corruption("CURRENT file corrupted");
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
|
|
|
"Recovering from manifest file: %s\n",
|
2013-12-31 03:33:57 +01:00
|
|
|
manifest_filename.c_str());
|
2012-08-23 04:15:06 +02:00
|
|
|
|
2013-12-31 03:33:57 +01:00
|
|
|
manifest_filename = dbname_ + "/" + manifest_filename;
|
|
|
|
unique_ptr<SequentialFile> manifest_file;
|
2014-03-12 18:52:32 +01:00
|
|
|
s = env_->NewSequentialFile(manifest_filename, &manifest_file,
|
2014-09-09 00:25:01 +02:00
|
|
|
env_options_);
|
2011-03-18 23:37:00 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2014-11-06 20:14:28 +01:00
|
|
|
uint64_t current_manifest_file_size;
|
|
|
|
s = env_->GetFileSize(manifest_filename, ¤t_manifest_file_size);
|
2012-09-24 23:01:01 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
bool have_log_number = false;
|
2011-04-12 21:38:58 +02:00
|
|
|
bool have_prev_log_number = false;
|
2011-03-18 23:37:00 +01:00
|
|
|
bool have_next_file = false;
|
|
|
|
bool have_last_sequence = false;
|
|
|
|
uint64_t next_file = 0;
|
2011-04-12 21:38:58 +02:00
|
|
|
uint64_t last_sequence = 0;
|
|
|
|
uint64_t log_number = 0;
|
2014-11-06 20:14:28 +01:00
|
|
|
uint64_t previous_log_number = 0;
|
2014-03-05 21:13:44 +01:00
|
|
|
uint32_t max_column_family = 0;
|
2014-10-31 16:48:19 +01:00
|
|
|
std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-01-06 22:31:06 +01:00
|
|
|
// add default column family
|
2014-04-09 18:56:17 +02:00
|
|
|
auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
|
2014-01-22 20:44:53 +01:00
|
|
|
if (default_cf_iter == cf_name_to_options.end()) {
|
2014-03-13 02:09:03 +01:00
|
|
|
return Status::InvalidArgument("Default column family not specified");
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
2014-03-13 02:09:03 +01:00
|
|
|
VersionEdit default_cf_edit;
|
2014-04-09 18:56:17 +02:00
|
|
|
default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
|
2014-03-13 02:09:03 +01:00
|
|
|
default_cf_edit.SetColumnFamily(0);
|
|
|
|
ColumnFamilyData* default_cfd =
|
|
|
|
CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
|
2014-10-31 16:48:19 +01:00
|
|
|
builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)});
|
2014-01-06 22:31:06 +01:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
{
|
2014-01-22 20:44:53 +01:00
|
|
|
VersionSet::LogReporter reporter;
|
2011-03-18 23:37:00 +01:00
|
|
|
reporter.status = &s;
|
2013-12-31 03:33:57 +01:00
|
|
|
log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/,
|
|
|
|
0 /*initial_offset*/);
|
2011-03-18 23:37:00 +01:00
|
|
|
Slice record;
|
|
|
|
std::string scratch;
|
|
|
|
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
2014-01-15 00:27:09 +01:00
|
|
|
VersionEdit edit;
|
2011-03-18 23:37:00 +01:00
|
|
|
s = edit.DecodeFrom(record);
|
2014-01-11 00:12:34 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-02-03 21:08:33 +01:00
|
|
|
// Not found means that user didn't supply that column
|
|
|
|
// family option AND we encountered column family add
|
|
|
|
// record. Once we encounter column family drop record,
|
|
|
|
// we will delete the column family from
|
|
|
|
// column_families_not_found.
|
2014-01-22 20:44:53 +01:00
|
|
|
bool cf_in_not_found =
|
|
|
|
column_families_not_found.find(edit.column_family_) !=
|
|
|
|
column_families_not_found.end();
|
2014-02-03 21:08:33 +01:00
|
|
|
// in builders means that user supplied that column family
|
|
|
|
// option AND that we encountered column family add record
|
2014-01-22 20:44:53 +01:00
|
|
|
bool cf_in_builders =
|
|
|
|
builders.find(edit.column_family_) != builders.end();
|
|
|
|
|
|
|
|
// they can't both be true
|
|
|
|
assert(!(cf_in_not_found && cf_in_builders));
|
|
|
|
|
2014-02-28 20:25:38 +01:00
|
|
|
ColumnFamilyData* cfd = nullptr;
|
|
|
|
|
2014-01-02 18:08:12 +01:00
|
|
|
if (edit.is_column_family_add_) {
|
2014-01-22 20:44:53 +01:00
|
|
|
if (cf_in_builders || cf_in_not_found) {
|
|
|
|
s = Status::Corruption(
|
|
|
|
"Manifest adding the same column family twice");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto cf_options = cf_name_to_options.find(edit.column_family_name_);
|
|
|
|
if (cf_options == cf_name_to_options.end()) {
|
2014-04-09 19:38:05 +02:00
|
|
|
column_families_not_found.insert(
|
|
|
|
{edit.column_family_, edit.column_family_name_});
|
2014-01-22 20:44:53 +01:00
|
|
|
} else {
|
2014-02-28 20:25:38 +01:00
|
|
|
cfd = CreateColumnFamily(cf_options->second, &edit);
|
2014-10-31 16:48:19 +01:00
|
|
|
builders.insert(
|
|
|
|
{edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
2014-01-11 00:12:34 +01:00
|
|
|
} else if (edit.is_column_family_drop_) {
|
2014-01-22 20:44:53 +01:00
|
|
|
if (cf_in_builders) {
|
|
|
|
auto builder = builders.find(edit.column_family_);
|
|
|
|
assert(builder != builders.end());
|
|
|
|
delete builder->second;
|
|
|
|
builders.erase(builder);
|
2014-02-28 20:25:38 +01:00
|
|
|
cfd = column_family_set_->GetColumnFamily(edit.column_family_);
|
2014-02-11 02:04:44 +01:00
|
|
|
if (cfd->Unref()) {
|
|
|
|
delete cfd;
|
2014-02-28 20:25:38 +01:00
|
|
|
cfd = nullptr;
|
2014-02-11 02:04:44 +01:00
|
|
|
} else {
|
|
|
|
// who else can have reference to cfd!?
|
|
|
|
assert(false);
|
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
} else if (cf_in_not_found) {
|
|
|
|
column_families_not_found.erase(edit.column_family_);
|
|
|
|
} else {
|
|
|
|
s = Status::Corruption(
|
|
|
|
"Manifest - dropping non-existing column family");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else if (!cf_in_not_found) {
|
|
|
|
if (!cf_in_builders) {
|
|
|
|
s = Status::Corruption(
|
|
|
|
"Manifest record referencing unknown column family");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-02-28 20:25:38 +01:00
|
|
|
cfd = column_family_set_->GetColumnFamily(edit.column_family_);
|
2014-01-22 20:44:53 +01:00
|
|
|
// this should never happen since cf_in_builders is true
|
|
|
|
assert(cfd != nullptr);
|
2014-11-04 02:45:55 +01:00
|
|
|
if (edit.max_level_ >= cfd->current()->storage_info()->num_levels()) {
|
2014-01-22 02:01:52 +01:00
|
|
|
s = Status::InvalidArgument(
|
|
|
|
"db has more levels than options.num_levels");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-02-28 20:25:38 +01:00
|
|
|
// if it is not column family add or column family drop,
|
|
|
|
// then it's a file add/delete, which should be forwarded
|
|
|
|
// to builder
|
|
|
|
auto builder = builders.find(edit.column_family_);
|
|
|
|
assert(builder != builders.end());
|
2014-11-04 02:45:55 +01:00
|
|
|
builder->second->version_builder()->Apply(&edit);
|
2014-02-28 20:25:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (cfd != nullptr) {
|
2014-01-28 20:05:04 +01:00
|
|
|
if (edit.has_log_number_) {
|
2014-03-14 21:11:41 +01:00
|
|
|
if (cfd->GetLogNumber() > edit.log_number_) {
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log,
|
2014-03-31 21:44:54 +02:00
|
|
|
"MANIFEST corruption detected, but ignored - Log numbers in "
|
|
|
|
"records NOT monotonically increasing");
|
2014-03-18 21:24:27 +01:00
|
|
|
} else {
|
|
|
|
cfd->SetLogNumber(edit.log_number_);
|
|
|
|
have_log_number = true;
|
2014-03-14 21:11:41 +01:00
|
|
|
}
|
2014-01-28 20:05:04 +01:00
|
|
|
}
|
2014-02-03 21:08:33 +01:00
|
|
|
if (edit.has_comparator_ &&
|
|
|
|
edit.comparator_ != cfd->user_comparator()->Name()) {
|
|
|
|
s = Status::InvalidArgument(
|
|
|
|
cfd->user_comparator()->Name(),
|
|
|
|
"does not match existing comparator " + edit.comparator_);
|
|
|
|
break;
|
|
|
|
}
|
2014-01-02 18:08:12 +01:00
|
|
|
}
|
|
|
|
|
2011-04-12 21:38:58 +02:00
|
|
|
if (edit.has_prev_log_number_) {
|
2014-11-06 20:14:28 +01:00
|
|
|
previous_log_number = edit.prev_log_number_;
|
2011-04-12 21:38:58 +02:00
|
|
|
have_prev_log_number = true;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
if (edit.has_next_file_number_) {
|
|
|
|
next_file = edit.next_file_number_;
|
|
|
|
have_next_file = true;
|
|
|
|
}
|
|
|
|
|
2014-03-05 21:13:44 +01:00
|
|
|
if (edit.has_max_column_family_) {
|
|
|
|
max_column_family = edit.max_column_family_;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
if (edit.has_last_sequence_) {
|
2011-04-12 21:38:58 +02:00
|
|
|
last_sequence = edit.last_sequence_;
|
2011-03-18 23:37:00 +01:00
|
|
|
have_last_sequence = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
if (!have_next_file) {
|
|
|
|
s = Status::Corruption("no meta-nextfile entry in descriptor");
|
|
|
|
} else if (!have_log_number) {
|
|
|
|
s = Status::Corruption("no meta-lognumber entry in descriptor");
|
|
|
|
} else if (!have_last_sequence) {
|
|
|
|
s = Status::Corruption("no last-sequence-number entry in descriptor");
|
|
|
|
}
|
2011-04-12 21:38:58 +02:00
|
|
|
|
|
|
|
if (!have_prev_log_number) {
|
2014-11-06 20:14:28 +01:00
|
|
|
previous_log_number = 0;
|
2011-04-12 21:38:58 +02:00
|
|
|
}
|
2011-09-01 21:08:02 +02:00
|
|
|
|
2014-03-05 21:13:44 +01:00
|
|
|
column_family_set_->UpdateMaxColumnFamily(max_column_family);
|
|
|
|
|
2014-11-08 00:44:12 +01:00
|
|
|
MarkFileNumberUsedDuringRecovery(previous_log_number);
|
|
|
|
MarkFileNumberUsedDuringRecovery(log_number);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
// there were some column families in the MANIFEST that weren't specified
|
2014-04-09 18:56:17 +02:00
|
|
|
// in the argument. This is OK in read_only mode
|
2014-09-26 18:04:44 +02:00
|
|
|
if (read_only == false && !column_families_not_found.empty()) {
|
2014-04-09 18:56:17 +02:00
|
|
|
std::string list_of_not_found;
|
2014-04-09 19:38:05 +02:00
|
|
|
for (const auto& cf : column_families_not_found) {
|
|
|
|
list_of_not_found += ", " + cf.second;
|
2014-04-09 18:56:17 +02:00
|
|
|
}
|
|
|
|
list_of_not_found = list_of_not_found.substr(2);
|
2014-01-22 20:44:53 +01:00
|
|
|
s = Status::InvalidArgument(
|
2014-04-09 19:38:05 +02:00
|
|
|
"You have to open all column families. Column families not opened: " +
|
|
|
|
list_of_not_found);
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
if (s.ok()) {
|
2014-01-22 20:44:53 +01:00
|
|
|
for (auto cfd : *column_family_set_) {
|
2015-03-20 01:04:29 +01:00
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
2014-02-12 23:01:30 +01:00
|
|
|
auto builders_iter = builders.find(cfd->GetID());
|
|
|
|
assert(builders_iter != builders.end());
|
2014-11-04 02:45:55 +01:00
|
|
|
auto* builder = builders_iter->second->version_builder();
|
2014-02-12 23:01:30 +01:00
|
|
|
|
2014-09-09 00:25:01 +02:00
|
|
|
if (db_options_->max_open_files == -1) {
|
2014-02-12 19:43:27 +01:00
|
|
|
// unlimited table cache. Pre-load table handle now.
|
|
|
|
// Need to do it out of the mutex.
|
2014-02-12 23:01:30 +01:00
|
|
|
builder->LoadTableHandlers();
|
|
|
|
}
|
2014-02-12 19:43:27 +01:00
|
|
|
|
2014-02-01 00:30:27 +01:00
|
|
|
Version* v = new Version(cfd, this, current_version_number_++);
|
2014-10-31 16:48:19 +01:00
|
|
|
builder->SaveTo(v->storage_info());
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
|
2014-01-11 00:12:34 +01:00
|
|
|
// Install recovered version
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
v->PrepareApply(*cfd->GetLatestMutableCFOptions());
|
2014-01-22 20:44:53 +01:00
|
|
|
AppendVersion(cfd, v);
|
2014-01-11 00:12:34 +01:00
|
|
|
}
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
|
2014-11-06 20:14:28 +01:00
|
|
|
manifest_file_size_ = current_manifest_file_size;
|
2014-11-08 00:44:12 +01:00
|
|
|
next_file_number_.store(next_file + 1);
|
2011-05-21 04:17:43 +02:00
|
|
|
last_sequence_ = last_sequence;
|
2014-11-06 20:14:28 +01:00
|
|
|
prev_log_number_ = previous_log_number;
|
2012-08-23 04:15:06 +02:00
|
|
|
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
2014-07-02 18:54:20 +02:00
|
|
|
"Recovered from manifest file:%s succeeded,"
|
2013-11-13 06:02:03 +01:00
|
|
|
"manifest_file_number is %lu, next_file_number is %lu, "
|
|
|
|
"last_sequence is %lu, log_number is %lu,"
|
2014-03-05 21:13:44 +01:00
|
|
|
"prev_log_number is %lu,"
|
|
|
|
"max_column_family is %u\n",
|
2014-07-02 18:54:20 +02:00
|
|
|
manifest_filename.c_str(), (unsigned long)manifest_file_number_,
|
2014-11-08 00:44:12 +01:00
|
|
|
(unsigned long)next_file_number_.load(), (unsigned long)last_sequence_,
|
2014-07-02 18:54:20 +02:00
|
|
|
(unsigned long)log_number, (unsigned long)prev_log_number_,
|
2014-03-05 21:13:44 +01:00
|
|
|
column_family_set_->GetMaxColumnFamily());
|
2014-01-28 20:05:04 +01:00
|
|
|
|
|
|
|
for (auto cfd : *column_family_set_) {
|
2015-03-20 01:04:29 +01:00
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
2014-04-25 15:51:16 +02:00
|
|
|
"Column family [%s] (ID %u), log number is %" PRIu64 "\n",
|
|
|
|
cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
|
2014-01-28 20:05:04 +01:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-01-11 00:12:34 +01:00
|
|
|
for (auto builder : builders) {
|
|
|
|
delete builder.second;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
|
|
|
|
const std::string& dbname, Env* env) {
|
|
|
|
// these are just for performance reasons, not correcntes,
|
|
|
|
// so we're fine using the defaults
|
|
|
|
EnvOptions soptions;
|
|
|
|
// Read "CURRENT" file, which contains a pointer to the current manifest file
|
|
|
|
std::string current;
|
|
|
|
Status s = ReadFileToString(env, CurrentFileName(dbname), ¤t);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (current.empty() || current[current.size()-1] != '\n') {
|
|
|
|
return Status::Corruption("CURRENT file does not end with newline");
|
|
|
|
}
|
|
|
|
current.resize(current.size() - 1);
|
|
|
|
|
|
|
|
std::string dscname = dbname + "/" + current;
|
|
|
|
unique_ptr<SequentialFile> file;
|
|
|
|
s = env->NewSequentialFile(dscname, &file, soptions);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::map<uint32_t, std::string> column_family_names;
|
|
|
|
// default column family is always implicitly there
|
2014-04-09 18:56:17 +02:00
|
|
|
column_family_names.insert({0, kDefaultColumnFamilyName});
|
2014-01-22 20:44:53 +01:00
|
|
|
VersionSet::LogReporter reporter;
|
|
|
|
reporter.status = &s;
|
|
|
|
log::Reader reader(std::move(file), &reporter, true /*checksum*/,
|
|
|
|
0 /*initial_offset*/);
|
|
|
|
Slice record;
|
|
|
|
std::string scratch;
|
|
|
|
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
2014-02-11 02:04:44 +01:00
|
|
|
VersionEdit edit;
|
|
|
|
s = edit.DecodeFrom(record);
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (edit.is_column_family_add_) {
|
2014-02-28 23:05:11 +01:00
|
|
|
if (column_family_names.find(edit.column_family_) !=
|
|
|
|
column_family_names.end()) {
|
|
|
|
s = Status::Corruption("Manifest adding the same column family twice");
|
|
|
|
break;
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
column_family_names.insert(
|
|
|
|
{edit.column_family_, edit.column_family_name_});
|
|
|
|
} else if (edit.is_column_family_drop_) {
|
2014-02-28 23:05:11 +01:00
|
|
|
if (column_family_names.find(edit.column_family_) ==
|
|
|
|
column_family_names.end()) {
|
|
|
|
s = Status::Corruption(
|
|
|
|
"Manifest - dropping non-existing column family");
|
|
|
|
break;
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
column_family_names.erase(edit.column_family_);
|
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
column_families->clear();
|
|
|
|
if (s.ok()) {
|
|
|
|
for (const auto& iter : column_family_names) {
|
|
|
|
column_families->push_back(iter.second);
|
2012-08-17 19:48:40 +02:00
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
2012-08-17 19:48:40 +02:00
|
|
|
|
2014-04-15 22:39:26 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
|
|
|
|
const Options* options,
|
2014-09-09 00:25:01 +02:00
|
|
|
const EnvOptions& env_options,
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
int new_levels) {
|
|
|
|
if (new_levels <= 1) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Number of levels needs to be bigger than 1");
|
|
|
|
}
|
|
|
|
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
ColumnFamilyOptions cf_options(*options);
|
2015-03-17 23:04:37 +01:00
|
|
|
std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
|
|
|
|
options->table_cache_numshardbits));
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
WriteController wc;
|
2014-12-02 21:09:20 +01:00
|
|
|
WriteBuffer wb(options->db_write_buffer_size);
|
|
|
|
VersionSet versions(dbname, options, env_options, tc.get(), &wb, &wc);
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
Status status;
|
|
|
|
|
2014-01-25 00:03:54 +01:00
|
|
|
std::vector<ColumnFamilyDescriptor> dummy;
|
2014-04-09 18:56:17 +02:00
|
|
|
ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
|
2014-02-26 19:03:34 +01:00
|
|
|
ColumnFamilyOptions(*options));
|
2014-02-01 04:44:48 +01:00
|
|
|
dummy.push_back(dummy_descriptor);
|
2014-01-25 00:03:54 +01:00
|
|
|
status = versions.Recover(dummy);
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2014-01-27 23:33:50 +01:00
|
|
|
Version* current_version =
|
2014-01-29 22:28:50 +01:00
|
|
|
versions.GetColumnFamilySet()->GetDefault()->current();
|
2014-10-31 16:48:19 +01:00
|
|
|
auto* vstorage = current_version->storage_info();
|
2014-11-04 02:45:55 +01:00
|
|
|
int current_levels = vstorage->num_levels();
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
|
|
|
|
if (current_levels <= new_levels) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure there are file only on one level from
|
|
|
|
// (new_levels-1) to (current_levels-1)
|
|
|
|
int first_nonempty_level = -1;
|
|
|
|
int first_nonempty_level_filenum = 0;
|
|
|
|
for (int i = new_levels - 1; i < current_levels; i++) {
|
2014-10-27 23:49:46 +01:00
|
|
|
int file_num = vstorage->NumLevelFiles(i);
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
if (file_num != 0) {
|
|
|
|
if (first_nonempty_level < 0) {
|
|
|
|
first_nonempty_level = i;
|
|
|
|
first_nonempty_level_filenum = file_num;
|
|
|
|
} else {
|
|
|
|
char msg[255];
|
|
|
|
snprintf(msg, sizeof(msg),
|
|
|
|
"Found at least two levels containing files: "
|
|
|
|
"[%d:%d],[%d:%d].\n",
|
|
|
|
first_nonempty_level, first_nonempty_level_filenum, i,
|
|
|
|
file_num);
|
|
|
|
return Status::InvalidArgument(msg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-25 03:30:00 +01:00
|
|
|
// we need to allocate an array with the old number of levels size to
|
|
|
|
// avoid SIGSEGV in WriteSnapshot()
|
|
|
|
// however, all levels bigger or equal to new_levels will be empty
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
std::vector<FileMetaData*>* new_files_list =
|
2014-01-25 03:30:00 +01:00
|
|
|
new std::vector<FileMetaData*>[current_levels];
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
for (int i = 0; i < new_levels - 1; i++) {
|
2014-10-31 16:48:19 +01:00
|
|
|
new_files_list[i] = vstorage->LevelFiles(i);
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (first_nonempty_level > 0) {
|
2014-10-31 16:48:19 +01:00
|
|
|
new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level);
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
}
|
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
delete[] vstorage -> files_;
|
|
|
|
vstorage->files_ = new_files_list;
|
|
|
|
vstorage->num_levels_ = new_levels;
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
MutableCFOptions mutable_cf_options(*options, ImmutableCFOptions(*options));
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
VersionEdit ve;
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutex dummy_mutex;
|
|
|
|
InstrumentedMutexLock l(&dummy_mutex);
|
2014-10-02 01:19:16 +02:00
|
|
|
return versions.LogAndApply(
|
|
|
|
versions.GetColumnFamilySet()->GetDefault(),
|
|
|
|
mutable_cf_options, &ve, &dummy_mutex, nullptr, true);
|
Make VersionSet::ReduceNumberOfLevels() static
Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.
Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.
Test Plan: reduce_levels_test
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15303
2014-01-24 23:57:04 +01:00
|
|
|
}
|
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
Status VersionSet::DumpManifest(Options& options, std::string& dscname,
|
|
|
|
bool verbose, bool hex) {
|
2012-08-17 19:48:40 +02:00
|
|
|
// Open the specified manifest file.
|
2013-01-20 11:07:13 +01:00
|
|
|
unique_ptr<SequentialFile> file;
|
2014-09-09 00:25:01 +02:00
|
|
|
Status s = options.env->NewSequentialFile(dscname, &file, env_options_);
|
2012-08-17 19:48:40 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool have_prev_log_number = false;
|
|
|
|
bool have_next_file = false;
|
|
|
|
bool have_last_sequence = false;
|
|
|
|
uint64_t next_file = 0;
|
|
|
|
uint64_t last_sequence = 0;
|
2014-11-06 20:14:28 +01:00
|
|
|
uint64_t previous_log_number = 0;
|
2012-11-19 20:54:13 +01:00
|
|
|
int count = 0;
|
2014-02-28 01:18:23 +01:00
|
|
|
std::unordered_map<uint32_t, std::string> comparators;
|
2014-10-31 16:48:19 +01:00
|
|
|
std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
|
2014-02-28 01:18:23 +01:00
|
|
|
|
|
|
|
// add default column family
|
|
|
|
VersionEdit default_cf_edit;
|
2014-04-09 18:56:17 +02:00
|
|
|
default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
|
2014-02-28 01:18:23 +01:00
|
|
|
default_cf_edit.SetColumnFamily(0);
|
|
|
|
ColumnFamilyData* default_cfd =
|
|
|
|
CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit);
|
2014-10-31 16:48:19 +01:00
|
|
|
builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)});
|
2012-08-17 19:48:40 +02:00
|
|
|
|
|
|
|
{
|
2014-01-22 20:44:53 +01:00
|
|
|
VersionSet::LogReporter reporter;
|
2012-08-17 19:48:40 +02:00
|
|
|
reporter.status = &s;
|
2013-01-20 11:07:13 +01:00
|
|
|
log::Reader reader(std::move(file), &reporter, true/*checksum*/,
|
|
|
|
0/*initial_offset*/);
|
2012-08-17 19:48:40 +02:00
|
|
|
Slice record;
|
|
|
|
std::string scratch;
|
|
|
|
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
2014-01-15 00:27:09 +01:00
|
|
|
VersionEdit edit;
|
2012-08-17 19:48:40 +02:00
|
|
|
s = edit.DecodeFrom(record);
|
2014-02-28 01:18:23 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
2012-08-17 19:48:40 +02:00
|
|
|
}
|
|
|
|
|
2012-11-19 21:16:45 +01:00
|
|
|
// Write out each individual edit
|
|
|
|
if (verbose) {
|
2012-11-29 01:42:36 +01:00
|
|
|
printf("*************************Edit[%d] = %s\n",
|
2013-08-09 00:51:16 +02:00
|
|
|
count, edit.DebugString(hex).c_str());
|
2012-11-19 21:16:45 +01:00
|
|
|
}
|
|
|
|
count++;
|
|
|
|
|
2014-02-28 01:18:23 +01:00
|
|
|
bool cf_in_builders =
|
|
|
|
builders.find(edit.column_family_) != builders.end();
|
|
|
|
|
|
|
|
if (edit.has_comparator_) {
|
|
|
|
comparators.insert({edit.column_family_, edit.comparator_});
|
2012-08-17 19:48:40 +02:00
|
|
|
}
|
|
|
|
|
2014-02-28 20:25:38 +01:00
|
|
|
ColumnFamilyData* cfd = nullptr;
|
|
|
|
|
2014-02-28 01:18:23 +01:00
|
|
|
if (edit.is_column_family_add_) {
|
|
|
|
if (cf_in_builders) {
|
|
|
|
s = Status::Corruption(
|
|
|
|
"Manifest adding the same column family twice");
|
|
|
|
break;
|
|
|
|
}
|
2014-02-28 20:25:38 +01:00
|
|
|
cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit);
|
2014-10-31 16:48:19 +01:00
|
|
|
builders.insert(
|
|
|
|
{edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
|
2014-02-28 01:18:23 +01:00
|
|
|
} else if (edit.is_column_family_drop_) {
|
|
|
|
if (!cf_in_builders) {
|
|
|
|
s = Status::Corruption(
|
|
|
|
"Manifest - dropping non-existing column family");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
auto builder_iter = builders.find(edit.column_family_);
|
|
|
|
delete builder_iter->second;
|
|
|
|
builders.erase(builder_iter);
|
|
|
|
comparators.erase(edit.column_family_);
|
2014-02-28 20:25:38 +01:00
|
|
|
cfd = column_family_set_->GetColumnFamily(edit.column_family_);
|
2014-02-28 01:18:23 +01:00
|
|
|
assert(cfd != nullptr);
|
|
|
|
cfd->Unref();
|
|
|
|
delete cfd;
|
2014-02-28 20:25:38 +01:00
|
|
|
cfd = nullptr;
|
2014-02-28 01:18:23 +01:00
|
|
|
} else {
|
|
|
|
if (!cf_in_builders) {
|
|
|
|
s = Status::Corruption(
|
|
|
|
"Manifest record referencing unknown column family");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-02-28 20:25:38 +01:00
|
|
|
cfd = column_family_set_->GetColumnFamily(edit.column_family_);
|
2014-02-28 01:18:23 +01:00
|
|
|
// this should never happen since cf_in_builders is true
|
|
|
|
assert(cfd != nullptr);
|
|
|
|
|
|
|
|
// if it is not column family add or column family drop,
|
|
|
|
// then it's a file add/delete, which should be forwarded
|
|
|
|
// to builder
|
|
|
|
auto builder = builders.find(edit.column_family_);
|
|
|
|
assert(builder != builders.end());
|
2014-11-04 02:45:55 +01:00
|
|
|
builder->second->version_builder()->Apply(&edit);
|
2012-08-17 19:48:40 +02:00
|
|
|
}
|
|
|
|
|
2014-02-28 20:25:38 +01:00
|
|
|
if (cfd != nullptr && edit.has_log_number_) {
|
|
|
|
cfd->SetLogNumber(edit.log_number_);
|
|
|
|
}
|
|
|
|
|
2012-08-17 19:48:40 +02:00
|
|
|
if (edit.has_prev_log_number_) {
|
2014-11-06 20:14:28 +01:00
|
|
|
previous_log_number = edit.prev_log_number_;
|
2012-08-17 19:48:40 +02:00
|
|
|
have_prev_log_number = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (edit.has_next_file_number_) {
|
|
|
|
next_file = edit.next_file_number_;
|
|
|
|
have_next_file = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (edit.has_last_sequence_) {
|
|
|
|
last_sequence = edit.last_sequence_;
|
|
|
|
have_last_sequence = true;
|
|
|
|
}
|
2014-03-05 21:13:44 +01:00
|
|
|
|
|
|
|
if (edit.has_max_column_family_) {
|
|
|
|
column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_);
|
|
|
|
}
|
2012-08-17 19:48:40 +02:00
|
|
|
}
|
|
|
|
}
|
2013-01-20 11:07:13 +01:00
|
|
|
file.reset();
|
2012-08-17 19:48:40 +02:00
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
if (!have_next_file) {
|
|
|
|
s = Status::Corruption("no meta-nextfile entry in descriptor");
|
|
|
|
printf("no meta-nextfile entry in descriptor");
|
|
|
|
} else if (!have_last_sequence) {
|
|
|
|
printf("no last-sequence-number entry in descriptor");
|
|
|
|
s = Status::Corruption("no last-sequence-number entry in descriptor");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!have_prev_log_number) {
|
2014-11-06 20:14:28 +01:00
|
|
|
previous_log_number = 0;
|
2012-08-17 19:48:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2014-02-28 01:18:23 +01:00
|
|
|
for (auto cfd : *column_family_set_) {
|
2015-03-20 01:04:29 +01:00
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
2014-02-28 01:18:23 +01:00
|
|
|
auto builders_iter = builders.find(cfd->GetID());
|
|
|
|
assert(builders_iter != builders.end());
|
2014-11-04 02:45:55 +01:00
|
|
|
auto builder = builders_iter->second->version_builder();
|
2014-02-28 01:18:23 +01:00
|
|
|
|
|
|
|
Version* v = new Version(cfd, this, current_version_number_++);
|
2014-10-31 16:48:19 +01:00
|
|
|
builder->SaveTo(v->storage_info());
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
v->PrepareApply(*cfd->GetLatestMutableCFOptions());
|
2014-02-28 01:18:23 +01:00
|
|
|
|
|
|
|
printf("--------------- Column family \"%s\" (ID %u) --------------\n",
|
|
|
|
cfd->GetName().c_str(), (unsigned int)cfd->GetID());
|
|
|
|
printf("log number: %lu\n", (unsigned long)cfd->GetLogNumber());
|
|
|
|
auto comparator = comparators.find(cfd->GetID());
|
|
|
|
if (comparator != comparators.end()) {
|
|
|
|
printf("comparator: %s\n", comparator->second.c_str());
|
|
|
|
} else {
|
|
|
|
printf("comparator: <NO COMPARATOR>\n");
|
|
|
|
}
|
|
|
|
printf("%s \n", v->DebugString(hex).c_str());
|
|
|
|
delete v;
|
|
|
|
}
|
Prevent segfault because SizeUnderCompaction was called without any locks.
Summary:
SizeBeingCompacted was called without any lock protection. This causes
crashes, especially when running db_bench with value_size=128K.
The fix is to compute SizeUnderCompaction while holding the mutex and
passing in these values into the call to Finalize.
(gdb) where
#4 leveldb::VersionSet::SizeBeingCompacted (this=this@entry=0x7f0b490931c0, level=level@entry=4) at db/version_set.cc:1827
#5 0x000000000043a3c8 in leveldb::VersionSet::Finalize (this=this@entry=0x7f0b490931c0, v=v@entry=0x7f0b3b86b480) at db/version_set.cc:1420
#6 0x00000000004418d1 in leveldb::VersionSet::LogAndApply (this=0x7f0b490931c0, edit=0x7f0b3dc8c200, mu=0x7f0b490835b0, new_descriptor_log=<optimized out>) at db/version_set.cc:1016
#7 0x00000000004222b2 in leveldb::DBImpl::InstallCompactionResults (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1473
#8 0x0000000000426027 in leveldb::DBImpl::DoCompactionWork (this=this@entry=0x7f0b49083400, compact=compact@entry=0x7f0b2b8330f0) at db/db_impl.cc:1757
#9 0x0000000000426690 in leveldb::DBImpl::BackgroundCompaction (this=this@entry=0x7f0b49083400, madeProgress=madeProgress@entry=0x7f0b41bf2d1e, deletion_state=...) at db/db_impl.cc:1268
#10 0x0000000000428f42 in leveldb::DBImpl::BackgroundCall (this=0x7f0b49083400) at db/db_impl.cc:1170
#11 0x000000000045348e in BGThread (this=0x7f0b49023100) at util/env_posix.cc:941
#12 leveldb::(anonymous namespace)::PosixEnv::BGThreadWrapper (arg=0x7f0b49023100) at util/env_posix.cc:874
#13 0x00007f0b4a7cf10d in start_thread (arg=0x7f0b41bf3700) at pthread_create.c:301
#14 0x00007f0b49b4b11d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Test Plan:
make check
I am running db_bench with a value size of 128K to see if the segfault is fixed.
Reviewers: MarkCallaghan, sheki, emayanke
Reviewed By: sheki
CC: leveldb
Differential Revision: https://reviews.facebook.net/D9279
2013-03-11 17:47:48 +01:00
|
|
|
|
2014-11-25 00:03:08 +01:00
|
|
|
// Free builders
|
|
|
|
for (auto& builder : builders) {
|
|
|
|
delete builder.second;
|
|
|
|
}
|
|
|
|
|
2014-11-08 00:44:12 +01:00
|
|
|
next_file_number_.store(next_file + 1);
|
2012-08-17 19:48:40 +02:00
|
|
|
last_sequence_ = last_sequence;
|
2014-11-06 20:14:28 +01:00
|
|
|
prev_log_number_ = previous_log_number;
|
2012-08-17 19:48:40 +02:00
|
|
|
|
2014-02-28 01:18:23 +01:00
|
|
|
printf(
|
2014-03-12 18:52:32 +01:00
|
|
|
"next_file_number %lu last_sequence "
|
2014-03-05 21:13:44 +01:00
|
|
|
"%lu prev_log_number %lu max_column_family %u\n",
|
2014-11-08 00:44:12 +01:00
|
|
|
(unsigned long)next_file_number_.load(), (unsigned long)last_sequence,
|
2014-11-06 20:14:28 +01:00
|
|
|
(unsigned long)previous_log_number,
|
2014-03-05 21:13:44 +01:00
|
|
|
column_family_set_->GetMaxColumnFamily());
|
2012-08-17 19:48:40 +02:00
|
|
|
}
|
2012-08-17 19:48:40 +02:00
|
|
|
|
2012-08-17 19:48:40 +02:00
|
|
|
return s;
|
|
|
|
}
|
2014-04-15 22:39:26 +02:00
|
|
|
#endif // ROCKSDB_LITE
|
2012-08-17 19:48:40 +02:00
|
|
|
|
2014-11-08 00:44:12 +01:00
|
|
|
void VersionSet::MarkFileNumberUsedDuringRecovery(uint64_t number) {
|
|
|
|
// only called during recovery which is single threaded, so this works because
|
|
|
|
// there can't be concurrent calls
|
|
|
|
if (next_file_number_.load(std::memory_order_relaxed) <= number) {
|
|
|
|
next_file_number_.store(number + 1, std::memory_order_relaxed);
|
2011-09-01 21:08:02 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Status VersionSet::WriteSnapshot(log::Writer* log) {
|
|
|
|
// TODO: Break up into multiple records to reduce memory usage on recovery?
|
2013-10-16 22:32:53 +02:00
|
|
|
|
2014-03-13 02:09:03 +01:00
|
|
|
// WARNING: This method doesn't hold a mutex!!
|
|
|
|
|
2014-03-11 04:22:31 +01:00
|
|
|
// This is done without DB mutex lock held, but only within single-threaded
|
|
|
|
// LogAndApply. Column family manipulations can only happen within LogAndApply
|
2014-03-13 02:09:03 +01:00
|
|
|
// (the same single thread), so we're safe to iterate.
|
2014-01-22 20:44:53 +01:00
|
|
|
for (auto cfd : *column_family_set_) {
|
2015-03-20 01:04:29 +01:00
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
2014-01-22 02:01:52 +01:00
|
|
|
{
|
|
|
|
// Store column family info
|
|
|
|
VersionEdit edit;
|
2014-01-29 22:28:50 +01:00
|
|
|
if (cfd->GetID() != 0) {
|
2014-01-22 02:01:52 +01:00
|
|
|
// default column family is always there,
|
|
|
|
// no need to explicitly write it
|
2014-01-29 22:28:50 +01:00
|
|
|
edit.AddColumnFamily(cfd->GetName());
|
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
2014-02-03 21:08:33 +01:00
|
|
|
}
|
|
|
|
edit.SetComparatorName(
|
|
|
|
cfd->internal_comparator().user_comparator()->Name());
|
|
|
|
std::string record;
|
2014-10-28 22:27:26 +01:00
|
|
|
if (!edit.EncodeTo(&record)) {
|
|
|
|
return Status::Corruption(
|
|
|
|
"Unable to Encode VersionEdit:" + edit.DebugString(true));
|
|
|
|
}
|
2014-02-03 21:08:33 +01:00
|
|
|
Status s = log->AddRecord(record);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
2014-01-22 02:01:52 +01:00
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
|
2014-01-22 02:01:52 +01:00
|
|
|
{
|
|
|
|
// Save files
|
|
|
|
VersionEdit edit;
|
2014-01-29 22:28:50 +01:00
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
2014-01-22 02:01:52 +01:00
|
|
|
|
2014-02-03 21:08:33 +01:00
|
|
|
for (int level = 0; level < cfd->NumberLevels(); level++) {
|
2014-10-31 16:48:19 +01:00
|
|
|
for (const auto& f :
|
|
|
|
cfd->current()->storage_info()->LevelFiles(level)) {
|
2014-07-02 18:54:20 +02:00
|
|
|
edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
|
|
|
|
f->fd.GetFileSize(), f->smallest, f->largest,
|
|
|
|
f->smallest_seqno, f->largest_seqno);
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
}
|
2014-01-29 22:28:50 +01:00
|
|
|
edit.SetLogNumber(cfd->GetLogNumber());
|
2014-01-22 02:01:52 +01:00
|
|
|
std::string record;
|
2014-10-28 22:27:26 +01:00
|
|
|
if (!edit.EncodeTo(&record)) {
|
|
|
|
return Status::Corruption(
|
|
|
|
"Unable to Encode VersionEdit:" + edit.DebugString(true));
|
|
|
|
}
|
2014-01-22 02:01:52 +01:00
|
|
|
Status s = log->AddRecord(record);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
2014-01-11 00:12:34 +01:00
|
|
|
}
|
2014-01-02 18:08:12 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-13 02:09:03 +01:00
|
|
|
return Status::OK();
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2013-01-08 21:00:13 +01:00
|
|
|
// Opens the mainfest file and reads all records
|
|
|
|
// till it finds the record we are looking for.
|
2014-11-06 20:14:28 +01:00
|
|
|
bool VersionSet::ManifestContains(uint64_t manifest_file_num,
|
2014-03-18 05:50:15 +01:00
|
|
|
const std::string& record) const {
|
2014-11-06 20:14:28 +01:00
|
|
|
std::string fname = DescriptorFileName(dbname_, manifest_file_num);
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
|
|
|
"ManifestContains: checking %s\n", fname.c_str());
|
2013-01-20 11:07:13 +01:00
|
|
|
unique_ptr<SequentialFile> file;
|
2014-09-09 00:25:01 +02:00
|
|
|
Status s = env_->NewSequentialFile(fname, &file, env_options_);
|
2013-01-08 21:00:13 +01:00
|
|
|
if (!s.ok()) {
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
|
|
|
"ManifestContains: %s\n", s.ToString().c_str());
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
2013-03-06 22:28:54 +01:00
|
|
|
"ManifestContains: is unable to reopen the manifest file %s",
|
|
|
|
fname.c_str());
|
2013-01-08 21:00:13 +01:00
|
|
|
return false;
|
|
|
|
}
|
2013-03-01 03:04:58 +01:00
|
|
|
log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0);
|
2013-01-08 21:00:13 +01:00
|
|
|
Slice r;
|
|
|
|
std::string scratch;
|
|
|
|
bool result = false;
|
|
|
|
while (reader.ReadRecord(&r, &scratch)) {
|
|
|
|
if (r == Slice(record)) {
|
|
|
|
result = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2014-11-04 19:34:33 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
|
|
|
"ManifestContains: result = %d\n", result ? 1 : 0);
|
2013-01-08 21:00:13 +01:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
|
|
|
|
uint64_t result = 0;
|
2014-10-31 16:48:19 +01:00
|
|
|
const auto* vstorage = v->storage_info();
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int level = 0; level < vstorage->num_levels(); level++) {
|
2014-10-27 23:49:46 +01:00
|
|
|
const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
2014-02-03 21:08:33 +01:00
|
|
|
if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <=
|
|
|
|
0) {
|
2011-03-18 23:37:00 +01:00
|
|
|
// Entire file is before "ikey", so just add the file size
|
2014-06-14 00:54:19 +02:00
|
|
|
result += files[i]->fd.GetFileSize();
|
2014-02-03 21:08:33 +01:00
|
|
|
} else if (v->cfd_->internal_comparator().Compare(files[i]->smallest,
|
|
|
|
ikey) > 0) {
|
2011-03-18 23:37:00 +01:00
|
|
|
// Entire file is after "ikey", so ignore
|
|
|
|
if (level > 0) {
|
|
|
|
// Files other than level 0 are sorted by meta->smallest, so
|
|
|
|
// no further files in this level will contain data for
|
|
|
|
// "ikey".
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// "ikey" falls in the range for this table. Add the
|
|
|
|
// approximate offset of "ikey" within the table.
|
2013-10-30 18:52:33 +01:00
|
|
|
TableReader* table_reader_ptr;
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
Iterator* iter = v->cfd_->table_cache()->NewIterator(
|
2014-09-09 00:25:01 +02:00
|
|
|
ReadOptions(), env_options_, v->cfd_->internal_comparator(),
|
2014-06-14 00:54:19 +02:00
|
|
|
files[i]->fd, &table_reader_ptr);
|
2013-10-30 18:52:33 +01:00
|
|
|
if (table_reader_ptr != nullptr) {
|
|
|
|
result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-07-02 18:54:20 +02:00
|
|
|
void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
2013-04-12 01:49:53 +02:00
|
|
|
// pre-calculate space requirement
|
|
|
|
int64_t total_files = 0;
|
2014-01-22 20:44:53 +01:00
|
|
|
for (auto cfd : *column_family_set_) {
|
2014-01-29 22:28:50 +01:00
|
|
|
Version* dummy_versions = cfd->dummy_versions();
|
|
|
|
for (Version* v = dummy_versions->next_; v != dummy_versions;
|
2014-01-22 20:44:53 +01:00
|
|
|
v = v->next_) {
|
2014-10-31 16:48:19 +01:00
|
|
|
const auto* vstorage = v->storage_info();
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int level = 0; level < vstorage->num_levels(); level++) {
|
2014-10-27 23:49:46 +01:00
|
|
|
total_files += vstorage->LevelFiles(level).size();
|
2014-01-11 00:12:34 +01:00
|
|
|
}
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
2013-04-12 01:49:53 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// just one time extension to the right size
|
2014-11-13 20:39:30 +01:00
|
|
|
live_list->reserve(live_list->size() + static_cast<size_t>(total_files));
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
2013-04-12 01:49:53 +02:00
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
for (auto cfd : *column_family_set_) {
|
2015-01-07 19:43:29 +01:00
|
|
|
auto* current = cfd->current();
|
|
|
|
bool found_current = false;
|
2014-01-29 22:28:50 +01:00
|
|
|
Version* dummy_versions = cfd->dummy_versions();
|
|
|
|
for (Version* v = dummy_versions->next_; v != dummy_versions;
|
2014-01-22 20:44:53 +01:00
|
|
|
v = v->next_) {
|
2015-01-07 19:43:29 +01:00
|
|
|
v->AddLiveFiles(live_list);
|
|
|
|
if (v == current) {
|
|
|
|
found_current = true;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
2015-01-07 19:43:29 +01:00
|
|
|
if (!found_current && current != nullptr) {
|
|
|
|
// Should never happen unless it is a bug.
|
|
|
|
assert(false);
|
|
|
|
current->AddLiveFiles(live_list);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Iterator* VersionSet::MakeInputIterator(Compaction* c) {
|
2014-04-25 21:22:23 +02:00
|
|
|
auto cfd = c->column_family_data();
|
|
|
|
ReadOptions read_options;
|
|
|
|
read_options.verify_checksums =
|
2014-11-18 19:20:10 +01:00
|
|
|
c->mutable_cf_options()->verify_checksums_in_compaction;
|
2014-04-25 21:22:23 +02:00
|
|
|
read_options.fill_cache = false;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
// Level-0 files have to be merged together. For other levels,
|
|
|
|
// we will make a concatenating iterator per level.
|
|
|
|
// TODO(opt): use concatenating iterator for level-0 if there is no overlap
|
2014-11-11 22:47:22 +01:00
|
|
|
const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files +
|
|
|
|
c->num_input_levels() - 1
|
|
|
|
: c->num_input_levels());
|
|
|
|
Iterator** list = new Iterator* [space];
|
|
|
|
size_t num = 0;
|
|
|
|
for (size_t which = 0; which < c->num_input_levels(); which++) {
|
2014-07-11 21:52:41 +02:00
|
|
|
if (c->input_levels(which)->num_files != 0) {
|
2014-07-17 03:12:17 +02:00
|
|
|
if (c->level(which) == 0) {
|
2014-10-28 18:03:13 +01:00
|
|
|
const LevelFilesBrief* flevel = c->input_levels(which);
|
2014-07-11 21:52:41 +02:00
|
|
|
for (size_t i = 0; i < flevel->num_files; i++) {
|
2014-04-25 21:22:23 +02:00
|
|
|
list[num++] = cfd->table_cache()->NewIterator(
|
2014-09-09 00:25:01 +02:00
|
|
|
read_options, env_options_compactions_,
|
2014-07-11 21:52:41 +02:00
|
|
|
cfd->internal_comparator(), flevel->files[i].fd, nullptr,
|
2014-01-24 01:32:49 +01:00
|
|
|
true /* for compaction */);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Create concatenating iterator for the files from this level
|
2014-10-28 19:42:22 +01:00
|
|
|
list[num++] = NewTwoLevelIterator(new LevelFileIteratorState(
|
2014-09-09 00:25:01 +02:00
|
|
|
cfd->table_cache(), read_options, env_options_,
|
2014-04-25 21:22:23 +02:00
|
|
|
cfd->internal_comparator(), true /* for_compaction */,
|
|
|
|
false /* prefix enabled */),
|
2014-10-28 19:42:22 +01:00
|
|
|
new LevelFileNumIterator(cfd->internal_comparator(),
|
|
|
|
c->input_levels(which)));
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(num <= space);
|
2014-11-11 22:47:22 +01:00
|
|
|
Iterator* result =
|
|
|
|
NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
|
|
|
|
static_cast<int>(num));
|
2011-03-18 23:37:00 +01:00
|
|
|
delete[] list;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2012-11-29 01:42:36 +01:00
|
|
|
// verify that the files listed in this compaction are present
|
2012-10-19 23:00:53 +02:00
|
|
|
// in the current version
|
|
|
|
bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
|
2013-03-06 22:28:54 +01:00
|
|
|
#ifndef NDEBUG
|
2014-02-01 01:45:20 +01:00
|
|
|
Version* version = c->column_family_data()->current();
|
2014-10-31 16:48:19 +01:00
|
|
|
const VersionStorageInfo* vstorage = version->storage_info();
|
2014-01-22 19:59:07 +01:00
|
|
|
if (c->input_version() != version) {
|
2015-01-14 01:30:31 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
|
|
|
|
"[%s] compaction output being applied to a different base version from"
|
|
|
|
" input version",
|
2014-04-25 15:51:16 +02:00
|
|
|
c->column_family_data()->GetName().c_str());
|
2015-03-30 23:04:21 +02:00
|
|
|
|
|
|
|
if (vstorage->compaction_style_ == kCompactionStyleLevel &&
|
|
|
|
c->start_level() == 0 && c->num_input_levels() > 2U) {
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
// We are doing a L0->base_level compaction. The assumption is if
|
|
|
|
// base level is not L1, levels from L1 to base_level - 1 is empty.
|
|
|
|
// This is ensured by having one compaction from L0 going on at the
|
|
|
|
// same time in level-based compaction. So that during the time, no
|
|
|
|
// compaction/flush can put files to those levels.
|
|
|
|
for (int l = c->start_level() + 1; l < c->output_level(); l++) {
|
|
|
|
if (vstorage->NumLevelFiles(l) != 0) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
|
2014-11-11 22:47:22 +01:00
|
|
|
for (size_t input = 0; input < c->num_input_levels(); ++input) {
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
int level = c->level(input);
|
2014-11-11 22:47:22 +01:00
|
|
|
for (size_t i = 0; i < c->num_input_files(input); ++i) {
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
uint64_t number = c->input(input, i)->fd.GetNumber();
|
|
|
|
bool found = false;
|
|
|
|
for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) {
|
|
|
|
FileMetaData* f = vstorage->files_[level][j];
|
|
|
|
if (f->fd.GetNumber() == number) {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
if (!found) {
|
|
|
|
return false; // input files non existent in current version
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2013-03-06 22:28:54 +01:00
|
|
|
#endif
|
2012-10-19 23:00:53 +02:00
|
|
|
return true; // everything good
|
|
|
|
}
|
|
|
|
|
2014-01-16 01:15:43 +01:00
|
|
|
Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
|
2014-02-07 00:42:16 +01:00
|
|
|
FileMetaData** meta,
|
2014-01-27 23:33:50 +01:00
|
|
|
ColumnFamilyData** cfd) {
|
|
|
|
for (auto cfd_iter : *column_family_set_) {
|
2014-01-29 22:28:50 +01:00
|
|
|
Version* version = cfd_iter->current();
|
2014-10-31 16:48:19 +01:00
|
|
|
const auto* vstorage = version->storage_info();
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int level = 0; level < vstorage->num_levels(); level++) {
|
2014-10-27 23:49:46 +01:00
|
|
|
for (const auto& file : vstorage->LevelFiles(level)) {
|
2014-06-14 00:54:19 +02:00
|
|
|
if (file->fd.GetNumber() == number) {
|
2014-02-07 00:42:16 +01:00
|
|
|
*meta = file;
|
2014-01-11 00:12:34 +01:00
|
|
|
*filelevel = level;
|
2014-01-27 23:33:50 +01:00
|
|
|
*cfd = cfd_iter;
|
2014-01-11 00:12:34 +01:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2013-08-22 23:32:53 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::NotFound("File not present in any level");
|
|
|
|
}
|
|
|
|
|
2014-01-11 00:12:34 +01:00
|
|
|
void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
|
2014-01-22 20:44:53 +01:00
|
|
|
for (auto cfd : *column_family_set_) {
|
2015-03-20 01:04:29 +01:00
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
2014-02-03 21:08:33 +01:00
|
|
|
for (int level = 0; level < cfd->NumberLevels(); level++) {
|
2014-10-31 16:48:19 +01:00
|
|
|
for (const auto& file :
|
|
|
|
cfd->current()->storage_info()->LevelFiles(level)) {
|
2014-01-11 00:12:34 +01:00
|
|
|
LiveFileMetaData filemetadata;
|
2014-04-30 22:24:52 +02:00
|
|
|
filemetadata.column_family_name = cfd->GetName();
|
2014-07-02 18:54:20 +02:00
|
|
|
uint32_t path_id = file->fd.GetPathId();
|
2014-09-09 00:25:01 +02:00
|
|
|
if (path_id < db_options_->db_paths.size()) {
|
|
|
|
filemetadata.db_path = db_options_->db_paths[path_id].path;
|
2014-07-02 18:54:20 +02:00
|
|
|
} else {
|
2014-09-09 00:25:01 +02:00
|
|
|
assert(!db_options_->db_paths.empty());
|
|
|
|
filemetadata.db_path = db_options_->db_paths.back().path;
|
2014-07-02 18:54:20 +02:00
|
|
|
}
|
|
|
|
filemetadata.name = MakeTableFileName("", file->fd.GetNumber());
|
2014-01-11 00:12:34 +01:00
|
|
|
filemetadata.level = level;
|
2014-06-14 00:54:19 +02:00
|
|
|
filemetadata.size = file->fd.GetFileSize();
|
2014-01-22 20:44:53 +01:00
|
|
|
filemetadata.smallestkey = file->smallest.user_key().ToString();
|
|
|
|
filemetadata.largestkey = file->largest.user_key().ToString();
|
|
|
|
filemetadata.smallest_seqno = file->smallest_seqno;
|
|
|
|
filemetadata.largest_seqno = file->largest_seqno;
|
2014-01-11 00:12:34 +01:00
|
|
|
metadata->push_back(filemetadata);
|
|
|
|
}
|
2013-08-22 23:32:53 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-10 02:38:32 +01:00
|
|
|
void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files,
|
|
|
|
uint64_t min_pending_output) {
|
|
|
|
std::vector<FileMetaData*> pending_files;
|
|
|
|
for (auto f : obsolete_files_) {
|
|
|
|
if (f->fd.GetNumber() < min_pending_output) {
|
|
|
|
files->push_back(f);
|
|
|
|
} else {
|
|
|
|
pending_files.push_back(f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
obsolete_files_.swap(pending_files);
|
2013-11-09 00:23:46 +01:00
|
|
|
}
|
|
|
|
|
2014-01-11 00:12:34 +01:00
|
|
|
ColumnFamilyData* VersionSet::CreateColumnFamily(
|
2014-09-17 21:49:13 +02:00
|
|
|
const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
|
2014-01-11 00:12:34 +01:00
|
|
|
assert(edit->is_column_family_add_);
|
|
|
|
|
2014-02-01 00:30:27 +01:00
|
|
|
Version* dummy_versions = new Version(nullptr, this);
|
2014-10-28 18:04:38 +01:00
|
|
|
// Ref() dummy version once so that later we can call Unref() to delete it
|
|
|
|
// by avoiding calling "delete" explicitly (~Version is private)
|
|
|
|
dummy_versions->Ref();
|
2014-01-22 20:44:53 +01:00
|
|
|
auto new_cfd = column_family_set_->CreateColumnFamily(
|
2014-09-17 21:49:13 +02:00
|
|
|
edit->column_family_name_, edit->column_family_, dummy_versions,
|
|
|
|
cf_options);
|
2014-01-22 20:44:53 +01:00
|
|
|
|
2014-03-18 22:23:47 +01:00
|
|
|
Version* v = new Version(new_cfd, this, current_version_number_++);
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
// Fill level target base information.
|
|
|
|
v->storage_info()->CalculateBaseBytes(*new_cfd->ioptions(),
|
|
|
|
*new_cfd->GetLatestMutableCFOptions());
|
2014-03-18 22:23:47 +01:00
|
|
|
AppendVersion(new_cfd, v);
|
2014-09-17 21:49:13 +02:00
|
|
|
// GetLatestMutableCFOptions() is safe here without mutex since the
|
|
|
|
// cfd is not available to client
|
2014-10-27 20:10:13 +01:00
|
|
|
new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions());
|
2014-02-28 20:08:24 +01:00
|
|
|
new_cfd->SetLogNumber(edit->log_number_);
|
2014-01-11 00:12:34 +01:00
|
|
|
return new_cfd;
|
|
|
|
}
|
|
|
|
|
2015-02-12 02:10:43 +01:00
|
|
|
uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) {
|
|
|
|
uint64_t count = 0;
|
|
|
|
for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
} // namespace rocksdb
|