2014-10-28 19:54:33 +01:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/flush_job.h"
|
|
|
|
|
|
|
|
#ifndef __STDC_FORMAT_MACROS
|
|
|
|
#define __STDC_FORMAT_MACROS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <inttypes.h>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/builder.h"
|
|
|
|
#include "db/db_iter.h"
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "db/filename.h"
|
|
|
|
#include "db/log_reader.h"
|
|
|
|
#include "db/log_writer.h"
|
|
|
|
#include "db/memtable.h"
|
|
|
|
#include "db/memtable_list.h"
|
|
|
|
#include "db/merge_context.h"
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "port/likely.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "table/block.h"
|
|
|
|
#include "table/block_based_table_factory.h"
|
|
|
|
#include "table/merger.h"
|
|
|
|
#include "table/table_builder.h"
|
|
|
|
#include "table/two_level_iterator.h"
|
|
|
|
#include "util/coding.h"
|
2014-12-16 06:48:16 +01:00
|
|
|
#include "util/file_util.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "util/logging.h"
|
|
|
|
#include "util/log_buffer.h"
|
|
|
|
#include "util/mutexlock.h"
|
|
|
|
#include "util/perf_context_imp.h"
|
|
|
|
#include "util/iostats_context_imp.h"
|
|
|
|
#include "util/stop_watch.h"
|
|
|
|
#include "util/sync_point.h"
|
2015-02-17 19:13:52 +01:00
|
|
|
#include "util/thread_status_util.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
|
|
|
|
const DBOptions& db_options,
|
|
|
|
const MutableCFOptions& mutable_cf_options,
|
|
|
|
const EnvOptions& env_options, VersionSet* versions,
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutex* db_mutex,
|
|
|
|
std::atomic<bool>* shutting_down,
|
2014-10-28 19:54:33 +01:00
|
|
|
SequenceNumber newest_snapshot, JobContext* job_context,
|
|
|
|
LogBuffer* log_buffer, Directory* db_directory,
|
2015-01-26 22:59:38 +01:00
|
|
|
Directory* output_file_directory,
|
2014-10-28 19:54:33 +01:00
|
|
|
CompressionType output_compression, Statistics* stats)
|
|
|
|
: dbname_(dbname),
|
|
|
|
cfd_(cfd),
|
|
|
|
db_options_(db_options),
|
|
|
|
mutable_cf_options_(mutable_cf_options),
|
|
|
|
env_options_(env_options),
|
|
|
|
versions_(versions),
|
|
|
|
db_mutex_(db_mutex),
|
|
|
|
shutting_down_(shutting_down),
|
|
|
|
newest_snapshot_(newest_snapshot),
|
|
|
|
job_context_(job_context),
|
|
|
|
log_buffer_(log_buffer),
|
|
|
|
db_directory_(db_directory),
|
2015-01-26 22:59:38 +01:00
|
|
|
output_file_directory_(output_file_directory),
|
2014-10-28 19:54:33 +01:00
|
|
|
output_compression_(output_compression),
|
|
|
|
stats_(stats) {}
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
Status FlushJob::Run(uint64_t* file_number) {
|
2014-10-28 19:54:33 +01:00
|
|
|
// Save the contents of the earliest memtable as a new Table
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
uint64_t fn;
|
2014-10-28 19:54:33 +01:00
|
|
|
autovector<MemTable*> mems;
|
|
|
|
cfd_->imm()->PickMemtablesToFlush(&mems);
|
|
|
|
if (mems.empty()) {
|
|
|
|
LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush",
|
|
|
|
cfd_->GetName().c_str());
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2015-02-17 19:13:52 +01:00
|
|
|
// Update the thread status to indicate flush.
|
|
|
|
ThreadStatusUtil::SetColumnFamily(cfd_);
|
|
|
|
ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
|
|
|
|
TEST_SYNC_POINT("FlushJob::Run:Start");
|
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
// entries mems are (implicitly) sorted in ascending order by their created
|
|
|
|
// time. We will use the first memtable's `edit` to keep the meta info for
|
|
|
|
// this flush.
|
|
|
|
MemTable* m = mems[0];
|
|
|
|
VersionEdit* edit = m->GetEdits();
|
|
|
|
edit->SetPrevLogNumber(0);
|
|
|
|
// SetLogNumber(log_num) indicates logs with number smaller than log_num
|
|
|
|
// will no longer be picked up for recovery.
|
|
|
|
edit->SetLogNumber(mems.back()->GetNextLogNumber());
|
|
|
|
edit->SetColumnFamily(cfd_->GetID());
|
|
|
|
|
|
|
|
// This will release and re-acquire the mutex.
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
Status s = WriteLevel0Table(mems, edit, &fn);
|
2014-10-28 19:54:33 +01:00
|
|
|
|
|
|
|
if (s.ok() &&
|
|
|
|
(shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
|
|
|
|
s = Status::ShutdownInProgress(
|
|
|
|
"Database shutdown or Column family drop during flush");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
2014-11-08 00:11:36 +01:00
|
|
|
cfd_->imm()->RollbackMemtableFlush(mems, fn);
|
2014-10-28 19:54:33 +01:00
|
|
|
} else {
|
|
|
|
// Replace immutable memtable with the generated Table
|
|
|
|
s = cfd_->imm()->InstallMemtableFlushResults(
|
2014-11-08 00:11:36 +01:00
|
|
|
cfd_, mutable_cf_options_, mems, versions_, db_mutex_, fn,
|
2014-11-07 20:50:34 +01:00
|
|
|
&job_context_->memtables_to_free, db_directory_, log_buffer_);
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
if (s.ok() && file_number != nullptr) {
|
|
|
|
*file_number = fn;
|
|
|
|
}
|
2015-02-17 19:13:52 +01:00
|
|
|
|
|
|
|
TEST_SYNC_POINT("FlushJob::Run:End");
|
|
|
|
ThreadStatusUtil::ResetThreadStatus();
|
2014-10-28 19:54:33 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
|
|
|
|
VersionEdit* edit, uint64_t* filenumber) {
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
const uint64_t start_micros = db_options_.env->NowMicros();
|
|
|
|
FileMetaData meta;
|
2015-01-26 22:59:38 +01:00
|
|
|
// path 0 for level 0 file.
|
2014-10-28 19:54:33 +01:00
|
|
|
meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
|
|
|
|
*filenumber = meta.fd.GetNumber();
|
|
|
|
|
|
|
|
const SequenceNumber earliest_seqno_in_memtable =
|
|
|
|
mems[0]->GetFirstSequenceNumber();
|
|
|
|
Version* base = cfd_->current();
|
|
|
|
base->Ref(); // it is likely that we do not need this reference
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
db_mutex_->Unlock();
|
|
|
|
if (log_buffer_) {
|
|
|
|
log_buffer_->FlushBufferToLog();
|
|
|
|
}
|
|
|
|
std::vector<Iterator*> memtables;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
Arena arena;
|
|
|
|
for (MemTable* m : mems) {
|
2014-10-29 23:12:10 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
|
2014-10-28 19:54:33 +01:00
|
|
|
memtables.push_back(m->NewIterator(ro, &arena));
|
|
|
|
}
|
|
|
|
{
|
2014-11-11 22:47:22 +01:00
|
|
|
ScopedArenaIterator iter(
|
|
|
|
NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
|
|
|
|
static_cast<int>(memtables.size()), &arena));
|
2014-10-29 23:12:10 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id, meta.fd.GetNumber());
|
2014-10-28 19:54:33 +01:00
|
|
|
|
|
|
|
s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
|
|
|
|
cfd_->table_cache(), iter.get(), &meta,
|
|
|
|
cfd_->internal_comparator(), newest_snapshot_,
|
|
|
|
earliest_seqno_in_memtable, output_compression_,
|
|
|
|
cfd_->ioptions()->compression_opts, Env::IO_HIGH);
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
}
|
2014-10-29 23:12:10 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id, meta.fd.GetNumber(),
|
|
|
|
meta.fd.GetFileSize(), s.ToString().c_str());
|
2015-01-26 22:59:38 +01:00
|
|
|
if (!db_options_.disableDataSync && output_file_directory_ != nullptr) {
|
|
|
|
output_file_directory_->Fsync();
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
|
|
|
db_mutex_->Lock();
|
|
|
|
}
|
|
|
|
base->Unref();
|
|
|
|
|
|
|
|
// re-acquire the most current version
|
|
|
|
base = cfd_->current();
|
|
|
|
|
|
|
|
// Note that if file_size is zero, the file has been deleted and
|
|
|
|
// should not be added to the manifest.
|
|
|
|
int level = 0;
|
|
|
|
if (s.ok() && meta.fd.GetFileSize() > 0) {
|
|
|
|
const Slice min_user_key = meta.smallest.user_key();
|
|
|
|
const Slice max_user_key = meta.largest.user_key();
|
|
|
|
// if we have more than 1 background thread, then we cannot
|
|
|
|
// insert files directly into higher levels because some other
|
|
|
|
// threads could be concurrently producing compacted files for
|
|
|
|
// that key range.
|
|
|
|
if (base != nullptr && db_options_.max_background_compactions <= 1 &&
|
|
|
|
db_options_.max_background_flushes == 0 &&
|
|
|
|
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
|
2014-10-31 16:48:19 +01:00
|
|
|
level = base->storage_info()->PickLevelForMemTableOutput(
|
2014-10-27 23:49:46 +01:00
|
|
|
mutable_cf_options_, min_user_key, max_user_key);
|
2014-12-16 06:48:16 +01:00
|
|
|
// If level does not match path id, reset level back to 0
|
|
|
|
uint32_t fdpath = LevelCompactionPicker::GetPathId(
|
|
|
|
*cfd_->ioptions(), mutable_cf_options_, level);
|
|
|
|
if (fdpath != 0) {
|
|
|
|
level = 0;
|
|
|
|
}
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
|
|
|
edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
|
|
|
|
meta.fd.GetFileSize(), meta.smallest, meta.largest,
|
|
|
|
meta.smallest_seqno, meta.largest_seqno);
|
|
|
|
}
|
|
|
|
|
|
|
|
InternalStats::CompactionStats stats(1);
|
|
|
|
stats.micros = db_options_.env->NowMicros() - start_micros;
|
|
|
|
stats.bytes_written = meta.fd.GetFileSize();
|
|
|
|
cfd_->internal_stats()->AddCompactionStats(level, stats);
|
|
|
|
cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
|
|
|
|
meta.fd.GetFileSize());
|
|
|
|
RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace rocksdb
|