2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2014-10-28 19:54:33 +01:00
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/flush_job.h"
|
|
|
|
|
|
|
|
#ifndef __STDC_FORMAT_MACROS
|
|
|
|
#define __STDC_FORMAT_MACROS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <inttypes.h>
|
2015-09-02 22:58:22 +02:00
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
#include <algorithm>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/builder.h"
|
|
|
|
#include "db/db_iter.h"
|
|
|
|
#include "db/dbformat.h"
|
2015-05-28 22:37:47 +02:00
|
|
|
#include "db/event_helpers.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "db/filename.h"
|
|
|
|
#include "db/log_reader.h"
|
|
|
|
#include "db/log_writer.h"
|
|
|
|
#include "db/memtable.h"
|
|
|
|
#include "db/memtable_list.h"
|
|
|
|
#include "db/merge_context.h"
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "port/likely.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "port/port.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "table/block.h"
|
|
|
|
#include "table/block_based_table_factory.h"
|
|
|
|
#include "table/merger.h"
|
|
|
|
#include "table/table_builder.h"
|
|
|
|
#include "table/two_level_iterator.h"
|
|
|
|
#include "util/coding.h"
|
EventLogger
Summary:
Here's my proposal for making our LOGs easier to read by machines.
The idea is to dump all events as JSON objects. JSON is easy to read by humans, but more importantly, it's easy to read by machines. That way, we can parse this, load into SQLite/mongo and then query or visualize.
I started with table_create and table_delete events, but if everybody agrees, I'll continue by adding more events (flush/compaction/etc etc)
Test Plan:
Ran db_bench. Observed:
2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros": 1421360005788015, "event": "table_file_creation", "file_number": 12, "file_size": 1909699}
2015/01/15-14:13:25.956500 110740000 EVENT_LOG_v1 {"time_micros": 1421360005956498, "event": "table_file_deletion", "file_number": 12}
Reviewers: yhchiang, rven, dhruba, MarkCallaghan, lgalanis, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31647
2015-03-13 18:15:54 +01:00
|
|
|
#include "util/event_logger.h"
|
2014-12-16 06:48:16 +01:00
|
|
|
#include "util/file_util.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "util/iostats_context_imp.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "util/log_buffer.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "util/logging.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "util/mutexlock.h"
|
|
|
|
#include "util/perf_context_imp.h"
|
|
|
|
#include "util/stop_watch.h"
|
|
|
|
#include "util/sync_point.h"
|
2015-02-17 19:13:52 +01:00
|
|
|
#include "util/thread_status_util.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
|
2016-09-24 01:34:04 +02:00
|
|
|
const ImmutableDBOptions& db_options,
|
2014-10-28 19:54:33 +01:00
|
|
|
const MutableCFOptions& mutable_cf_options,
|
|
|
|
const EnvOptions& env_options, VersionSet* versions,
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutex* db_mutex,
|
|
|
|
std::atomic<bool>* shutting_down,
|
2015-08-24 20:11:12 +02:00
|
|
|
std::vector<SequenceNumber> existing_snapshots,
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
SequenceNumber earliest_write_conflict_snapshot,
|
2015-08-24 20:11:12 +02:00
|
|
|
JobContext* job_context, LogBuffer* log_buffer,
|
|
|
|
Directory* db_directory, Directory* output_file_directory,
|
EventLogger
Summary:
Here's my proposal for making our LOGs easier to read by machines.
The idea is to dump all events as JSON objects. JSON is easy to read by humans, but more importantly, it's easy to read by machines. That way, we can parse this, load into SQLite/mongo and then query or visualize.
I started with table_create and table_delete events, but if everybody agrees, I'll continue by adding more events (flush/compaction/etc etc)
Test Plan:
Ran db_bench. Observed:
2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros": 1421360005788015, "event": "table_file_creation", "file_number": 12, "file_size": 1909699}
2015/01/15-14:13:25.956500 110740000 EVENT_LOG_v1 {"time_micros": 1421360005956498, "event": "table_file_deletion", "file_number": 12}
Reviewers: yhchiang, rven, dhruba, MarkCallaghan, lgalanis, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31647
2015-03-13 18:15:54 +01:00
|
|
|
CompressionType output_compression, Statistics* stats,
|
2016-04-14 22:56:29 +02:00
|
|
|
EventLogger* event_logger, bool measure_io_stats)
|
2014-10-28 19:54:33 +01:00
|
|
|
: dbname_(dbname),
|
|
|
|
cfd_(cfd),
|
|
|
|
db_options_(db_options),
|
|
|
|
mutable_cf_options_(mutable_cf_options),
|
|
|
|
env_options_(env_options),
|
|
|
|
versions_(versions),
|
|
|
|
db_mutex_(db_mutex),
|
|
|
|
shutting_down_(shutting_down),
|
2015-08-24 20:11:12 +02:00
|
|
|
existing_snapshots_(std::move(existing_snapshots)),
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
|
2014-10-28 19:54:33 +01:00
|
|
|
job_context_(job_context),
|
|
|
|
log_buffer_(log_buffer),
|
|
|
|
db_directory_(db_directory),
|
2015-01-26 22:59:38 +01:00
|
|
|
output_file_directory_(output_file_directory),
|
2014-10-28 19:54:33 +01:00
|
|
|
output_compression_(output_compression),
|
EventLogger
Summary:
Here's my proposal for making our LOGs easier to read by machines.
The idea is to dump all events as JSON objects. JSON is easy to read by humans, but more importantly, it's easy to read by machines. That way, we can parse this, load into SQLite/mongo and then query or visualize.
I started with table_create and table_delete events, but if everybody agrees, I'll continue by adding more events (flush/compaction/etc etc)
Test Plan:
Ran db_bench. Observed:
2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros": 1421360005788015, "event": "table_file_creation", "file_number": 12, "file_size": 1909699}
2015/01/15-14:13:25.956500 110740000 EVENT_LOG_v1 {"time_micros": 1421360005956498, "event": "table_file_deletion", "file_number": 12}
Reviewers: yhchiang, rven, dhruba, MarkCallaghan, lgalanis, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31647
2015-03-13 18:15:54 +01:00
|
|
|
stats_(stats),
|
2016-04-14 22:56:29 +02:00
|
|
|
event_logger_(event_logger),
|
2016-07-20 00:12:46 +02:00
|
|
|
measure_io_stats_(measure_io_stats),
|
|
|
|
pick_memtable_called(false) {
|
2015-03-13 18:45:40 +01:00
|
|
|
// Update the thread status to indicate flush.
|
2015-05-16 08:22:22 +02:00
|
|
|
ReportStartedFlush();
|
2015-03-13 18:45:40 +01:00
|
|
|
TEST_SYNC_POINT("FlushJob::FlushJob()");
|
|
|
|
}
|
|
|
|
|
|
|
|
FlushJob::~FlushJob() {
|
|
|
|
ThreadStatusUtil::ResetThreadStatus();
|
|
|
|
}
|
2014-10-28 19:54:33 +01:00
|
|
|
|
2015-05-16 08:22:22 +02:00
|
|
|
void FlushJob::ReportStartedFlush() {
|
2016-01-26 01:26:53 +01:00
|
|
|
ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env,
|
2016-09-17 00:09:14 +02:00
|
|
|
db_options_.enable_thread_tracking);
|
2015-05-16 08:22:22 +02:00
|
|
|
ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
|
|
|
|
ThreadStatusUtil::SetThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_JOB_ID,
|
|
|
|
job_context_->job_id);
|
|
|
|
IOSTATS_RESET(bytes_written);
|
|
|
|
}
|
|
|
|
|
|
|
|
void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
|
|
|
|
uint64_t input_size = 0;
|
|
|
|
for (auto* mem : mems) {
|
|
|
|
input_size += mem->ApproximateMemoryUsage();
|
|
|
|
}
|
|
|
|
ThreadStatusUtil::IncreaseThreadOperationProperty(
|
|
|
|
ThreadStatus::FLUSH_BYTES_MEMTABLES,
|
|
|
|
input_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
void FlushJob::RecordFlushIOStats() {
|
2016-04-25 21:01:01 +02:00
|
|
|
RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
|
|
|
|
ThreadStatusUtil::IncreaseThreadOperationProperty(
|
2015-05-16 08:22:22 +02:00
|
|
|
ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
|
2016-04-25 21:01:01 +02:00
|
|
|
IOSTATS_RESET(bytes_written);
|
2015-05-16 08:22:22 +02:00
|
|
|
}
|
|
|
|
|
2016-07-20 00:12:46 +02:00
|
|
|
void FlushJob::PickMemTable() {
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
assert(!pick_memtable_called);
|
|
|
|
pick_memtable_called = true;
|
|
|
|
// Save the contents of the earliest memtable as a new Table
|
|
|
|
cfd_->imm()->PickMemtablesToFlush(&mems_);
|
|
|
|
if (mems_.empty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
ReportFlushInputSize(mems_);
|
|
|
|
|
|
|
|
// entries mems are (implicitly) sorted in ascending order by their created
|
|
|
|
// time. We will use the first memtable's `edit` to keep the meta info for
|
|
|
|
// this flush.
|
|
|
|
MemTable* m = mems_[0];
|
|
|
|
edit_ = m->GetEdits();
|
|
|
|
edit_->SetPrevLogNumber(0);
|
|
|
|
// SetLogNumber(log_num) indicates logs with number smaller than log_num
|
|
|
|
// will no longer be picked up for recovery.
|
|
|
|
edit_->SetLogNumber(mems_.back()->GetNextLogNumber());
|
|
|
|
edit_->SetColumnFamily(cfd_->GetID());
|
|
|
|
|
|
|
|
// path 0 for level 0 file.
|
|
|
|
meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
|
|
|
|
|
|
|
|
base_ = cfd_->current();
|
|
|
|
base_->Ref(); // it is likely that we do not need this reference
|
|
|
|
}
|
|
|
|
|
2015-06-12 00:22:22 +02:00
|
|
|
Status FlushJob::Run(FileMetaData* file_meta) {
|
2016-07-20 00:12:46 +02:00
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
assert(pick_memtable_called);
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_run(
|
|
|
|
ThreadStatus::STAGE_FLUSH_RUN);
|
2016-07-20 00:12:46 +02:00
|
|
|
if (mems_.empty()) {
|
|
|
|
LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush",
|
|
|
|
cfd_->GetName().c_str());
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2016-04-14 22:56:29 +02:00
|
|
|
// I/O measurement variables
|
|
|
|
PerfLevel prev_perf_level = PerfLevel::kEnableTime;
|
|
|
|
uint64_t prev_write_nanos = 0;
|
|
|
|
uint64_t prev_fsync_nanos = 0;
|
|
|
|
uint64_t prev_range_sync_nanos = 0;
|
|
|
|
uint64_t prev_prepare_write_nanos = 0;
|
|
|
|
if (measure_io_stats_) {
|
|
|
|
prev_perf_level = GetPerfLevel();
|
|
|
|
SetPerfLevel(PerfLevel::kEnableTime);
|
|
|
|
prev_write_nanos = IOSTATS(write_nanos);
|
|
|
|
prev_fsync_nanos = IOSTATS(fsync_nanos);
|
|
|
|
prev_range_sync_nanos = IOSTATS(range_sync_nanos);
|
|
|
|
prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
|
|
|
|
}
|
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
// This will release and re-acquire the mutex.
|
2016-07-20 00:12:46 +02:00
|
|
|
Status s = WriteLevel0Table();
|
2014-10-28 19:54:33 +01:00
|
|
|
|
|
|
|
if (s.ok() &&
|
|
|
|
(shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
|
|
|
|
s = Status::ShutdownInProgress(
|
|
|
|
"Database shutdown or Column family drop during flush");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
2016-07-20 00:12:46 +02:00
|
|
|
cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
|
2014-10-28 19:54:33 +01:00
|
|
|
} else {
|
LogAndApply() should fail if the column family has been dropped
Summary:
This patch finally fixes the ColumnFamilyTest.ReadDroppedColumnFamily test. The test has been failing very sporadically and it was hard to repro. However, I managed to write a new tests that reproes the failure deterministically.
Here's what happens:
1. We start the flush for the column family
2. We check if the column family was dropped here: https://github.com/facebook/rocksdb/blob/a3fc49bfddcdb1ff29409aacd06c04df56c7a1d7/db/flush_job.cc#L149
3. This check goes through, ends up in InstallMemtableFlushResults() and it goes into LogAndApply()
4. At about this time, we start dropping the column family. Dropping the column family process gets to LogAndApply() at about the same time as LogAndApply() from flush process
5. Drop column family goes through LogAndApply() first, marking the column family as dropped.
6. Flush process gets woken up and gets a chance to write to the MANIFEST. However, this is where it gets stuck: https://github.com/facebook/rocksdb/blob/a3fc49bfddcdb1ff29409aacd06c04df56c7a1d7/db/version_set.cc#L1975
7. We see that the column family was dropped, so there is no need to write to the MANIFEST. We return OK.
8. Flush gets OK back from LogAndApply() and it deletes the memtable, thinking that the data is now safely persisted to sst file.
The fix is pretty simple. Instead of OK, we return ShutdownInProgress. This is not really true, but we have been using this status code to also mean "this operation was canceled because the column family has been dropped".
The fix is only one LOC. All other code is related to tests. I added a new test that reproes the failure. I also moved SleepingBackgroundTask to util/testutil.h (because I needed it in column_family_test for my new test). There's plenty of other places where we reimplement SleepingBackgroundTask, but I'll address that in a separate commit.
Test Plan:
1. new test
2. make check
3. Make sure the ColumnFamilyTest.ReadDroppedColumnFamily doesn't fail on Travis: https://travis-ci.org/facebook/rocksdb/jobs/79952386
Reviewers: yhchiang, anthony, IslamAbdelRahman, kradhakrishnan, rven, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D46773
2015-09-15 20:28:44 +02:00
|
|
|
TEST_SYNC_POINT("FlushJob::InstallResults");
|
2014-10-28 19:54:33 +01:00
|
|
|
// Replace immutable memtable with the generated Table
|
|
|
|
s = cfd_->imm()->InstallMemtableFlushResults(
|
2016-07-20 00:12:46 +02:00
|
|
|
cfd_, mutable_cf_options_, mems_, versions_, db_mutex_,
|
|
|
|
meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
|
2015-06-12 00:22:22 +02:00
|
|
|
log_buffer_);
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
|
|
|
|
2015-06-12 00:22:22 +02:00
|
|
|
if (s.ok() && file_meta != nullptr) {
|
2016-07-20 00:12:46 +02:00
|
|
|
*file_meta = meta_;
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
}
|
2015-05-16 08:22:22 +02:00
|
|
|
RecordFlushIOStats();
|
2015-02-17 19:13:52 +01:00
|
|
|
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
auto stream = event_logger_->LogToBuffer(log_buffer_);
|
|
|
|
stream << "job" << job_context_->job_id << "event"
|
|
|
|
<< "flush_finished";
|
|
|
|
stream << "lsm_state";
|
|
|
|
stream.StartArray();
|
|
|
|
auto vstorage = cfd_->current()->storage_info();
|
|
|
|
for (int level = 0; level < vstorage->num_levels(); ++level) {
|
|
|
|
stream << vstorage->NumLevelFiles(level);
|
|
|
|
}
|
|
|
|
stream.EndArray();
|
2016-06-01 20:11:33 +02:00
|
|
|
stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed();
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
|
2016-04-14 22:56:29 +02:00
|
|
|
if (measure_io_stats_) {
|
|
|
|
if (prev_perf_level != PerfLevel::kEnableTime) {
|
|
|
|
SetPerfLevel(prev_perf_level);
|
|
|
|
}
|
|
|
|
stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos);
|
|
|
|
stream << "file_range_sync_nanos"
|
|
|
|
<< (IOSTATS(range_sync_nanos) - prev_range_sync_nanos);
|
|
|
|
stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos);
|
|
|
|
stream << "file_prepare_write_nanos"
|
|
|
|
<< (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos);
|
|
|
|
}
|
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2016-07-20 00:12:46 +02:00
|
|
|
Status FlushJob::WriteLevel0Table() {
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_updater(
|
|
|
|
ThreadStatus::STAGE_FLUSH_WRITE_L0);
|
2014-10-28 19:54:33 +01:00
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
const uint64_t start_micros = db_options_.env->NowMicros();
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
db_mutex_->Unlock();
|
|
|
|
if (log_buffer_) {
|
|
|
|
log_buffer_->FlushBufferToLog();
|
|
|
|
}
|
2016-11-01 04:35:54 +01:00
|
|
|
// memtables and range_del_iters store internal iterators over each data
|
|
|
|
// memtable and its associated range deletion memtable, respectively, at
|
|
|
|
// corresponding indexes.
|
2015-10-13 00:06:38 +02:00
|
|
|
std::vector<InternalIterator*> memtables;
|
2016-11-01 04:35:54 +01:00
|
|
|
std::vector<InternalIterator*> range_del_iters;
|
2014-10-28 19:54:33 +01:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
Arena arena;
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
uint64_t total_num_entries = 0, total_num_deletes = 0;
|
|
|
|
size_t total_memory_usage = 0;
|
2016-07-20 00:12:46 +02:00
|
|
|
for (MemTable* m : mems_) {
|
2014-10-29 23:12:10 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
|
2014-10-28 19:54:33 +01:00
|
|
|
memtables.push_back(m->NewIterator(ro, &arena));
|
2016-11-01 04:35:54 +01:00
|
|
|
range_del_iters.push_back(m->NewRangeTombstoneIterator(ro, &arena));
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
total_num_entries += m->num_entries();
|
|
|
|
total_num_deletes += m->num_deletes();
|
|
|
|
total_memory_usage += m->ApproximateMemoryUsage();
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
2016-11-01 04:35:54 +01:00
|
|
|
assert(memtables.size() == range_del_iters.size());
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
|
|
|
|
event_logger_->Log() << "job" << job_context_->job_id << "event"
|
|
|
|
<< "flush_started"
|
2016-07-20 00:12:46 +02:00
|
|
|
<< "num_memtables" << mems_.size() << "num_entries"
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
<< total_num_entries << "num_deletes"
|
|
|
|
<< total_num_deletes << "memory_usage"
|
|
|
|
<< total_memory_usage;
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
{
|
2014-11-11 22:47:22 +01:00
|
|
|
ScopedArenaIterator iter(
|
|
|
|
NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
|
|
|
|
static_cast<int>(memtables.size()), &arena));
|
2016-11-01 04:35:54 +01:00
|
|
|
ScopedArenaIterator range_del_iter(NewMergingIterator(
|
|
|
|
&cfd_->internal_comparator(), &range_del_iters[0],
|
|
|
|
static_cast<int>(range_del_iters.size()), &arena));
|
2014-10-29 23:12:10 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
|
2016-07-20 00:12:46 +02:00
|
|
|
cfd_->GetName().c_str(), job_context_->job_id, meta_.fd.GetNumber());
|
2014-10-28 19:54:33 +01:00
|
|
|
|
2015-04-15 01:53:19 +02:00
|
|
|
TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
|
|
|
|
&output_compression_);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 19:42:39 +02:00
|
|
|
s = BuildTable(
|
2016-05-17 22:11:56 +02:00
|
|
|
dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
|
2016-11-01 04:35:54 +01:00
|
|
|
env_options_, cfd_->table_cache(), iter.get(),
|
|
|
|
std::move(range_del_iter), &meta_, cfd_->internal_comparator(),
|
|
|
|
cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
|
|
|
|
cfd_->GetName(), existing_snapshots_,
|
2016-04-07 08:10:32 +02:00
|
|
|
earliest_write_conflict_snapshot_, output_compression_,
|
|
|
|
cfd_->ioptions()->compression_opts,
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 19:42:39 +02:00
|
|
|
mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
|
Added EventListener::OnTableFileCreationStarted() callback
Summary: Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status.
Test Plan: unit test.
Reviewers: dhruba, yhchiang, ott, sdong
Reviewed By: sdong
Subscribers: sdong, kradhakrishnan, IslamAbdelRahman, andrewkr, yhchiang, leveldb, ott, dhruba
Differential Revision: https://reviews.facebook.net/D56337
2016-04-29 20:35:00 +02:00
|
|
|
TableFileCreationReason::kFlush, event_logger_, job_context_->job_id,
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 19:42:39 +02:00
|
|
|
Env::IO_HIGH, &table_properties_, 0 /* level */);
|
2014-10-28 19:54:33 +01:00
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
}
|
2014-10-29 23:12:10 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
Print info message about files need compaction for debuging purpose
Summary:
When there are files marked for compaction after compactions, print extra messages to help debugging. Example:
2015/06/08-23:12:55.212855 7ff5013ff700 [default] [JOB 121] Generated table #75: 54 keys, 4807 bytes (need compaction)
2015/06/08-23:12:55.556194 7ff5013ff700 (Original Log Time 2015/06/08-23:12:55.556160) [default] compacted to: base level 1 max bytes base
10240 files[0 1 9 32 12 0 0 0] max score 0.96 (2 files need compaction), MB/sec: 0.0 rd, 0.1 wr, level 2, files in(1, 3) out(5) MB in(0.0,
0.0) out(0.0), read-write-amplify(11.3) write-amplify(5.7) OK, records in: 40, records dropped: 0
Test Plan:
Run test and see LOG files.
valgrind test DBTest.TablePropertiesNeedCompactTest
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, igor
Reviewed By: igor
Subscribers: yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D39771
2015-06-09 08:14:13 +02:00
|
|
|
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
|
|
|
|
" bytes %s"
|
|
|
|
"%s",
|
2016-07-20 00:12:46 +02:00
|
|
|
cfd_->GetName().c_str(), job_context_->job_id, meta_.fd.GetNumber(),
|
|
|
|
meta_.fd.GetFileSize(), s.ToString().c_str(),
|
|
|
|
meta_.marked_for_compaction ? " (needs compaction)" : "");
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-13 00:53:55 +02:00
|
|
|
|
2016-09-24 01:34:04 +02:00
|
|
|
if (!db_options_.disable_data_sync && output_file_directory_ != nullptr) {
|
2015-01-26 22:59:38 +01:00
|
|
|
output_file_directory_->Fsync();
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
2016-01-26 18:12:20 +01:00
|
|
|
TEST_SYNC_POINT("FlushJob::WriteLevel0Table");
|
2014-10-28 19:54:33 +01:00
|
|
|
db_mutex_->Lock();
|
|
|
|
}
|
2016-07-20 00:12:46 +02:00
|
|
|
base_->Unref();
|
2014-10-28 19:54:33 +01:00
|
|
|
|
|
|
|
// Note that if file_size is zero, the file has been deleted and
|
|
|
|
// should not be added to the manifest.
|
2016-07-20 00:12:46 +02:00
|
|
|
if (s.ok() && meta_.fd.GetFileSize() > 0) {
|
2014-10-28 19:54:33 +01:00
|
|
|
// if we have more than 1 background thread, then we cannot
|
|
|
|
// insert files directly into higher levels because some other
|
|
|
|
// threads could be concurrently producing compacted files for
|
|
|
|
// that key range.
|
2015-07-17 21:02:52 +02:00
|
|
|
// Add file to L0
|
2016-07-20 00:12:46 +02:00
|
|
|
edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
|
|
|
|
meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
|
|
|
|
meta_.smallest_seqno, meta_.largest_seqno,
|
|
|
|
meta_.marked_for_compaction);
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
|
|
|
|
2016-04-25 21:01:01 +02:00
|
|
|
// Note that here we treat flush as level 0 compaction in internal stats
|
2014-10-28 19:54:33 +01:00
|
|
|
InternalStats::CompactionStats stats(1);
|
|
|
|
stats.micros = db_options_.env->NowMicros() - start_micros;
|
2016-07-20 00:12:46 +02:00
|
|
|
stats.bytes_written = meta_.fd.GetFileSize();
|
2015-07-17 21:02:52 +02:00
|
|
|
cfd_->internal_stats()->AddCompactionStats(0 /* level */, stats);
|
2014-10-28 19:54:33 +01:00
|
|
|
cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
|
2016-07-20 00:12:46 +02:00
|
|
|
meta_.fd.GetFileSize());
|
2016-04-25 21:01:01 +02:00
|
|
|
RecordFlushIOStats();
|
2014-10-28 19:54:33 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace rocksdb
|