2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-16 01:03:42 +02:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2014-09-25 20:14:01 +02:00
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
2021-03-23 21:47:56 +01:00
|
|
|
#include "db/db_impl/compacted_db_impl.h"
|
2020-07-03 04:24:25 +02:00
|
|
|
|
2019-05-31 20:52:59 +02:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2014-09-25 20:14:01 +02:00
|
|
|
#include "db/version_set.h"
|
2021-09-29 13:01:57 +02:00
|
|
|
#include "logging/logging.h"
|
2014-09-29 20:09:09 +02:00
|
|
|
#include "table/get_context.h"
|
2020-07-03 04:24:25 +02:00
|
|
|
#include "util/cast_util.h"
|
2014-09-25 20:14:01 +02:00
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2014-09-25 20:14:01 +02:00
|
|
|
|
|
|
|
extern void MarkKeyMayExist(void* arg);
|
|
|
|
extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
|
2014-09-29 20:09:09 +02:00
|
|
|
const Slice& v, bool hit_and_return);
|
2014-09-25 20:14:01 +02:00
|
|
|
|
Make backups openable as read-only DBs (#8142)
Summary:
A current limitation of backups is that you don't know the
exact database state of when the backup was taken. With this new
feature, you can at least inspect the backup's DB state without
restoring it by opening it as a read-only DB.
Rather than add something like OpenAsReadOnlyDB to the BackupEngine API,
which would inhibit opening stackable DB implementations read-only
(if/when their APIs support it), we instead provide a DB name and Env
that can be used to open as a read-only DB.
Possible follow-up work:
* Add a version of GetBackupInfo for a single backup.
* Let CreateNewBackup return the BackupID of the newly-created backup.
Implementation details:
Refactored ChrootFileSystem to split off new base class RemapFileSystem,
which allows more general remapping of files. We use this base class to
implement BackupEngineImpl::RemapSharedFileSystem.
To minimize API impact, I decided to just add these fields `name_for_open`
and `env_for_open` to those set by GetBackupInfo when
include_file_details=true. Creating the RemapSharedFileSystem adds a bit
to the memory consumption, perhaps unnecessarily in some cases, but this
has been mitigated by (a) only initialize the RemapSharedFileSystem
lazily when GetBackupInfo with include_file_details=true is called, and
(b) using the existing `shared_ptr<FileInfo>` objects to hold most of the
mapping data.
To enhance API safety, RemapSharedFileSystem is wrapped by new
ReadOnlyFileSystem which rejects any attempts to write. This uncovered a
couple of places in which DB::OpenForReadOnly would write to the
filesystem, so I fixed these. Added a release note because this affects
logging.
Additional minor refactoring in backupable_db.cc to support the new
functionality.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8142
Test Plan:
new test (run with ASAN and UBSAN), added to stress test and
ran it for a while with amplified backup_one_in
Reviewed By: ajkr
Differential Revision: D27535408
Pulled By: pdillinger
fbshipit-source-id: 04666d310aa0261ef6b2385c43ca793ce1dfd148
2021-04-06 23:36:45 +02:00
|
|
|
CompactedDBImpl::CompactedDBImpl(const DBOptions& options,
|
|
|
|
const std::string& dbname)
|
|
|
|
: DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true,
|
|
|
|
/*read_only*/ true),
|
|
|
|
cfd_(nullptr),
|
|
|
|
version_(nullptr),
|
|
|
|
user_comparator_(nullptr) {}
|
2014-09-25 20:14:01 +02:00
|
|
|
|
|
|
|
CompactedDBImpl::~CompactedDBImpl() {
|
|
|
|
}
|
|
|
|
|
2014-09-25 22:34:51 +02:00
|
|
|
size_t CompactedDBImpl::FindFile(const Slice& key) {
|
2014-09-25 20:14:01 +02:00
|
|
|
size_t right = files_.num_files - 1;
|
2018-09-27 19:33:04 +02:00
|
|
|
auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
|
|
|
|
return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
|
|
|
|
};
|
|
|
|
return static_cast<size_t>(std::lower_bound(files_.files,
|
|
|
|
files_.files + right, key, cmp) - files_.files);
|
2014-09-25 22:34:51 +02:00
|
|
|
}
|
|
|
|
|
2017-03-13 19:44:50 +01:00
|
|
|
Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
|
|
|
|
const Slice& key, PinnableSlice* value) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 07:17:46 +01:00
|
|
|
assert(user_comparator_);
|
|
|
|
if (options.timestamp || user_comparator_->timestamp_size()) {
|
|
|
|
// TODO: support timestamp
|
|
|
|
return Status::NotSupported();
|
|
|
|
}
|
2014-09-29 20:09:09 +02:00
|
|
|
GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
|
2017-01-08 23:08:51 +01:00
|
|
|
GetContext::kNotFound, key, value, nullptr, nullptr,
|
2020-03-03 00:58:32 +01:00
|
|
|
nullptr, true, nullptr, nullptr);
|
2014-09-25 20:14:01 +02:00
|
|
|
LookupKey lkey(key, kMaxSequenceNumber);
|
2020-09-29 18:47:33 +02:00
|
|
|
Status s = files_.files[FindFile(key)].fd.table_reader->Get(
|
|
|
|
options, lkey.internal_key(), &get_context, nullptr);
|
|
|
|
if (!s.ok() && !s.IsNotFound()) {
|
|
|
|
return s;
|
|
|
|
}
|
2014-09-29 20:09:09 +02:00
|
|
|
if (get_context.State() == GetContext::kFound) {
|
2014-09-25 20:14:01 +02:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
return Status::NotFound();
|
|
|
|
}
|
|
|
|
|
2014-09-25 22:34:51 +02:00
|
|
|
std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>&,
|
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 07:17:46 +01:00
|
|
|
assert(user_comparator_);
|
|
|
|
if (user_comparator_->timestamp_size() || options.timestamp) {
|
|
|
|
// TODO: support timestamp
|
|
|
|
return std::vector<Status>(keys.size(), Status::NotSupported());
|
|
|
|
}
|
2014-09-25 22:34:51 +02:00
|
|
|
autovector<TableReader*, 16> reader_list;
|
|
|
|
for (const auto& key : keys) {
|
|
|
|
const FdWithKeyRange& f = files_.files[FindFile(key)];
|
|
|
|
if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
|
|
|
|
reader_list.push_back(nullptr);
|
|
|
|
} else {
|
|
|
|
LookupKey lkey(key, kMaxSequenceNumber);
|
|
|
|
f.fd.table_reader->Prepare(lkey.internal_key());
|
|
|
|
reader_list.push_back(f.fd.table_reader);
|
|
|
|
}
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 07:17:46 +01:00
|
|
|
|
2014-09-25 22:34:51 +02:00
|
|
|
std::vector<Status> statuses(keys.size(), Status::NotFound());
|
|
|
|
values->resize(keys.size());
|
|
|
|
int idx = 0;
|
|
|
|
for (auto* r : reader_list) {
|
|
|
|
if (r != nullptr) {
|
2017-03-13 19:44:50 +01:00
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
std::string& value = (*values)[idx];
|
2014-09-29 20:09:09 +02:00
|
|
|
GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
|
2017-03-13 19:44:50 +01:00
|
|
|
GetContext::kNotFound, keys[idx], &pinnable_val,
|
2020-03-03 00:58:32 +01:00
|
|
|
nullptr, nullptr, nullptr, true, nullptr, nullptr);
|
2014-09-25 22:34:51 +02:00
|
|
|
LookupKey lkey(keys[idx], kMaxSequenceNumber);
|
2020-09-29 18:47:33 +02:00
|
|
|
Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr);
|
2021-03-23 21:47:56 +01:00
|
|
|
assert(static_cast<size_t>(idx) < statuses.size());
|
2020-09-29 18:47:33 +02:00
|
|
|
if (!s.ok() && !s.IsNotFound()) {
|
|
|
|
statuses[idx] = s;
|
|
|
|
} else {
|
|
|
|
value.assign(pinnable_val.data(), pinnable_val.size());
|
|
|
|
if (get_context.State() == GetContext::kFound) {
|
|
|
|
statuses[idx] = Status::OK();
|
|
|
|
}
|
2014-09-25 22:34:51 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
++idx;
|
|
|
|
}
|
|
|
|
return statuses;
|
|
|
|
}
|
|
|
|
|
2014-09-25 20:14:01 +02:00
|
|
|
Status CompactedDBImpl::Init(const Options& options) {
|
2017-10-06 03:00:38 +02:00
|
|
|
SuperVersionContext sv_context(/* create_superversion */ true);
|
2014-09-25 20:14:01 +02:00
|
|
|
mutex_.Lock();
|
|
|
|
ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
|
|
|
|
ColumnFamilyOptions(options));
|
2016-04-22 00:32:06 +02:00
|
|
|
Status s = Recover({cf}, true /* read only */, false, true);
|
2014-09-25 20:14:01 +02:00
|
|
|
if (s.ok()) {
|
2020-07-03 04:24:25 +02:00
|
|
|
cfd_ = static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
|
|
|
|
->cfd();
|
2017-10-06 03:00:38 +02:00
|
|
|
cfd_->InstallSuperVersion(&sv_context, &mutex_);
|
2014-09-25 20:14:01 +02:00
|
|
|
}
|
|
|
|
mutex_.Unlock();
|
2017-10-06 03:00:38 +02:00
|
|
|
sv_context.Clean();
|
2014-09-25 20:14:01 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2014-11-20 19:49:32 +01:00
|
|
|
NewThreadStatusCfInfo(cfd_);
|
2014-09-25 20:14:01 +02:00
|
|
|
version_ = cfd_->GetSuperVersion()->current;
|
|
|
|
user_comparator_ = cfd_->user_comparator();
|
2014-10-31 16:48:19 +01:00
|
|
|
auto* vstorage = version_->storage_info();
|
2015-04-02 01:55:08 +02:00
|
|
|
if (vstorage->num_non_empty_levels() == 0) {
|
|
|
|
return Status::NotSupported("no file exists");
|
|
|
|
}
|
2014-10-27 23:49:46 +01:00
|
|
|
const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
|
2014-09-25 20:14:01 +02:00
|
|
|
// L0 should not have files
|
2014-10-28 18:03:13 +01:00
|
|
|
if (l0.num_files > 1) {
|
2014-09-25 20:14:01 +02:00
|
|
|
return Status::NotSupported("L0 contain more than 1 file");
|
|
|
|
}
|
2014-10-28 18:03:13 +01:00
|
|
|
if (l0.num_files == 1) {
|
2014-11-04 02:45:55 +01:00
|
|
|
if (vstorage->num_non_empty_levels() > 1) {
|
2014-09-25 20:14:01 +02:00
|
|
|
return Status::NotSupported("Both L0 and other level contain files");
|
|
|
|
}
|
2014-10-28 18:03:13 +01:00
|
|
|
files_ = l0;
|
2014-09-25 20:14:01 +02:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
|
2014-10-27 23:49:46 +01:00
|
|
|
if (vstorage->LevelFilesBrief(i).num_files > 0) {
|
2014-09-25 20:14:01 +02:00
|
|
|
return Status::NotSupported("Other levels also contain files");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-04 02:45:55 +01:00
|
|
|
int level = vstorage->num_non_empty_levels() - 1;
|
2014-10-27 23:49:46 +01:00
|
|
|
if (vstorage->LevelFilesBrief(level).num_files > 0) {
|
|
|
|
files_ = vstorage->LevelFilesBrief(level);
|
2014-09-25 20:14:01 +02:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
return Status::NotSupported("no file exists");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CompactedDBImpl::Open(const Options& options,
|
|
|
|
const std::string& dbname, DB** dbptr) {
|
|
|
|
*dbptr = nullptr;
|
|
|
|
|
|
|
|
if (options.max_open_files != -1) {
|
|
|
|
return Status::InvalidArgument("require max_open_files = -1");
|
|
|
|
}
|
|
|
|
if (options.merge_operator.get() != nullptr) {
|
|
|
|
return Status::InvalidArgument("merge operator is not supported");
|
|
|
|
}
|
|
|
|
DBOptions db_options(options);
|
|
|
|
std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
|
|
|
|
Status s = db->Init(options);
|
|
|
|
if (s.ok()) {
|
2022-03-12 20:45:56 +01:00
|
|
|
s = db->StartPeriodicWorkScheduler();
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
2017-03-16 03:22:52 +01:00
|
|
|
ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
|
|
|
|
"Opened the db as fully compacted mode");
|
2016-09-24 01:34:04 +02:00
|
|
|
LogFlush(db->immutable_db_options_.info_log);
|
2014-09-25 20:14:01 +02:00
|
|
|
*dbptr = db.release();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2014-09-25 20:14:01 +02:00
|
|
|
#endif // ROCKSDB_LITE
|