diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 12b880ca8..6a18cee8e 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1308,6 +1308,7 @@ class DBImpl : public DB { struct LogFileNumberSize { explicit LogFileNumberSize(uint64_t _number) : number(_number) {} + LogFileNumberSize() {} void AddSize(uint64_t new_size) { size += new_size; } uint64_t number; uint64_t size = 0; @@ -1507,6 +1508,12 @@ class DBImpl : public DB { Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit); + // Get the size of a log file and, if truncate is true, truncate the + // log file to its actual size, thereby freeing preallocated space. + // Return success even if truncate fails + Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log); + // Restore alive_log_files_ and total_log_size_ after recovery. // It needs to run only when there's no flush during recovery // (e.g. avoid_flush_during_recovery=true). May also trigger flush diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index c417ecba8..6524952b9 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1229,8 +1229,16 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, } } - if (status.ok() && data_seen && !flushed) { - status = RestoreAliveLogFiles(wal_numbers); + if (status.ok()) { + if (data_seen && !flushed) { + status = RestoreAliveLogFiles(wal_numbers); + } else { + // If there's no data in the WAL, or we flushed all the data, still + // truncate the log file. If the process goes into a crash loop before + // the file is deleted, the preallocated space will never get freed. + GetLogSizeAndMaybeTruncate(wal_numbers.back(), true, nullptr) + .PermitUncheckedError(); + } } event_logger_.Log() << "job" << job_id << "event" @@ -1239,6 +1247,40 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, return status; } +Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log_ptr) { + LogFileNumberSize log(wal_number); + std::string fname = LogFileName(immutable_db_options_.wal_dir, wal_number); + Status s; + // This gets the appear size of the wals, not including preallocated space. + s = env_->GetFileSize(fname, &log.size); + if (s.ok() && truncate) { + std::unique_ptr last_log; + Status truncate_status = fs_->ReopenWritableFile( + fname, + fs_->OptimizeForLogWrite( + file_options_, + BuildDBOptions(immutable_db_options_, mutable_db_options_)), + &last_log, nullptr); + if (truncate_status.ok()) { + truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); + } + if (truncate_status.ok()) { + truncate_status = last_log->Close(IOOptions(), nullptr); + } + // Not a critical error if fail to truncate. + if (!truncate_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to truncate log #%" PRIu64 ": %s", wal_number, + truncate_status.ToString().c_str()); + } + } + if (log_ptr) { + *log_ptr = log; + } + return s; +} + Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { if (wal_numbers.empty()) { return Status::OK(); @@ -1254,39 +1296,17 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { total_log_size_ = 0; log_empty_ = false; for (auto wal_number : wal_numbers) { - LogFileNumberSize log(wal_number); - std::string fname = LogFileName(immutable_db_options_.wal_dir, wal_number); - // This gets the appear size of the wals, not including preallocated space. - s = env_->GetFileSize(fname, &log.size); + // We preallocate space for wals, but then after a crash and restart, those + // preallocated space are not needed anymore. It is likely only the last + // log has such preallocated space, so we only truncate for the last log. + LogFileNumberSize log; + s = GetLogSizeAndMaybeTruncate( + wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log); if (!s.ok()) { break; } total_log_size_ += log.size; alive_log_files_.push_back(log); - // We preallocate space for wals, but then after a crash and restart, those - // preallocated space are not needed anymore. It is likely only the last - // log has such preallocated space, so we only truncate for the last log. - if (wal_number == wal_numbers.back()) { - std::unique_ptr last_log; - Status truncate_status = fs_->ReopenWritableFile( - fname, - fs_->OptimizeForLogWrite( - file_options_, - BuildDBOptions(immutable_db_options_, mutable_db_options_)), - &last_log, nullptr); - if (truncate_status.ok()) { - truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); - } - if (truncate_status.ok()) { - truncate_status = last_log->Close(IOOptions(), nullptr); - } - // Not a critical error if fail to truncate. - if (!truncate_status.ok()) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Failed to truncate log #%" PRIu64 ": %s", wal_number, - truncate_status.ToString().c_str()); - } - } } if (two_write_queues_) { log_write_mutex_.Unlock(); diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index cc8c46335..69c42ff52 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -24,13 +24,37 @@ class DBWALTestBase : public DBTestBase { #if defined(ROCKSDB_PLATFORM_POSIX) public: +#if defined(ROCKSDB_FALLOCATE_PRESENT) + bool IsFallocateSupported() { + // Test fallocate support of running file system. + // Skip this test if fallocate is not supported. + std::string fname_test_fallocate = dbname_ + "/preallocate_testfile"; + int fd = -1; + do { + fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + assert(fd > 0); + int alloc_status = fallocate(fd, 0, 0, 1); + int err_number = errno; + close(fd); + assert(env_->DeleteFile(fname_test_fallocate) == Status::OK()); + if (err_number == ENOSYS || err_number == EOPNOTSUPP) { + fprintf(stderr, "Skipped preallocated space check: %s\n", + errnoStr(err_number).c_str()); + return false; + } + assert(alloc_status == 0); + return true; + } +#endif // ROCKSDB_FALLOCATE_PRESENT + uint64_t GetAllocatedFileSize(std::string file_name) { struct stat sbuf; int err = stat(file_name.c_str(), &sbuf); assert(err == 0); return sbuf.st_blocks * 512; } -#endif +#endif // ROCKSDB_PLATFORM_POSIX }; class DBWALTest : public DBWALTestBase { @@ -1849,24 +1873,9 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) { ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); return; } - // Test fallocate support of running file system. - // Skip this test if fallocate is not supported. - std::string fname_test_fallocate = dbname_ + "/preallocate_testfile"; - int fd = -1; - do { - fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); - } while (fd < 0 && errno == EINTR); - ASSERT_GT(fd, 0); - int alloc_status = fallocate(fd, 0, 0, 1); - int err_number = errno; - close(fd); - ASSERT_OK(options.env->DeleteFile(fname_test_fallocate)); - if (err_number == ENOSYS || err_number == EOPNOTSUPP) { - fprintf(stderr, "Skipped preallocated space check: %s\n", - errnoStr(err_number).c_str()); + if (!IsFallocateSupported()) { return; } - ASSERT_EQ(0, alloc_status); DestroyAndReopen(options); size_t preallocated_size = @@ -1889,6 +1898,120 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) { ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), preallocated_size); } +// Tests that we will truncate the preallocated space of the last log from +// previous. +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = false; + options.avoid_flush_during_shutdown = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + // The log file has preallocated space. + Close(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::PurgeObsoleteFiles:Begin", + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"}, + {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate", + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + port::Thread reopen_thread([&]() { Reopen(options); }); + + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"); + // After the flush during Open, the log file should get deleted. However, + // if the process is in a crash loop, the log file may not get + // deleted and thte preallocated space will keep accumulating. So we need + // to ensure it gets trtuncated. + EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate"); + reopen_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) { + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = false; + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + Close(); + std::vector filenames; + std::string last_log; + uint64_t last_log_num = 0; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + for (auto fname : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(fname, &number, &type, nullptr)) { + if (type == kWalFile && number > last_log_num) { + last_log = fname; + } + } + } + ASSERT_NE(last_log, ""); + last_log = dbname_ + '/' + last_log; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::PurgeObsoleteFiles:Begin", + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"}, + {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate", + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PosixWritableFile::Close", + [](void* arg) { *(reinterpret_cast(arg)) = 0; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Preallocate space for the empty log file. This could happen if WAL data + // was buffered in memory and the process crashed. + std::unique_ptr log_file; + ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions())); + log_file->SetPreallocationBlockSize(preallocated_size); + log_file->PrepareWrite(0, 4096); + log_file.reset(); + + ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size); + + port::Thread reopen_thread([&]() { Reopen(options); }); + + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"); + // The preallocated space should be truncated. + EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size); + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate"); + reopen_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} #endif // ROCKSDB_FALLOCATE_PRESENT #endif // ROCKSDB_PLATFORM_POSIX diff --git a/env/io_posix.cc b/env/io_posix.cc index 297b56794..d48f04723 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -1213,6 +1213,7 @@ IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/, size_t block_size; size_t last_allocated_block; GetPreallocationStatus(&block_size, &last_allocated_block); + TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block); if (last_allocated_block > 0) { // trim the extra space preallocated at the end of the file // NOTE(ljin): we probably don't want to surface failure as an IOError, diff --git a/env/mock_env.cc b/env/mock_env.cc index 539f20e93..3733371fc 100644 --- a/env/mock_env.cc +++ b/env/mock_env.cc @@ -724,11 +724,12 @@ IOStatus MockFileSystem::ReopenWritableFile( MemFile* file = nullptr; if (file_map_.find(fn) == file_map_.end()) { file = new MemFile(env_, fn, false); + // Only take a reference when we create the file objectt + file->Ref(); file_map_[fn] = file; } else { file = file_map_[fn]; } - file->Ref(); if (file_opts.use_direct_writes && !supports_direct_io_) { return IOStatus::NotSupported("Direct I/O Not Supported"); } else {