Fix block cache ID uniqueness for Windows builds (#5844)

Summary:
Since we do not evict a file's blocks from block cache before that file
is deleted, we require a file's cache ID prefix is both unique and
non-reusable. However, the Windows functionality we were relying on only
guaranteed uniqueness. That meant a newly created file could be assigned
the same cache ID prefix as a deleted file. If the newly created file
had block offsets matching the deleted file, full cache keys could be
exactly the same, resulting in obsolete data blocks returned from cache
when trying to read from the new file.

We noticed this when running on FAT32 where compaction was writing out
of order keys due to reading obsolete blocks from its input files. The
functionality is documented as behaving the same on NTFS, although I
wasn't able to repro it there.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5844

Test Plan:
we had a reliable repro of out-of-order keys on FAT32 that
was fixed by this change

Differential Revision: D17752442

fbshipit-source-id: 95d983f9196cf415f269e19293b97341edbf7e00
This commit is contained in:
Andrew Kryczka 2019-10-11 18:18:14 -07:00 committed by Facebook Github Bot
parent bc8b05cb77
commit b00761eea6
5 changed files with 33 additions and 64 deletions

View File

@ -1942,7 +1942,10 @@ TEST_F(DBTest2, TestPerfContextIterCpuTime) {
}
#endif // OS_LINUX
#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache
// breaks when that function is not implemented and no regular block cache is
// provided.
#if !defined(OS_SOLARIS) && !defined(OS_WIN)
TEST_F(DBTest2, PersistentCache) {
int num_iter = 80;
@ -2006,7 +2009,7 @@ TEST_F(DBTest2, PersistentCache) {
}
}
}
#endif // !OS_SOLARIS
#endif // !defined(OS_SOLARIS) && !defined(OS_WIN)
namespace {
void CountSyncPoint() {

11
env/env_test.cc vendored
View File

@ -883,7 +883,9 @@ TEST_F(EnvPosixTest, PositionedAppend) {
}
#endif // !ROCKSDB_LITE
// Only works in linux platforms
// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
// handle a return value of zero but this test case cannot.
#ifndef OS_WIN
TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
// Create file.
if (env_ == Env::Default()) {
@ -926,6 +928,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
env_->DeleteFile(fname);
}
}
#endif // !defined(OS_WIN)
// only works in linux platforms
#ifdef ROCKSDB_FALLOCATE_PRESENT
@ -1016,7 +1019,9 @@ bool HasPrefix(const std::unordered_set<std::string>& ss) {
return false;
}
// Only works in linux and WIN platforms
// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
// handle a return value of zero but this test case cannot.
#ifndef OS_WIN
TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
if (env_ == Env::Default()) {
// Check whether a bunch of concurrently existing files have unique IDs.
@ -1058,7 +1063,6 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
}
}
// Only works in linux and WIN platforms
TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
if (env_ == Env::Default()) {
EnvOptions soptions;
@ -1098,6 +1102,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
ASSERT_TRUE(!HasPrefix(ids));
}
}
#endif // !defined(OS_WIN)
TEST_P(EnvPosixTestWithParam, MultiRead) {
EnvOptions soptions;

View File

@ -175,60 +175,18 @@ Status ftruncate(const std::string& filename, HANDLE hFile,
return status;
}
size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
if (max_size < kMaxVarint64Length * 3) {
return 0;
}
#if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
// MINGGW as defined by CMake file.
// yuslepukhin: I hate the guts of the above macros.
// This impl does not guarantee uniqueness everywhere
// is reasonably good
BY_HANDLE_FILE_INFORMATION FileInfo;
BOOL result = GetFileInformationByHandle(hFile, &FileInfo);
TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
if (!result) {
return 0;
}
char* rid = id;
rid = EncodeVarint64(rid, uint64_t(FileInfo.dwVolumeSerialNumber));
rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexHigh));
rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexLow));
assert(rid >= id);
return static_cast<size_t>(rid - id);
#else
FILE_ID_INFO FileInfo;
BOOL result = GetFileInformationByHandleEx(hFile, FileIdInfo, &FileInfo,
sizeof(FileInfo));
TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
if (!result) {
return 0;
}
static_assert(sizeof(uint64_t) == sizeof(FileInfo.VolumeSerialNumber),
"Wrong sizeof expectations");
// FileId.Identifier is an array of 16 BYTEs, we encode them as two uint64_t
static_assert(sizeof(uint64_t) * 2 == sizeof(FileInfo.FileId.Identifier),
"Wrong sizeof expectations");
char* rid = id;
rid = EncodeVarint64(rid, uint64_t(FileInfo.VolumeSerialNumber));
uint64_t* file_id = reinterpret_cast<uint64_t*>(&FileInfo.FileId.Identifier[0]);
rid = EncodeVarint64(rid, *file_id);
++file_id;
rid = EncodeVarint64(rid, *file_id);
assert(rid >= id);
return static_cast<size_t>(rid - id);
#endif
size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
size_t /*max_size*/) {
// Returning 0 is safe as it causes the table reader to generate a unique ID.
// This is suboptimal for performance as it prevents multiple table readers
// for the same file from sharing cached blocks. For example, if users have
// a low value for `max_open_files`, there can be many table readers opened
// for the same file.
//
// TODO: this is a temporarily solution as it is safe but not optimal for
// performance. For more details see discussion in
// https://github.com/facebook/rocksdb/pull/5844.
return 0;
}
////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -981,7 +981,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
// If the prefix wasn't generated or was too long,
// create one from the cache.
if (cc && *size == 0) {
if (cc != nullptr && *size == 0) {
char* end = EncodeVarint64(buffer, cc->NewId());
*size = static_cast<size_t>(end - buffer);
}
@ -994,7 +994,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
// If the prefix wasn't generated or was too long,
// create one from the cache.
if (*size == 0) {
if (cc != nullptr && *size == 0) {
char* end = EncodeVarint64(buffer, cc->NewId());
*size = static_cast<size_t>(end - buffer);
}

View File

@ -6,7 +6,10 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef ROCKSDB_LITE
// GetUniqueIdFromFile is not implemented on Windows. Persistent cache
// breaks when that function is not implemented
#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
#include "utilities/persistent_cache/persistent_cache_test.h"
@ -466,6 +469,6 @@ int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#else
#else // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
int main() { return 0; }
#endif
#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN)