Hui Xiao fc61e98ae6 Attempt to deflake DBLogicalBlockSizeCacheTest.CreateColumnFamilies (#9516)
Summary:
**Context:**
`DBLogicalBlockSizeCacheTest.CreateColumnFamilies` is flaky on a rare occurrence of assertion failure below
```
db/db_logical_block_size_cache_test.cc:210
Expected equality of these values:
  1
  cache_->GetRefCount(cf_path_0_)
    Which is: 2
```

Root-cause: `ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));` in the test may not successfully decrease the ref count of `cf_path_0_` since the decreasing only happens in the clean-up of `ColumnFamilyData` when `ColumnFamilyData` has no referencing to it, which may not be true when `db->DestroyColumnFamilyHandle(cfs[0])` is called since background work such as `DumpStats()` can hold reference to that `ColumnFamilyData` (suggested and repro-d by ajkr ). Similar case `ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));`.

See following for a deterministic repro:
```
 diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 196b428a3..4e7a834c4 100644
 --- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -956,10 +956,16 @@ void DBImpl::DumpStats() {
         // near-atomically.
         // Get a ref before unlocking
         cfd->Ref();
+        if (cfd->GetName() == "cf1" || cfd->GetName() == "cf2") {
+          TEST_SYNC_POINT("DBImpl::DumpStats:PostCFDRef");
+        }
         {
           InstrumentedMutexUnlock u(&mutex_);
           cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
         }
+        if (cfd->GetName() == "cf1" || cfd->GetName() == "cf2") {
+          TEST_SYNC_POINT("DBImpl::DumpStats::PreCFDUnrefAndTryDelete");
+        }
         cfd->UnrefAndTryDelete();
       }
     }
 diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc
index 1057871c9..c3872c036 100644
 --- a/db/db_logical_block_size_cache_test.cc
+++ b/db/db_logical_block_size_cache_test.cc
@@ -9,6 +9,7 @@
 #include "env/io_posix.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"

 namespace ROCKSDB_NAMESPACE {
 class EnvWithCustomLogicalBlockSizeCache : public EnvWrapper {
@@ -183,6 +184,15 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
   ASSERT_EQ(1, cache_->GetRefCount(dbname_));

   std::vector<ColumnFamilyHandle*> cfs;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBLogicalBlockSizeCacheTest::CreateColumnFamilies::PostSetupTwoCFH",
+        "DBImpl::DumpStats:StartRunning"},
+       {"DBImpl::DumpStats:PostCFDRef",
+        "DBLogicalBlockSizeCacheTest::CreateColumnFamilies::PreDeleteTwoCFH"},
+       {"DBLogicalBlockSizeCacheTest::CreateColumnFamilies::"
+        "PostFinishCheckingRef",
+        "DBImpl::DumpStats::PreCFDUnrefAndTryDelete"}});
   ASSERT_OK(db->CreateColumnFamilies(cf_options, {"cf1", "cf2"}, &cfs));
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(dbname_));
@@ -190,7 +200,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
   ASSERT_TRUE(cache_->Contains(cf_path_0_));
   ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
   }

    // Delete one handle will not drop cache because another handle is still
   // referencing cf_path_0_.
+  TEST_SYNC_POINT(
+      "DBLogicalBlockSizeCacheTest::CreateColumnFamilies::PostSetupTwoCFH");
+  TEST_SYNC_POINT(
+      "DBLogicalBlockSizeCacheTest::CreateColumnFamilies::PreDeleteTwoCFH");
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(dbname_));
@@ -209,16 +221,20 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
   ASSERT_TRUE(cache_->Contains(cf_path_0_));
    // Will fail
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));

   // Delete the last handle will drop cache.
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(dbname_));
   // Will fail
   ASSERT_EQ(1, cache_->GetRefCount(dbname_));

+  TEST_SYNC_POINT(
+      "DBLogicalBlockSizeCacheTest::CreateColumnFamilies::"
+      "PostFinishCheckingRef");
   delete db;
   ASSERT_EQ(0, cache_->Size());
   ASSERT_OK(DestroyDB(dbname_, options,
       {{"cf1", cf_options}, {"cf2", cf_options}}));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
```

**Summary**
- Removed the flaky assertion
- Clarified the comments for the test

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9516

Test Plan:
- CI
- Monitor for future flakiness

Reviewed By: ajkr

Differential Revision: D34055232

Pulled By: hx235

fbshipit-source-id: 9bf83ae5fa88bf6fc829876494d4692082e4c357
2022-03-04 11:35:28 -08:00
..
2022-02-18 14:23:07 -08:00
2020-06-15 10:47:02 -07:00
2022-02-03 15:15:23 -08:00
2020-12-09 16:02:12 -08:00