From 2ee20a669d3e676c8bdb05f51ac3531c778b1435 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Tue, 28 Dec 2021 11:45:27 -0800
Subject: [PATCH 1/6] Extend trace filtering to more operation types (#9335)

Summary:
- Extended trace filtering to cover `MultiGet()`, `Seek()`, and `SeekForPrev()`. Now all user ops that can be traced support filtering.
- Enabled the new filter masks in `db_stress` since it only cares to trace writes.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9335

Test Plan:
- trace-heavy `db_stress` command reduced 30% elapsed time  (79.21 -> 55.47 seconds)

Benchmark command:
```
$ /usr/bin/time ./db_stress -ops_per_thread=100000 -sync_fault_injection=1 --db=/dev/shm/rocksdb_stress_db/ --expected_values_dir=/dev/shm/rocksdb_stress_expected/ --clear_column_family_one_in=0
```

- replay-heavy `db_stress` command reduced 12.4% elapsed time (23.69 -> 20.75 seconds)

Setup command:
```
$  ./db_stress -ops_per_thread=100000000 -sync_fault_injection=1 -db=/dev/shm/rocksdb_stress_db/ -expected_values_dir=/dev/shm/rocksdb_stress_expected --clear_column_family_one_in=0 & sleep 120; pkill -9 db_stress
```

Benchmark command:
```
$ /usr/bin/time ./db_stress -ops_per_thread=1 -reopen=0 -expected_values_dir=/dev/shm/rocksdb_stress_expected/ -db=/dev/shm/rocksdb_stress_db/ --clear_column_family_one_in=0 --destroy_db_initially=0
```

Reviewed By: zhichao-cao

Differential Revision: D33304580

Pulled By: ajkr

fbshipit-source-id: 0df10f87c1fc506e9484b6b42cea2ef96c7ecd65
---
 HISTORY.md                       |  4 ++++
 db_stress_tool/expected_state.cc |  3 +++
 include/rocksdb/options.h        |  8 ++++++-
 trace_replay/trace_replay.cc     | 41 +++++++++++++++++++++++++++++---
 4 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index e20b00a1a..8b0bd1b70 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,4 +1,8 @@
 # Rocksdb Change Log
+## Unreleased
+### Public API change
+* Added values to `TraceFilterType`: `kTraceFilterIteratorSeek`, `kTraceFilterIteratorSeekForPrev`, and `kTraceFilterMultiGet`. They can be set in `TraceOptions` to filter out the operation types after which they are named.
+
 ## 6.28.0 (2021-12-17)
 ### New Features
 * Introduced 'CommitWithTimestamp' as a new tag. Currently, there is no API for user to trigger a write with this tag to the WAL. This is part of the efforts to support write-commited transactions with user-defined timestamps.
diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc
index 86bfdb95e..6c2c51abc 100644
--- a/db_stress_tool/expected_state.cc
+++ b/db_stress_tool/expected_state.cc
@@ -298,6 +298,9 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) {
   if (s.ok()) {
     TraceOptions trace_opts;
     trace_opts.filter |= kTraceFilterGet;
+    trace_opts.filter |= kTraceFilterMultiGet;
+    trace_opts.filter |= kTraceFilterIteratorSeek;
+    trace_opts.filter |= kTraceFilterIteratorSeekForPrev;
     s = db->StartTrace(trace_opts, std::move(trace_writer));
   }
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index f593e1243..2f18b132e 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1851,7 +1851,13 @@ enum TraceFilterType : uint64_t {
   // Do not trace the get operations
   kTraceFilterGet = 0x1 << 0,
   // Do not trace the write operations
-  kTraceFilterWrite = 0x1 << 1
+  kTraceFilterWrite = 0x1 << 1,
+  // Do not trace the `Iterator::Seek()` operations
+  kTraceFilterIteratorSeek = 0x1 << 2,
+  // Do not trace the `Iterator::SeekForPrev()` operations
+  kTraceFilterIteratorSeekForPrev = 0x1 << 3,
+  // Do not trace the `MultiGet()` operations
+  kTraceFilterMultiGet = 0x1 << 4,
 };
 
 // TraceOptions is used for StartTrace
diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc
index 89bea7870..37b95852b 100644
--- a/trace_replay/trace_replay.cc
+++ b/trace_replay/trace_replay.cc
@@ -532,11 +532,46 @@ bool Tracer::ShouldSkipTrace(const TraceType& trace_type) {
   if (IsTraceFileOverMax()) {
     return true;
   }
-  if ((trace_options_.filter & kTraceFilterGet && trace_type == kTraceGet) ||
-      (trace_options_.filter & kTraceFilterWrite &&
-       trace_type == kTraceWrite)) {
+
+  TraceFilterType filter_mask = kTraceFilterNone;
+  switch (trace_type) {
+    case kTraceNone:
+    case kTraceBegin:
+    case kTraceEnd:
+      filter_mask = kTraceFilterNone;
+      break;
+    case kTraceWrite:
+      filter_mask = kTraceFilterWrite;
+      break;
+    case kTraceGet:
+      filter_mask = kTraceFilterGet;
+      break;
+    case kTraceIteratorSeek:
+      filter_mask = kTraceFilterIteratorSeek;
+      break;
+    case kTraceIteratorSeekForPrev:
+      filter_mask = kTraceFilterIteratorSeekForPrev;
+      break;
+    case kBlockTraceIndexBlock:
+    case kBlockTraceFilterBlock:
+    case kBlockTraceDataBlock:
+    case kBlockTraceUncompressionDictBlock:
+    case kBlockTraceRangeDeletionBlock:
+    case kIOTracer:
+      filter_mask = kTraceFilterNone;
+      break;
+    case kTraceMultiGet:
+      filter_mask = kTraceFilterMultiGet;
+      break;
+    case kTraceMax:
+      assert(false);
+      filter_mask = kTraceFilterNone;
+      break;
+  }
+  if (filter_mask != kTraceFilterNone && trace_options_.filter & filter_mask) {
     return true;
   }
+
   ++trace_request_count_;
   if (trace_request_count_ < trace_options_.sampling_frequency) {
     return true;

From aa2b3bf675a9d32122432aa497972920c7f07df5 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Tue, 28 Dec 2021 15:03:03 -0800
Subject: [PATCH 2/6] Added `TraceOptions::preserve_write_order` (#9334)

Summary:
This option causes trace records to be written in the serialized write thread. That way, the write records in the trace must follow the same order as writes that are logged to WAL and writes that are applied to the DB.

By default I left it disabled to match existing behavior. I enabled it in `db_stress`, though, as that use case requires order of write records in trace matches the order in WAL.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9334

Test Plan:
- See if below unsynced data loss crash test can run  for 24h straight. It used to crash after a few hours when reaching an unlucky trace ordering.

```
DEBUG_LEVEL=0 TEST_TMPDIR=/dev/shm /usr/local/bin/python3 -u tools/db_crashtest.py blackbox --interval=10 --max_key=100000 --write_buffer_size=524288 --target_file_size_base=524288 --max_bytes_for_level_base=2097152 --value_size_mult=33 --sync_fault_injection=1 --test_batches_snapshots=0 --duration=86400
```

Reviewed By: zhichao-cao

Differential Revision: D33301990

Pulled By: ajkr

fbshipit-source-id: 82d97559727adb4462a7af69758449c8725b22d3
---
 HISTORY.md                       |  1 +
 db/db_impl/db_impl_write.cc      | 40 +++++++++++++++++++++++++++++++-
 db_stress_tool/expected_state.cc |  1 +
 include/rocksdb/options.h        |  7 ++++++
 trace_replay/trace_replay.h      |  4 ++++
 5 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 8b0bd1b70..6e0c54e53 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,7 @@
 ## Unreleased
 ### Public API change
 * Added values to `TraceFilterType`: `kTraceFilterIteratorSeek`, `kTraceFilterIteratorSeekForPrev`, and `kTraceFilterMultiGet`. They can be set in `TraceOptions` to filter out the operation types after which they are named.
+* Added `TraceOptions::preserve_write_order`. When enabled it  guarantees write records are traced in the same order they are logged to WAL and applied to the DB. By default it is disabled (false) to match the legacy behavior and prevent regression.
 
 ## 6.28.0 (2021-12-17)
 ### New Features
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index b3d57643d..72a47d83a 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -75,9 +75,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   if (my_batch == nullptr) {
     return Status::Corruption("Batch is nullptr!");
   }
+  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+  // grabs but does not seem thread-safe.
   if (tracer_) {
     InstrumentedMutexLock lock(&trace_mutex_);
-    if (tracer_) {
+    if (tracer_ && !tracer_->IsWriteOrderPreserved()) {
+      // We don't have to preserve write order so can trace anywhere. It's more
+      // efficient to trace here than to add latency to a phase of the log/apply
+      // pipeline.
       // TODO: maybe handle the tracing status?
       tracer_->Write(my_batch).PermitUncheckedError();
     }
@@ -249,6 +254,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   IOStatus io_s;
   Status pre_release_cb_status;
   if (status.ok()) {
+    // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+    // grabs but does not seem thread-safe.
+    if (tracer_) {
+      InstrumentedMutexLock lock(&trace_mutex_);
+      if (tracer_ && tracer_->IsWriteOrderPreserved()) {
+        for (auto* writer : write_group) {
+          // TODO: maybe handle the tracing status?
+          tracer_->Write(writer->batch).PermitUncheckedError();
+        }
+      }
+    }
     // Rules for when we can update the memtable concurrently
     // 1. supported by memtable
     // 2. Puts are not okay if inplace_update_support
@@ -498,6 +514,17 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     size_t total_byte_size = 0;
 
     if (w.status.ok()) {
+      // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+      // grabs but does not seem thread-safe.
+      if (tracer_) {
+        InstrumentedMutexLock lock(&trace_mutex_);
+        if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+          for (auto* writer : wal_write_group) {
+            // TODO: maybe handle the tracing status?
+            tracer_->Write(writer->batch).PermitUncheckedError();
+          }
+        }
+      }
       SequenceNumber next_sequence = current_sequence;
       for (auto writer : wal_write_group) {
         if (writer->CheckCallback(this)) {
@@ -722,6 +749,17 @@ Status DBImpl::WriteImplWALOnly(
   write_thread->EnterAsBatchGroupLeader(&w, &write_group);
   // Note: no need to update last_batch_group_size_ here since the batch writes
   // to WAL only
+  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+  // grabs but does not seem thread-safe.
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+      for (auto* writer : write_group) {
+        // TODO: maybe handle the tracing status?
+        tracer_->Write(writer->batch).PermitUncheckedError();
+      }
+    }
+  }
 
   size_t pre_release_callback_cnt = 0;
   size_t total_byte_size = 0;
diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc
index 6c2c51abc..c787ff757 100644
--- a/db_stress_tool/expected_state.cc
+++ b/db_stress_tool/expected_state.cc
@@ -301,6 +301,7 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) {
     trace_opts.filter |= kTraceFilterMultiGet;
     trace_opts.filter |= kTraceFilterIteratorSeek;
     trace_opts.filter |= kTraceFilterIteratorSeekForPrev;
+    trace_opts.preserve_write_order = true;
     s = db->StartTrace(trace_opts, std::move(trace_writer));
   }
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 2f18b132e..a8123328c 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1870,6 +1870,13 @@ struct TraceOptions {
   uint64_t sampling_frequency = 1;
   // Note: The filtering happens before sampling.
   uint64_t filter = kTraceFilterNone;
+  // When true, the order of write records in the trace will match the order of
+  // the corresponding write records in the WAL and applied to the DB. There may
+  // be a performance penalty associated with preserving this ordering.
+  //
+  // Default: false. This means write records in the trace may be in an order
+  // different from the WAL's order.
+  bool preserve_write_order = false;
 };
 
 // ImportColumnFamilyOptions is used by ImportColumnFamily()
diff --git a/trace_replay/trace_replay.h b/trace_replay/trace_replay.h
index 4990accef..9aba5ceb7 100644
--- a/trace_replay/trace_replay.h
+++ b/trace_replay/trace_replay.h
@@ -150,6 +150,10 @@ class Tracer {
   // False otherwise.
   bool IsTraceFileOverMax();
 
+  // Returns true if the order of write trace records must match the order of
+  // the corresponding records logged to WAL and applied to the DB.
+  bool IsWriteOrderPreserved() { return trace_options_.preserve_write_order; }
+
   // Writes a trace footer at the end of the tracing
   Status Close();
 

From 26a238f5b75683ff59d4d2e361ef68c2c242deac Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 28 Dec 2021 21:53:26 -0800
Subject: [PATCH 3/6] New blog post for Ribbon filter (#8992)

Summary:
new blog post for Ribbon filter

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8992

Test Plan: markdown render in GitHub, Pages on my fork

Reviewed By: jay-zhuang

Differential Revision: D33342496

Pulled By: pdillinger

fbshipit-source-id: a0a7c19100abdf8755f8a618eb4dead755dfddae
---
 docs/_posts/2021-12-29-ribbon-filter.markdown | 281 ++++++++++++++++++
 1 file changed, 281 insertions(+)
 create mode 100644 docs/_posts/2021-12-29-ribbon-filter.markdown

diff --git a/docs/_posts/2021-12-29-ribbon-filter.markdown b/docs/_posts/2021-12-29-ribbon-filter.markdown
new file mode 100644
index 000000000..c6a52ce84
--- /dev/null
+++ b/docs/_posts/2021-12-29-ribbon-filter.markdown
@@ -0,0 +1,281 @@
+---
+title: Ribbon Filter
+layout: post
+author: pdillinger
+category: blog
+---
+
+## Summary
+Since version 6.15 last year, RocksDB supports Ribbon filters, a new
+alternative to Bloom filters that save space, especially memory, at
+the cost of more CPU usage, mostly in constructing the filters in the
+background. Most applications with long-lived data (many hours or
+longer) will likely benefit from adopting a Ribbon+Bloom hybrid filter
+policy. Here we explain why and how.
+
+[Ribbon filter on RocksDB wiki](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#ribbon-filter)
+
+[Ribbon filter paper](https://arxiv.org/abs/2103.02515)
+
+## Problem & background
+Bloom filters play a critical role in optimizing point queries and
+some range queries in LSM-tree storage systems like RocksDB. Very
+large DBs can use 10% or more of their RAM memory for (Bloom) filters,
+so that (average case) read performance can be very good despite high
+(worst case) read amplification, [which is useful for lowering write
+and/or space
+amplification](http://smalldatum.blogspot.com/2015/11/read-write-space-amplification-pick-2_23.html).
+Although the `format_version=5` Bloom filter in RocksDB is extremely
+fast, all Bloom filters use around 50% more space than is
+theoretically possible for a hashed structure configured for the same
+false positive (FP) rate and number of keys added. What would it take
+to save that significant share of “wasted” filter memory, and when
+does it make sense to use such a Bloom alternative?
+
+A number of alternatives to Bloom filters were known, especially for
+static filters (not modified after construction), but all the
+previously known structures were unsatisfying for SSTs because of some
+combination of
+* Not enough space savings for CPU increase. For example, [Xor
+  filters](https://arxiv.org/abs/1912.08258) use 3-4x more CPU than
+  Bloom but only save 15-20% of
+  space. [GOV](https://arxiv.org/pdf/1603.04330.pdf) can save around
+  30% space but requires around 10x more CPU than Bloom.
+* Inconsistent space savings. [Cuckoo
+  filters](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf)
+  and Xor+ filters offer significant space savings for very low FP
+  rates (high bits per key) but little or no savings for higher FP
+  rates (low bits per key). ([Higher FP rates are considered best for
+  largest levels of
+  LSM.](https://stratos.seas.harvard.edu/files/stratos/files/monkeykeyvaluestore.pdf))
+  [Spatially-coupled Xor
+  filters](https://arxiv.org/pdf/2001.10500.pdf) require very large
+  number of keys per filter for large space savings.
+* Inflexible configuration. No published alternatives offered the same
+  continuous configurability of Bloom filters, where any FP rate and
+  any fractional bits per key could be chosen. This flexibility
+  improves memory efficiency with the `optimize_filters_for_memory`
+  option that minimizes internal fragmentation on filters.
+
+## Ribbon filter development and implementation
+The Ribbon filter came about when I developed a faster, simpler, and
+more adaptable algorithm for constructing a little-known [Xor-based
+structure from Dietzfelbinger and
+Walzer](https://arxiv.org/pdf/1907.04750.pdf). It has very good space
+usage for required CPU time (~30% space savings for 3-4x CPU) and,
+with some engineering, Bloom-like configurability. The complications
+were managable for use in RocksDB:
+* Ribbon space efficiency does not naturally scale to very large
+  number of keys in a single filter (whole SST file or partition), but
+  with the current 128-bit Ribbon implementation in RocksDB, even 100
+  million keys in one filter saves 27% space vs. Bloom rather than 30%
+  for 100,000 keys in a filter.
+* More temporary memory is required during construction, ~230 bits per
+  key for 128-bit Ribbon vs. ~75 bits per key for Bloom filter. A
+  quick calculation shows that if you are saving 3 bits per key on the
+  generated filter, you only need about 50 generated filters in memory
+  to offset this temporary memory usage. (Thousands of filters in
+  memory is typical.) Starting in RocksDB version 6.27, this temporary
+  memory can be accounted for under block cache using
+  `BlockBasedTableOptions::reserve_table_builder_memory`.
+* Ribbon filter queries use relatively more CPU for lower FP rates
+  (but still O(1) relative to number of keys added to filter). This
+  should be OK because lower FP rates are only appropriate when then
+  cost of a false positive is very high (worth extra query time) or
+  memory is not so constrained (can use Bloom instead).
+
+Future: data in [the paper](https://arxiv.org/abs/2103.02515) suggests
+that 32-bit Balanced Ribbon (new name: [Bump-Once
+Ribbon](https://arxiv.org/pdf/2109.01892.pdf)) would improve all of
+these issues and be better all around (except for code complexity).
+
+## Ribbon vs. Bloom in RocksDB configuration
+Different applications and hardware configurations have different
+constraints, but we can use hardware costs to examine and better
+understand the trade-off between Bloom and Ribbon.
+
+### Same FP rate, RAM vs. CPU hardware cost
+Under ideal conditions where we can adjust our hardware to suit the
+application, in terms of dollars, how much does it cost to construct,
+query, and keep in memory a Bloom filter vs. a Ribbon filter?  The
+Ribbon filter costs more for CPU but less for RAM. Importantly, the
+RAM cost directly depends on how long the filter is kept in memory,
+which in RocksDB is essentially the lifetime of the filter.
+(Temporary RAM during construction is so short-lived that it is
+ignored.)  Using some consumer hardware and electricity prices and a
+predicted balance between construction and queries, we can compute a
+“break even” duration in memory. To minimize cost, filters with a
+lifetime shorter than this should be Bloom and filters with a lifetime
+longer than this should be Ribbon. (Python code)
+
+```
+# Commodity prices based roughly on consumer prices and rough guesses
+# Upfront cost of a CPU per hardware thread
+upfront_dollars_per_cpu_thread = 30.0
+
+# CPU average power usage per hardware thread
+watts_per_cpu_thread = 3.5
+
+# Upfront cost of a GB of RAM
+upfront_dollars_per_gb_ram = 8.0
+
+# RAM average power usage per GB
+# https://www.crucial.com/support/articles-faq-memory/how-much-power-does-memory-use
+watts_per_gb_ram = 0.375
+
+# Estimated price of power per kilowatt-hour, including overheads like conversion losses and cooling
+dollars_per_kwh = 0.35
+
+# Assume 3 year hardware lifetime
+hours_per_lifetime = 3 * 365 * 24
+seconds_per_lifetime = hours_per_lifetime * 60 * 60
+
+# Number of filter queries per key added in filter construction is heavily dependent on workload.
+# When replication is in layer above RocksDB, it will be low, likely < 1. When replication is in
+# storage layer below RocksDB, it will likely be > 1. Using a rough and general guesstimate.
+key_query_per_construct = 1.0
+
+#==================================
+# Bloom & Ribbon filter performance
+typical_bloom_bits_per_key = 10.0
+typical_ribbon_bits_per_key = 7.0
+
+# Speeds here are sensitive to many variables, especially query speed because it
+# is so dependent on memory latency. Using this benchmark here:
+# for IMPL in 2 3; do
+#   ./filter_bench -impl=$IMPL -quick -m_keys_total_max=200 -use_full_block_reader
+# done
+# and "Random filter" queries.
+nanoseconds_per_construct_bloom_key = 32.0
+nanoseconds_per_construct_ribbon_key = 140.0
+
+nanoseconds_per_query_bloom_key = 500.0
+nanoseconds_per_query_ribbon_key = 600.0
+
+#==================================
+# Some constants
+kwh_per_watt_lifetime = hours_per_lifetime / 1000.0
+bits_per_gb = 8 * 1024 * 1024 * 1024
+
+#==================================
+# Crunching the numbers
+# on CPU for constructing filters
+dollars_per_cpu_thread_lifetime = upfront_dollars_per_cpu_thread + watts_per_cpu_thread * kwh_per_watt_lifetime * dollars_per_kwh
+dollars_per_cpu_thread_second = dollars_per_cpu_thread_lifetime / seconds_per_lifetime
+
+dollars_per_construct_bloom_key = dollars_per_cpu_thread_second * nanoseconds_per_construct_bloom_key / 10**9
+dollars_per_construct_ribbon_key = dollars_per_cpu_thread_second * nanoseconds_per_construct_ribbon_key / 10**9
+
+dollars_per_query_bloom_key = dollars_per_cpu_thread_second * nanoseconds_per_query_bloom_key / 10**9
+dollars_per_query_ribbon_key = dollars_per_cpu_thread_second * nanoseconds_per_query_ribbon_key / 10**9
+
+dollars_per_bloom_key_cpu = dollars_per_construct_bloom_key + key_query_per_construct * dollars_per_query_bloom_key
+dollars_per_ribbon_key_cpu = dollars_per_construct_ribbon_key + key_query_per_construct * dollars_per_query_ribbon_key
+
+# on holding filters in RAM
+dollars_per_gb_ram_lifetime = upfront_dollars_per_gb_ram + watts_per_gb_ram * kwh_per_watt_lifetime * dollars_per_kwh
+dollars_per_gb_ram_second = dollars_per_gb_ram_lifetime / seconds_per_lifetime
+
+dollars_per_bloom_key_in_ram_second = dollars_per_gb_ram_second / bits_per_gb * typical_bloom_bits_per_key
+dollars_per_ribbon_key_in_ram_second = dollars_per_gb_ram_second / bits_per_gb * typical_ribbon_bits_per_key
+
+#==================================
+# How many seconds does it take for the added cost of constructing a ribbon filter instead
+# of bloom to be offset by the added cost of holding the bloom filter in memory?
+break_even_seconds = (dollars_per_ribbon_key_cpu - dollars_per_bloom_key_cpu) / (dollars_per_bloom_key_in_ram_second - dollars_per_ribbon_key_in_ram_second)
+print(break_even_seconds)
+# -> 3235.1647730256936
+```
+
+So roughly speaking, filters that live in memory for more than an hour
+should be Ribbon, and filters that live less than an hour should be
+Bloom. This is very interesting, but how long do filters live in
+RocksDB?
+
+First let's consider the average case. Write-heavy RocksDB loads are
+often backed by flash storage, which has some specified write
+endurance for its intended lifetime. This can be expressed as *device
+writes per day* (DWPD), and supported DWPD is typically < 10.0 even
+for high end devices (excluding NVRAM). Roughly speaking, the DB would
+need to be writing at a rate of 20+ DWPD for data to have an average
+lifetime of less than one hour. Thus, unless you are prematurely
+burning out your flash or massively under-utilizing available storage,
+using the Ribbon filter has the better cost profile *on average*.
+
+### Predictable lifetime
+But we can do even better than optimizing for the average case. LSM
+levels give us very strong data lifetime hints.  Data in L0 might live
+for minutes or a small number of hours. Data in Lmax might live for
+days or weeks. So even if Ribbon filters weren't the best choice on
+average for a workload, they almost certainly make sense for the
+larger, longer-lived levels of the LSM. As of RocksDB 6.24, you can
+specify a minimum LSM level for Ribbon filters with
+`NewRibbonFilterPolicy`, and earlier levels will use Bloom filters.
+
+### Resident filter memory
+The above analysis assumes that nearly all filters for all live SST
+files are resident in memory. This is true if using
+`cache_index_and_filter_blocks=0` and `max_open_files=-1` (defaults),
+but `cache_index_and_filter_blocks=1` is popular. In that case,
+if you use `optimize_filters_for_hits=1` and non-partitioned filters
+(a popular MyRocks configuration), it is also likely that nearly all
+live filters are in memory. However, if you don't use
+`optimize_filters_for_hits` and use partitioned filters, then
+cold data (by age or by key range) can lead to only a portion of
+filters being resident in memory. In that case, benefit from Ribbon
+filter is not as clear, though because Ribbon filters are smaller,
+they are more efficient to read into memory.
+
+RocksDB version 6.21 and later include a rough feature to determine
+block cache usage for data blocks, filter blocks, index blocks, etc.
+Data like this is periodically dumped to LOG file
+(`stats_dump_period_sec`):
+
+```
+Block cache entry stats(count,size,portion): DataBlock(441761,6.82 GB,75.765%) FilterBlock(3002,1.27 GB,14.1387%) IndexBlock(17777,887.75 MB,9.63267%) Misc(1,0.00 KB,0%)
+Block cache LRUCache@0x7fdd08104290#7004432 capacity: 9.00 GB collections: 2573 last_copies: 10 last_secs: 0.143248 secs_since: 0
+```
+
+This indicates that at this moment in time, the block cache object
+identified by `LRUCache@0x7fdd08104290#7004432` (potentially used
+by multiple DBs) uses roughly 14% of its 9GB, about 1.27 GB, on filter
+blocks. This same data is available through `DB::GetMapProperty` with
+`DB::Properties::kBlockCacheEntryStats`, and (with some effort) can
+be compared to total size of all filters (not necessarily in memory)
+using `rocksdb.filter.size` from
+`DB::Properties::kAggregatedTableProperties`.
+
+### Sanity checking lifetime
+Can we be sure that using filters even makes sense for such long-lived
+data? We can apply [the current 5 minute rule for caching SSD data in
+RAM](http://renata.borovica-gajic.com/data/adms2017_5minuterule.pdf). A
+4KB filter page holds data for roughly 4K keys. If we assume at least
+one negative (useful) filter query in its lifetime per added key, it
+can satisfy the 5 minute rule with a lifetime of up to about two
+weeks. Thus, the lifetime threshold for “no filter” is about 300x
+higher than the lifetime threshold for Ribbon filter.
+
+### What to do with saved memory
+The default way to improve overall RocksDB performance with more
+available memory is to use more space for caching, which improves
+latency, CPU load, read IOs, etc.  With
+`cache_index_and_filter_blocks=1`, savings in filters will
+automatically make room for caching more data blocks in block
+cache. With `cache_index_and_filter_blocks=0`, consider increasing
+block cache size.
+
+Using the space savings to lower filter FP rates is also an option,
+but there is less evidence for this commonly improving existing
+*optimized* configurations.
+
+## Generic recommendation
+If using `NewBloomFilterPolicy(bpk)` for a large persistent DB using
+compression, try using `NewRibbonFilterPolicy(bpk)` instead, which
+will generate Ribbon filters during compaction and Bloom filters
+for flush, both with the same FP rate as the old setting. Once new SST
+files are generated under the new policy, this should free up some
+memory for more caching without much effect on burst or sustained
+write speed. Both kinds of filters can be read under either policy, so
+there's always an option to adjust settings or gracefully roll back to
+using Bloom filter only (keeping in mind that SST files must be
+replaced to see effect of that change).

From 0a563ae2781728a8548225a46a4703bde4b75c90 Mon Sep 17 00:00:00 2001
From: mrambacher <mrambach@gmail.com>
Date: Wed, 29 Dec 2021 03:40:45 -0800
Subject: [PATCH 4/6] Change GTEST_SKIP to BYPASS for MemoryAllocatorTest
 (#9340)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/9340

Reviewed By: riversand963

Differential Revision: D33344152

Pulled By: mrambacher

fbshipit-source-id: 283637625b86c33497571c5f52cac3ddf910b6f3
---
 memory/memory_allocator_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/memory/memory_allocator_test.cc b/memory/memory_allocator_test.cc
index 3cc14840e..90aed448b 100644
--- a/memory/memory_allocator_test.cc
+++ b/memory/memory_allocator_test.cc
@@ -141,7 +141,7 @@ TEST_F(CreateMemoryAllocatorTest, JemallocOptionsTest) {
   Status s = MemoryAllocator::CreateFromString(config_options_, id, &allocator);
   if (!JemallocNodumpAllocator::IsSupported()) {
     ASSERT_TRUE(s.IsNotSupported());
-    ROCKSDB_GTEST_SKIP("JEMALLOC not supported");
+    ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
     return;
   }
   ASSERT_OK(s);
@@ -193,7 +193,7 @@ TEST_F(CreateMemoryAllocatorTest, NewJemallocNodumpAllocator) {
   std::string msg;
   if (!JemallocNodumpAllocator::IsSupported(&msg)) {
     ASSERT_TRUE(s.IsNotSupported());
-    ROCKSDB_GTEST_SKIP("JEMALLOC not supported");
+    ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
     return;
   }
   ASSERT_NOK(s);  // Invalid options

From 1c39b7952bfff1beff1d473444cd75c3313b73bd Mon Sep 17 00:00:00 2001
From: mrambacher <mrambach@gmail.com>
Date: Wed, 29 Dec 2021 07:55:17 -0800
Subject: [PATCH 5/6] Remove/Reduce use of Regex in ObjectRegistry/Library
 (#9264)

Summary:
Added new ObjectLibrary::Entry classes to replace/reduce the use of Regex.  For simple factories that only do name matching, there are "StringEntry" and "AltStringEntry" classes.  For classes that use some semblance of regular expressions, there is a PatternEntry class that can match a name and prefixes.  There is also a class for Customizable::IndividualId format matches.

Added tests for the new derivative classes and got all unit tests to pass.

Resolves https://github.com/facebook/rocksdb/issues/9225.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9264

Reviewed By: pdillinger

Differential Revision: D33062001

Pulled By: mrambacher

fbshipit-source-id: c2d2143bd2d38bdf522705c8280c35381b135c03
---
 env/env_encryption.cc                       |  10 +-
 include/rocksdb/utilities/object_registry.h | 211 +++++++++++----
 options/customizable_test.cc                | 112 ++++++--
 options/options_test.cc                     |   5 +-
 table/plain/plain_table_factory.cc          |  23 +-
 test_util/testutil.cc                       |  35 ++-
 util/slice.cc                               |  38 +--
 utilities/merge_operators.cc                |  21 +-
 utilities/object_registry.cc                | 135 +++++++---
 utilities/object_registry_test.cc           | 268 +++++++++++++++++---
 10 files changed, 665 insertions(+), 193 deletions(-)

diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index 97c7439c4..4a2b68bc7 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -1274,10 +1274,10 @@ static void RegisterEncryptionBuiltins() {
   static std::once_flag once;
   std::call_once(once, [&]() {
     auto lib = ObjectRegistry::Default()->AddLibrary("encryption");
-    std::string ctr =
-        std::string(CTREncryptionProvider::kClassName()) + "?(://test)";
+    // Match "CTR" or "CTR://test"
     lib->Register<EncryptionProvider>(
-        std::string(CTREncryptionProvider::kClassName()) + "(://test)?",
+        ObjectLibrary::PatternEntry(CTREncryptionProvider::kClassName(), true)
+            .AddSuffix("://test"),
         [](const std::string& uri, std::unique_ptr<EncryptionProvider>* guard,
            std::string* /*errmsg*/) {
           if (EndsWith(uri, "://test")) {
@@ -1300,8 +1300,10 @@ static void RegisterEncryptionBuiltins() {
           return guard->get();
         });
 
+    // Match "ROT13" or "ROT13:[0-9]+"
     lib->Register<BlockCipher>(
-        std::string(ROT13BlockCipher::kClassName()) + "(:.*)?",
+        ObjectLibrary::PatternEntry(ROT13BlockCipher::kClassName(), true)
+            .AddNumber(":"),
         [](const std::string& uri, std::unique_ptr<BlockCipher>* guard,
            std::string* /* errmsg */) {
           size_t colon = uri.find(':');
diff --git a/include/rocksdb/utilities/object_registry.h b/include/rocksdb/utilities/object_registry.h
index b21868709..bc7d7ce3f 100644
--- a/include/rocksdb/utilities/object_registry.h
+++ b/include/rocksdb/utilities/object_registry.h
@@ -16,7 +16,6 @@
 #include <vector>
 
 #include "rocksdb/status.h"
-#include "rocksdb/utilities/regex.h"
 
 namespace ROCKSDB_NAMESPACE {
 class Customizable;
@@ -43,58 +42,158 @@ using ConfigureFunc = std::function<Status(T*)>;
 
 class ObjectLibrary {
  public:
+  // Class for matching target strings to a pattern.
+  // Entries consist of a name that starts the pattern and attributes
+  // The following attributes can be added to the entry:
+  //   -Suffix: Comparable to name(suffix)
+  //   -Separator: Comparable to name(separator).+
+  //   -Number: Comparable to name(separator).[0-9]+
+  //   -AltName: Comparable to (name|alt)
+  //   -Optional: Comparable to name(separator)?
+  // Multiple separators can be combined and cause multiple matches.
+  // For example, Pattern("A").AnotherName("B"),AddSeparator("@").AddNumber("#")
+  // is roughly equivalent to "(A|B)@.+#.+"
+  //
+  // Note that though this class does provide some regex-style matching,
+  // it is not a full regex parser and has some key differences:
+  //   - Separators are matched left-most.  For example, an entry
+  //     Name("Hello").AddSeparator(" ").AddSuffix("!") would match
+  //     "Hello world!", but not "Hello world!!"
+  //   - No backtracking is necessary, enabling reliably efficient matching
+  class PatternEntry {
+   private:
+    enum Quantifier {
+      kMatchPattern,  // [suffix].+
+      kMatchExact,    // [suffix]
+      kMatchNumeric,  // [suffix][0-9]+
+    };
+
+   public:
+    // Short-cut for creating an entry that matches to a
+    // Customizable::IndividualId
+    static PatternEntry AsIndividualId(const std::string& name) {
+      PatternEntry entry(name, true);
+      entry.AddSeparator("@");
+      entry.AddSeparator("#");
+      return entry;
+    }
+
+    // Creates a new pattern entry for "name".  If optional is true,
+    // Matches will also return true if name==target
+    explicit PatternEntry(const std::string& name, bool optional = true)
+        : name_(name), optional_(optional), slength_(0) {
+      nlength_ = name_.size();
+    }
+
+    // Adds a suffix (exact match of separator with no trailing characters) to
+    // the separator
+    PatternEntry& AddSuffix(const std::string& suffix) {
+      separators_.emplace_back(suffix, kMatchExact);
+      slength_ += suffix.size();
+      return *this;
+    }
+
+    // Adds a separator (exact match of separator with trailing characters) to
+    // the entry
+    PatternEntry& AddSeparator(const std::string& separator) {
+      separators_.emplace_back(separator, kMatchPattern);
+      slength_ += separator.size() + 1;
+      return *this;
+    }
+
+    // Adds a separator (exact match of separator with trailing numbers) to the
+    // entry
+    PatternEntry& AddNumber(const std::string& separator) {
+      separators_.emplace_back(separator, kMatchNumeric);
+      slength_ += separator.size() + 1;
+      return *this;
+    }
+
+    // Sets another name that this entry will match, similar to (name|alt)
+    PatternEntry& AnotherName(const std::string& alt) {
+      names_.emplace_back(alt);
+      return *this;
+    }
+
+    // Sets whether the separators are required -- similar to name(separator)?
+    // If optional is true, then name(separator)? would match
+    // If optional is false, then the separators must also match
+    PatternEntry& SetOptional(bool optional) {
+      optional_ = optional;
+      return *this;
+    }
+
+    // Checks to see if the target matches this entry
+    bool Matches(const std::string& target) const;
+    const char* Name() const { return name_.c_str(); }
+
+   private:
+    size_t MatchSeparatorAt(size_t start, Quantifier mode,
+                            const std::string& target, size_t tlen,
+                            const std::string& pattern) const;
+
+    bool MatchesTarget(const std::string& name, size_t nlen,
+                       const std::string& target, size_t ylen) const;
+    std::string name_;                // The base name for this entry
+    size_t nlength_;                  // The length of name_
+    std::vector<std::string> names_;  // Alternative names for this entry
+    bool optional_;   // Whether matching of separators is required
+    size_t slength_;  // The minimum required length to match the separators
+    std::vector<std::pair<std::string, Quantifier>>
+        separators_;  // What to match
+  };                  // End class Entry
+
+ private:
   // Base class for an Entry in the Registry.
   class Entry {
    public:
     virtual ~Entry() {}
-    Entry(const std::string& name) : name_(std::move(name)) {}
-
-    // Checks to see if the target matches this entry
-    virtual bool matches(const std::string& target) const {
-      return name_ == target;
-    }
-    const std::string& Name() const { return name_; }
-
-   private:
-    const std::string name_;  // The name of the Entry
-  };                          // End class Entry
+    virtual bool Matches(const std::string& target) const = 0;
+    virtual const char* Name() const = 0;
+  };
 
   // An Entry containing a FactoryFunc for creating new Objects
-  //
-  // !!!!!! WARNING !!!!!!: The implementation currently uses std::regex, which
-  // has terrible performance in some cases, including possible crash due to
-  // stack overflow. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582
-  // for example. Avoid complicated regexes as much as possible.
   template <typename T>
   class FactoryEntry : public Entry {
    public:
-    FactoryEntry(const std::string& name, FactoryFunc<T> f)
-        : Entry(name), factory_(std::move(f)) {
-      // FIXME: the API needs to expose this failure mode. For now, bad regexes
-      // will match nothing.
-      Regex::Parse(name, &regex_).PermitUncheckedError();
-    }
-    ~FactoryEntry() override {}
-    bool matches(const std::string& target) const override {
-      return regex_.Matches(target);
+    FactoryEntry(const PatternEntry& e, FactoryFunc<T> f)
+        : entry_(e), factory_(std::move(f)) {}
+    bool Matches(const std::string& target) const override {
+      return entry_.Matches(target);
     }
+    const char* Name() const override { return entry_.Name(); }
+
     // Creates a new T object.
     T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard,
                         std::string* msg) const {
       return factory_(target, guard, msg);
     }
+    const FactoryFunc<T>& GetFactory() const { return factory_; }
 
    private:
-    Regex regex_;  // The pattern for this entry
+    PatternEntry entry_;  // The pattern for this entry
     FactoryFunc<T> factory_;
   };  // End class FactoryEntry
  public:
   explicit ObjectLibrary(const std::string& id) { id_ = id; }
 
   const std::string& GetID() const { return id_; }
-  // Finds the entry matching the input name and type
-  const Entry* FindEntry(const std::string& type,
-                         const std::string& name) const;
+
+  template <typename T>
+  FactoryFunc<T> FindFactory(const std::string& pattern) const {
+    std::unique_lock<std::mutex> lock(mu_);
+    auto factories = factories_.find(T::Type());
+    if (factories != factories_.end()) {
+      for (const auto& e : factories->second) {
+        if (e->Matches(pattern)) {
+          const auto* fe =
+              static_cast<const ObjectLibrary::FactoryEntry<T>*>(e.get());
+          return fe->GetFactory();
+        }
+      }
+    }
+    return nullptr;
+  }
 
   // Returns the total number of factories registered for this library.
   // This method returns the sum of all factories registered for all types.
@@ -108,9 +207,18 @@ class ObjectLibrary {
   template <typename T>
   const FactoryFunc<T>& Register(const std::string& pattern,
                                  const FactoryFunc<T>& factory) {
-    std::unique_ptr<Entry> entry(new FactoryEntry<T>(pattern, factory));
-    AddEntry(T::Type(), entry);
-    return factory;
+    PatternEntry entry(pattern);
+    return Register(entry, factory);
+  }
+
+  template <typename T>
+  const FactoryFunc<T>& Register(const PatternEntry& pattern,
+                                 const FactoryFunc<T>& func) {
+    std::unique_ptr<Entry> entry(new FactoryEntry<T>(pattern, func));
+    std::unique_lock<std::mutex> lock(mu_);
+    auto& factories = factories_[T::Type()];
+    factories.emplace_back(std::move(entry));
+    return func;
   }
 
   // Invokes the registrar function with the supplied arg for this library.
@@ -122,13 +230,11 @@ class ObjectLibrary {
   static std::shared_ptr<ObjectLibrary>& Default();
 
  private:
-  // Adds the input entry to the list for the given type
-  void AddEntry(const std::string& type, std::unique_ptr<Entry>& entry);
-
   // Protects the entry map
   mutable std::mutex mu_;
   // ** FactoryFunctions for this loader, organized by type
-  std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>> entries_;
+  std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>>
+      factories_;
 
   // The name for this library
   std::string id_;
@@ -178,11 +284,9 @@ class ObjectRegistry {
   T* NewObject(const std::string& target, std::unique_ptr<T>* guard,
                std::string* errmsg) {
     guard->reset();
-    const auto* basic = FindEntry(T::Type(), target);
-    if (basic != nullptr) {
-      const auto* factory =
-          static_cast<const ObjectLibrary::FactoryEntry<T>*>(basic);
-      return factory->NewFactoryObject(target, guard, errmsg);
+    auto factory = FindFactory<T>(target);
+    if (factory != nullptr) {
+      return factory(target, guard, errmsg);
     } else {
       *errmsg = std::string("Could not load ") + T::Type();
       return nullptr;
@@ -386,8 +490,27 @@ class ObjectRegistry {
   Status SetManagedObject(const std::string& type, const std::string& id,
                           const std::shared_ptr<Customizable>& c);
 
-  const ObjectLibrary::Entry* FindEntry(const std::string& type,
-                                        const std::string& name) const;
+  // Searches (from back to front) the libraries looking for the
+  // factory that matches this pattern.
+  // Returns the factory if it is found, and nullptr otherwise
+  template <typename T>
+  const FactoryFunc<T> FindFactory(const std::string& name) const {
+    {
+      std::unique_lock<std::mutex> lock(library_mutex_);
+      for (auto iter = libraries_.crbegin(); iter != libraries_.crend();
+           ++iter) {
+        const auto factory = iter->get()->FindFactory<T>(name);
+        if (factory != nullptr) {
+          return factory;
+        }
+      }
+    }
+    if (parent_ != nullptr) {
+      return parent_->FindFactory<T>(name);
+    } else {
+      return nullptr;
+    }
+  }
 
   // The set of libraries to search for factories for this registry.
   // The libraries are searched in reverse order (back to front) when
diff --git a/options/customizable_test.cc b/options/customizable_test.cc
index f4872ad9d..ea6e73fb2 100644
--- a/options/customizable_test.cc
+++ b/options/customizable_test.cc
@@ -41,6 +41,10 @@
 #include "util/string_util.h"
 #include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
 #include "utilities/memory_allocators.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
@@ -177,7 +181,7 @@ static int A_count = 0;
 static int RegisterCustomTestObjects(ObjectLibrary& library,
                                      const std::string& /*arg*/) {
   library.Register<TestCustomizable>(
-      "A.*",
+      ObjectLibrary::PatternEntry("A", true).AddSeparator("_"),
       [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
          std::string* /* msg */) {
         guard->reset(new ACustomizable(name));
@@ -322,7 +326,7 @@ class CustomizableTest : public testing::Test {
 //    - a property with a name
 TEST_F(CustomizableTest, CreateByNameTest) {
   ObjectLibrary::Default()->Register<TestCustomizable>(
-      "TEST.*",
+      ObjectLibrary::PatternEntry("TEST", false).AddSeparator("_"),
       [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
          std::string* /* msg */) {
         guard->reset(new TestCustomizable(name));
@@ -931,12 +935,12 @@ TEST_F(CustomizableTest, NoNameTest) {
   auto copts = copy.GetOptions<SimpleOptions>();
   sopts->cu.reset(new ACustomizable(""));
   orig.cv.push_back(std::make_shared<ACustomizable>(""));
-  orig.cv.push_back(std::make_shared<ACustomizable>("A1"));
+  orig.cv.push_back(std::make_shared<ACustomizable>("A_1"));
   std::string opt_str, mismatch;
   ASSERT_OK(orig.GetOptionString(config_options_, &opt_str));
   ASSERT_OK(copy.ConfigureFromString(config_options_, opt_str));
   ASSERT_EQ(copy.cv.size(), 1U);
-  ASSERT_EQ(copy.cv[0]->GetId(), "A1");
+  ASSERT_EQ(copy.cv[0]->GetId(), "A_1");
   ASSERT_EQ(copts->cu, nullptr);
 }
 
@@ -1016,19 +1020,27 @@ TEST_F(CustomizableTest, FactoryFunctionTest) {
 
 TEST_F(CustomizableTest, URLFactoryTest) {
   std::unique_ptr<TestCustomizable> unique;
+  config_options_.registry->AddLibrary("URL")->Register<TestCustomizable>(
+      ObjectLibrary::PatternEntry("Z", false).AddSeparator(""),
+      [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new TestCustomizable(name));
+        return guard->get();
+      });
+
   ConfigOptions ignore = config_options_;
   ignore.ignore_unsupported_options = false;
   ignore.ignore_unsupported_options = false;
-  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "A=1;x=y", &unique));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z=1;x=y", &unique));
   ASSERT_NE(unique, nullptr);
-  ASSERT_EQ(unique->GetId(), "A=1;x=y");
-  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "A;x=y", &unique));
+  ASSERT_EQ(unique->GetId(), "Z=1;x=y");
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z;x=y", &unique));
   ASSERT_NE(unique, nullptr);
-  ASSERT_EQ(unique->GetId(), "A;x=y");
+  ASSERT_EQ(unique->GetId(), "Z;x=y");
   unique.reset();
-  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "A=1?x=y", &unique));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z=1?x=y", &unique));
   ASSERT_NE(unique, nullptr);
-  ASSERT_EQ(unique->GetId(), "A=1?x=y");
+  ASSERT_EQ(unique->GetId(), "Z=1?x=y");
 }
 
 TEST_F(CustomizableTest, MutableOptionsTest) {
@@ -1126,6 +1138,7 @@ TEST_F(CustomizableTest, CustomManagedObjects) {
   std::shared_ptr<TestCustomizable> object1, object2;
   ASSERT_OK(LoadManagedObject<TestCustomizable>(
       config_options_, "id=A_1;int=1;bool=true", &object1));
+  ASSERT_NE(object1, nullptr);
   ASSERT_OK(
       LoadManagedObject<TestCustomizable>(config_options_, "A_1", &object2));
   ASSERT_EQ(object1, object2);
@@ -1165,9 +1178,11 @@ TEST_F(CustomizableTest, CreateManagedObjects) {
 
   config_options_.registry->AddLibrary("Managed")
       ->Register<ManagedCustomizable>(
-          "Managed(@.*)?", [](const std::string& /*name*/,
-                              std::unique_ptr<ManagedCustomizable>* guard,
-                              std::string* /* msg */) {
+          ObjectLibrary::PatternEntry::AsIndividualId(
+              ManagedCustomizable::kClassName()),
+          [](const std::string& /*name*/,
+             std::unique_ptr<ManagedCustomizable>* guard,
+             std::string* /* msg */) {
             guard->reset(new ManagedCustomizable());
             return guard->get();
           });
@@ -1317,7 +1332,8 @@ class MockMemoryAllocator : public BaseMemoryAllocator {
 class MockEncryptionProvider : public EncryptionProvider {
  public:
   explicit MockEncryptionProvider(const std::string& id) : id_(id) {}
-  const char* Name() const override { return "Mock"; }
+  static const char* kClassName() { return "Mock"; }
+  const char* Name() const override { return kClassName(); }
   size_t GetPrefixLength() const override { return 0; }
   Status CreateNewPrefix(const std::string& /*fname*/, char* /*prefix*/,
                          size_t /*prefixLength*/) const override {
@@ -1459,7 +1475,8 @@ static int RegisterLocalObjects(ObjectLibrary& library,
       });
 
   library.Register<EncryptionProvider>(
-      "Mock(://test)?",
+      ObjectLibrary::PatternEntry(MockEncryptionProvider::kClassName(), true)
+          .AddSuffix("://test"),
       [](const std::string& uri, std::unique_ptr<EncryptionProvider>* guard,
          std::string* /* errmsg */) {
         guard->reset(new MockEncryptionProvider(uri));
@@ -1811,9 +1828,74 @@ TEST_F(LoadCustomizableTest, LoadMergeOperatorTest) {
 
   ASSERT_NOK(
       MergeOperator::CreateFromString(config_options_, "Changling", &result));
+  //**TODO: MJR: Use the constants when these names are in public classes
   ASSERT_OK(MergeOperator::CreateFromString(config_options_, "put", &result));
   ASSERT_NE(result, nullptr);
   ASSERT_STREQ(result->Name(), "PutOperator");
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options_, "PutOperator", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "PutOperator");
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options_, "put_v1", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "PutOperator");
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options_, "uint64add", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "UInt64AddOperator");
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_,
+                                            "UInt64AddOperator", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "UInt64AddOperator");
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_, "max", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "MaxOperator");
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options_, "MaxOperator", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "MaxOperator");
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, StringAppendOperator::kNickName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), StringAppendOperator::kClassName());
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, StringAppendOperator::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), StringAppendOperator::kClassName());
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, StringAppendTESTOperator::kNickName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), StringAppendTESTOperator::kClassName());
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, StringAppendTESTOperator::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), StringAppendTESTOperator::kClassName());
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_,
+                                            SortList::kNickName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), SortList::kClassName());
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_,
+                                            SortList::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), SortList::kClassName());
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, BytesXOROperator::kNickName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), BytesXOROperator::kClassName());
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, BytesXOROperator::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), BytesXOROperator::kClassName());
+#endif  // ROCKSDB_LITE
+  ASSERT_NOK(
+      MergeOperator::CreateFromString(config_options_, "Changling", &result));
   if (RegisterTests("Test")) {
     ASSERT_OK(
         MergeOperator::CreateFromString(config_options_, "Changling", &result));
diff --git a/options/options_test.cc b/options/options_test.cc
index 0392320a8..36e538dc5 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -2079,8 +2079,9 @@ TEST_F(OptionsTest, OptionTablePropertiesTest) {
   // properties as the original
   cfg_opts.registry->AddLibrary("collector")
       ->Register<TablePropertiesCollectorFactory>(
-          std::string(TestTablePropertiesCollectorFactory::kClassName()) +
-              ":.*",
+          ObjectLibrary::PatternEntry(
+              TestTablePropertiesCollectorFactory::kClassName(), false)
+              .AddSeparator(":"),
           [](const std::string& name,
              std::unique_ptr<TablePropertiesCollectorFactory>* guard,
              std::string* /* errmsg */) {
diff --git a/table/plain/plain_table_factory.cc b/table/plain/plain_table_factory.cc
index 737b6b58f..a6cf42f1e 100644
--- a/table/plain/plain_table_factory.cc
+++ b/table/plain/plain_table_factory.cc
@@ -161,15 +161,14 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
   // The MemTableRepFactory built-in classes will be either a class
   // (VectorRepFactory) or a nickname (vector), followed optionally by ":#",
   // where # is the "size" of the factory.
-  auto AsRegex = [](const std::string& name, const std::string& alt) {
-    std::string regex;
-    regex.append("(").append(name);
-    regex.append("|").append(alt).append(")(:[0-9]*)?");
-    return regex;
+  auto AsPattern = [](const std::string& name, const std::string& alt) {
+    auto pattern = ObjectLibrary::PatternEntry(name, true);
+    pattern.AnotherName(alt);
+    pattern.AddNumber(":");
+    return pattern;
   };
-
   library.Register<MemTableRepFactory>(
-      AsRegex(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()),
+      AsPattern(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()),
       [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
          std::string* /*errmsg*/) {
         auto colon = uri.find(":");
@@ -182,7 +181,7 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
         return guard->get();
       });
   library.Register<MemTableRepFactory>(
-      AsRegex(SkipListFactory::kClassName(), SkipListFactory::kNickName()),
+      AsPattern(SkipListFactory::kClassName(), SkipListFactory::kNickName()),
       [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
          std::string* /*errmsg*/) {
         auto colon = uri.find(":");
@@ -195,7 +194,7 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
         return guard->get();
       });
   library.Register<MemTableRepFactory>(
-      AsRegex("HashLinkListRepFactory", "hash_linkedlist"),
+      AsPattern("HashLinkListRepFactory", "hash_linkedlist"),
       [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
          std::string* /*errmsg*/) {
         // Expecting format: hash_linkedlist:<hash_bucket_count>
@@ -209,7 +208,7 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
         return guard->get();
       });
   library.Register<MemTableRepFactory>(
-      AsRegex("HashSkipListRepFactory", "prefix_hash"),
+      AsPattern("HashSkipListRepFactory", "prefix_hash"),
       [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
          std::string* /*errmsg*/) {
         // Expecting format: prefix_hash:<hash_bucket_count>
@@ -230,9 +229,11 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
         return nullptr;
       });
 
-  return 5;
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
 }
 #endif  // ROCKSDB_LITE
+
 Status GetMemTableRepFactoryFromString(
     const std::string& opts_str, std::unique_ptr<MemTableRepFactory>* result) {
   ConfigOptions config_options;
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index 0fe789e71..07d0ef252 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -676,6 +676,25 @@ class SpecialMemTableRep : public MemTableRep {
 };
 class SpecialSkipListFactory : public MemTableRepFactory {
  public:
+#ifndef ROCKSDB_LITE
+  static bool Register(ObjectLibrary& library, const std::string& /*arg*/) {
+    library.Register<MemTableRepFactory>(
+        ObjectLibrary::PatternEntry(SpecialSkipListFactory::kClassName(), true)
+            .AddNumber(":"),
+        [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+           std::string* /* errmsg */) {
+          auto colon = uri.find(":");
+          if (colon != std::string::npos) {
+            auto count = ParseInt(uri.substr(colon + 1));
+            guard->reset(new SpecialSkipListFactory(count));
+          } else {
+            guard->reset(new SpecialSkipListFactory(2));
+          }
+          return guard->get();
+        });
+    return true;
+  }
+#endif  // ROCKSDB_LITE
   // After number of inserts exceeds `num_entries_flush` in a mem table, trigger
   // flush.
   explicit SpecialSkipListFactory(int num_entries_flush)
@@ -717,7 +736,7 @@ MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush) {
 
 #ifndef ROCKSDB_LITE
 // This method loads existing test classes into the ObjectRegistry
-int RegisterTestObjects(ObjectLibrary& library, const std::string& /*arg*/) {
+int RegisterTestObjects(ObjectLibrary& library, const std::string& arg) {
   size_t num_types;
   library.Register<const Comparator>(
       test::SimpleSuffixReverseComparator::kClassName(),
@@ -727,19 +746,7 @@ int RegisterTestObjects(ObjectLibrary& library, const std::string& /*arg*/) {
         static test::SimpleSuffixReverseComparator ssrc;
         return &ssrc;
       });
-  library.Register<MemTableRepFactory>(
-      std::string(SpecialSkipListFactory::kClassName()) + "(:[0-9]*)?",
-      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
-         std::string* /* errmsg */) {
-        auto colon = uri.find(":");
-        if (colon != std::string::npos) {
-          auto count = ParseInt(uri.substr(colon + 1));
-          guard->reset(new SpecialSkipListFactory(count));
-        } else {
-          guard->reset(new SpecialSkipListFactory(2));
-        }
-        return guard->get();
-      });
+  SpecialSkipListFactory::Register(library, arg);
   library.Register<MergeOperator>(
       "Changling",
       [](const std::string& uri, std::unique_ptr<MergeOperator>* guard,
diff --git a/util/slice.cc b/util/slice.cc
index f96926cfc..268eb1b4e 100644
--- a/util/slice.cc
+++ b/util/slice.cc
@@ -143,6 +143,9 @@ const SliceTransform* NewNoopTransform() { return new NoopTransform; }
 #ifndef ROCKSDB_LITE
 static int RegisterBuiltinSliceTransform(ObjectLibrary& library,
                                          const std::string& /*arg*/) {
+  // For the builtin transforms, the format is typically
+  // [Name] or [Name].[0-9]+
+  // [NickName]:[0-9]+
   library.Register<const SliceTransform>(
       NoopTransform::kClassName(),
       [](const std::string& /*uri*/,
@@ -152,7 +155,8 @@ static int RegisterBuiltinSliceTransform(ObjectLibrary& library,
         return guard->get();
       });
   library.Register<const SliceTransform>(
-      std::string(FixedPrefixTransform::kNickName()) + ":[0-9]+",
+      ObjectLibrary::PatternEntry(FixedPrefixTransform::kNickName(), false)
+          .AddNumber(":"),
       [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
          std::string* /*errmsg*/) {
         auto colon = uri.find(":");
@@ -161,24 +165,22 @@ static int RegisterBuiltinSliceTransform(ObjectLibrary& library,
         return guard->get();
       });
   library.Register<const SliceTransform>(
-      FixedPrefixTransform::kClassName(),
-      [](const std::string& /*uri*/,
-         std::unique_ptr<const SliceTransform>* guard,
-         std::string* /*errmsg*/) {
-        guard->reset(NewFixedPrefixTransform(0));
-        return guard->get();
-      });
-  library.Register<const SliceTransform>(
-      std::string(FixedPrefixTransform::kClassName()) + "\\.[0-9]+",
+      ObjectLibrary::PatternEntry(FixedPrefixTransform::kClassName(), true)
+          .AddNumber("."),
       [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
          std::string* /*errmsg*/) {
-        auto len = ParseSizeT(
-            uri.substr(strlen(FixedPrefixTransform::kClassName()) + 1));
-        guard->reset(NewFixedPrefixTransform(len));
+        if (uri == FixedPrefixTransform::kClassName()) {
+          guard->reset(NewFixedPrefixTransform(0));
+        } else {
+          auto len = ParseSizeT(
+              uri.substr(strlen(FixedPrefixTransform::kClassName()) + 1));
+          guard->reset(NewFixedPrefixTransform(len));
+        }
         return guard->get();
       });
   library.Register<const SliceTransform>(
-      std::string(CappedPrefixTransform::kNickName()) + ":[0-9]+",
+      ObjectLibrary::PatternEntry(CappedPrefixTransform::kNickName(), false)
+          .AddNumber(":"),
       [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
          std::string* /*errmsg*/) {
         auto colon = uri.find(":");
@@ -187,19 +189,21 @@ static int RegisterBuiltinSliceTransform(ObjectLibrary& library,
         return guard->get();
       });
   library.Register<const SliceTransform>(
-      std::string(CappedPrefixTransform::kClassName()) + "(\\.[0-9]+)?",
+      ObjectLibrary::PatternEntry(CappedPrefixTransform::kClassName(), true)
+          .AddNumber("."),
       [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
          std::string* /*errmsg*/) {
         if (uri == CappedPrefixTransform::kClassName()) {
           guard->reset(NewCappedPrefixTransform(0));
-        } else {  // Length + "."
+        } else {
           auto len = ParseSizeT(
               uri.substr(strlen(CappedPrefixTransform::kClassName()) + 1));
           guard->reset(NewCappedPrefixTransform(len));
         }
         return guard->get();
       });
-  return 5;
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
 }
 #endif  // ROCKSDB_LITE
 
diff --git a/utilities/merge_operators.cc b/utilities/merge_operators.cc
index 7fe0abfaf..180e577db 100644
--- a/utilities/merge_operators.cc
+++ b/utilities/merge_operators.cc
@@ -55,38 +55,33 @@ static bool LoadMergeOperator(const std::string& id,
 static int RegisterBuiltinMergeOperators(ObjectLibrary& library,
                                          const std::string& /*arg*/) {
   size_t num_types;
-  auto AsRegex = [](const std::string& name, const std::string& alt) {
-    std::string regex;
-    regex.append("(").append(name);
-    regex.append("|").append(alt).append(")");
-    return regex;
-  };
-
   library.Register<MergeOperator>(
-      AsRegex(StringAppendOperator::kClassName(),
-              StringAppendOperator::kNickName()),
+      ObjectLibrary::PatternEntry(StringAppendOperator::kClassName())
+          .AnotherName(StringAppendOperator::kNickName()),
       [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
          std::string* /*errmsg*/) {
         guard->reset(new StringAppendOperator(","));
         return guard->get();
       });
   library.Register<MergeOperator>(
-      AsRegex(StringAppendTESTOperator::kClassName(),
-              StringAppendTESTOperator::kNickName()),
+      ObjectLibrary::PatternEntry(StringAppendTESTOperator::kClassName())
+          .AnotherName(StringAppendTESTOperator::kNickName()),
       [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
          std::string* /*errmsg*/) {
         guard->reset(new StringAppendTESTOperator(","));
         return guard->get();
       });
   library.Register<MergeOperator>(
-      AsRegex(SortList::kClassName(), SortList::kNickName()),
+      ObjectLibrary::PatternEntry(SortList::kClassName())
+          .AnotherName(SortList::kNickName()),
       [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
          std::string* /*errmsg*/) {
         guard->reset(new SortList());
         return guard->get();
       });
   library.Register<MergeOperator>(
-      AsRegex(BytesXOROperator::kClassName(), BytesXOROperator::kNickName()),
+      ObjectLibrary::PatternEntry(BytesXOROperator::kClassName())
+          .AnotherName(BytesXOROperator::kNickName()),
       [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
          std::string* /*errmsg*/) {
         guard->reset(new BytesXOROperator());
diff --git a/utilities/object_registry.cc b/utilities/object_registry.cc
index b0a60f53e..e41d3172d 100644
--- a/utilities/object_registry.cc
+++ b/utilities/object_registry.cc
@@ -5,6 +5,8 @@
 
 #include "rocksdb/utilities/object_registry.h"
 
+#include <ctype.h>
+
 #include "logging/logging.h"
 #include "rocksdb/customizable.h"
 #include "rocksdb/env.h"
@@ -12,35 +14,105 @@
 
 namespace ROCKSDB_NAMESPACE {
 #ifndef ROCKSDB_LITE
-// Looks through the "type" factories for one that matches "name".
-// If found, returns the pointer to the Entry matching this name.
-// Otherwise, nullptr is returned
-const ObjectLibrary::Entry *ObjectLibrary::FindEntry(
-    const std::string &type, const std::string &name) const {
-  std::unique_lock<std::mutex> lock(mu_);
-  auto entries = entries_.find(type);
-  if (entries != entries_.end()) {
-    for (const auto &entry : entries->second) {
-      if (entry->matches(name)) {
-        return entry.get();
+size_t ObjectLibrary::PatternEntry::MatchSeparatorAt(
+    size_t start, Quantifier mode, const std::string &target, size_t tlen,
+    const std::string &separator) const {
+  size_t slen = separator.size();
+  // See if there is enough space.  If so, find the separator
+  if (tlen < start + slen) {
+    return std::string::npos;  // not enough space left
+  } else if (mode == kMatchExact) {
+    // Exact mode means the next thing we are looking for is the separator
+    if (target.compare(start, slen, separator) != 0) {
+      return std::string::npos;
+    } else {
+      return start + slen;  // Found the separator, return where we found it
+    }
+  } else {
+    auto pos = start + 1;
+    if (!separator.empty()) {
+      pos = target.find(separator, pos);
+    }
+    if (pos == std::string::npos) {
+      return pos;
+    } else if (mode == kMatchNumeric) {
+      // If it is numeric, everything up to the match must be a number
+      while (start < pos) {
+        if (!isdigit(target[start++])) {
+          return std::string::npos;
+        }
+      }
+    }
+    return pos + slen;
+  }
+}
+
+bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
+                                                size_t nlen,
+                                                const std::string &target,
+                                                size_t tlen) const {
+  if (separators_.empty()) {
+    assert(optional_);  // If there are no separators, it must be only a name
+    return nlen == tlen && name == target;
+  } else if (nlen == tlen) {  // The lengths are the same
+    return optional_ && name == target;
+  } else if (tlen < nlen + slength_) {
+    // The target is not long enough
+    return false;
+  } else if (target.compare(0, nlen, name) != 0) {
+    return false;  // Target does not start with name
+  } else {
+    // Loop through all of the separators one at a time matching them.
+    // Note that we first match the separator and then its quantifiers.
+    // Since we expect the separator first, we start with an exact match
+    // Subsequent matches will use the quantifier of the previous separator
+    size_t start = nlen;
+    auto mode = kMatchExact;
+    for (size_t idx = 0; idx < separators_.size(); ++idx) {
+      const auto &separator = separators_[idx];
+      start = MatchSeparatorAt(start, mode, target, tlen, separator.first);
+      if (start == std::string::npos) {
+        return false;
+      } else {
+        mode = separator.second;
+      }
+    }
+    // We have matched all of the separators.  Now check that what is left
+    // unmatched in the target is acceptable.
+    if (mode == kMatchExact) {
+      return (start == tlen);
+    } else if (start >= tlen) {
+      return false;
+    } else if (mode == kMatchNumeric) {
+      while (start < tlen) {
+        if (!isdigit(target[start++])) {
+          return false;
+        }
       }
     }
   }
-  return nullptr;
+  return true;
 }
 
-void ObjectLibrary::AddEntry(const std::string &type,
-                             std::unique_ptr<Entry> &entry) {
-  std::unique_lock<std::mutex> lock(mu_);
-  auto &entries = entries_[type];
-  entries.emplace_back(std::move(entry));
+bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const {
+  auto tlen = target.size();
+  if (MatchesTarget(name_, nlength_, target, tlen)) {
+    return true;
+  } else if (!names_.empty()) {
+    for (const auto &alt : names_) {
+      if (MatchesTarget(alt, alt.size(), target, tlen)) {
+        return true;
+      }
+    }
+  }
+  return false;
 }
 
 size_t ObjectLibrary::GetFactoryCount(size_t *types) const {
   std::unique_lock<std::mutex> lock(mu_);
-  *types = entries_.size();
+  *types = factories_.size();
   size_t factories = 0;
-  for (const auto &e : entries_) {
+  for (const auto &e : factories_) {
     factories += e.second.size();
   }
   return factories;
@@ -48,13 +120,12 @@ size_t ObjectLibrary::GetFactoryCount(size_t *types) const {
 
 void ObjectLibrary::Dump(Logger *logger) const {
   std::unique_lock<std::mutex> lock(mu_);
-  for (const auto &iter : entries_) {
+  for (const auto &iter : factories_) {
     ROCKS_LOG_HEADER(logger, "    Registered factories for type[%s] ",
                      iter.first.c_str());
     bool printed_one = false;
     for (const auto &e : iter.second) {
-      ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':',
-                       e->Name().c_str());
+      ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':', e->Name());
       printed_one = true;
     }
   }
@@ -84,26 +155,6 @@ std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance(
   return std::make_shared<ObjectRegistry>(parent);
 }
 
-// Searches (from back to front) the libraries looking for the
-// an entry that matches this pattern.
-// Returns the entry if it is found, and nullptr otherwise
-const ObjectLibrary::Entry *ObjectRegistry::FindEntry(
-    const std::string &type, const std::string &name) const {
-  {
-    std::unique_lock<std::mutex> lock(library_mutex_);
-    for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
-      const auto *entry = iter->get()->FindEntry(type, name);
-      if (entry != nullptr) {
-        return entry;
-      }
-    }
-  }
-  if (parent_ != nullptr) {
-    return parent_->FindEntry(type, name);
-  } else {
-    return nullptr;
-  }
-}
 Status ObjectRegistry::SetManagedObject(
     const std::string &type, const std::string &id,
     const std::shared_ptr<Customizable> &object) {
diff --git a/utilities/object_registry_test.cc b/utilities/object_registry_test.cc
index d9a5b1526..a2f61c8f6 100644
--- a/utilities/object_registry_test.cc
+++ b/utilities/object_registry_test.cc
@@ -12,32 +12,33 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-class EnvRegistryTest : public testing::Test {
+class ObjRegistryTest : public testing::Test {
  public:
   static int num_a, num_b;
 };
 
-int EnvRegistryTest::num_a = 0;
-int EnvRegistryTest::num_b = 0;
+int ObjRegistryTest::num_a = 0;
+int ObjRegistryTest::num_b = 0;
 static FactoryFunc<Env> test_reg_a = ObjectLibrary::Default()->Register<Env>(
-    "a://.*",
+    ObjectLibrary::PatternEntry("a", false).AddSeparator("://"),
     [](const std::string& /*uri*/, std::unique_ptr<Env>* /*env_guard*/,
        std::string* /* errmsg */) {
-      ++EnvRegistryTest::num_a;
+      ++ObjRegistryTest::num_a;
       return Env::Default();
     });
 
 static FactoryFunc<Env> test_reg_b = ObjectLibrary::Default()->Register<Env>(
-    "b://.*", [](const std::string& /*uri*/, std::unique_ptr<Env>* env_guard,
-                 std::string* /* errmsg */) {
-      ++EnvRegistryTest::num_b;
+    ObjectLibrary::PatternEntry("b", false).AddSeparator("://"),
+    [](const std::string& /*uri*/, std::unique_ptr<Env>* env_guard,
+       std::string* /* errmsg */) {
+      ++ObjRegistryTest::num_b;
       // Env::Default() is a singleton so we can't grant ownership directly to
       // the caller - we must wrap it first.
       env_guard->reset(new EnvWrapper(Env::Default()));
       return env_guard->get();
     });
 
-TEST_F(EnvRegistryTest, Basics) {
+TEST_F(ObjRegistryTest, Basics) {
   std::string msg;
   std::unique_ptr<Env> env_guard;
   auto registry = ObjectRegistry::NewInstance();
@@ -60,7 +61,7 @@ TEST_F(EnvRegistryTest, Basics) {
   ASSERT_EQ(1, num_b);
 }
 
-TEST_F(EnvRegistryTest, LocalRegistry) {
+TEST_F(ObjRegistryTest, LocalRegistry) {
   std::string msg;
   std::unique_ptr<Env> guard;
   auto registry = ObjectRegistry::NewInstance();
@@ -87,7 +88,7 @@ TEST_F(EnvRegistryTest, LocalRegistry) {
   ASSERT_NE(registry->NewObject<Env>("test-global", &guard, &msg), nullptr);
 }
 
-TEST_F(EnvRegistryTest, CheckShared) {
+TEST_F(ObjRegistryTest, CheckShared) {
   std::shared_ptr<Env> shared;
   std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
   std::shared_ptr<ObjectLibrary> library =
@@ -112,7 +113,7 @@ TEST_F(EnvRegistryTest, CheckShared) {
   ASSERT_EQ(shared, nullptr);
 }
 
-TEST_F(EnvRegistryTest, CheckStatic) {
+TEST_F(ObjRegistryTest, CheckStatic) {
   Env* env = nullptr;
   std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
   std::shared_ptr<ObjectLibrary> library =
@@ -137,7 +138,7 @@ TEST_F(EnvRegistryTest, CheckStatic) {
   ASSERT_NE(env, nullptr);
 }
 
-TEST_F(EnvRegistryTest, CheckUnique) {
+TEST_F(ObjRegistryTest, CheckUnique) {
   std::unique_ptr<Env> unique;
   std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
   std::shared_ptr<ObjectLibrary> library =
@@ -162,7 +163,7 @@ TEST_F(EnvRegistryTest, CheckUnique) {
   ASSERT_EQ(unique, nullptr);
 }
 
-TEST_F(EnvRegistryTest, TestRegistryParents) {
+TEST_F(ObjRegistryTest, TestRegistryParents) {
   auto grand = ObjectRegistry::Default();
   auto parent = ObjectRegistry::NewInstance();  // parent with a grandparent
   auto uncle = ObjectRegistry::NewInstance(grand);
@@ -221,7 +222,7 @@ class MyCustomizable : public Customizable {
   std::string name_;
 };
 
-TEST_F(EnvRegistryTest, TestManagedObjects) {
+TEST_F(ObjRegistryTest, TestManagedObjects) {
   auto registry = ObjectRegistry::NewInstance();
   auto m_a1 = std::make_shared<MyCustomizable>("", "A");
   auto m_a2 = std::make_shared<MyCustomizable>("", "A");
@@ -238,7 +239,7 @@ TEST_F(EnvRegistryTest, TestManagedObjects) {
   ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a2);
 }
 
-TEST_F(EnvRegistryTest, TestTwoManagedObjects) {
+TEST_F(ObjRegistryTest, TestTwoManagedObjects) {
   auto registry = ObjectRegistry::NewInstance();
   auto m_a = std::make_shared<MyCustomizable>("", "A");
   auto m_b = std::make_shared<MyCustomizable>("", "B");
@@ -284,7 +285,7 @@ TEST_F(EnvRegistryTest, TestTwoManagedObjects) {
   ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
 }
 
-TEST_F(EnvRegistryTest, TestAlternateNames) {
+TEST_F(ObjRegistryTest, TestAlternateNames) {
   auto registry = ObjectRegistry::NewInstance();
   auto m_a = std::make_shared<MyCustomizable>("", "A");
   auto m_b = std::make_shared<MyCustomizable>("", "B");
@@ -337,7 +338,7 @@ TEST_F(EnvRegistryTest, TestAlternateNames) {
   ASSERT_EQ(objects.size(), 0U);
 }
 
-TEST_F(EnvRegistryTest, TestTwoManagedClasses) {
+TEST_F(ObjRegistryTest, TestTwoManagedClasses) {
   class MyCustomizable2 : public MyCustomizable {
    public:
     static const char* Type() { return "MyCustomizable2"; }
@@ -377,7 +378,7 @@ TEST_F(EnvRegistryTest, TestTwoManagedClasses) {
   ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
 }
 
-TEST_F(EnvRegistryTest, TestManagedObjectsWithParent) {
+TEST_F(ObjRegistryTest, TestManagedObjectsWithParent) {
   auto base = ObjectRegistry::NewInstance();
   auto registry = ObjectRegistry::NewInstance(base);
 
@@ -397,10 +398,10 @@ TEST_F(EnvRegistryTest, TestManagedObjectsWithParent) {
   ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_b);
 }
 
-TEST_F(EnvRegistryTest, TestGetOrCreateManagedObject) {
+TEST_F(ObjRegistryTest, TestGetOrCreateManagedObject) {
   auto registry = ObjectRegistry::NewInstance();
   registry->AddLibrary("test")->Register<MyCustomizable>(
-      "MC(@.*)?",
+      ObjectLibrary::PatternEntry::AsIndividualId("MC"),
       [](const std::string& uri, std::unique_ptr<MyCustomizable>* guard,
          std::string* /* errmsg */) {
         guard->reset(new MyCustomizable("MC", uri));
@@ -411,14 +412,14 @@ TEST_F(EnvRegistryTest, TestGetOrCreateManagedObject) {
 
   std::unordered_map<std::string, std::string> opt_map;
 
-  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A"), nullptr);
-  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@B"), nullptr);
-  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A", &m_a));
-  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B", &m_b));
-  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A"), m_a);
-  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A", &obj));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A#1"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@B#1"), nullptr);
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a));
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &m_b));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A#1"), m_a);
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &obj));
   ASSERT_EQ(obj, m_a);
-  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B", &obj));
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj));
   ASSERT_EQ(obj, m_b);
   ASSERT_OK(registry->ListManagedObjects(&objs));
   ASSERT_EQ(objs.size(), 2U);
@@ -426,11 +427,216 @@ TEST_F(EnvRegistryTest, TestGetOrCreateManagedObject) {
   objs.clear();
   m_a.reset();
   obj.reset();
-  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A", &m_a));
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a));
   ASSERT_EQ(1, m_a.use_count());
-  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B", &obj));
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj));
   ASSERT_EQ(2, obj.use_count());
 }
+
+class PatternEntryTest : public testing::Test {};
+
+TEST_F(PatternEntryTest, TestSimpleEntry) {
+  ObjectLibrary::PatternEntry entry("ABC", true);
+
+  ASSERT_TRUE(entry.Matches("ABC"));
+  ASSERT_FALSE(entry.Matches("AABC"));
+  ASSERT_FALSE(entry.Matches("ABCA"));
+  ASSERT_FALSE(entry.Matches("AABCA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("BC"));
+  ASSERT_FALSE(entry.Matches("ABD"));
+  ASSERT_FALSE(entry.Matches("BCA"));
+}
+
+TEST_F(PatternEntryTest, TestPatternEntry) {
+  // Matches A:+
+  ObjectLibrary::PatternEntry entry("A", false);
+  entry.AddSeparator(":");
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+
+  entry.SetOptional(true);  // Now matches "A" or "A:+"
+  ASSERT_TRUE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+}
+
+TEST_F(PatternEntryTest, TestSuffixEntry) {
+  ObjectLibrary::PatternEntry entry("AA", true);
+  entry.AddSuffix("BB");
+
+  ASSERT_TRUE(entry.Matches("AA"));
+  ASSERT_TRUE(entry.Matches("AABB"));
+
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("BB"));
+  ASSERT_FALSE(entry.Matches("ABA"));
+  ASSERT_FALSE(entry.Matches("BBAA"));
+  ASSERT_FALSE(entry.Matches("AABBA"));
+  ASSERT_FALSE(entry.Matches("AABBB"));
+}
+
+TEST_F(PatternEntryTest, TestNumericEntry) {
+  ObjectLibrary::PatternEntry entry("A", false);
+  entry.AddNumber(":");
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_TRUE(entry.Matches("A:1"));
+  ASSERT_TRUE(entry.Matches("A:11"));
+  ASSERT_FALSE(entry.Matches("AA:1"));
+  ASSERT_FALSE(entry.Matches("AA:11"));
+  ASSERT_FALSE(entry.Matches("A:B"));
+  ASSERT_FALSE(entry.Matches("A:1B"));
+  ASSERT_FALSE(entry.Matches("A:B1"));
+}
+
+TEST_F(PatternEntryTest, TestIndividualIdEntry) {
+  auto entry = ObjectLibrary::PatternEntry::AsIndividualId("AA");
+  ASSERT_TRUE(entry.Matches("AA"));
+  ASSERT_TRUE(entry.Matches("AA@123#456"));
+  ASSERT_TRUE(entry.Matches("AA@deadbeef#id"));
+
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AAA"));
+  ASSERT_FALSE(entry.Matches("AA@123"));
+  ASSERT_FALSE(entry.Matches("AA@123#"));
+  ASSERT_FALSE(entry.Matches("AA@#123"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNameEntry) {
+  ObjectLibrary::PatternEntry entry("A");
+  entry.AnotherName("B");
+  ASSERT_TRUE(entry.Matches("A"));
+  ASSERT_TRUE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("BB"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("BA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+}
+
+TEST_F(PatternEntryTest, TestTwoPatternEntry) {
+  ObjectLibrary::PatternEntry entry("AA", false);
+  entry.AddSeparator(":");
+  entry.AddSeparator(":");
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AA::12"));
+  ASSERT_TRUE(entry.Matches("AA:1:2"));
+  ASSERT_TRUE(entry.Matches("AA:1:2:"));
+
+  ObjectLibrary::PatternEntry entry2("AA", false);
+  entry2.AddSeparator("::");
+  entry2.AddSeparator("##");
+  ASSERT_FALSE(entry2.Matches("AA"));
+  ASSERT_FALSE(entry2.Matches("AA:"));
+  ASSERT_FALSE(entry2.Matches("AA::"));
+  ASSERT_FALSE(entry2.Matches("AA::#"));
+  ASSERT_FALSE(entry2.Matches("AA::##"));
+  ASSERT_FALSE(entry2.Matches("AA##1::2"));
+  ASSERT_FALSE(entry2.Matches("AA::123##"));
+  ASSERT_TRUE(entry2.Matches("AA::1##2"));
+  ASSERT_TRUE(entry2.Matches("AA::12##34:"));
+  ASSERT_TRUE(entry2.Matches("AA::12::34##56"));
+  ASSERT_TRUE(entry2.Matches("AA::12##34::56"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNumbersEntry) {
+  ObjectLibrary::PatternEntry entry("AA", false);
+  entry.AddNumber(":");
+  entry.AddNumber(":");
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AA::12"));
+  ASSERT_FALSE(entry.Matches("AA:1:2:"));
+  ASSERT_TRUE(entry.Matches("AA:1:2"));
+  ASSERT_TRUE(entry.Matches("AA:12:23456"));
+
+  ObjectLibrary::PatternEntry entry2("AA", false);
+  entry2.AddNumber(":");
+  entry2.AddNumber("#");
+  ASSERT_FALSE(entry2.Matches("AA"));
+  ASSERT_FALSE(entry2.Matches("AA:"));
+  ASSERT_FALSE(entry2.Matches("AA:#"));
+  ASSERT_FALSE(entry2.Matches("AA#:"));
+  ASSERT_FALSE(entry2.Matches("AA:123#"));
+  ASSERT_FALSE(entry2.Matches("AA:123#B"));
+  ASSERT_FALSE(entry2.Matches("AA:B#123"));
+  ASSERT_TRUE(entry2.Matches("AA:1#2"));
+  ASSERT_FALSE(entry2.Matches("AA:123#23:"));
+  ASSERT_FALSE(entry2.Matches("AA::12#234"));
+}
+
+TEST_F(PatternEntryTest, TestPatternAndSuffix) {
+  ObjectLibrary::PatternEntry entry("AA", false);
+  entry.AddSeparator("::");
+  entry.AddSuffix("##");
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AA::##"));
+  ASSERT_FALSE(entry.Matches("AB::1##"));
+  ASSERT_FALSE(entry.Matches("AB::1##2"));
+  ASSERT_FALSE(entry.Matches("AA##1::"));
+  ASSERT_TRUE(entry.Matches("AA::1##"));
+  ASSERT_FALSE(entry.Matches("AA::1###"));
+
+  ObjectLibrary::PatternEntry entry2("AA", false);
+  entry2.AddSuffix("::");
+  entry2.AddSeparator("##");
+  ASSERT_FALSE(entry2.Matches("AA"));
+  ASSERT_FALSE(entry2.Matches("AA::"));
+  ASSERT_FALSE(entry2.Matches("AA::##"));
+  ASSERT_FALSE(entry2.Matches("AB::1##"));
+  ASSERT_FALSE(entry2.Matches("AB::1##2"));
+  ASSERT_TRUE(entry2.Matches("AA::##12"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNamesAndPattern) {
+  ObjectLibrary::PatternEntry entry("AA", true);
+  entry.AddSeparator("::");
+  entry.AnotherName("BBB");
+  ASSERT_TRUE(entry.Matches("AA"));
+  ASSERT_TRUE(entry.Matches("AA::1"));
+  ASSERT_TRUE(entry.Matches("BBB"));
+  ASSERT_TRUE(entry.Matches("BBB::2"));
+
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AAA::"));
+  ASSERT_FALSE(entry.Matches("BBB::"));
+
+  entry.SetOptional(false);
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("BBB"));
+
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AAA::"));
+  ASSERT_FALSE(entry.Matches("BBB::"));
+
+  ASSERT_TRUE(entry.Matches("AA::1"));
+  ASSERT_TRUE(entry.Matches("BBB::2"));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
@@ -442,7 +648,7 @@ int main(int argc, char** argv) {
 #include <stdio.h>
 
 int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr, "SKIPPED as EnvRegistry is not supported in ROCKSDB_LITE\n");
+  fprintf(stderr, "SKIPPED as ObjRegistry is not supported in ROCKSDB_LITE\n");
   return 0;
 }
 

From a931bacf5d1ac118235e818f217c7cc936e84660 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 29 Dec 2021 11:13:49 -0800
Subject: [PATCH 6/6] Improve SimulatedHybridFileSystem (#9301)

Summary:
Several improvements to SimulatedHybridFileSystem:
(1) Allow a mode where all I/Os to all files simulate HDD. This can be enabled in db_bench using -simulate_hdd
(2) Latency calculation is slightly more accurate
(3) Allow to simulate more than one HDD spindles.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9301

Test Plan: Run db_bench and observe the results are reasonable.

Reviewed By: jay-zhuang

Differential Revision: D33141662

fbshipit-source-id: b736e58c4ba910d06899cc9ccec79b628275f4fa
---
 HISTORY.md                            |  1 +
 tools/db_bench_tool.cc                | 11 ++++-
 tools/simulated_hybrid_file_system.cc | 61 +++++++++++++++++----------
 tools/simulated_hybrid_file_system.h  | 14 ++++--
 4 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 6e0c54e53..b672f1160 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -7,6 +7,7 @@
 ## 6.28.0 (2021-12-17)
 ### New Features
 * Introduced 'CommitWithTimestamp' as a new tag. Currently, there is no API for user to trigger a write with this tag to the WAL. This is part of the efforts to support write-commited transactions with user-defined timestamps.
+* Introduce SimulatedHybridFileSystem which can help simulating HDD latency in db_bench. Tiered Storage latency simulation can be enabled using -simulate_hybrid_fs_file (note that it doesn't work if db_bench is interrupted in the middle). -simulate_hdd can also be used to simulate all files on HDD.
 
 ### Bug Fixes
 * Fixed a bug in rocksdb automatic implicit prefetching which got broken because of new feature adaptive_readahead and internal prefetching got disabled when iterator moves from one file to next.
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 93b780541..fcd3c157a 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1154,6 +1154,10 @@ DEFINE_string(simulate_hybrid_fs_file, "",
               "File for Store Metadata for Simulate hybrid FS. Empty means "
               "disable the feature. Now, if it is set, "
               "bottommost_temperature is set to kWarm.");
+DEFINE_int32(simulate_hybrid_hdd_multipliers, 1,
+             "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs "
+             "are simulated.");
+DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD.");
 
 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
 
@@ -8135,12 +8139,15 @@ int db_bench_tool(int argc, char** argv) {
       fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
       exit(1);
     }
-  } else if (FLAGS_simulate_hybrid_fs_file != "") {
+  } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
     //**TODO: Make the simulate fs something that can be loaded
     // from the ObjectRegistry...
     static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
         NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
-            FileSystem::Default(), FLAGS_simulate_hybrid_fs_file));
+            FileSystem::Default(), FLAGS_simulate_hybrid_fs_file,
+            /*throughput_multiplier=*/
+            int{FLAGS_simulate_hybrid_hdd_multipliers},
+            /*is_full_fs_warm=*/FLAGS_simulate_hdd));
     FLAGS_env = composite_env.get();
   }
 #endif  // ROCKSDB_LITE
diff --git a/tools/simulated_hybrid_file_system.cc b/tools/simulated_hybrid_file_system.cc
index 776dc6623..675d2593f 100644
--- a/tools/simulated_hybrid_file_system.cc
+++ b/tools/simulated_hybrid_file_system.cc
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "util/stop_watch.h"
 #ifndef ROCKSDB_LITE
 
 #include "tools/simulated_hybrid_file_system.h"
@@ -15,7 +16,6 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-const int kLatencyAddedPerRequestUs = 15000;
 const int64_t kUsPerSec = 1000000;
 const int64_t kDummyBytesPerUs = 1024;
 
@@ -43,14 +43,17 @@ void RateLimiterRequest(RateLimiter* rater_limiter, int64_t amount) {
 // warm
 SimulatedHybridFileSystem::SimulatedHybridFileSystem(
     const std::shared_ptr<FileSystem>& base,
-    const std::string& metadata_file_name)
+    const std::string& metadata_file_name, int throughput_multiplier,
+    bool is_full_fs_warm)
     : FileSystemWrapper(base),
       // Limit to 100 requests per second.
       rate_limiter_(NewGenericRateLimiter(
-          kDummyBytesPerUs * kUsPerSec /* rate_bytes_per_sec */,
+          int64_t{throughput_multiplier} * kDummyBytesPerUs *
+              kUsPerSec /* rate_bytes_per_sec */,
           1000 /* refill_period_us */)),
       metadata_file_name_(metadata_file_name),
-      name_("SimulatedHybridFileSystem: " + std::string(target()->Name())) {
+      name_("SimulatedHybridFileSystem: " + std::string(target()->Name())),
+      is_full_fs_warm_(is_full_fs_warm) {
   IOStatus s = base->FileExists(metadata_file_name, IOOptions(), nullptr);
   if (s.IsNotFound()) {
     return;
@@ -77,6 +80,9 @@ SimulatedHybridFileSystem::SimulatedHybridFileSystem(
 // SimulatedHybridFileSystem::SimulatedHybridFileSystem() for format of the
 // file.
 SimulatedHybridFileSystem::~SimulatedHybridFileSystem() {
+  if (metadata_file_name_.empty()) {
+    return;
+  }
   std::string metadata;
   for (const auto& f : warm_file_set_) {
     metadata += f;
@@ -93,13 +99,15 @@ IOStatus SimulatedHybridFileSystem::NewRandomAccessFile(
     const std::string& fname, const FileOptions& file_opts,
     std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
   Temperature temperature = Temperature::kUnknown;
-  {
+  if (is_full_fs_warm_) {
+    temperature = Temperature::kWarm;
+  } else {
     const std::lock_guard<std::mutex> lock(mutex_);
     if (warm_file_set_.find(fname) != warm_file_set_.end()) {
       temperature = Temperature::kWarm;
     }
+    assert(temperature == file_opts.temperature);
   }
-  assert(temperature == file_opts.temperature);
   IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
   result->reset(
       new SimulatedHybridRaf(std::move(*result), rate_limiter_, temperature));
@@ -115,7 +123,7 @@ IOStatus SimulatedHybridFileSystem::NewWritableFile(
   }
 
   IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
-  if (file_opts.temperature == Temperature::kWarm) {
+  if (file_opts.temperature == Temperature::kWarm || is_full_fs_warm_) {
     result->reset(new SimulatedWritableFile(std::move(*result), rate_limiter_));
   }
   return s;
@@ -135,8 +143,7 @@ IOStatus SimulatedHybridRaf::Read(uint64_t offset, size_t n,
                                   const IOOptions& options, Slice* result,
                                   char* scratch, IODebugContext* dbg) const {
   if (temperature_ == Temperature::kWarm) {
-    Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs);
-    RequestRateLimit(n);
+    SimulateIOWait(n);
   }
   return target()->Read(offset, n, options, result, scratch, dbg);
 }
@@ -146,10 +153,8 @@ IOStatus SimulatedHybridRaf::MultiRead(FSReadRequest* reqs, size_t num_reqs,
                                        IODebugContext* dbg) {
   if (temperature_ == Temperature::kWarm) {
     for (size_t i = 0; i < num_reqs; i++) {
-      RequestRateLimit(reqs[i].len);
+      SimulateIOWait(reqs[i].len);
     }
-    Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs *
-                                         static_cast<int>(num_reqs));
   }
   return target()->MultiRead(reqs, num_reqs, options, dbg);
 }
@@ -158,24 +163,34 @@ IOStatus SimulatedHybridRaf::Prefetch(uint64_t offset, size_t n,
                                       const IOOptions& options,
                                       IODebugContext* dbg) {
   if (temperature_ == Temperature::kWarm) {
-    RequestRateLimit(n);
-    Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs);
+    SimulateIOWait(n);
   }
   return target()->Prefetch(offset, n, options, dbg);
 }
 
-void SimulatedHybridRaf::RequestRateLimit(int64_t bytes) const {
-  RateLimiterRequest(rate_limiter_.get(), CalculateServeTimeUs(bytes));
+void SimulatedHybridRaf::SimulateIOWait(int64_t bytes) const {
+  int serve_time = CalculateServeTimeUs(bytes);
+  {
+    StopWatchNano stop_watch(Env::Default()->GetSystemClock().get(),
+                             /*auto_start=*/true);
+    RateLimiterRequest(rate_limiter_.get(), serve_time);
+    int time_passed_us = static_cast<int>(stop_watch.ElapsedNanos() / 1000);
+    if (time_passed_us < serve_time) {
+      Env::Default()->SleepForMicroseconds(serve_time - time_passed_us);
+    }
+  }
 }
 
-void SimulatedWritableFile::RequestRateLimit(int64_t bytes) const {
-  RateLimiterRequest(rate_limiter_.get(), CalculateServeTimeUs(bytes));
+void SimulatedWritableFile::SimulateIOWait(int64_t bytes) const {
+  int serve_time = CalculateServeTimeUs(bytes);
+  Env::Default()->SleepForMicroseconds(serve_time);
+  RateLimiterRequest(rate_limiter_.get(), serve_time);
 }
 
 IOStatus SimulatedWritableFile::Append(const Slice& data, const IOOptions& ioo,
                                        IODebugContext* idc) {
   if (use_direct_io()) {
-    RequestRateLimit(data.size());
+    SimulateIOWait(data.size());
   } else {
     unsynced_bytes += data.size();
   }
@@ -186,7 +201,7 @@ IOStatus SimulatedWritableFile::Append(
     const Slice& data, const IOOptions& options,
     const DataVerificationInfo& verification_info, IODebugContext* dbg) {
   if (use_direct_io()) {
-    RequestRateLimit(data.size());
+    SimulateIOWait(data.size());
   } else {
     unsynced_bytes += data.size();
   }
@@ -198,7 +213,7 @@ IOStatus SimulatedWritableFile::PositionedAppend(const Slice& data,
                                                  const IOOptions& options,
                                                  IODebugContext* dbg) {
   if (use_direct_io()) {
-    RequestRateLimit(data.size());
+    SimulateIOWait(data.size());
   } else {
     // This might be overcalculated, but it's probably OK.
     unsynced_bytes += data.size();
@@ -209,7 +224,7 @@ IOStatus SimulatedWritableFile::PositionedAppend(
     const Slice& data, uint64_t offset, const IOOptions& options,
     const DataVerificationInfo& verification_info, IODebugContext* dbg) {
   if (use_direct_io()) {
-    RequestRateLimit(data.size());
+    SimulateIOWait(data.size());
   } else {
     // This might be overcalculated, but it's probably OK.
     unsynced_bytes += data.size();
@@ -221,7 +236,7 @@ IOStatus SimulatedWritableFile::PositionedAppend(
 IOStatus SimulatedWritableFile::Sync(const IOOptions& options,
                                      IODebugContext* dbg) {
   if (unsynced_bytes > 0) {
-    RequestRateLimit(unsynced_bytes);
+    SimulateIOWait(unsynced_bytes);
     unsynced_bytes = 0;
   }
   return target()->Sync(options, dbg);
diff --git a/tools/simulated_hybrid_file_system.h b/tools/simulated_hybrid_file_system.h
index e1b3393b4..251d89df7 100644
--- a/tools/simulated_hybrid_file_system.h
+++ b/tools/simulated_hybrid_file_system.h
@@ -28,8 +28,13 @@ class SimulatedHybridFileSystem : public FileSystemWrapper {
   // metadata_file_name stores metadata of the files, so that it can be
   // loaded after process restarts. If the file doesn't exist, create
   // one. The file is written when the class is destroyed.
-  explicit SimulatedHybridFileSystem(const std::shared_ptr<FileSystem>& base,
-                                     const std::string& metadata_file_name);
+  // throughput_multiplier: multiplier of throughput. For example, 1 is to
+  //      simulate single disk spindle. 4 is to simualte 4 disk spindles.
+  // is_full_fs_warm: if true, all files are all included in slow I/O
+  // simulation.
+  SimulatedHybridFileSystem(const std::shared_ptr<FileSystem>& base,
+                            const std::string& metadata_file_name,
+                            int throughput_multiplier, bool is_full_fs_warm);
 
   ~SimulatedHybridFileSystem() override;
 
@@ -55,6 +60,7 @@ class SimulatedHybridFileSystem : public FileSystemWrapper {
   std::unordered_set<std::string> warm_file_set_;
   std::string metadata_file_name_;
   std::string name_;
+  bool is_full_fs_warm_;
 };
 
 // Simulated random access file that can control IOPs and latency to simulate
@@ -84,7 +90,7 @@ class SimulatedHybridRaf : public FSRandomAccessFileOwnerWrapper {
   std::shared_ptr<RateLimiter> rate_limiter_;
   Temperature temperature_;
 
-  void RequestRateLimit(int64_t num_requests) const;
+  void SimulateIOWait(int64_t num_requests) const;
 };
 
 class SimulatedWritableFile : public FSWritableFileWrapper {
@@ -113,7 +119,7 @@ class SimulatedWritableFile : public FSWritableFileWrapper {
   std::shared_ptr<RateLimiter> rate_limiter_;
   size_t unsynced_bytes = 0;
 
-  void RequestRateLimit(int64_t num_requests) const;
+  void SimulateIOWait(int64_t num_requests) const;
 };
 }  // namespace ROCKSDB_NAMESPACE