From 7fdf735d5dd71b7f92a1ff5a64312ff1bc08c5d3 Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Thu, 24 Aug 2017 12:20:15 -0700 Subject: [PATCH] Pinnableslice examples and blog post Summary: Closes https://github.com/facebook/rocksdb/pull/2788 Differential Revision: D5700189 Pulled By: maysamyabandeh fbshipit-source-id: 6f043e652093ff904e52f6d35190855781b87673 --- ...17-05-12-partitioned-index-filter.markdown | 2 +- docs/_posts/2017-08-24-pinnableslice.markdown | 37 +++++++++++++++++++ examples/simple_example.cc | 27 ++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 docs/_posts/2017-08-24-pinnableslice.markdown diff --git a/docs/_posts/2017-05-12-partitioned-index-filter.markdown b/docs/_posts/2017-05-12-partitioned-index-filter.markdown index fb4f62cd8..a537feb0c 100644 --- a/docs/_posts/2017-05-12-partitioned-index-filter.markdown +++ b/docs/_posts/2017-05-12-partitioned-index-filter.markdown @@ -31,4 +31,4 @@ In this example we have a DB of size 86G on HDD and emulate the small memory tha In this example we have a DB of size 300G on SSD and emulate the small memory that would be available in presence of other DBs on the same node by by using direct IO (skipping OS file cache) and block cache of size 6G and 2G. Without partitioning the linkbench throughput drops from 38k tps to 23k when reducing block cache size from 6G to 2G. With partitioning the throughput drops from 38k to only 30k. -Learn more (here)[https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters]. +Learn more [here](https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters). diff --git a/docs/_posts/2017-08-24-pinnableslice.markdown b/docs/_posts/2017-08-24-pinnableslice.markdown new file mode 100644 index 000000000..a5026d5c4 --- /dev/null +++ b/docs/_posts/2017-08-24-pinnableslice.markdown @@ -0,0 +1,37 @@ +--- +title: PinnableSlice: less memcpy with point lookups +layout: post +author: maysamyabandeh +category: blog +--- + +The classic API for [DB::Get](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L310) receives a std::string as argument to which it will copy the value. The memcpy overhead could be non-trivial when the value is large. The [new API](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L322) receives a PinnableSlice instead, which avoids memcpy in most of the cases. + +### What is PinnableSlice? + +Similarly to Slice, PinnableSlice refers to some in-memory data so it does not incur the memcpy cost. To ensure that the data will not be erased while it is being processed by the user, PinnableSlice, as its name suggests, has the data pinned in memory. The pinned data are released when PinnableSlice object is destructed or when ::Reset is invoked explicitly on it. + +### How good it is? + +Here are the improvements in throughput for an [in-memory benchmark](https://github.com/facebook/rocksdb/pull/1756#issuecomment-286201693): +* value 1k byte: 14% +* value 10k byte: 34% + +### Any limitations? + +PinnableSlice tries to avoid memcpy as much as possible. The primary gain is when reading large values from the block cache. There are however cases that it would still have to copy the data into its internal buffer. The reason is mainly the complexity of implementation and if there is enough motivation on the application side. the scope of PinnableSlice could be extended to such cases too. These include: +* Merged values +* Reads from memtables + +### How to use it? + +```cpp +PinnableSlice pinnable_val; +while (!stopped) { + auto s = db->Get(opt, cf, key, &pinnable_val); + // ... use it + pinnable_val.Reset(); // then release it immediately +} +``` + +You can also [initialize the internal buffer](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L314) of PinnableSlice by passing your own string in the constructor. [simple_example.cc](https://github.com/facebook/rocksdb/blob/master/examples/simple_example.cc) demonstrates that with more examples. diff --git a/examples/simple_example.cc b/examples/simple_example.cc index 52fffff5b..a8f80f091 100644 --- a/examples/simple_example.cc +++ b/examples/simple_example.cc @@ -50,6 +50,33 @@ int main() { db->Get(ReadOptions(), "key2", &value); assert(value == "value"); + { + PinnableSlice pinnable_val; + db->Get(ReadOptions(), db->DefaultColumnFamily(), "key2", &pinnable_val); + assert(pinnable_val == "value"); + } + + { + std::string string_val; + // If it cannot pin the value, it copies the value to its internal buffer. + // The intenral buffer could be set during construction. + PinnableSlice pinnable_val(&string_val); + db->Get(ReadOptions(), db->DefaultColumnFamily(), "key2", &pinnable_val); + assert(pinnable_val == "value"); + // If the value is not pinned, the internal buffer must have the value. + assert(pinnable_val.IsPinned() || string_val == "value"); + } + + PinnableSlice pinnable_val; + db->Get(ReadOptions(), db->DefaultColumnFamily(), "key1", &pinnable_val); + assert(s.IsNotFound()); + // Reset PinnableSlice after each use and before each reuse + pinnable_val.Reset(); + db->Get(ReadOptions(), db->DefaultColumnFamily(), "key2", &pinnable_val); + assert(pinnable_val == "value"); + pinnable_val.Reset(); + // The Slice pointed by pinnable_val is not valid after this point + delete db; return 0;