diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt index 828803bfb..71745d259 100644 --- a/java/CMakeLists.txt +++ b/java/CMakeLists.txt @@ -147,6 +147,7 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/HistogramData.java src/main/java/org/rocksdb/HistogramType.java src/main/java/org/rocksdb/Holder.java + src/main/java/org/rocksdb/IndexShorteningMode.java src/main/java/org/rocksdb/IndexType.java src/main/java/org/rocksdb/InfoLogLevel.java src/main/java/org/rocksdb/IngestExternalFileOptions.java diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 58111c846..9bc1e674e 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -6375,6 +6375,51 @@ class ChecksumTypeJni { } }; +// The portal class for org.rocksdb.IndexShorteningMode +class IndexShorteningModeJni { + public: + // Returns the equivalent org.rocksdb.IndexShorteningMode for the provided + // C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum + static jbyte toJavaIndexShorteningMode( + const ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode& + index_shortening_mode) { + switch (index_shortening_mode) { + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kNoShortening: + return 0x0; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparators: + return 0x1; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor: + return 0x2; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum for + // the provided Java org.rocksdb.IndexShorteningMode + static ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode + toCppIndexShorteningMode(jbyte jindex_shortening_mode) { + switch (jindex_shortening_mode) { + case 0x0: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kNoShortening; + case 0x1: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparators; + case 0x2: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor; + default: + // undefined/default + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparators; + } + } +}; + // The portal class for org.rocksdb.Priority class PriorityJni { public: diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index 9e3f4b663..1b98cd8b0 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -42,25 +42,25 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( /* * Class: org_rocksdb_BlockBasedTableConfig * Method: newTableFactoryHandle - * Signature: (ZZZZBBDBZJJJJIIIJZZJZZIIZZJIJI)J + * Signature: (ZZZZBBDBZJJJJIIIJZZZJZZIIZZBJIJI)J */ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( - JNIEnv*, jobject, jboolean jcache_index_and_filter_blocks, + JNIEnv *, jobject, jboolean jcache_index_and_filter_blocks, jboolean jcache_index_and_filter_blocks_with_high_priority, jboolean jpin_l0_filter_and_index_blocks_in_cache, jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value, jbyte jdata_block_index_type_value, jdouble jdata_block_hash_table_util_ratio, jbyte jchecksum_type_value, jboolean jno_block_cache, jlong jblock_cache_handle, - jlong jpersistent_cache_handle, - jlong jblock_cache_compressed_handle, jlong jblock_size, - jint jblock_size_deviation, jint jblock_restart_interval, + jlong jpersistent_cache_handle, jlong jblock_cache_compressed_handle, + jlong jblock_size, jint jblock_size_deviation, jint jblock_restart_interval, jint jindex_block_restart_interval, jlong jmetadata_block_size, - jboolean jpartition_filters, jboolean juse_delta_encoding, - jlong jfilter_policy_handle, jboolean jwhole_key_filtering, - jboolean jverify_compression, jint jread_amp_bytes_per_bit, - jint jformat_version, jboolean jenable_index_compression, - jboolean jblock_align, jlong jblock_cache_size, + jboolean jpartition_filters, jboolean joptimize_filters_for_memory, + jboolean juse_delta_encoding, jlong jfilter_policy_handle, + jboolean jwhole_key_filtering, jboolean jverify_compression, + jint jread_amp_bytes_per_bit, jint jformat_version, + jboolean jenable_index_compression, jboolean jblock_align, + jbyte jindex_shortening, jlong jblock_cache_size, jint jblock_cache_num_shard_bits, jlong jblock_cache_compressed_size, jint jblock_cache_compressed_num_shard_bits) { ROCKSDB_NAMESPACE::BlockBasedTableOptions options; @@ -131,6 +131,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.index_block_restart_interval = static_cast(jindex_block_restart_interval); options.metadata_block_size = static_cast(jmetadata_block_size); options.partition_filters = static_cast(jpartition_filters); + options.optimize_filters_for_memory = + static_cast(joptimize_filters_for_memory); options.use_delta_encoding = static_cast(juse_delta_encoding); if (jfilter_policy_handle > 0) { std::shared_ptr *pFilterPolicy = @@ -144,6 +146,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.format_version = static_cast(jformat_version); options.enable_index_compression = static_cast(jenable_index_compression); options.block_align = static_cast(jblock_align); + options.index_shortening = + ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode( + jindex_shortening); return reinterpret_cast( ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(options)); diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index 5bc694af5..bbd2ca082 100644 --- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -15,7 +15,7 @@ public class BlockBasedTableConfig extends TableFormatConfig { public BlockBasedTableConfig() { //TODO(AR) flushBlockPolicyFactory cacheIndexAndFilterBlocks = false; - cacheIndexAndFilterBlocksWithHighPriority = false; + cacheIndexAndFilterBlocksWithHighPriority = true; pinL0FilterAndIndexBlocksInCache = false; pinTopLevelIndexAndFilter = true; indexType = IndexType.kBinarySearch; @@ -32,14 +32,16 @@ public class BlockBasedTableConfig extends TableFormatConfig { indexBlockRestartInterval = 1; metadataBlockSize = 4096; partitionFilters = false; + optimizeFiltersForMemory = false; useDeltaEncoding = true; filterPolicy = null; wholeKeyFiltering = true; - verifyCompression = true; + verifyCompression = false; readAmpBytesPerBit = 0; - formatVersion = 2; + formatVersion = 4; enableIndexCompression = true; blockAlign = false; + indexShortening = IndexShorteningMode.kShortenSeparators; // NOTE: ONLY used if blockCache == null blockCacheSize = 8 * 1024 * 1024; @@ -77,7 +79,7 @@ public class BlockBasedTableConfig extends TableFormatConfig { /** * Indicates if index and filter blocks will be treated as high-priority in the block cache. - * See note below about applicability. If not specified, defaults to false. + * See note below about applicability. If not specified, defaults to true. * * @return if index and filter blocks will be treated as high-priority. */ @@ -453,6 +455,65 @@ public class BlockBasedTableConfig extends TableFormatConfig { return this; } + /*** + * Option to generate Bloom filters that minimize memory + * internal fragmentation. + * + * See {@link #setOptimizeFiltersForMemory(boolean)}. + * + * @return true if bloom filters are used to minimize memory internal + * fragmentation + */ + @Experimental("Option to generate Bloom filters that minimize memory internal fragmentation") + public boolean optimizeFiltersForMemory() { + return optimizeFiltersForMemory; + } + + /** + * Option to generate Bloom filters that minimize memory + * internal fragmentation. + * + * When false, malloc_usable_size is not available, or format_version < 5, + * filters are generated without regard to internal fragmentation when + * loaded into memory (historical behavior). When true (and + * malloc_usable_size is available and {@link #formatVersion()} >= 5), + * then Bloom filters are generated to "round up" and "round down" their + * sizes to minimize internal fragmentation when loaded into memory, assuming + * the reading DB has the same memory allocation characteristics as the + * generating DB. This option does not break forward or backward + * compatibility. + * + * While individual filters will vary in bits/key and false positive rate + * when setting is true, the implementation attempts to maintain a weighted + * average FP rate for filters consistent with this option set to false. + * + * With Jemalloc for example, this setting is expected to save about 10% of + * the memory footprint and block cache charge of filters, while increasing + * disk usage of filters by about 1-2% due to encoding efficiency losses + * with variance in bits/key. + * + * NOTE: Because some memory counted by block cache might be unmapped pages + * within internal fragmentation, this option can increase observed RSS + * memory usage. With {@link #cacheIndexAndFilterBlocks()} == true, + * this option makes the block cache better at using space it is allowed. + * + * NOTE: Do not set to true if you do not trust malloc_usable_size. With + * this option, RocksDB might access an allocated memory object beyond its + * original size if malloc_usable_size says it is safe to do so. While this + * can be considered bad practice, it should not produce undefined behavior + * unless malloc_usable_size is buggy or broken. + * + * @param optimizeFiltersForMemory true to enable Bloom filters that minimize + * memory internal fragmentation, or false to disable. + * + * @return the reference to the current config. + */ + @Experimental("Option to generate Bloom filters that minimize memory internal fragmentation") + public BlockBasedTableConfig setOptimizeFiltersForMemory(final boolean optimizeFiltersForMemory) { + this.optimizeFiltersForMemory = optimizeFiltersForMemory; + return this; + } + /** * Determine if delta encoding is being used to compress block keys. * @@ -717,6 +778,28 @@ public class BlockBasedTableConfig extends TableFormatConfig { return this; } + /** + * Get the index shortening mode. + * + * @return the index shortening mode. + */ + public IndexShorteningMode indexShortening() { + return indexShortening; + } + + /** + * Set the index shortening mode. + * + * See {@link IndexShorteningMode}. + * + * @param indexShortening the index shortening mode. + * + * @return the reference to the current option. + */ + public BlockBasedTableConfig setIndexShortening(final IndexShorteningMode indexShortening) { + this.indexShortening = indexShortening; + return this; + } /** * Get the size of the cache in bytes that will be used by RocksDB. @@ -900,54 +983,35 @@ public class BlockBasedTableConfig extends TableFormatConfig { } return newTableFactoryHandle(cacheIndexAndFilterBlocks, - cacheIndexAndFilterBlocksWithHighPriority, - pinL0FilterAndIndexBlocksInCache, pinTopLevelIndexAndFilter, - indexType.getValue(), dataBlockIndexType.getValue(), - dataBlockHashTableUtilRatio, checksumType.getValue(), noBlockCache, - blockCacheHandle, persistentCacheHandle, blockCacheCompressedHandle, - blockSize, blockSizeDeviation, blockRestartInterval, - indexBlockRestartInterval, metadataBlockSize, partitionFilters, - useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, - verifyCompression, readAmpBytesPerBit, formatVersion, - enableIndexCompression, blockAlign, - blockCacheSize, blockCacheNumShardBits, + cacheIndexAndFilterBlocksWithHighPriority, pinL0FilterAndIndexBlocksInCache, + pinTopLevelIndexAndFilter, indexType.getValue(), dataBlockIndexType.getValue(), + dataBlockHashTableUtilRatio, checksumType.getValue(), noBlockCache, blockCacheHandle, + persistentCacheHandle, blockCacheCompressedHandle, blockSize, blockSizeDeviation, + blockRestartInterval, indexBlockRestartInterval, metadataBlockSize, partitionFilters, + optimizeFiltersForMemory, useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, + verifyCompression, readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign, + indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits, blockCacheCompressedSize, blockCacheCompressedNumShardBits); } - private native long newTableFactoryHandle( - final boolean cacheIndexAndFilterBlocks, + private native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks, final boolean cacheIndexAndFilterBlocksWithHighPriority, - final boolean pinL0FilterAndIndexBlocksInCache, - final boolean pinTopLevelIndexAndFilter, - final byte indexTypeValue, - final byte dataBlockIndexTypeValue, - final double dataBlockHashTableUtilRatio, - final byte checksumTypeValue, - final boolean noBlockCache, - final long blockCacheHandle, - final long persistentCacheHandle, - final long blockCacheCompressedHandle, - final long blockSize, - final int blockSizeDeviation, - final int blockRestartInterval, - final int indexBlockRestartInterval, - final long metadataBlockSize, - final boolean partitionFilters, - final boolean useDeltaEncoding, - final long filterPolicyHandle, - final boolean wholeKeyFiltering, - final boolean verifyCompression, - final int readAmpBytesPerBit, - final int formatVersion, - final boolean enableIndexCompression, - final boolean blockAlign, + final boolean pinL0FilterAndIndexBlocksInCache, final boolean pinTopLevelIndexAndFilter, + final byte indexTypeValue, final byte dataBlockIndexTypeValue, + final double dataBlockHashTableUtilRatio, final byte checksumTypeValue, + final boolean noBlockCache, final long blockCacheHandle, final long persistentCacheHandle, + final long blockCacheCompressedHandle, final long blockSize, final int blockSizeDeviation, + final int blockRestartInterval, final int indexBlockRestartInterval, + final long metadataBlockSize, final boolean partitionFilters, + final boolean optimizeFiltersForMemory, final boolean useDeltaEncoding, + final long filterPolicyHandle, final boolean wholeKeyFiltering, + final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion, + final boolean enableIndexCompression, final boolean blockAlign, final byte indexShortening, - @Deprecated final long blockCacheSize, - @Deprecated final int blockCacheNumShardBits, + @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits, @Deprecated final long blockCacheCompressedSize, - @Deprecated final int blockCacheCompressedNumShardBits - ); + @Deprecated final int blockCacheCompressedNumShardBits); //TODO(AR) flushBlockPolicyFactory private boolean cacheIndexAndFilterBlocks; @@ -968,6 +1032,7 @@ public class BlockBasedTableConfig extends TableFormatConfig { private int indexBlockRestartInterval; private long metadataBlockSize; private boolean partitionFilters; + private boolean optimizeFiltersForMemory; private boolean useDeltaEncoding; private Filter filterPolicy; private boolean wholeKeyFiltering; @@ -976,6 +1041,7 @@ public class BlockBasedTableConfig extends TableFormatConfig { private int formatVersion; private boolean enableIndexCompression; private boolean blockAlign; + private IndexShorteningMode indexShortening; // NOTE: ONLY used if blockCache == null @Deprecated private long blockCacheSize; diff --git a/java/src/main/java/org/rocksdb/ChecksumType.java b/java/src/main/java/org/rocksdb/ChecksumType.java index def9f2e9f..7b2862df8 100644 --- a/java/src/main/java/org/rocksdb/ChecksumType.java +++ b/java/src/main/java/org/rocksdb/ChecksumType.java @@ -20,7 +20,11 @@ public enum ChecksumType { /** * XX Hash */ - kxxHash((byte) 2); + kxxHash((byte) 2), + /** + * XX Hash 64 + */ + kxxHash64((byte) 3); /** * Returns the byte value of the enumerations value diff --git a/java/src/main/java/org/rocksdb/IndexShorteningMode.java b/java/src/main/java/org/rocksdb/IndexShorteningMode.java new file mode 100644 index 000000000..a68346c38 --- /dev/null +++ b/java/src/main/java/org/rocksdb/IndexShorteningMode.java @@ -0,0 +1,60 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +/** + * This enum allows trading off increased index size for improved iterator + * seek performance in some situations, particularly when block cache is + * disabled ({@link ReadOptions#fillCache()} == false and direct IO is + * enabled ({@link DBOptions#useDirectReads()} == true). + * The default mode is the best tradeoff for most use cases. + * This option only affects newly written tables. + * + * The index contains a key separating each pair of consecutive blocks. + * Let A be the highest key in one block, B the lowest key in the next block, + * and I the index entry separating these two blocks: + * [ ... A] I [B ...] + * I is allowed to be anywhere in [A, B). + * If an iterator is seeked to a key in (A, I], we'll unnecessarily read the + * first block, then immediately fall through to the second block. + * However, if I=A, this can't happen, and we'll read only the second block. + * In kNoShortening mode, we use I=A. In other modes, we use the shortest + * key in [A, B), which usually significantly reduces index size. + * + * There's a similar story for the last index entry, which is an upper bound + * of the highest key in the file. If it's shortened and therefore + * overestimated, iterator is likely to unnecessarily read the last data block + * from each file on each seek. + */ +public enum IndexShorteningMode { + /** + * Use full keys. + */ + kNoShortening((byte) 0), + /** + * Shorten index keys between blocks, but use full key for the last index + * key, which is the upper bound of the whole file. + */ + kShortenSeparators((byte) 1), + /** + * Shorten both keys between blocks and key after last block. + */ + kShortenSeparatorsAndSuccessor((byte) 2); + + private final byte value; + + IndexShorteningMode(final byte value) { + this.value = value; + } + + /** + * Returns the byte value of the enumerations value. + * + * @return byte representation + */ + byte getValue() { + return value; + } +} diff --git a/java/src/main/java/org/rocksdb/IndexType.java b/java/src/main/java/org/rocksdb/IndexType.java index 04e481465..162edad1b 100644 --- a/java/src/main/java/org/rocksdb/IndexType.java +++ b/java/src/main/java/org/rocksdb/IndexType.java @@ -22,7 +22,21 @@ public enum IndexType { /** * A two-level index implementation. Both levels are binary search indexes. */ - kTwoLevelIndexSearch((byte) 2); + kTwoLevelIndexSearch((byte) 2), + /** + * Like {@link #kBinarySearch}, but index also contains first key of each block. + * This allows iterators to defer reading the block until it's actually + * needed. May significantly reduce read amplification of short range scans. + * Without it, iterator seek usually reads one block from each level-0 file + * and from each level, which may be expensive. + * Works best in combination with: + * - IndexShorteningMode::kNoShortening, + * - custom FlushBlockPolicy to cut blocks at some meaningful boundaries, + * e.g. when prefix changes. + * Makes the index significantly bigger (2x or more), especially when keys + * are long. + */ + kBinarySearchWithFirstKey((byte) 3); /** * Returns the byte value of the enumerations value diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java index 6fdd314cb..4b5927ebe 100644 --- a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java +++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java @@ -35,9 +35,10 @@ public class BlockBasedTableConfigTest { @Test public void cacheIndexAndFilterBlocksWithHighPriority() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - blockBasedTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(true); assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()). isTrue(); + blockBasedTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(false); + assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()).isFalse(); } @Test @@ -59,7 +60,7 @@ public class BlockBasedTableConfigTest { @Test public void indexType() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - assertThat(IndexType.values().length).isEqualTo(3); + assertThat(IndexType.values().length).isEqualTo(4); blockBasedTableConfig.setIndexType(IndexType.kHashSearch); assertThat(blockBasedTableConfig.indexType().equals( IndexType.kHashSearch)); @@ -83,7 +84,7 @@ public class BlockBasedTableConfigTest { @Test public void checksumType() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - assertThat(ChecksumType.values().length).isEqualTo(3); + assertThat(ChecksumType.values().length).isEqualTo(4); assertThat(ChecksumType.valueOf("kxxHash")). isEqualTo(ChecksumType.kxxHash); blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum); @@ -258,6 +259,13 @@ public class BlockBasedTableConfigTest { isTrue(); } + @Test + public void optimizeFiltersForMemory() { + final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setOptimizeFiltersForMemory(true); + assertThat(blockBasedTableConfig.optimizeFiltersForMemory()).isTrue(); + } + @Test public void useDeltaEncoding() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); @@ -296,6 +304,7 @@ public class BlockBasedTableConfigTest { @Test public void verifyCompression() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + assertThat(blockBasedTableConfig.verifyCompression()).isFalse(); blockBasedTableConfig.setVerifyCompression(true); assertThat(blockBasedTableConfig.verifyCompression()). isTrue(); @@ -346,6 +355,14 @@ public class BlockBasedTableConfigTest { isTrue(); } + @Test + public void indexShortening() { + final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setIndexShortening(IndexShorteningMode.kShortenSeparatorsAndSuccessor); + assertThat(blockBasedTableConfig.indexShortening()) + .isEqualTo(IndexShorteningMode.kShortenSeparatorsAndSuccessor); + } + @Deprecated @Test public void hashIndexAllowCollision() {