760e9a94de
Summary: Currently RocksDB silently ignores this issue and doesn't compress the data. Based on discussion, we agree that this is pretty bad because it can cause confusion for our users. This patch fails DB::Open() if we don't support the compression that is specified in the options. Test Plan: make check with LZ4 not present. If Snappy is not present all tests will just fail because Snappy is our default library. We should make Snappy the requirement, since without it our default DB::Open() fails. Reviewers: sdong, MarkCallaghan, rven, yhchiang Reviewed By: yhchiang Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D39687
594 lines
18 KiB
C++
594 lines
18 KiB
C++
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under the BSD-style license found in the
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
//
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <limits>
|
|
|
|
#include "rocksdb/options.h"
|
|
#include "util/coding.h"
|
|
|
|
#ifdef SNAPPY
|
|
#include <snappy.h>
|
|
#endif
|
|
|
|
#ifdef ZLIB
|
|
#include <zlib.h>
|
|
#endif
|
|
|
|
#ifdef BZIP2
|
|
#include <bzlib.h>
|
|
#endif
|
|
|
|
#if defined(LZ4)
|
|
#include <lz4.h>
|
|
#include <lz4hc.h>
|
|
#endif
|
|
|
|
namespace rocksdb {
|
|
|
|
inline bool Snappy_Supported() {
|
|
#ifdef SNAPPY
|
|
return true;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
inline bool Zlib_Supported() {
|
|
#ifdef ZLIB
|
|
return true;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
inline bool BZip2_Supported() {
|
|
#ifdef BZIP2
|
|
return true;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
inline bool LZ4_Supported() {
|
|
#ifdef LZ4
|
|
return true;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
inline bool CompressionTypeSupported(CompressionType compression_type) {
|
|
switch (compression_type) {
|
|
case kNoCompression:
|
|
return true;
|
|
case kSnappyCompression:
|
|
return Snappy_Supported();
|
|
case kZlibCompression:
|
|
return Zlib_Supported();
|
|
case kBZip2Compression:
|
|
return BZip2_Supported();
|
|
case kLZ4Compression:
|
|
return LZ4_Supported();
|
|
case kLZ4HCCompression:
|
|
return LZ4_Supported();
|
|
default:
|
|
assert(false);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
inline std::string CompressionTypeToString(CompressionType compression_type) {
|
|
switch (compression_type) {
|
|
case kNoCompression:
|
|
return "NoCompression";
|
|
case kSnappyCompression:
|
|
return "Snappy";
|
|
case kZlibCompression:
|
|
return "Zlib";
|
|
case kBZip2Compression:
|
|
return "BZip2";
|
|
case kLZ4Compression:
|
|
return "LZ4";
|
|
case kLZ4HCCompression:
|
|
return "LZ4HC";
|
|
default:
|
|
assert(false);
|
|
return "";
|
|
}
|
|
}
|
|
|
|
// compress_format_version can have two values:
|
|
// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
|
|
// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
|
|
// way.
|
|
// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
|
|
// start of compressed block. Snappy format is the same as version 1.
|
|
|
|
inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
|
|
size_t length, ::std::string* output) {
|
|
#ifdef SNAPPY
|
|
output->resize(snappy::MaxCompressedLength(length));
|
|
size_t outlen;
|
|
snappy::RawCompress(input, length, &(*output)[0], &outlen);
|
|
output->resize(outlen);
|
|
return true;
|
|
#endif
|
|
|
|
return false;
|
|
}
|
|
|
|
inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
|
|
size_t* result) {
|
|
#ifdef SNAPPY
|
|
return snappy::GetUncompressedLength(input, length, result);
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
inline bool Snappy_Uncompress(const char* input, size_t length,
|
|
char* output) {
|
|
#ifdef SNAPPY
|
|
return snappy::RawUncompress(input, length, output);
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
namespace compression {
|
|
// returns size
|
|
inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) {
|
|
PutVarint32(output, length);
|
|
return output->size();
|
|
}
|
|
|
|
inline bool GetDecompressedSizeInfo(const char** input_data,
|
|
size_t* input_length,
|
|
uint32_t* output_len) {
|
|
auto new_input_data =
|
|
GetVarint32Ptr(*input_data, *input_data + *input_length, output_len);
|
|
if (new_input_data == nullptr) {
|
|
return false;
|
|
}
|
|
*input_length -= (new_input_data - *input_data);
|
|
*input_data = new_input_data;
|
|
return true;
|
|
}
|
|
} // namespace compression
|
|
|
|
// compress_format_version == 1 -- decompressed size is not included in the
|
|
// block header
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
// header in varint32 format
|
|
inline bool Zlib_Compress(const CompressionOptions& opts,
|
|
uint32_t compress_format_version,
|
|
const char* input, size_t length,
|
|
::std::string* output) {
|
|
#ifdef ZLIB
|
|
if (length > std::numeric_limits<uint32_t>::max()) {
|
|
// Can't compress more than 4GB
|
|
return false;
|
|
}
|
|
|
|
size_t output_header_len = 0;
|
|
if (compress_format_version == 2) {
|
|
output_header_len = compression::PutDecompressedSizeInfo(
|
|
output, static_cast<uint32_t>(length));
|
|
}
|
|
// Resize output to be the plain data length.
|
|
// This may not be big enough if the compression actually expands data.
|
|
output->resize(output_header_len + length);
|
|
|
|
// The memLevel parameter specifies how much memory should be allocated for
|
|
// the internal compression state.
|
|
// memLevel=1 uses minimum memory but is slow and reduces compression ratio.
|
|
// memLevel=9 uses maximum memory for optimal speed.
|
|
// The default value is 8. See zconf.h for more details.
|
|
static const int memLevel = 8;
|
|
z_stream _stream;
|
|
memset(&_stream, 0, sizeof(z_stream));
|
|
int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits,
|
|
memLevel, opts.strategy);
|
|
if (st != Z_OK) {
|
|
return false;
|
|
}
|
|
|
|
// Compress the input, and put compressed data in output.
|
|
_stream.next_in = (Bytef *)input;
|
|
_stream.avail_in = static_cast<unsigned int>(length);
|
|
|
|
// Initialize the output size.
|
|
_stream.avail_out = static_cast<unsigned int>(length);
|
|
_stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
|
|
|
|
bool done = false;
|
|
while (!done) {
|
|
st = deflate(&_stream, Z_FINISH);
|
|
switch (st) {
|
|
case Z_STREAM_END:
|
|
done = true;
|
|
break;
|
|
case Z_OK:
|
|
// No output space. This means the compression is bigger than
|
|
// decompressed size. Just fail the compression in that case.
|
|
// Intentional fallback (to failure case)
|
|
case Z_BUF_ERROR:
|
|
default:
|
|
deflateEnd(&_stream);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
output->resize(output->size() - _stream.avail_out + output_header_len);
|
|
deflateEnd(&_stream);
|
|
return true;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
// compress_format_version == 1 -- decompressed size is not included in the
|
|
// block header
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
// header in varint32 format
|
|
inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
|
|
int* decompress_size,
|
|
uint32_t compress_format_version,
|
|
int windowBits = -14) {
|
|
#ifdef ZLIB
|
|
uint32_t output_len = 0;
|
|
if (compress_format_version == 2) {
|
|
if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
|
|
&output_len)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
// Assume the decompressed data size will 5x of compressed size, but round
|
|
// to the page size
|
|
size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
|
|
output_len = static_cast<uint32_t>(
|
|
std::min(proposed_output_len,
|
|
static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
|
|
}
|
|
|
|
z_stream _stream;
|
|
memset(&_stream, 0, sizeof(z_stream));
|
|
|
|
// For raw inflate, the windowBits should be -8..-15.
|
|
// If windowBits is bigger than zero, it will use either zlib
|
|
// header or gzip header. Adding 32 to it will do automatic detection.
|
|
int st = inflateInit2(&_stream,
|
|
windowBits > 0 ? windowBits + 32 : windowBits);
|
|
if (st != Z_OK) {
|
|
return nullptr;
|
|
}
|
|
|
|
_stream.next_in = (Bytef *)input_data;
|
|
_stream.avail_in = static_cast<unsigned int>(input_length);
|
|
|
|
char* output = new char[output_len];
|
|
|
|
_stream.next_out = (Bytef *)output;
|
|
_stream.avail_out = static_cast<unsigned int>(output_len);
|
|
|
|
bool done = false;
|
|
while (!done) {
|
|
st = inflate(&_stream, Z_SYNC_FLUSH);
|
|
switch (st) {
|
|
case Z_STREAM_END:
|
|
done = true;
|
|
break;
|
|
case Z_OK: {
|
|
// No output space. Increase the output space by 20%.
|
|
// We should never run out of output space if
|
|
// compress_format_version == 2
|
|
assert(compress_format_version != 2);
|
|
size_t old_sz = output_len;
|
|
size_t output_len_delta = static_cast<size_t>(output_len * 0.2);
|
|
output_len += output_len_delta < 10 ? 10 : output_len_delta;
|
|
char* tmp = new char[output_len];
|
|
memcpy(tmp, output, old_sz);
|
|
delete[] output;
|
|
output = tmp;
|
|
|
|
// Set more output.
|
|
_stream.next_out = (Bytef *)(output + old_sz);
|
|
_stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
|
|
break;
|
|
}
|
|
case Z_BUF_ERROR:
|
|
default:
|
|
delete[] output;
|
|
inflateEnd(&_stream);
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
// If we encoded decompressed block size, we should have no bytes left
|
|
assert(compress_format_version != 2 || _stream.avail_out == 0);
|
|
*decompress_size = static_cast<int>(output_len - _stream.avail_out);
|
|
inflateEnd(&_stream);
|
|
return output;
|
|
#endif
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
// compress_format_version == 1 -- decompressed size is not included in the
|
|
// block header
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
// header in varint32 format
|
|
inline bool BZip2_Compress(const CompressionOptions& opts,
|
|
uint32_t compress_format_version,
|
|
const char* input, size_t length,
|
|
::std::string* output) {
|
|
#ifdef BZIP2
|
|
if (length > std::numeric_limits<uint32_t>::max()) {
|
|
// Can't compress more than 4GB
|
|
return false;
|
|
}
|
|
size_t output_header_len = 0;
|
|
if (compress_format_version == 2) {
|
|
output_header_len = compression::PutDecompressedSizeInfo(
|
|
output, static_cast<uint32_t>(length));
|
|
}
|
|
// Resize output to be the plain data length.
|
|
// This may not be big enough if the compression actually expands data.
|
|
output->resize(output_header_len + length);
|
|
|
|
|
|
bz_stream _stream;
|
|
memset(&_stream, 0, sizeof(bz_stream));
|
|
|
|
// Block size 1 is 100K.
|
|
// 0 is for silent.
|
|
// 30 is the default workFactor
|
|
int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
|
|
if (st != BZ_OK) {
|
|
return false;
|
|
}
|
|
|
|
// Compress the input, and put compressed data in output.
|
|
_stream.next_in = (char *)input;
|
|
_stream.avail_in = static_cast<unsigned int>(length);
|
|
|
|
// Initialize the output size.
|
|
_stream.avail_out = static_cast<unsigned int>(length);
|
|
_stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]);
|
|
|
|
while (_stream.next_in != nullptr && _stream.avail_in != 0) {
|
|
st = BZ2_bzCompress(&_stream, BZ_FINISH);
|
|
switch (st) {
|
|
case BZ_STREAM_END:
|
|
break;
|
|
case BZ_FINISH_OK:
|
|
// No output space. This means the compression is bigger than
|
|
// decompressed size. Just fail the compression in that case
|
|
// Intentional fallback (to failure case)
|
|
case BZ_SEQUENCE_ERROR:
|
|
default:
|
|
BZ2_bzCompressEnd(&_stream);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
output->resize(output->size() - _stream.avail_out + output_header_len);
|
|
BZ2_bzCompressEnd(&_stream);
|
|
return true;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
// compress_format_version == 1 -- decompressed size is not included in the
|
|
// block header
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
// header in varint32 format
|
|
inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
|
|
int* decompress_size,
|
|
uint32_t compress_format_version) {
|
|
#ifdef BZIP2
|
|
uint32_t output_len = 0;
|
|
if (compress_format_version == 2) {
|
|
if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
|
|
&output_len)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
// Assume the decompressed data size will 5x of compressed size, but round
|
|
// to the next page size
|
|
size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
|
|
output_len = static_cast<uint32_t>(
|
|
std::min(proposed_output_len,
|
|
static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
|
|
}
|
|
|
|
bz_stream _stream;
|
|
memset(&_stream, 0, sizeof(bz_stream));
|
|
|
|
int st = BZ2_bzDecompressInit(&_stream, 0, 0);
|
|
if (st != BZ_OK) {
|
|
return nullptr;
|
|
}
|
|
|
|
_stream.next_in = (char *)input_data;
|
|
_stream.avail_in = static_cast<unsigned int>(input_length);
|
|
|
|
char* output = new char[output_len];
|
|
|
|
_stream.next_out = (char *)output;
|
|
_stream.avail_out = static_cast<unsigned int>(output_len);
|
|
|
|
bool done = false;
|
|
while (!done) {
|
|
st = BZ2_bzDecompress(&_stream);
|
|
switch (st) {
|
|
case BZ_STREAM_END:
|
|
done = true;
|
|
break;
|
|
case BZ_OK: {
|
|
// No output space. Increase the output space by 20%.
|
|
// We should never run out of output space if
|
|
// compress_format_version == 2
|
|
assert(compress_format_version != 2);
|
|
uint32_t old_sz = output_len;
|
|
output_len = output_len * 1.2;
|
|
char* tmp = new char[output_len];
|
|
memcpy(tmp, output, old_sz);
|
|
delete[] output;
|
|
output = tmp;
|
|
|
|
// Set more output.
|
|
_stream.next_out = (char *)(output + old_sz);
|
|
_stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
|
|
break;
|
|
}
|
|
default:
|
|
delete[] output;
|
|
BZ2_bzDecompressEnd(&_stream);
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
// If we encoded decompressed block size, we should have no bytes left
|
|
assert(compress_format_version != 2 || _stream.avail_out == 0);
|
|
*decompress_size = static_cast<int>(output_len - _stream.avail_out);
|
|
BZ2_bzDecompressEnd(&_stream);
|
|
return output;
|
|
#endif
|
|
return nullptr;
|
|
}
|
|
|
|
// compress_format_version == 1 -- decompressed size is included in the
|
|
// block header using memcpy, which makes database non-portable)
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
// header in varint32 format
|
|
inline bool LZ4_Compress(const CompressionOptions& opts,
|
|
uint32_t compress_format_version, const char* input,
|
|
size_t length, ::std::string* output) {
|
|
#ifdef LZ4
|
|
if (length > std::numeric_limits<uint32_t>::max()) {
|
|
// Can't compress more than 4GB
|
|
return false;
|
|
}
|
|
|
|
size_t output_header_len = 0;
|
|
if (compress_format_version == 2) {
|
|
// new encoding, using varint32 to store size information
|
|
output_header_len = compression::PutDecompressedSizeInfo(
|
|
output, static_cast<uint32_t>(length));
|
|
} else {
|
|
// legacy encoding, which is not really portable (depends on big/little
|
|
// endianness)
|
|
output_header_len = 8;
|
|
output->resize(output_header_len);
|
|
char* p = const_cast<char*>(output->c_str());
|
|
memcpy(p, &length, sizeof(length));
|
|
}
|
|
|
|
int compressBound = LZ4_compressBound(static_cast<int>(length));
|
|
output->resize(static_cast<size_t>(output_header_len + compressBound));
|
|
int outlen =
|
|
LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
|
|
static_cast<int>(length), compressBound);
|
|
if (outlen == 0) {
|
|
return false;
|
|
}
|
|
output->resize(static_cast<size_t>(output_header_len + outlen));
|
|
return true;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
// compress_format_version == 1 -- decompressed size is included in the
|
|
// block header using memcpy, which makes database non-portable)
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
// header in varint32 format
|
|
inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
|
|
int* decompress_size,
|
|
uint32_t compress_format_version) {
|
|
#ifdef LZ4
|
|
uint32_t output_len = 0;
|
|
if (compress_format_version == 2) {
|
|
// new encoding, using varint32 to store size information
|
|
if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
|
|
&output_len)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
// legacy encoding, which is not really portable (depends on big/little
|
|
// endianness)
|
|
if (input_length < 8) {
|
|
return nullptr;
|
|
}
|
|
memcpy(&output_len, input_data, sizeof(output_len));
|
|
input_length -= 8;
|
|
input_data += 8;
|
|
}
|
|
char* output = new char[output_len];
|
|
*decompress_size =
|
|
LZ4_decompress_safe(input_data, output, static_cast<int>(input_length),
|
|
static_cast<int>(output_len));
|
|
if (*decompress_size < 0) {
|
|
delete[] output;
|
|
return nullptr;
|
|
}
|
|
assert(*decompress_size == static_cast<int>(output_len));
|
|
return output;
|
|
#endif
|
|
return nullptr;
|
|
}
|
|
|
|
// compress_format_version == 1 -- decompressed size is included in the
|
|
// block header using memcpy, which makes database non-portable)
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
// header in varint32 format
|
|
inline bool LZ4HC_Compress(const CompressionOptions& opts,
|
|
uint32_t compress_format_version, const char* input,
|
|
size_t length, ::std::string* output) {
|
|
#ifdef LZ4
|
|
if (length > std::numeric_limits<uint32_t>::max()) {
|
|
// Can't compress more than 4GB
|
|
return false;
|
|
}
|
|
|
|
size_t output_header_len = 0;
|
|
if (compress_format_version == 2) {
|
|
// new encoding, using varint32 to store size information
|
|
output_header_len = compression::PutDecompressedSizeInfo(
|
|
output, static_cast<uint32_t>(length));
|
|
} else {
|
|
// legacy encoding, which is not really portable (depends on big/little
|
|
// endianness)
|
|
output_header_len = 8;
|
|
output->resize(output_header_len);
|
|
char* p = const_cast<char*>(output->c_str());
|
|
memcpy(p, &length, sizeof(length));
|
|
}
|
|
|
|
int compressBound = LZ4_compressBound(static_cast<int>(length));
|
|
output->resize(static_cast<size_t>(output_header_len + compressBound));
|
|
int outlen;
|
|
#ifdef LZ4_VERSION_MAJOR // they only started defining this since r113
|
|
outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
|
|
static_cast<int>(length),
|
|
compressBound, opts.level);
|
|
#else
|
|
outlen =
|
|
LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
|
|
static_cast<int>(length), compressBound);
|
|
#endif
|
|
if (outlen == 0) {
|
|
return false;
|
|
}
|
|
output->resize(static_cast<size_t>(output_header_len + outlen));
|
|
return true;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
} // namespace rocksdb
|