2016-02-09 15:12:00 -08:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 16:03:42 -07:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2015-01-09 12:57:11 -08:00
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
//
|
|
|
|
#pragma once
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
#include <algorithm>
|
|
|
|
#include <limits>
|
2015-08-27 15:40:42 -07:00
|
|
|
#include <string>
|
2015-01-14 16:24:24 -08:00
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
#include "rocksdb/options.h"
|
2015-01-14 16:24:24 -08:00
|
|
|
#include "util/coding.h"
|
2018-06-04 12:04:52 -07:00
|
|
|
#include "util/compression_context_cache.h"
|
2015-01-09 12:57:11 -08:00
|
|
|
|
|
|
|
#ifdef SNAPPY
|
|
|
|
#include <snappy.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef ZLIB
|
|
|
|
#include <zlib.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef BZIP2
|
|
|
|
#include <bzlib.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(LZ4)
|
|
|
|
#include <lz4.h>
|
|
|
|
#include <lz4hc.h>
|
|
|
|
#endif
|
|
|
|
|
2015-08-27 15:40:42 -07:00
|
|
|
#if defined(ZSTD)
|
|
|
|
#include <zstd.h>
|
2018-08-22 18:22:10 -07:00
|
|
|
#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+
|
2017-11-02 22:46:13 -07:00
|
|
|
#include <zdict.h>
|
2018-08-22 18:22:10 -07:00
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 10103
|
2018-06-04 12:04:52 -07:00
|
|
|
namespace rocksdb {
|
|
|
|
// Need this for the context allocation override
|
|
|
|
// On windows we need to do this explicitly
|
|
|
|
#if (ZSTD_VERSION_NUMBER >= 500)
|
|
|
|
#if defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && \
|
|
|
|
defined(ZSTD_STATIC_LINKING_ONLY)
|
|
|
|
#define ROCKSDB_ZSTD_CUSTOM_MEM
|
|
|
|
namespace port {
|
|
|
|
ZSTD_customMem GetJeZstdAllocationOverrides();
|
|
|
|
} // namespace port
|
|
|
|
#endif // defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) &&
|
|
|
|
// defined(ZSTD_STATIC_LINKING_ONLY)
|
|
|
|
|
|
|
|
// Cached data represents a portion that can be re-used
|
|
|
|
// If, in the future we have more than one native context to
|
|
|
|
// cache we can arrange this as a tuple
|
|
|
|
class ZSTDUncompressCachedData {
|
2018-06-05 12:51:05 -07:00
|
|
|
public:
|
2018-06-04 12:04:52 -07:00
|
|
|
using ZSTDNativeContext = ZSTD_DCtx*;
|
2018-06-05 12:51:05 -07:00
|
|
|
ZSTDUncompressCachedData() {}
|
2018-06-04 12:04:52 -07:00
|
|
|
// Init from cache
|
|
|
|
ZSTDUncompressCachedData(const ZSTDUncompressCachedData& o) = delete;
|
|
|
|
ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
|
2018-06-05 12:51:05 -07:00
|
|
|
ZSTDUncompressCachedData(ZSTDUncompressCachedData&& o) ROCKSDB_NOEXCEPT
|
|
|
|
: ZSTDUncompressCachedData() {
|
2018-06-04 12:04:52 -07:00
|
|
|
*this = std::move(o);
|
|
|
|
}
|
2018-06-05 12:51:05 -07:00
|
|
|
ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&& o)
|
|
|
|
ROCKSDB_NOEXCEPT {
|
2018-06-04 12:04:52 -07:00
|
|
|
assert(zstd_ctx_ == nullptr);
|
2018-06-05 12:51:05 -07:00
|
|
|
std::swap(zstd_ctx_, o.zstd_ctx_);
|
|
|
|
std::swap(cache_idx_, o.cache_idx_);
|
2018-06-04 12:04:52 -07:00
|
|
|
return *this;
|
|
|
|
}
|
2018-06-05 12:51:05 -07:00
|
|
|
ZSTDNativeContext Get() const { return zstd_ctx_; }
|
|
|
|
int64_t GetCacheIndex() const { return cache_idx_; }
|
2018-06-04 12:04:52 -07:00
|
|
|
void CreateIfNeeded() {
|
|
|
|
if (zstd_ctx_ == nullptr) {
|
|
|
|
#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
|
2018-06-05 12:51:05 -07:00
|
|
|
zstd_ctx_ =
|
|
|
|
ZSTD_createDCtx_advanced(port::GetJeZstdAllocationOverrides());
|
2018-06-04 12:04:52 -07:00
|
|
|
#else // ROCKSDB_ZSTD_CUSTOM_MEM
|
2018-06-05 12:51:05 -07:00
|
|
|
zstd_ctx_ = ZSTD_createDCtx();
|
2018-06-04 12:04:52 -07:00
|
|
|
#endif // ROCKSDB_ZSTD_CUSTOM_MEM
|
|
|
|
cache_idx_ = -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void InitFromCache(const ZSTDUncompressCachedData& o, int64_t idx) {
|
|
|
|
zstd_ctx_ = o.zstd_ctx_;
|
|
|
|
cache_idx_ = idx;
|
|
|
|
}
|
|
|
|
~ZSTDUncompressCachedData() {
|
|
|
|
if (zstd_ctx_ != nullptr && cache_idx_ == -1) {
|
|
|
|
ZSTD_freeDCtx(zstd_ctx_);
|
|
|
|
}
|
|
|
|
}
|
2018-06-05 12:51:05 -07:00
|
|
|
|
|
|
|
private:
|
|
|
|
ZSTDNativeContext zstd_ctx_ = nullptr;
|
|
|
|
int64_t cache_idx_ = -1; // -1 means this instance owns the context
|
2018-06-04 12:04:52 -07:00
|
|
|
};
|
2018-06-05 12:51:05 -07:00
|
|
|
#endif // (ZSTD_VERSION_NUMBER >= 500)
|
2018-06-04 12:04:52 -07:00
|
|
|
} // namespace rocksdb
|
2017-11-02 22:46:13 -07:00
|
|
|
#endif // ZSTD
|
2015-08-27 15:40:42 -07:00
|
|
|
|
2018-06-04 12:04:52 -07:00
|
|
|
#if !(defined ZSTD) || !(ZSTD_VERSION_NUMBER >= 500)
|
|
|
|
namespace rocksdb {
|
|
|
|
class ZSTDUncompressCachedData {
|
2018-06-05 12:51:05 -07:00
|
|
|
void* padding; // unused
|
|
|
|
public:
|
2018-06-04 12:04:52 -07:00
|
|
|
using ZSTDNativeContext = void*;
|
|
|
|
ZSTDUncompressCachedData() {}
|
|
|
|
ZSTDUncompressCachedData(const ZSTDUncompressCachedData&) {}
|
|
|
|
ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
|
2018-06-05 12:51:05 -07:00
|
|
|
ZSTDUncompressCachedData(ZSTDUncompressCachedData&&)
|
|
|
|
ROCKSDB_NOEXCEPT = default;
|
|
|
|
ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&&)
|
|
|
|
ROCKSDB_NOEXCEPT = default;
|
|
|
|
ZSTDNativeContext Get() const { return nullptr; }
|
|
|
|
int64_t GetCacheIndex() const { return -1; }
|
2018-06-04 12:04:52 -07:00
|
|
|
void CreateIfNeeded() {}
|
|
|
|
void InitFromCache(const ZSTDUncompressCachedData&, int64_t) {}
|
2018-07-13 14:07:53 -07:00
|
|
|
private:
|
|
|
|
void ignore_padding__() { padding = nullptr; }
|
2018-06-04 12:04:52 -07:00
|
|
|
};
|
|
|
|
} // namespace rocksdb
|
|
|
|
#endif
|
|
|
|
|
2016-04-19 22:54:24 -07:00
|
|
|
#if defined(XPRESS)
|
|
|
|
#include "port/xpress.h"
|
|
|
|
#endif
|
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
namespace rocksdb {
|
|
|
|
|
2018-08-23 19:19:16 -07:00
|
|
|
// Holds dictionary and related data, like ZSTD's digested dictionary.
|
|
|
|
struct CompressionDict {
|
|
|
|
enum class Mode {
|
|
|
|
kUninit,
|
|
|
|
kEmpty, // An empty one can be used for both compression and uncompression
|
|
|
|
kCompression,
|
|
|
|
kUncompression,
|
|
|
|
};
|
|
|
|
#if ZSTD_VERSION_NUMBER >= 700
|
|
|
|
union {
|
|
|
|
ZSTD_CDict* zstd_cdict_;
|
|
|
|
ZSTD_DDict* zstd_ddict_;
|
|
|
|
};
|
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 700
|
|
|
|
Mode mode_ = Mode::kUninit;
|
|
|
|
Slice dict_;
|
|
|
|
|
|
|
|
public:
|
|
|
|
static const CompressionDict& GetEmptyDict() {
|
|
|
|
static CompressionDict empty_dict{};
|
|
|
|
static bool init = false;
|
|
|
|
if (!init) {
|
|
|
|
empty_dict.Init(Slice() /* dict */, Mode::kEmpty,
|
|
|
|
false /* use_zstd_trainer */);
|
|
|
|
init = true;
|
|
|
|
}
|
|
|
|
return empty_dict;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Init(Slice dict, Mode mode, CompressionType type, int level = -1) {
|
|
|
|
return Init(dict, mode, type == kZSTD || type == kZSTDNotFinalCompression,
|
|
|
|
level);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
#if ZSTD_VERSION_NUMBER >= 700
|
|
|
|
void Init(Slice dict, Mode mode, bool use_zstd_trainer, int level = -1) {
|
|
|
|
#else // ZSTD_VERSION_NUMBER >= 700
|
|
|
|
void Init(Slice dict, Mode mode, bool /* use_zstd_trainer */,
|
|
|
|
int /*level*/ = -1) {
|
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 700
|
|
|
|
assert(mode_ == Mode::kUninit);
|
|
|
|
dict_ = std::move(dict);
|
|
|
|
mode_ = mode;
|
|
|
|
switch (mode) {
|
|
|
|
case Mode::kUninit:
|
|
|
|
assert(false);
|
|
|
|
break;
|
|
|
|
case Mode::kEmpty:
|
|
|
|
break;
|
|
|
|
case Mode::kCompression:
|
|
|
|
#if ZSTD_VERSION_NUMBER >= 700
|
|
|
|
zstd_cdict_ = nullptr;
|
|
|
|
if (!dict_.empty() && use_zstd_trainer) {
|
|
|
|
if (level == CompressionOptions::kDefaultCompressionLevel) {
|
|
|
|
// 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
|
|
|
|
// https://github.com/facebook/zstd/issues/1148
|
|
|
|
level = 3;
|
|
|
|
}
|
|
|
|
// Should be safe (but slower) if below call fails as we'll use the
|
|
|
|
// raw dictionary to compress.
|
|
|
|
zstd_cdict_ = ZSTD_createCDict(dict_.data(), dict_.size(), level);
|
|
|
|
assert(zstd_cdict_ != nullptr);
|
|
|
|
}
|
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 700
|
|
|
|
break;
|
|
|
|
case Mode::kUncompression:
|
|
|
|
#if ZSTD_VERSION_NUMBER >= 700
|
|
|
|
zstd_ddict_ = nullptr;
|
|
|
|
if (!dict_.empty() && use_zstd_trainer) {
|
|
|
|
zstd_ddict_ = ZSTD_createDDict(dict_.data(), dict_.size());
|
|
|
|
assert(zstd_ddict_ != nullptr);
|
|
|
|
}
|
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 700
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
|
|
|
~CompressionDict() {
|
|
|
|
#if ZSTD_VERSION_NUMBER >= 700
|
|
|
|
size_t res = 0;
|
|
|
|
switch (mode_) {
|
|
|
|
case Mode::kUninit:
|
|
|
|
break;
|
|
|
|
case Mode::kEmpty:
|
|
|
|
break;
|
|
|
|
case Mode::kCompression:
|
|
|
|
if (zstd_cdict_ != nullptr) {
|
|
|
|
res = ZSTD_freeCDict(zstd_cdict_);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case Mode::kUncompression:
|
|
|
|
if (zstd_ddict_ != nullptr) {
|
|
|
|
res = ZSTD_freeDDict(zstd_ddict_);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
assert(res == 0); // Last I checked they can't fail
|
|
|
|
(void)res; // prevent unused var warning
|
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 700
|
|
|
|
}
|
|
|
|
|
|
|
|
#if ZSTD_VERSION_NUMBER >= 700
|
|
|
|
const ZSTD_CDict* GetDigestedZstdCDict() const {
|
|
|
|
assert(mode_ != Mode::kUninit);
|
|
|
|
if (mode_ == Mode::kEmpty) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
assert(mode_ == Mode::kCompression);
|
|
|
|
return zstd_cdict_;
|
|
|
|
}
|
|
|
|
|
|
|
|
const ZSTD_DDict* GetDigestedZstdDDict() const {
|
|
|
|
assert(mode_ != Mode::kUninit);
|
|
|
|
if (mode_ == Mode::kEmpty) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
assert(mode_ == Mode::kUncompression);
|
|
|
|
return zstd_ddict_;
|
|
|
|
}
|
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 700
|
|
|
|
|
|
|
|
Slice GetRawDict() const {
|
|
|
|
assert(mode_ != Mode::kUninit);
|
|
|
|
assert(mode_ != Mode::kEmpty || dict_.empty());
|
|
|
|
return dict_;
|
|
|
|
}
|
|
|
|
|
|
|
|
CompressionDict() = default;
|
|
|
|
// Disable copy/move
|
|
|
|
CompressionDict(const CompressionDict&) = delete;
|
|
|
|
CompressionDict& operator=(const CompressionDict&) = delete;
|
|
|
|
CompressionDict(CompressionDict&&) = delete;
|
|
|
|
CompressionDict& operator=(CompressionDict&&) = delete;
|
|
|
|
};
|
|
|
|
|
2018-06-04 12:04:52 -07:00
|
|
|
class CompressionContext {
|
2018-08-23 19:19:16 -07:00
|
|
|
#if defined(ZSTD) && (ZSTD_VERSION_NUMBER >= 500)
|
2018-06-05 12:51:05 -07:00
|
|
|
private:
|
|
|
|
const CompressionType type_;
|
|
|
|
ZSTD_CCtx* zstd_ctx_ = nullptr;
|
2018-06-04 12:04:52 -07:00
|
|
|
void CreateNativeContext() {
|
2018-06-06 23:30:26 -07:00
|
|
|
if (type_ == kZSTD || type_ == kZSTDNotFinalCompression) {
|
2018-06-04 12:04:52 -07:00
|
|
|
#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
|
2018-06-05 12:51:05 -07:00
|
|
|
zstd_ctx_ =
|
|
|
|
ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides());
|
2018-06-04 12:04:52 -07:00
|
|
|
#else // ROCKSDB_ZSTD_CUSTOM_MEM
|
2018-06-05 12:51:05 -07:00
|
|
|
zstd_ctx_ = ZSTD_createCCtx();
|
2018-06-04 12:04:52 -07:00
|
|
|
#endif // ROCKSDB_ZSTD_CUSTOM_MEM
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void DestroyNativeContext() {
|
|
|
|
if (zstd_ctx_ != nullptr) {
|
|
|
|
ZSTD_freeCCtx(zstd_ctx_);
|
|
|
|
}
|
|
|
|
}
|
2018-06-05 12:51:05 -07:00
|
|
|
|
|
|
|
public:
|
2018-06-04 12:04:52 -07:00
|
|
|
// callable inside ZSTD_Compress
|
2018-06-05 12:51:05 -07:00
|
|
|
ZSTD_CCtx* ZSTDPreallocCtx() const {
|
2018-06-06 23:30:26 -07:00
|
|
|
assert(type_ == kZSTD || type_ == kZSTDNotFinalCompression);
|
2018-06-04 12:04:52 -07:00
|
|
|
return zstd_ctx_;
|
|
|
|
}
|
2018-08-23 19:19:16 -07:00
|
|
|
|
|
|
|
explicit CompressionContext(CompressionType comp_type) : type_(comp_type) {
|
|
|
|
CreateNativeContext();
|
|
|
|
}
|
|
|
|
|
2018-06-05 12:51:05 -07:00
|
|
|
#else // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
|
2018-08-23 19:19:16 -07:00
|
|
|
public:
|
|
|
|
explicit CompressionContext(CompressionType /* comp_type */) {}
|
2018-06-05 12:51:05 -07:00
|
|
|
private:
|
2018-06-04 12:04:52 -07:00
|
|
|
void CreateNativeContext() {}
|
|
|
|
void DestroyNativeContext() {}
|
2018-06-05 12:51:05 -07:00
|
|
|
#endif // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
|
|
|
|
public:
|
|
|
|
~CompressionContext() { DestroyNativeContext(); }
|
2018-06-04 12:04:52 -07:00
|
|
|
CompressionContext(const CompressionContext&) = delete;
|
|
|
|
CompressionContext& operator=(const CompressionContext&) = delete;
|
2018-08-23 19:19:16 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
class CompressionInfo {
|
|
|
|
const CompressionOptions& opts_;
|
|
|
|
const CompressionContext& context_;
|
|
|
|
const CompressionDict& dict_;
|
|
|
|
const CompressionType type_;
|
|
|
|
|
|
|
|
public:
|
|
|
|
CompressionInfo(const CompressionOptions& _opts,
|
|
|
|
const CompressionContext& _context,
|
|
|
|
const CompressionDict& _dict, CompressionType _type)
|
|
|
|
: opts_(_opts), context_(_context), dict_(_dict), type_(_type) {}
|
2018-06-04 12:04:52 -07:00
|
|
|
|
2018-06-05 12:51:05 -07:00
|
|
|
const CompressionOptions& options() const { return opts_; }
|
2018-08-23 19:19:16 -07:00
|
|
|
const CompressionContext& context() const { return context_; }
|
|
|
|
const CompressionDict& dict() const { return dict_; }
|
2018-06-05 12:51:05 -07:00
|
|
|
CompressionType type() const { return type_; }
|
2018-06-04 12:04:52 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
class UncompressionContext {
|
2018-06-05 12:51:05 -07:00
|
|
|
private:
|
2018-08-23 19:19:16 -07:00
|
|
|
const CompressionType type_;
|
2018-06-04 12:04:52 -07:00
|
|
|
CompressionContextCache* ctx_cache_ = nullptr;
|
|
|
|
ZSTDUncompressCachedData uncomp_cached_data_;
|
2018-06-05 12:51:05 -07:00
|
|
|
|
|
|
|
public:
|
2018-06-04 12:04:52 -07:00
|
|
|
struct NoCache {};
|
|
|
|
// Do not use context cache, used by TableBuilder
|
2018-06-05 12:51:05 -07:00
|
|
|
UncompressionContext(NoCache, CompressionType comp_type) : type_(comp_type) {}
|
2018-08-23 19:19:16 -07:00
|
|
|
|
|
|
|
explicit UncompressionContext(CompressionType comp_type) : type_(comp_type) {
|
2018-06-06 23:30:26 -07:00
|
|
|
if (type_ == kZSTD || type_ == kZSTDNotFinalCompression) {
|
2018-06-04 12:04:52 -07:00
|
|
|
ctx_cache_ = CompressionContextCache::Instance();
|
|
|
|
uncomp_cached_data_ = ctx_cache_->GetCachedZSTDUncompressData();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
~UncompressionContext() {
|
2018-06-06 23:30:26 -07:00
|
|
|
if ((type_ == kZSTD || type_ == kZSTDNotFinalCompression) &&
|
|
|
|
uncomp_cached_data_.GetCacheIndex() != -1) {
|
2018-06-04 12:04:52 -07:00
|
|
|
assert(ctx_cache_ != nullptr);
|
|
|
|
ctx_cache_->ReturnCachedZSTDUncompressData(
|
2018-06-05 12:51:05 -07:00
|
|
|
uncomp_cached_data_.GetCacheIndex());
|
2018-06-04 12:04:52 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
UncompressionContext(const UncompressionContext&) = delete;
|
|
|
|
UncompressionContext& operator=(const UncompressionContext&) = delete;
|
|
|
|
|
|
|
|
ZSTDUncompressCachedData::ZSTDNativeContext GetZSTDContext() const {
|
|
|
|
return uncomp_cached_data_.Get();
|
|
|
|
}
|
2018-08-23 19:19:16 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
class UncompressionInfo {
|
|
|
|
const UncompressionContext& context_;
|
|
|
|
const CompressionDict& dict_;
|
|
|
|
const CompressionType type_;
|
|
|
|
|
|
|
|
public:
|
|
|
|
UncompressionInfo(const UncompressionContext& _context,
|
|
|
|
const CompressionDict& _dict, CompressionType _type)
|
|
|
|
: context_(_context), dict_(_dict), type_(_type) {}
|
|
|
|
|
|
|
|
const UncompressionContext& context() const { return context_; }
|
|
|
|
const CompressionDict& dict() const { return dict_; }
|
2018-06-05 12:51:05 -07:00
|
|
|
CompressionType type() const { return type_; }
|
2018-06-04 12:04:52 -07:00
|
|
|
};
|
|
|
|
|
2015-04-06 12:50:44 -07:00
|
|
|
inline bool Snappy_Supported() {
|
|
|
|
#ifdef SNAPPY
|
|
|
|
return true;
|
2017-10-19 10:48:47 -07:00
|
|
|
#else
|
2015-04-06 12:50:44 -07:00
|
|
|
return false;
|
2017-10-19 10:48:47 -07:00
|
|
|
#endif
|
2015-04-06 12:50:44 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool Zlib_Supported() {
|
|
|
|
#ifdef ZLIB
|
|
|
|
return true;
|
2018-01-31 12:04:52 -08:00
|
|
|
#else
|
2015-04-06 12:50:44 -07:00
|
|
|
return false;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-04-06 12:50:44 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool BZip2_Supported() {
|
|
|
|
#ifdef BZIP2
|
|
|
|
return true;
|
2017-10-19 10:48:47 -07:00
|
|
|
#else
|
2015-04-06 12:50:44 -07:00
|
|
|
return false;
|
2017-10-19 10:48:47 -07:00
|
|
|
#endif
|
2015-04-06 12:50:44 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool LZ4_Supported() {
|
|
|
|
#ifdef LZ4
|
|
|
|
return true;
|
2017-10-19 10:48:47 -07:00
|
|
|
#else
|
2015-04-06 12:50:44 -07:00
|
|
|
return false;
|
2017-10-19 10:48:47 -07:00
|
|
|
#endif
|
2015-04-06 12:50:44 -07:00
|
|
|
}
|
|
|
|
|
2016-04-19 22:54:24 -07:00
|
|
|
inline bool XPRESS_Supported() {
|
|
|
|
#ifdef XPRESS
|
|
|
|
return true;
|
2017-10-19 10:48:47 -07:00
|
|
|
#else
|
2016-04-19 22:54:24 -07:00
|
|
|
return false;
|
2017-10-19 10:48:47 -07:00
|
|
|
#endif
|
2016-04-19 22:54:24 -07:00
|
|
|
}
|
|
|
|
|
2015-08-27 15:40:42 -07:00
|
|
|
inline bool ZSTD_Supported() {
|
2016-09-01 15:28:40 -07:00
|
|
|
#ifdef ZSTD
|
|
|
|
// ZSTD format is finalized since version 0.8.0.
|
|
|
|
return (ZSTD_versionNumber() >= 800);
|
2017-10-19 10:48:47 -07:00
|
|
|
#else
|
2016-09-01 15:28:40 -07:00
|
|
|
return false;
|
2017-10-19 10:48:47 -07:00
|
|
|
#endif
|
2016-09-01 15:28:40 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool ZSTDNotFinal_Supported() {
|
2015-08-27 15:40:42 -07:00
|
|
|
#ifdef ZSTD
|
|
|
|
return true;
|
2017-10-19 10:48:47 -07:00
|
|
|
#else
|
2015-08-27 15:40:42 -07:00
|
|
|
return false;
|
2017-10-19 10:48:47 -07:00
|
|
|
#endif
|
2015-08-27 15:40:42 -07:00
|
|
|
}
|
|
|
|
|
2015-06-18 14:55:05 -07:00
|
|
|
inline bool CompressionTypeSupported(CompressionType compression_type) {
|
|
|
|
switch (compression_type) {
|
|
|
|
case kNoCompression:
|
|
|
|
return true;
|
|
|
|
case kSnappyCompression:
|
|
|
|
return Snappy_Supported();
|
|
|
|
case kZlibCompression:
|
|
|
|
return Zlib_Supported();
|
|
|
|
case kBZip2Compression:
|
|
|
|
return BZip2_Supported();
|
|
|
|
case kLZ4Compression:
|
|
|
|
return LZ4_Supported();
|
|
|
|
case kLZ4HCCompression:
|
|
|
|
return LZ4_Supported();
|
2016-04-19 22:54:24 -07:00
|
|
|
case kXpressCompression:
|
|
|
|
return XPRESS_Supported();
|
2015-08-27 15:40:42 -07:00
|
|
|
case kZSTDNotFinalCompression:
|
2016-09-01 15:28:40 -07:00
|
|
|
return ZSTDNotFinal_Supported();
|
|
|
|
case kZSTD:
|
2015-09-09 16:36:19 -07:00
|
|
|
return ZSTD_Supported();
|
2015-06-18 14:55:05 -07:00
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
inline std::string CompressionTypeToString(CompressionType compression_type) {
|
|
|
|
switch (compression_type) {
|
|
|
|
case kNoCompression:
|
|
|
|
return "NoCompression";
|
|
|
|
case kSnappyCompression:
|
|
|
|
return "Snappy";
|
|
|
|
case kZlibCompression:
|
|
|
|
return "Zlib";
|
|
|
|
case kBZip2Compression:
|
|
|
|
return "BZip2";
|
|
|
|
case kLZ4Compression:
|
|
|
|
return "LZ4";
|
|
|
|
case kLZ4HCCompression:
|
|
|
|
return "LZ4HC";
|
2016-04-19 22:54:24 -07:00
|
|
|
case kXpressCompression:
|
|
|
|
return "Xpress";
|
2016-09-01 15:28:40 -07:00
|
|
|
case kZSTD:
|
2015-08-27 15:40:42 -07:00
|
|
|
return "ZSTD";
|
2017-11-15 19:30:35 -08:00
|
|
|
case kZSTDNotFinalCompression:
|
|
|
|
return "ZSTDNotFinal";
|
2015-06-18 14:55:05 -07:00
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
// compress_format_version can have two values:
|
|
|
|
// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
|
|
|
|
// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
|
|
|
|
// way.
|
|
|
|
// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
|
|
|
|
// start of compressed block. Snappy format is the same as version 1.
|
|
|
|
|
2018-08-23 19:19:16 -07:00
|
|
|
inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input,
|
|
|
|
size_t length, ::std::string* output) {
|
2015-01-09 12:57:11 -08:00
|
|
|
#ifdef SNAPPY
|
|
|
|
output->resize(snappy::MaxCompressedLength(length));
|
|
|
|
size_t outlen;
|
|
|
|
snappy::RawCompress(input, length, &(*output)[0], &outlen);
|
|
|
|
output->resize(outlen);
|
|
|
|
return true;
|
2017-10-19 10:48:47 -07:00
|
|
|
#else
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)input;
|
|
|
|
(void)length;
|
|
|
|
(void)output;
|
2015-01-09 12:57:11 -08:00
|
|
|
return false;
|
2017-10-19 10:48:47 -07:00
|
|
|
#endif
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
|
|
|
|
size_t* result) {
|
|
|
|
#ifdef SNAPPY
|
|
|
|
return snappy::GetUncompressedLength(input, length, result);
|
|
|
|
#else
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)input;
|
|
|
|
(void)length;
|
|
|
|
(void)result;
|
2015-01-09 12:57:11 -08:00
|
|
|
return false;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2018-04-12 17:55:14 -07:00
|
|
|
inline bool Snappy_Uncompress(const char* input, size_t length, char* output) {
|
2015-01-09 12:57:11 -08:00
|
|
|
#ifdef SNAPPY
|
|
|
|
return snappy::RawUncompress(input, length, output);
|
|
|
|
#else
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)input;
|
|
|
|
(void)length;
|
|
|
|
(void)output;
|
2015-01-09 12:57:11 -08:00
|
|
|
return false;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
namespace compression {
|
|
|
|
// returns size
|
|
|
|
inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) {
|
|
|
|
PutVarint32(output, length);
|
|
|
|
return output->size();
|
|
|
|
}
|
|
|
|
|
|
|
|
inline bool GetDecompressedSizeInfo(const char** input_data,
|
|
|
|
size_t* input_length,
|
|
|
|
uint32_t* output_len) {
|
|
|
|
auto new_input_data =
|
|
|
|
GetVarint32Ptr(*input_data, *input_data + *input_length, output_len);
|
|
|
|
if (new_input_data == nullptr) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
*input_length -= (new_input_data - *input_data);
|
|
|
|
*input_data = new_input_data;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} // namespace compression
|
|
|
|
|
|
|
|
// compress_format_version == 1 -- decompressed size is not included in the
|
|
|
|
// block header
|
|
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
|
|
// header in varint32 format
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
// @param compression_dict Data for presetting the compression library's
|
|
|
|
// dictionary.
|
2018-08-23 19:19:16 -07:00
|
|
|
inline bool Zlib_Compress(const CompressionInfo& info,
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
uint32_t compress_format_version, const char* input,
|
2018-06-04 12:04:52 -07:00
|
|
|
size_t length, ::std::string* output) {
|
2015-01-09 12:57:11 -08:00
|
|
|
#ifdef ZLIB
|
2015-01-14 16:24:24 -08:00
|
|
|
if (length > std::numeric_limits<uint32_t>::max()) {
|
|
|
|
// Can't compress more than 4GB
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t output_header_len = 0;
|
|
|
|
if (compress_format_version == 2) {
|
|
|
|
output_header_len = compression::PutDecompressedSizeInfo(
|
|
|
|
output, static_cast<uint32_t>(length));
|
|
|
|
}
|
|
|
|
// Resize output to be the plain data length.
|
|
|
|
// This may not be big enough if the compression actually expands data.
|
|
|
|
output->resize(output_header_len + length);
|
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
// The memLevel parameter specifies how much memory should be allocated for
|
|
|
|
// the internal compression state.
|
|
|
|
// memLevel=1 uses minimum memory but is slow and reduces compression ratio.
|
|
|
|
// memLevel=9 uses maximum memory for optimal speed.
|
|
|
|
// The default value is 8. See zconf.h for more details.
|
|
|
|
static const int memLevel = 8;
|
2018-05-23 18:33:00 -07:00
|
|
|
int level;
|
2018-08-23 19:19:16 -07:00
|
|
|
if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
|
2018-05-23 18:33:00 -07:00
|
|
|
level = Z_DEFAULT_COMPRESSION;
|
|
|
|
} else {
|
2018-08-23 19:19:16 -07:00
|
|
|
level = info.options().level;
|
2018-05-23 18:33:00 -07:00
|
|
|
}
|
2015-01-09 12:57:11 -08:00
|
|
|
z_stream _stream;
|
|
|
|
memset(&_stream, 0, sizeof(z_stream));
|
2018-08-23 19:19:16 -07:00
|
|
|
int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits,
|
|
|
|
memLevel, info.options().strategy);
|
2015-01-09 12:57:11 -08:00
|
|
|
if (st != Z_OK) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-08-23 19:19:16 -07:00
|
|
|
Slice compression_dict = info.dict().GetRawDict();
|
|
|
|
if (compression_dict.size()) {
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
// Initialize the compression library's dictionary
|
2018-08-23 19:19:16 -07:00
|
|
|
st = deflateSetDictionary(
|
|
|
|
&_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
|
|
|
|
static_cast<unsigned int>(compression_dict.size()));
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
if (st != Z_OK) {
|
2016-07-08 10:52:25 -07:00
|
|
|
deflateEnd(&_stream);
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
// Compress the input, and put compressed data in output.
|
2018-04-12 17:55:14 -07:00
|
|
|
_stream.next_in = (Bytef*)input;
|
2015-01-09 12:57:11 -08:00
|
|
|
_stream.avail_in = static_cast<unsigned int>(length);
|
|
|
|
|
|
|
|
// Initialize the output size.
|
|
|
|
_stream.avail_out = static_cast<unsigned int>(length);
|
2015-01-14 16:24:24 -08:00
|
|
|
_stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
|
2015-01-09 12:57:11 -08:00
|
|
|
|
2016-07-08 10:52:25 -07:00
|
|
|
bool compressed = false;
|
|
|
|
st = deflate(&_stream, Z_FINISH);
|
|
|
|
if (st == Z_STREAM_END) {
|
|
|
|
compressed = true;
|
|
|
|
output->resize(output->size() - _stream.avail_out);
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
2016-07-08 10:52:25 -07:00
|
|
|
// The only return value we really care about is Z_STREAM_END.
|
|
|
|
// Z_OK means insufficient output space. This means the compression is
|
|
|
|
// bigger than decompressed size. Just fail the compression in that case.
|
2015-01-09 12:57:11 -08:00
|
|
|
|
|
|
|
deflateEnd(&_stream);
|
2016-07-08 10:52:25 -07:00
|
|
|
return compressed;
|
2018-01-31 12:04:52 -08:00
|
|
|
#else
|
2018-08-23 19:19:16 -07:00
|
|
|
(void)info;
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)compress_format_version;
|
|
|
|
(void)input;
|
|
|
|
(void)length;
|
|
|
|
(void)output;
|
2015-01-09 12:57:11 -08:00
|
|
|
return false;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
// compress_format_version == 1 -- decompressed size is not included in the
|
|
|
|
// block header
|
|
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
|
|
// header in varint32 format
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
// @param compression_dict Data for presetting the compression library's
|
|
|
|
// dictionary.
|
2018-08-23 19:19:16 -07:00
|
|
|
inline char* Zlib_Uncompress(const UncompressionInfo& info,
|
2018-06-05 12:51:05 -07:00
|
|
|
const char* input_data, size_t input_length,
|
2015-01-14 16:24:24 -08:00
|
|
|
int* decompress_size,
|
|
|
|
uint32_t compress_format_version,
|
|
|
|
int windowBits = -14) {
|
2015-01-09 12:57:11 -08:00
|
|
|
#ifdef ZLIB
|
2015-01-14 16:24:24 -08:00
|
|
|
uint32_t output_len = 0;
|
|
|
|
if (compress_format_version == 2) {
|
|
|
|
if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
|
|
|
|
&output_len)) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Assume the decompressed data size will 5x of compressed size, but round
|
|
|
|
// to the page size
|
|
|
|
size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
|
|
|
|
output_len = static_cast<uint32_t>(
|
|
|
|
std::min(proposed_output_len,
|
|
|
|
static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
|
|
|
|
}
|
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
z_stream _stream;
|
|
|
|
memset(&_stream, 0, sizeof(z_stream));
|
|
|
|
|
|
|
|
// For raw inflate, the windowBits should be -8..-15.
|
|
|
|
// If windowBits is bigger than zero, it will use either zlib
|
|
|
|
// header or gzip header. Adding 32 to it will do automatic detection.
|
2018-04-12 17:55:14 -07:00
|
|
|
int st =
|
|
|
|
inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits);
|
2015-01-09 12:57:11 -08:00
|
|
|
if (st != Z_OK) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2018-08-23 19:19:16 -07:00
|
|
|
Slice compression_dict = info.dict().GetRawDict();
|
|
|
|
if (compression_dict.size()) {
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
// Initialize the compression library's dictionary
|
2018-08-23 19:19:16 -07:00
|
|
|
st = inflateSetDictionary(
|
|
|
|
&_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
|
|
|
|
static_cast<unsigned int>(compression_dict.size()));
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
if (st != Z_OK) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-12 17:55:14 -07:00
|
|
|
_stream.next_in = (Bytef*)input_data;
|
2015-01-09 12:57:11 -08:00
|
|
|
_stream.avail_in = static_cast<unsigned int>(input_length);
|
|
|
|
|
|
|
|
char* output = new char[output_len];
|
|
|
|
|
2018-04-12 17:55:14 -07:00
|
|
|
_stream.next_out = (Bytef*)output;
|
2015-01-09 12:57:11 -08:00
|
|
|
_stream.avail_out = static_cast<unsigned int>(output_len);
|
|
|
|
|
|
|
|
bool done = false;
|
|
|
|
while (!done) {
|
|
|
|
st = inflate(&_stream, Z_SYNC_FLUSH);
|
|
|
|
switch (st) {
|
|
|
|
case Z_STREAM_END:
|
|
|
|
done = true;
|
|
|
|
break;
|
2015-01-14 16:24:24 -08:00
|
|
|
case Z_OK: {
|
2015-01-09 12:57:11 -08:00
|
|
|
// No output space. Increase the output space by 20%.
|
2015-01-14 16:24:24 -08:00
|
|
|
// We should never run out of output space if
|
|
|
|
// compress_format_version == 2
|
|
|
|
assert(compress_format_version != 2);
|
|
|
|
size_t old_sz = output_len;
|
2018-04-12 17:55:14 -07:00
|
|
|
uint32_t output_len_delta = output_len / 5;
|
2015-01-09 12:57:11 -08:00
|
|
|
output_len += output_len_delta < 10 ? 10 : output_len_delta;
|
2015-01-14 16:24:24 -08:00
|
|
|
char* tmp = new char[output_len];
|
2015-01-09 12:57:11 -08:00
|
|
|
memcpy(tmp, output, old_sz);
|
|
|
|
delete[] output;
|
|
|
|
output = tmp;
|
|
|
|
|
|
|
|
// Set more output.
|
2018-04-12 17:55:14 -07:00
|
|
|
_stream.next_out = (Bytef*)(output + old_sz);
|
2015-01-09 12:57:11 -08:00
|
|
|
_stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
|
|
|
|
break;
|
2015-01-14 16:24:24 -08:00
|
|
|
}
|
2015-01-09 12:57:11 -08:00
|
|
|
case Z_BUF_ERROR:
|
|
|
|
default:
|
|
|
|
delete[] output;
|
|
|
|
inflateEnd(&_stream);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
// If we encoded decompressed block size, we should have no bytes left
|
|
|
|
assert(compress_format_version != 2 || _stream.avail_out == 0);
|
2015-01-09 12:57:11 -08:00
|
|
|
*decompress_size = static_cast<int>(output_len - _stream.avail_out);
|
|
|
|
inflateEnd(&_stream);
|
|
|
|
return output;
|
2018-01-31 12:04:52 -08:00
|
|
|
#else
|
2018-08-23 19:19:16 -07:00
|
|
|
(void)info;
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)input_data;
|
|
|
|
(void)input_length;
|
|
|
|
(void)decompress_size;
|
|
|
|
(void)compress_format_version;
|
|
|
|
(void)windowBits;
|
2015-01-09 12:57:11 -08:00
|
|
|
return nullptr;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
// compress_format_version == 1 -- decompressed size is not included in the
|
|
|
|
// block header
|
|
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
|
|
// header in varint32 format
|
2018-08-23 19:19:16 -07:00
|
|
|
inline bool BZip2_Compress(const CompressionInfo& /*info*/,
|
2018-03-05 13:08:17 -08:00
|
|
|
uint32_t compress_format_version, const char* input,
|
|
|
|
size_t length, ::std::string* output) {
|
2015-01-09 12:57:11 -08:00
|
|
|
#ifdef BZIP2
|
2015-01-14 16:24:24 -08:00
|
|
|
if (length > std::numeric_limits<uint32_t>::max()) {
|
|
|
|
// Can't compress more than 4GB
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
size_t output_header_len = 0;
|
|
|
|
if (compress_format_version == 2) {
|
|
|
|
output_header_len = compression::PutDecompressedSizeInfo(
|
|
|
|
output, static_cast<uint32_t>(length));
|
|
|
|
}
|
|
|
|
// Resize output to be the plain data length.
|
|
|
|
// This may not be big enough if the compression actually expands data.
|
|
|
|
output->resize(output_header_len + length);
|
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
bz_stream _stream;
|
|
|
|
memset(&_stream, 0, sizeof(bz_stream));
|
|
|
|
|
|
|
|
// Block size 1 is 100K.
|
|
|
|
// 0 is for silent.
|
|
|
|
// 30 is the default workFactor
|
|
|
|
int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
|
|
|
|
if (st != BZ_OK) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compress the input, and put compressed data in output.
|
2018-04-12 17:55:14 -07:00
|
|
|
_stream.next_in = (char*)input;
|
2015-01-09 12:57:11 -08:00
|
|
|
_stream.avail_in = static_cast<unsigned int>(length);
|
|
|
|
|
|
|
|
// Initialize the output size.
|
|
|
|
_stream.avail_out = static_cast<unsigned int>(length);
|
2015-01-14 16:24:24 -08:00
|
|
|
_stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]);
|
2015-01-09 12:57:11 -08:00
|
|
|
|
2016-07-08 10:52:25 -07:00
|
|
|
bool compressed = false;
|
|
|
|
st = BZ2_bzCompress(&_stream, BZ_FINISH);
|
|
|
|
if (st == BZ_STREAM_END) {
|
|
|
|
compressed = true;
|
|
|
|
output->resize(output->size() - _stream.avail_out);
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
2016-07-08 10:52:25 -07:00
|
|
|
// The only return value we really care about is BZ_STREAM_END.
|
|
|
|
// BZ_FINISH_OK means insufficient output space. This means the compression
|
|
|
|
// is bigger than decompressed size. Just fail the compression in that case.
|
2015-01-09 12:57:11 -08:00
|
|
|
|
|
|
|
BZ2_bzCompressEnd(&_stream);
|
2016-07-08 10:52:25 -07:00
|
|
|
return compressed;
|
2018-01-31 12:04:52 -08:00
|
|
|
#else
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)compress_format_version;
|
|
|
|
(void)input;
|
|
|
|
(void)length;
|
|
|
|
(void)output;
|
2015-01-09 12:57:11 -08:00
|
|
|
return false;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
// compress_format_version == 1 -- decompressed size is not included in the
|
|
|
|
// block header
|
|
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
|
|
// header in varint32 format
|
2015-01-09 12:57:11 -08:00
|
|
|
inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
|
2015-01-14 16:24:24 -08:00
|
|
|
int* decompress_size,
|
|
|
|
uint32_t compress_format_version) {
|
2015-01-09 12:57:11 -08:00
|
|
|
#ifdef BZIP2
|
2015-01-14 16:24:24 -08:00
|
|
|
uint32_t output_len = 0;
|
|
|
|
if (compress_format_version == 2) {
|
|
|
|
if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
|
|
|
|
&output_len)) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Assume the decompressed data size will 5x of compressed size, but round
|
|
|
|
// to the next page size
|
|
|
|
size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
|
|
|
|
output_len = static_cast<uint32_t>(
|
|
|
|
std::min(proposed_output_len,
|
|
|
|
static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
|
|
|
|
}
|
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
bz_stream _stream;
|
|
|
|
memset(&_stream, 0, sizeof(bz_stream));
|
|
|
|
|
|
|
|
int st = BZ2_bzDecompressInit(&_stream, 0, 0);
|
|
|
|
if (st != BZ_OK) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2018-04-12 17:55:14 -07:00
|
|
|
_stream.next_in = (char*)input_data;
|
2015-01-09 12:57:11 -08:00
|
|
|
_stream.avail_in = static_cast<unsigned int>(input_length);
|
|
|
|
|
|
|
|
char* output = new char[output_len];
|
|
|
|
|
2018-04-12 17:55:14 -07:00
|
|
|
_stream.next_out = (char*)output;
|
2015-01-09 12:57:11 -08:00
|
|
|
_stream.avail_out = static_cast<unsigned int>(output_len);
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
bool done = false;
|
|
|
|
while (!done) {
|
2015-01-09 12:57:11 -08:00
|
|
|
st = BZ2_bzDecompress(&_stream);
|
|
|
|
switch (st) {
|
|
|
|
case BZ_STREAM_END:
|
2015-01-14 16:24:24 -08:00
|
|
|
done = true;
|
2015-01-09 12:57:11 -08:00
|
|
|
break;
|
2015-01-14 16:24:24 -08:00
|
|
|
case BZ_OK: {
|
2015-01-09 12:57:11 -08:00
|
|
|
// No output space. Increase the output space by 20%.
|
2015-01-14 16:24:24 -08:00
|
|
|
// We should never run out of output space if
|
|
|
|
// compress_format_version == 2
|
|
|
|
assert(compress_format_version != 2);
|
|
|
|
uint32_t old_sz = output_len;
|
|
|
|
output_len = output_len * 1.2;
|
|
|
|
char* tmp = new char[output_len];
|
2015-01-09 12:57:11 -08:00
|
|
|
memcpy(tmp, output, old_sz);
|
|
|
|
delete[] output;
|
|
|
|
output = tmp;
|
|
|
|
|
|
|
|
// Set more output.
|
2018-04-12 17:55:14 -07:00
|
|
|
_stream.next_out = (char*)(output + old_sz);
|
2015-01-09 12:57:11 -08:00
|
|
|
_stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
|
|
|
|
break;
|
2015-01-14 16:24:24 -08:00
|
|
|
}
|
2015-01-09 12:57:11 -08:00
|
|
|
default:
|
|
|
|
delete[] output;
|
|
|
|
BZ2_bzDecompressEnd(&_stream);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
// If we encoded decompressed block size, we should have no bytes left
|
|
|
|
assert(compress_format_version != 2 || _stream.avail_out == 0);
|
2015-01-09 12:57:11 -08:00
|
|
|
*decompress_size = static_cast<int>(output_len - _stream.avail_out);
|
|
|
|
BZ2_bzDecompressEnd(&_stream);
|
|
|
|
return output;
|
2018-01-31 12:04:52 -08:00
|
|
|
#else
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)input_data;
|
|
|
|
(void)input_length;
|
|
|
|
(void)decompress_size;
|
|
|
|
(void)compress_format_version;
|
2015-01-09 12:57:11 -08:00
|
|
|
return nullptr;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
// compress_format_version == 1 -- decompressed size is included in the
|
|
|
|
// block header using memcpy, which makes database non-portable)
|
|
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
|
|
// header in varint32 format
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
// @param compression_dict Data for presetting the compression library's
|
|
|
|
// dictionary.
|
2018-08-23 19:19:16 -07:00
|
|
|
inline bool LZ4_Compress(const CompressionInfo& info,
|
2015-01-14 16:24:24 -08:00
|
|
|
uint32_t compress_format_version, const char* input,
|
2018-06-04 12:04:52 -07:00
|
|
|
size_t length, ::std::string* output) {
|
2015-01-09 12:57:11 -08:00
|
|
|
#ifdef LZ4
|
2015-01-14 16:24:24 -08:00
|
|
|
if (length > std::numeric_limits<uint32_t>::max()) {
|
|
|
|
// Can't compress more than 4GB
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t output_header_len = 0;
|
|
|
|
if (compress_format_version == 2) {
|
|
|
|
// new encoding, using varint32 to store size information
|
|
|
|
output_header_len = compression::PutDecompressedSizeInfo(
|
|
|
|
output, static_cast<uint32_t>(length));
|
|
|
|
} else {
|
|
|
|
// legacy encoding, which is not really portable (depends on big/little
|
|
|
|
// endianness)
|
|
|
|
output_header_len = 8;
|
|
|
|
output->resize(output_header_len);
|
|
|
|
char* p = const_cast<char*>(output->c_str());
|
|
|
|
memcpy(p, &length, sizeof(length));
|
|
|
|
}
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
int compress_bound = LZ4_compressBound(static_cast<int>(length));
|
|
|
|
output->resize(static_cast<size_t>(output_header_len + compress_bound));
|
|
|
|
|
|
|
|
int outlen;
|
|
|
|
#if LZ4_VERSION_NUMBER >= 10400 // r124+
|
|
|
|
LZ4_stream_t* stream = LZ4_createStream();
|
2018-08-23 19:19:16 -07:00
|
|
|
Slice compression_dict = info.dict().GetRawDict();
|
|
|
|
if (compression_dict.size()) {
|
|
|
|
LZ4_loadDict(stream, compression_dict.data(),
|
|
|
|
static_cast<int>(compression_dict.size()));
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
}
|
2016-11-21 12:17:06 -08:00
|
|
|
#if LZ4_VERSION_NUMBER >= 10700 // r129+
|
2018-04-12 17:55:14 -07:00
|
|
|
outlen =
|
|
|
|
LZ4_compress_fast_continue(stream, input, &(*output)[output_header_len],
|
|
|
|
static_cast<int>(length), compress_bound, 1);
|
2016-11-21 12:17:06 -08:00
|
|
|
#else // up to r128
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
outlen = LZ4_compress_limitedOutput_continue(
|
|
|
|
stream, input, &(*output)[output_header_len], static_cast<int>(length),
|
|
|
|
compress_bound);
|
2016-11-21 12:17:06 -08:00
|
|
|
#endif
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
LZ4_freeStream(stream);
|
|
|
|
#else // up to r123
|
|
|
|
outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
|
|
|
|
static_cast<int>(length), compress_bound);
|
|
|
|
#endif // LZ4_VERSION_NUMBER >= 10400
|
2015-01-14 16:24:24 -08:00
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
if (outlen == 0) {
|
|
|
|
return false;
|
|
|
|
}
|
2015-01-14 16:24:24 -08:00
|
|
|
output->resize(static_cast<size_t>(output_header_len + outlen));
|
2015-01-09 12:57:11 -08:00
|
|
|
return true;
|
2018-01-31 12:04:52 -08:00
|
|
|
#else // LZ4
|
2018-08-23 19:19:16 -07:00
|
|
|
(void)info;
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)compress_format_version;
|
|
|
|
(void)input;
|
|
|
|
(void)length;
|
|
|
|
(void)output;
|
2015-01-09 12:57:11 -08:00
|
|
|
return false;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
// compress_format_version == 1 -- decompressed size is included in the
|
|
|
|
// block header using memcpy, which makes database non-portable)
|
|
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
|
|
// header in varint32 format
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
// @param compression_dict Data for presetting the compression library's
|
|
|
|
// dictionary.
|
2018-08-23 19:19:16 -07:00
|
|
|
inline char* LZ4_Uncompress(const UncompressionInfo& info,
|
2018-06-05 12:51:05 -07:00
|
|
|
const char* input_data, size_t input_length,
|
2015-01-14 16:24:24 -08:00
|
|
|
int* decompress_size,
|
2018-06-04 12:04:52 -07:00
|
|
|
uint32_t compress_format_version) {
|
2015-01-09 12:57:11 -08:00
|
|
|
#ifdef LZ4
|
2015-01-14 16:24:24 -08:00
|
|
|
uint32_t output_len = 0;
|
|
|
|
if (compress_format_version == 2) {
|
|
|
|
// new encoding, using varint32 to store size information
|
|
|
|
if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
|
|
|
|
&output_len)) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// legacy encoding, which is not really portable (depends on big/little
|
|
|
|
// endianness)
|
|
|
|
if (input_length < 8) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
memcpy(&output_len, input_data, sizeof(output_len));
|
|
|
|
input_length -= 8;
|
|
|
|
input_data += 8;
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
char* output = new char[output_len];
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
#if LZ4_VERSION_NUMBER >= 10400 // r124+
|
|
|
|
LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
|
2018-08-23 19:19:16 -07:00
|
|
|
Slice compression_dict = info.dict().GetRawDict();
|
|
|
|
if (compression_dict.size()) {
|
|
|
|
LZ4_setStreamDecode(stream, compression_dict.data(),
|
|
|
|
static_cast<int>(compression_dict.size()));
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
}
|
|
|
|
*decompress_size = LZ4_decompress_safe_continue(
|
|
|
|
stream, input_data, output, static_cast<int>(input_length),
|
|
|
|
static_cast<int>(output_len));
|
|
|
|
LZ4_freeStreamDecode(stream);
|
|
|
|
#else // up to r123
|
2015-01-14 16:24:24 -08:00
|
|
|
*decompress_size =
|
|
|
|
LZ4_decompress_safe(input_data, output, static_cast<int>(input_length),
|
|
|
|
static_cast<int>(output_len));
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
#endif // LZ4_VERSION_NUMBER >= 10400
|
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
if (*decompress_size < 0) {
|
|
|
|
delete[] output;
|
|
|
|
return nullptr;
|
|
|
|
}
|
2015-01-14 16:24:24 -08:00
|
|
|
assert(*decompress_size == static_cast<int>(output_len));
|
2015-01-09 12:57:11 -08:00
|
|
|
return output;
|
2018-01-31 12:04:52 -08:00
|
|
|
#else // LZ4
|
2018-08-23 19:19:16 -07:00
|
|
|
(void)info;
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)input_data;
|
|
|
|
(void)input_length;
|
|
|
|
(void)decompress_size;
|
|
|
|
(void)compress_format_version;
|
2015-01-09 12:57:11 -08:00
|
|
|
return nullptr;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
// compress_format_version == 1 -- decompressed size is included in the
|
|
|
|
// block header using memcpy, which makes database non-portable)
|
|
|
|
// compress_format_version == 2 -- decompressed size is included in the block
|
|
|
|
// header in varint32 format
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
// @param compression_dict Data for presetting the compression library's
|
|
|
|
// dictionary.
|
2018-08-23 19:19:16 -07:00
|
|
|
inline bool LZ4HC_Compress(const CompressionInfo& info,
|
2015-01-14 16:24:24 -08:00
|
|
|
uint32_t compress_format_version, const char* input,
|
2018-06-04 12:04:52 -07:00
|
|
|
size_t length, ::std::string* output) {
|
2015-01-09 12:57:11 -08:00
|
|
|
#ifdef LZ4
|
2015-01-14 16:24:24 -08:00
|
|
|
if (length > std::numeric_limits<uint32_t>::max()) {
|
|
|
|
// Can't compress more than 4GB
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t output_header_len = 0;
|
|
|
|
if (compress_format_version == 2) {
|
|
|
|
// new encoding, using varint32 to store size information
|
|
|
|
output_header_len = compression::PutDecompressedSizeInfo(
|
|
|
|
output, static_cast<uint32_t>(length));
|
|
|
|
} else {
|
|
|
|
// legacy encoding, which is not really portable (depends on big/little
|
|
|
|
// endianness)
|
|
|
|
output_header_len = 8;
|
|
|
|
output->resize(output_header_len);
|
|
|
|
char* p = const_cast<char*>(output->c_str());
|
|
|
|
memcpy(p, &length, sizeof(length));
|
|
|
|
}
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
int compress_bound = LZ4_compressBound(static_cast<int>(length));
|
|
|
|
output->resize(static_cast<size_t>(output_header_len + compress_bound));
|
2015-01-14 16:24:24 -08:00
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
int outlen;
|
2018-05-23 18:33:00 -07:00
|
|
|
int level;
|
2018-08-23 19:19:16 -07:00
|
|
|
if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
|
2018-05-23 18:33:00 -07:00
|
|
|
level = 0; // lz4hc.h says any value < 1 will be sanitized to default
|
|
|
|
} else {
|
2018-08-23 19:19:16 -07:00
|
|
|
level = info.options().level;
|
2018-05-23 18:33:00 -07:00
|
|
|
}
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
#if LZ4_VERSION_NUMBER >= 10400 // r124+
|
|
|
|
LZ4_streamHC_t* stream = LZ4_createStreamHC();
|
2018-05-23 18:33:00 -07:00
|
|
|
LZ4_resetStreamHC(stream, level);
|
2018-08-23 19:19:16 -07:00
|
|
|
Slice compression_dict = info.dict().GetRawDict();
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
const char* compression_dict_data =
|
2018-08-23 19:19:16 -07:00
|
|
|
compression_dict.size() > 0 ? compression_dict.data() : nullptr;
|
|
|
|
size_t compression_dict_size = compression_dict.size();
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
LZ4_loadDictHC(stream, compression_dict_data,
|
|
|
|
static_cast<int>(compression_dict_size));
|
|
|
|
|
|
|
|
#if LZ4_VERSION_NUMBER >= 10700 // r129+
|
|
|
|
outlen =
|
|
|
|
LZ4_compress_HC_continue(stream, input, &(*output)[output_header_len],
|
|
|
|
static_cast<int>(length), compress_bound);
|
|
|
|
#else // r124-r128
|
|
|
|
outlen = LZ4_compressHC_limitedOutput_continue(
|
|
|
|
stream, input, &(*output)[output_header_len], static_cast<int>(length),
|
|
|
|
compress_bound);
|
|
|
|
#endif // LZ4_VERSION_NUMBER >= 10700
|
|
|
|
LZ4_freeStreamHC(stream);
|
|
|
|
|
|
|
|
#elif LZ4_VERSION_MAJOR // r113-r123
|
2015-01-14 16:24:24 -08:00
|
|
|
outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
|
|
|
|
static_cast<int>(length),
|
2018-05-23 18:33:00 -07:00
|
|
|
compress_bound, level);
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
#else // up to r112
|
2015-01-14 16:24:24 -08:00
|
|
|
outlen =
|
|
|
|
LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
static_cast<int>(length), compress_bound);
|
|
|
|
#endif // LZ4_VERSION_NUMBER >= 10400
|
|
|
|
|
2015-01-09 12:57:11 -08:00
|
|
|
if (outlen == 0) {
|
|
|
|
return false;
|
|
|
|
}
|
2015-01-14 16:24:24 -08:00
|
|
|
output->resize(static_cast<size_t>(output_header_len + outlen));
|
2015-01-09 12:57:11 -08:00
|
|
|
return true;
|
2018-01-31 12:04:52 -08:00
|
|
|
#else // LZ4
|
2018-08-23 19:19:16 -07:00
|
|
|
(void)info;
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)compress_format_version;
|
|
|
|
(void)input;
|
|
|
|
(void)length;
|
|
|
|
(void)output;
|
2015-01-09 12:57:11 -08:00
|
|
|
return false;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-01-09 12:57:11 -08:00
|
|
|
}
|
2015-01-14 16:24:24 -08:00
|
|
|
|
2016-04-19 22:54:24 -07:00
|
|
|
#ifdef XPRESS
|
2018-03-05 13:08:17 -08:00
|
|
|
inline bool XPRESS_Compress(const char* input, size_t length,
|
|
|
|
std::string* output) {
|
2016-04-19 22:54:24 -07:00
|
|
|
return port::xpress::Compress(input, length, output);
|
2018-03-05 13:08:17 -08:00
|
|
|
}
|
2017-10-19 10:48:47 -07:00
|
|
|
#else
|
2018-03-05 13:08:17 -08:00
|
|
|
inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/,
|
|
|
|
std::string* /*output*/) {
|
2016-04-19 22:54:24 -07:00
|
|
|
return false;
|
|
|
|
}
|
2018-03-05 13:08:17 -08:00
|
|
|
#endif
|
2016-04-19 22:54:24 -07:00
|
|
|
|
|
|
|
#ifdef XPRESS
|
2018-04-12 17:55:14 -07:00
|
|
|
inline char* XPRESS_Uncompress(const char* input_data, size_t input_length,
|
2018-03-05 13:08:17 -08:00
|
|
|
int* decompress_size) {
|
2016-04-19 22:54:24 -07:00
|
|
|
return port::xpress::Decompress(input_data, input_length, decompress_size);
|
2018-03-05 13:08:17 -08:00
|
|
|
}
|
2017-10-19 10:48:47 -07:00
|
|
|
#else
|
2018-03-05 13:08:17 -08:00
|
|
|
inline char* XPRESS_Uncompress(const char* /*input_data*/,
|
|
|
|
size_t /*input_length*/,
|
|
|
|
int* /*decompress_size*/) {
|
2016-04-19 22:54:24 -07:00
|
|
|
return nullptr;
|
|
|
|
}
|
2018-03-05 13:08:17 -08:00
|
|
|
#endif
|
2016-04-19 22:54:24 -07:00
|
|
|
|
2018-08-23 19:19:16 -07:00
|
|
|
inline bool ZSTD_Compress(const CompressionInfo& info, const char* input,
|
2018-06-04 12:04:52 -07:00
|
|
|
size_t length, ::std::string* output) {
|
2015-08-27 15:40:42 -07:00
|
|
|
#ifdef ZSTD
|
|
|
|
if (length > std::numeric_limits<uint32_t>::max()) {
|
|
|
|
// Can't compress more than 4GB
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t output_header_len = compression::PutDecompressedSizeInfo(
|
|
|
|
output, static_cast<uint32_t>(length));
|
|
|
|
|
|
|
|
size_t compressBound = ZSTD_compressBound(length);
|
|
|
|
output->resize(static_cast<size_t>(output_header_len + compressBound));
|
2018-06-04 12:04:52 -07:00
|
|
|
size_t outlen = 0;
|
2018-05-23 18:33:00 -07:00
|
|
|
int level;
|
2018-08-23 19:19:16 -07:00
|
|
|
if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
|
2018-05-23 18:33:00 -07:00
|
|
|
// 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
|
|
|
|
// https://github.com/facebook/zstd/issues/1148
|
|
|
|
level = 3;
|
|
|
|
} else {
|
2018-08-23 19:19:16 -07:00
|
|
|
level = info.options().level;
|
2018-05-23 18:33:00 -07:00
|
|
|
}
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+
|
2018-08-23 19:19:16 -07:00
|
|
|
ZSTD_CCtx* context = info.context().ZSTDPreallocCtx();
|
2018-06-04 12:04:52 -07:00
|
|
|
assert(context != nullptr);
|
2018-08-23 19:19:16 -07:00
|
|
|
#if ZSTD_VERSION_NUMBER >= 700 // v0.7.0+
|
|
|
|
if (info.dict().GetDigestedZstdCDict() != nullptr) {
|
|
|
|
outlen = ZSTD_compress_usingCDict(context, &(*output)[output_header_len],
|
|
|
|
compressBound, input, length,
|
|
|
|
info.dict().GetDigestedZstdCDict());
|
|
|
|
}
|
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 700
|
|
|
|
if (outlen == 0) {
|
|
|
|
outlen = ZSTD_compress_usingDict(context, &(*output)[output_header_len],
|
|
|
|
compressBound, input, length,
|
|
|
|
info.dict().GetRawDict().data(),
|
|
|
|
info.dict().GetRawDict().size(), level);
|
|
|
|
}
|
2018-04-12 17:55:14 -07:00
|
|
|
#else // up to v0.4.x
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input,
|
2018-05-23 18:33:00 -07:00
|
|
|
length, level);
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 500
|
2015-08-27 15:40:42 -07:00
|
|
|
if (outlen == 0) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
output->resize(output_header_len + outlen);
|
|
|
|
return true;
|
2018-04-12 17:55:14 -07:00
|
|
|
#else // ZSTD
|
2018-08-23 19:19:16 -07:00
|
|
|
(void)info;
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)input;
|
|
|
|
(void)length;
|
|
|
|
(void)output;
|
2015-08-27 15:40:42 -07:00
|
|
|
return false;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-08-27 15:40:42 -07:00
|
|
|
}
|
|
|
|
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
// @param compression_dict Data for presetting the compression library's
|
|
|
|
// dictionary.
|
2018-08-23 19:19:16 -07:00
|
|
|
inline char* ZSTD_Uncompress(const UncompressionInfo& info,
|
2018-06-05 12:51:05 -07:00
|
|
|
const char* input_data, size_t input_length,
|
2018-06-04 12:04:52 -07:00
|
|
|
int* decompress_size) {
|
2015-08-27 15:40:42 -07:00
|
|
|
#ifdef ZSTD
|
|
|
|
uint32_t output_len = 0;
|
|
|
|
if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
|
|
|
|
&output_len)) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
char* output = new char[output_len];
|
2018-08-23 19:19:16 -07:00
|
|
|
size_t actual_output_length = 0;
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+
|
2018-08-23 19:19:16 -07:00
|
|
|
ZSTD_DCtx* context = info.context().GetZSTDContext();
|
2018-06-04 12:04:52 -07:00
|
|
|
assert(context != nullptr);
|
2018-08-23 19:19:16 -07:00
|
|
|
#if ZSTD_VERSION_NUMBER >= 700 // v0.7.0+
|
|
|
|
if (info.dict().GetDigestedZstdDDict() != nullptr) {
|
|
|
|
actual_output_length = ZSTD_decompress_usingDDict(
|
|
|
|
context, output, output_len, input_data, input_length,
|
|
|
|
info.dict().GetDigestedZstdDDict());
|
|
|
|
}
|
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 700
|
|
|
|
if (actual_output_length == 0) {
|
|
|
|
actual_output_length = ZSTD_decompress_usingDict(
|
|
|
|
context, output, output_len, input_data, input_length,
|
|
|
|
info.dict().GetRawDict().data(), info.dict().GetRawDict().size());
|
|
|
|
}
|
2018-04-12 17:55:14 -07:00
|
|
|
#else // up to v0.4.x
|
2018-08-23 19:19:16 -07:00
|
|
|
(void) info;
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
actual_output_length =
|
2015-08-27 15:40:42 -07:00
|
|
|
ZSTD_decompress(output, output_len, input_data, input_length);
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 500
|
2015-08-27 15:40:42 -07:00
|
|
|
assert(actual_output_length == output_len);
|
|
|
|
*decompress_size = static_cast<int>(actual_output_length);
|
|
|
|
return output;
|
2018-04-12 17:55:14 -07:00
|
|
|
#else // ZSTD
|
2018-08-23 19:19:16 -07:00
|
|
|
(void)info;
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)input_data;
|
|
|
|
(void)input_length;
|
|
|
|
(void)decompress_size;
|
2015-08-27 15:40:42 -07:00
|
|
|
return nullptr;
|
2018-01-31 12:04:52 -08:00
|
|
|
#endif
|
2015-08-27 15:40:42 -07:00
|
|
|
}
|
|
|
|
|
2017-11-02 22:46:13 -07:00
|
|
|
inline std::string ZSTD_TrainDictionary(const std::string& samples,
|
|
|
|
const std::vector<size_t>& sample_lens,
|
|
|
|
size_t max_dict_bytes) {
|
2018-08-22 18:22:10 -07:00
|
|
|
// Dictionary trainer is available since v0.6.1 for static linking, but not
|
|
|
|
// available for dynamic linking until v1.1.3. For now we enable the feature
|
|
|
|
// in v1.1.3+ only.
|
|
|
|
#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+
|
2018-08-23 19:19:16 -07:00
|
|
|
assert(samples.empty() == sample_lens.empty());
|
|
|
|
if (samples.empty()) {
|
|
|
|
return "";
|
|
|
|
}
|
2017-11-02 22:46:13 -07:00
|
|
|
std::string dict_data(max_dict_bytes, '\0');
|
2017-11-03 11:15:26 -07:00
|
|
|
size_t dict_len = ZDICT_trainFromBuffer(
|
|
|
|
&dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
|
|
|
|
static_cast<unsigned>(sample_lens.size()));
|
2017-11-02 22:46:13 -07:00
|
|
|
if (ZDICT_isError(dict_len)) {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
assert(dict_len <= max_dict_bytes);
|
|
|
|
dict_data.resize(dict_len);
|
|
|
|
return dict_data;
|
2018-08-22 18:22:10 -07:00
|
|
|
#else // up to v1.1.2
|
2017-11-02 22:46:13 -07:00
|
|
|
assert(false);
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)samples;
|
|
|
|
(void)sample_lens;
|
|
|
|
(void)max_dict_bytes;
|
2017-11-02 22:46:13 -07:00
|
|
|
return "";
|
2018-08-22 18:22:10 -07:00
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 10103
|
2017-11-02 22:46:13 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
inline std::string ZSTD_TrainDictionary(const std::string& samples,
|
|
|
|
size_t sample_len_shift,
|
|
|
|
size_t max_dict_bytes) {
|
|
|
|
// Dictionary trainer is available since v0.6.1, but ZSTD was marked stable
|
|
|
|
// only since v0.8.0. For now we enable the feature in stable versions only.
|
2018-08-22 18:22:10 -07:00
|
|
|
#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+
|
2017-11-02 22:46:13 -07:00
|
|
|
// skips potential partial sample at the end of "samples"
|
|
|
|
size_t num_samples = samples.size() >> sample_len_shift;
|
2018-03-30 11:16:10 -07:00
|
|
|
std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
|
2017-11-02 22:46:13 -07:00
|
|
|
return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
|
2018-08-22 18:22:10 -07:00
|
|
|
#else // up to v1.1.2
|
2017-11-02 22:46:13 -07:00
|
|
|
assert(false);
|
2018-04-12 17:55:14 -07:00
|
|
|
(void)samples;
|
|
|
|
(void)sample_len_shift;
|
|
|
|
(void)max_dict_bytes;
|
2017-11-02 22:46:13 -07:00
|
|
|
return "";
|
2018-08-22 18:22:10 -07:00
|
|
|
#endif // ZSTD_VERSION_NUMBER >= 10103
|
2017-11-02 22:46:13 -07:00
|
|
|
}
|
|
|
|
|
2015-01-14 16:24:24 -08:00
|
|
|
} // namespace rocksdb
|