Improve support for using regexes (#8740)

Summary:
* Consolidate use of std::regex for testing to testharness.cc, to
minimize Facebook linters constantly flagging uses in non-production
code.
* Improve syntax and error messages for asserting some string matches a
regex in tests.
* Add a public Regex wrapper class to encapsulate existing usage in
ObjectRegistry.
* Remove unnecessary include <regex>
* Put warnings that use of Regex in production code could cause bad
performance or stack overflow.

Intended follow-up work:
* Replace std::regex with another underlying implementation like RE2
* Improve ObjectRegistry interface in terms of possibly confusing literal
string matching vs. regex and in terms of reporting invalid regex.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8740

Test Plan:
tests updated, basic unit test for public Regex, and some manual
testing of temporary changes to see example error messages:

utilities/backupable/backupable_db_test.cc:917: Failure
000010_1162373755_138626.blob (child.name)
does not match regex
[0-9]+_[0-9]+_[0-9]+[.]blobHAHAHA (pattern)

db/db_basic_test.cc:74: Failure
R3SHSBA8C4U0CIMV2ZB0 (sid3)
does not match regex [0-9A-Z]{20}HAHAHA

Reviewed By: mrambacher

Differential Revision: D30706246

Pulled By: pdillinger

fbshipit-source-id: ba845e8f563ccad39bdb58f44f04e9da8f78c3fd
This commit is contained in:
Peter Dillinger 2021-09-07 13:04:07 -07:00 committed by Facebook GitHub Bot
parent 4750421ece
commit 0ef88538c6
12 changed files with 254 additions and 30 deletions

View File

@ -836,6 +836,7 @@ set(SOURCES
util/random.cc
util/rate_limiter.cc
util/ribbon_config.cc
util/regex.cc
util/slice.cc
util/file_checksum_helper.cc
util/status.cc

View File

@ -355,6 +355,7 @@ cpp_library(
"util/murmurhash.cc",
"util/random.cc",
"util/rate_limiter.cc",
"util/regex.cc",
"util/ribbon_config.cc",
"util/slice.cc",
"util/status.cc",
@ -676,6 +677,7 @@ cpp_library(
"util/murmurhash.cc",
"util/random.cc",
"util/rate_limiter.cc",
"util/regex.cc",
"util/ribbon_config.cc",
"util/slice.cc",
"util/status.cc",

View File

@ -8,7 +8,6 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <cstring>
#include <regex>
#include "db/db_test_util.h"
#include "port/stack_trace.h"
@ -69,11 +68,10 @@ TEST_F(DBBasicTest, UniqueSession) {
ASSERT_EQ(sid2, sid4);
// Expected compact format for session ids (see notes in implementation)
std::regex expected("[0-9A-Z]{20}");
const std::string match("match");
EXPECT_EQ(match, std::regex_replace(sid1, expected, match));
EXPECT_EQ(match, std::regex_replace(sid2, expected, match));
EXPECT_EQ(match, std::regex_replace(sid3, expected, match));
TestRegex expected("[0-9A-Z]{20}");
EXPECT_MATCHES_REGEX(sid1, expected);
EXPECT_MATCHES_REGEX(sid2, expected);
EXPECT_MATCHES_REGEX(sid3, expected);
#ifndef ROCKSDB_LITE
Close();

View File

@ -10,12 +10,12 @@
#include <functional>
#include <memory>
#include <mutex>
#include <regex>
#include <string>
#include <unordered_map>
#include <vector>
#include "rocksdb/status.h"
#include "rocksdb/utilities/regex.h"
namespace ROCKSDB_NAMESPACE {
class Logger;
@ -55,14 +55,23 @@ class ObjectLibrary {
}; // End class Entry
// An Entry containing a FactoryFunc for creating new Objects
//
// !!!!!! WARNING !!!!!!: The implementation currently uses std::regex, which
// has terrible performance in some cases, including possible crash due to
// stack overflow. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582
// for example. Avoid complicated regexes as much as possible.
template <typename T>
class FactoryEntry : public Entry {
public:
FactoryEntry(const std::string& name, FactoryFunc<T> f)
: Entry(name), pattern_(std::move(name)), factory_(std::move(f)) {}
: Entry(name), factory_(std::move(f)) {
// FIXME: the API needs to expose this failure mode. For now, bad regexes
// will match nothing.
Regex::Parse(name, &regex_).PermitUncheckedError();
}
~FactoryEntry() override {}
bool matches(const std::string& target) const override {
return std::regex_match(target, pattern_);
return regex_.Matches(target);
}
// Creates a new T object.
T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard,
@ -71,7 +80,7 @@ class ObjectLibrary {
}
private:
std::regex pattern_; // The pattern for this entry
Regex regex_; // The pattern for this entry
FactoryFunc<T> factory_;
}; // End class FactoryEntry
public:
@ -152,6 +161,9 @@ class ObjectRegistry {
// pattern that matches the provided "target" string according to
// std::regex_match.
//
// WARNING: some regexes are problematic for std::regex; see
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582 for example
//
// If no registered functions match, returns nullptr. If multiple functions
// match, the factory function used is unspecified.
//

View File

@ -0,0 +1,48 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#ifndef ROCKSDB_LITE
#include <memory>
#include <string>
#include "rocksdb/status.h"
namespace ROCKSDB_NAMESPACE {
// A wrapper for parsed regular expressions. The regex syntax and matching is
// compatible with std::regex.
//
// !!!!!! WARNING !!!!!!: The implementation currently uses std::regex, which
// has terrible performance in some cases, including possible crash due to
// stack overflow. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582
// for example. Avoid use in production as much as possible.
//
// Internal note: see also TestRegex
class Regex {
public:
// Note: Cannot be constructed with a pattern, so that syntax errors can
// be handled without using exceptions.
// Parse returns OK and saves to `out` when the pattern is valid regex
// syntax (modified ECMAScript), or else returns InvalidArgument.
// See https://en.cppreference.com/w/cpp/regex/ecmascript
static Status Parse(const char *pattern, Regex *out);
static Status Parse(const std::string &pattern, Regex *out);
// Checks that the whole of str is matched by this regex. If called on a
// default-constructed Regex, will trigger assertion failure in DEBUG build
// or return false in release build.
bool Matches(const std::string &str) const;
private:
class Impl;
std::shared_ptr<Impl> impl_; // shared_ptr for simple implementation
};
} // namespace ROCKSDB_NAMESPACE
#endif // ROCKSDB_LITE

1
src.mk
View File

@ -220,6 +220,7 @@ LIB_SOURCES = \
util/random.cc \
util/rate_limiter.cc \
util/ribbon_config.cc \
util/regex.cc \
util/slice.cc \
util/file_checksum_helper.cc \
util/status.cc \

View File

@ -8,6 +8,8 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "test_util/testharness.h"
#include <regex>
#include <string>
#include <thread>
@ -60,5 +62,49 @@ int RandomSeed() {
return result;
}
TestRegex::TestRegex(const std::string& pattern)
: impl_(std::make_shared<Impl>(pattern)), pattern_(pattern) {}
TestRegex::TestRegex(const char* pattern)
: impl_(std::make_shared<Impl>(pattern)), pattern_(pattern) {}
const std::string& TestRegex::GetPattern() const { return pattern_; }
// Sorry about code duplication with regex.cc, but it doesn't support LITE
// due to exception handling
class TestRegex::Impl : public std::regex {
public:
using std::regex::basic_regex;
};
bool TestRegex::Matches(const std::string& str) const {
if (impl_) {
return std::regex_match(str, *impl_);
} else {
// Should not call Matches on unset Regex
assert(false);
return false;
}
}
::testing::AssertionResult AssertMatchesRegex(const char* str_expr,
const char* pattern_expr,
const std::string& str,
const TestRegex& pattern) {
if (pattern.Matches(str)) {
return ::testing::AssertionSuccess();
} else if (TestRegex("\".*\"").Matches(pattern_expr)) {
// constant regex string
return ::testing::AssertionFailure()
<< str << " (" << str_expr << ")" << std::endl
<< "does not match regex " << pattern.GetPattern();
} else {
// runtime regex string
return ::testing::AssertionFailure()
<< str << " (" << str_expr << ")" << std::endl
<< "does not match regex" << std::endl
<< pattern.GetPattern() << " (" << pattern_expr << ")";
}
}
} // namespace test
} // namespace ROCKSDB_NAMESPACE

View File

@ -14,6 +14,7 @@
#else
#include <gtest/gtest.h>
#endif
#include <rocksdb/utilities/regex.h>
// A "skipped" test has a specific meaning in Facebook infrastructure: the
// test is in good shape and should be run, but something about the
@ -78,5 +79,40 @@ int RandomSeed();
#define EXPECT_OK(s) \
EXPECT_PRED_FORMAT1(ROCKSDB_NAMESPACE::test::AssertStatus, s)
#define EXPECT_NOK(s) EXPECT_FALSE((s).ok())
// Useful for testing
// * No need to deal with Status like in Regex public API
// * No triggering lint reports on use of std::regex in tests
// * Available in LITE (unlike public API)
class TestRegex {
public:
// These throw on bad pattern
/*implicit*/ TestRegex(const std::string& pattern);
/*implicit*/ TestRegex(const char* pattern);
// Checks that the whole of str is matched by this regex
bool Matches(const std::string& str) const;
const std::string& GetPattern() const;
private:
class Impl;
std::shared_ptr<Impl> impl_; // shared_ptr for simple implementation
std::string pattern_;
};
::testing::AssertionResult AssertMatchesRegex(const char* str_expr,
const char* pattern_expr,
const std::string& str,
const TestRegex& pattern);
#define ASSERT_MATCHES_REGEX(str, pattern) \
ASSERT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern)
#define EXPECT_MATCHES_REGEX(str, pattern) \
EXPECT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern)
} // namespace test
using test::TestRegex;
} // namespace ROCKSDB_NAMESPACE

50
util/regex.cc Normal file
View File

@ -0,0 +1,50 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
// LITE not supported here in part because of exception handling
#ifndef ROCKSDB_LITE
#include "rocksdb/utilities/regex.h"
#include <cassert>
#include <regex>
namespace ROCKSDB_NAMESPACE {
// This section would change for alternate underlying implementations other
// than std::regex.
#if 1
class Regex::Impl : public std::regex {
public:
using std::regex::basic_regex;
};
bool Regex::Matches(const std::string &str) const {
if (impl_) {
return std::regex_match(str, *impl_);
} else {
// Should not call Matches on unset Regex
assert(false);
return false;
}
}
Status Regex::Parse(const std::string &pattern, Regex *out) {
try {
out->impl_.reset(new Impl(pattern));
return Status::OK();
} catch (const std::regex_error &e) {
return Status::InvalidArgument(e.what());
}
}
#endif
Status Regex::Parse(const char *pattern, Regex *out) {
return Parse(std::string(pattern), out);
}
} // namespace ROCKSDB_NAMESPACE
#endif // ROCKSDB_LITE

View File

@ -5,10 +5,13 @@
#include "rocksdb/slice.h"
#include <gtest/gtest.h>
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/data_structure.h"
#include "rocksdb/types.h"
#include "rocksdb/utilities/regex.h"
#include "test_util/testharness.h"
#include "test_util/testutil.h"
@ -157,6 +160,7 @@ TEST_F(PinnableSliceTest, Move) {
ASSERT_EQ(2, res);
}
// ***************************************************************** //
// Unit test for SmallEnumSet
class SmallEnumSetTest : public testing::Test {
public:
@ -173,6 +177,35 @@ TEST_F(SmallEnumSetTest, SmallSetTest) {
ASSERT_FALSE(fs.Contains(FileType::kDBLockFile));
}
// ***************************************************************** //
// Unit test for Regex
#ifndef ROCKSDB_LITE
TEST(RegexTest, ParseEtc) {
Regex r;
ASSERT_OK(Regex::Parse("[abc]{5}", &r));
ASSERT_TRUE(r.Matches("abcba"));
ASSERT_FALSE(r.Matches("abcb")); // too short
ASSERT_FALSE(r.Matches("abcbaa")); // too long
ASSERT_OK(Regex::Parse(".*foo.*", &r));
ASSERT_TRUE(r.Matches("123forfoodie456"));
ASSERT_FALSE(r.Matches("123forfodie456"));
// Ensure copy operator
Regex r2;
r2 = r;
ASSERT_TRUE(r2.Matches("123forfoodie456"));
ASSERT_FALSE(r2.Matches("123forfodie456"));
// Ensure copy constructor
Regex r3{r};
ASSERT_TRUE(r3.Matches("123forfoodie456"));
ASSERT_FALSE(r3.Matches("123forfodie456"));
ASSERT_TRUE(Regex::Parse("*foo.*", &r).IsInvalidArgument());
ASSERT_TRUE(Regex::Parse("[abc", &r).IsInvalidArgument());
ASSERT_TRUE(Regex::Parse("[abc]{1", &r).IsInvalidArgument());
}
#endif // ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

View File

@ -15,7 +15,6 @@
#include <array>
#include <limits>
#include <random>
#include <regex>
#include <string>
#include <utility>
@ -907,7 +906,7 @@ class BackupEngineTest : public testing::Test {
}
void AssertDirectoryFilesMatchRegex(const std::string& dir,
const std::regex& pattern,
const TestRegex& pattern,
const std::string& file_type,
int minimum_count) {
std::vector<FileAttributes> children;
@ -915,8 +914,7 @@ class BackupEngineTest : public testing::Test {
int found_count = 0;
for (const auto& child : children) {
if (EndsWith(child.name, file_type)) {
ASSERT_TRUE(std::regex_match(child.name, pattern))
<< "File name " << child.name << " does not match regex.";
ASSERT_MATCHES_REGEX(child.name, pattern);
++found_count;
}
}
@ -1764,7 +1762,7 @@ TEST_F(BackupEngineTest, FlushCompactDuringBackupCheckpoint) {
if (sopt == kShareWithChecksum) {
// Ensure we actually got DB manifest checksums by inspecting
// shared_checksum file names for hex checksum component
std::regex expected("[^_]+_[0-9A-F]{8}_[^_]+.sst");
TestRegex expected("[^_]+_[0-9A-F]{8}_[^_]+.sst");
std::vector<FileAttributes> children;
const std::string dir = backupdir_ + "/shared_checksum";
ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
@ -1772,8 +1770,7 @@ TEST_F(BackupEngineTest, FlushCompactDuringBackupCheckpoint) {
if (child.size_bytes == 0) {
continue;
}
const std::string match("match");
EXPECT_EQ(match, std::regex_replace(child.name, expected, match));
EXPECT_MATCHES_REGEX(child.name, expected);
}
}
*/
@ -2000,7 +1997,7 @@ TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNaming) {
FillDB(db_.get(), 0, keys_iteration);
CloseDBAndBackupEngine();
static const std::map<ShareFilesNaming, std::string> option_to_expected = {
static const std::map<ShareFilesNaming, TestRegex> option_to_expected = {
{kLegacyCrc32cAndFileSize, "[0-9]+_[0-9]+_[0-9]+[.]sst"},
// kFlagIncludeFileSize redundant here
{kLegacyCrc32cAndFileSize | kFlagIncludeFileSize,
@ -2010,7 +2007,7 @@ TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNaming) {
"[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst"},
};
const std::string blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
for (const auto& pair : option_to_expected) {
CloseAndReopenDB();
@ -2019,15 +2016,15 @@ TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNaming) {
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
CloseDBAndBackupEngine();
AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
std::regex(pair.second), ".sst",
1 /* minimum_count */);
if (std::string::npos != pair.second.find("_[0-9]+[.]sst")) {
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", pair.second,
".sst", 1 /* minimum_count */);
if (std::string::npos != pair.second.GetPattern().find("_[0-9]+[.]sst")) {
AssertDirectoryFilesSizeIndicators(backupdir_ + "/shared_checksum",
1 /* minimum_count */);
}
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
std::regex(blobfile_pattern), ".blob",
blobfile_pattern, ".blob",
1 /* minimum_count */);
}
}
@ -2051,9 +2048,9 @@ TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsOldFileNaming) {
CloseDBAndBackupEngine();
// Old names should always be used on old files
const std::regex expected("[0-9]+_[0-9]+_[0-9]+[.]sst");
const TestRegex sstfile_pattern("[0-9]+_[0-9]+_[0-9]+[.]sst");
const std::string blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
for (ShareFilesNaming option : {kNamingDefault, kUseDbSessionId}) {
CloseAndReopenDB();
@ -2062,10 +2059,11 @@ TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsOldFileNaming) {
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
CloseDBAndBackupEngine();
AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", expected,
".sst", 1 /* minimum_count */);
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
std::regex(blobfile_pattern), ".blob",
sstfile_pattern, ".sst",
1 /* minimum_count */);
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
blobfile_pattern, ".blob",
1 /* minimum_count */);
}

View File

@ -6,7 +6,6 @@
#include "utilities/persistent_cache/block_cache_tier.h"
#include <regex>
#include <utility>
#include <vector>