diff options
Diffstat (limited to 'third_party/re2/src/re2/testing/filtered_re2_test.cc')
-rw-r--r-- | third_party/re2/src/re2/testing/filtered_re2_test.cc | 342 |
1 files changed, 342 insertions, 0 deletions
diff --git a/third_party/re2/src/re2/testing/filtered_re2_test.cc b/third_party/re2/src/re2/testing/filtered_re2_test.cc new file mode 100644 index 000000000..a8d2dfc72 --- /dev/null +++ b/third_party/re2/src/re2/testing/filtered_re2_test.cc @@ -0,0 +1,342 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <stddef.h> +#include <algorithm> +#include <memory> +#include <string> +#include <vector> +#include <utility> + +#include "absl/base/macros.h" +#include "gtest/gtest.h" +#include "util/logging.h" +#include "re2/filtered_re2.h" +#include "re2/re2.h" + +namespace re2 { + +struct FilterTestVars { + FilterTestVars() {} + explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {} + + std::vector<std::string> atoms; + std::vector<int> atom_indices; + std::vector<int> matches; + RE2::Options opts; + FilteredRE2 f; +}; + +TEST(FilteredRE2Test, EmptyTest) { + FilterTestVars v; + + v.f.Compile(&v.atoms); + EXPECT_EQ(0, v.atoms.size()); + + // Compile has no effect at all when called before Add: it will not + // record that it has been called and it will not clear the vector. + // The second point does not matter here, but the first point means + // that an error will be logged during the call to AllMatches. + v.f.AllMatches("foo", v.atom_indices, &v.matches); + EXPECT_EQ(0, v.matches.size()); +} + +TEST(FilteredRE2Test, SmallOrTest) { + FilterTestVars v(4); // override the minimum atom length + int id; + v.f.Add("(foo|bar)", v.opts, &id); + + v.f.Compile(&v.atoms); + EXPECT_EQ(0, v.atoms.size()); + + v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +TEST(FilteredRE2Test, SmallLatinTest) { + FilterTestVars v; + int id; + + v.opts.set_encoding(RE2::Options::EncodingLatin1); + v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); + v.f.Compile(&v.atoms); + EXPECT_EQ(1, v.atoms.size()); + EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); + + v.atom_indices.push_back(0); + v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +struct AtomTest { + const char* testname; + // If any test needs more than this many regexps or atoms, increase + // the size of the corresponding array. + const char* regexps[20]; + const char* atoms[20]; +}; + +AtomTest atom_tests[] = { + { + // This test checks to make sure empty patterns are allowed. + "CheckEmptyPattern", + {""}, + {} + }, { + // This test checks that all atoms of length greater than min length + // are found, and no atoms that are of smaller length are found. + "AllAtomsGtMinLengthFound", { + "(abc123|def456|ghi789).*mnop[x-z]+", + "abc..yyy..zz", + "mnmnpp[a-z]+PPP" + }, { + "abc123", + "def456", + "ghi789", + "mnop", + "abc", + "yyy", + "mnmnpp", + "ppp" + } + }, { + // Test to make sure that any atoms that have another atom as a + // substring in an OR are removed; that is, only the shortest + // substring is kept. + "SubstrAtomRemovesSuperStrInOr", { + "(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+", + "abcd..yyy..yyyzzz", + "mnmnpp[a-z]+PPP" + }, { + "abc", + "ghi789", + "xyz", + "abcd", + "yyy", + "yyyzzz", + "mnmnpp", + "ppp" + } + }, { + // Test character class expansion. + "CharClassExpansion", { + "m[a-c][d-f]n.*[x-z]+", + "[x-y]bcde[ab]" + }, { + "madn", "maen", "mafn", + "mbdn", "mben", "mbfn", + "mcdn", "mcen", "mcfn", + "xbcdea", "xbcdeb", + "ybcdea", "ybcdeb" + } + }, { + // Test upper/lower of non-ASCII. + "UnicodeLower", { + "(?i)ΔδΠϖπΣςσ", + "ΛΜΝΟΠ", + "ψρστυ", + }, { + "δδπππσσσ", + "λμνοπ", + "ψρστυ", + }, + }, +}; + +void AddRegexpsAndCompile(const char* regexps[], + size_t n, + struct FilterTestVars* v) { + for (size_t i = 0; i < n; i++) { + int id; + v->f.Add(regexps[i], v->opts, &id); + } + v->f.Compile(&v->atoms); +} + +bool CheckExpectedAtoms(const char* atoms[], + size_t n, + const char* testname, + struct FilterTestVars* v) { + std::vector<std::string> expected; + for (size_t i = 0; i < n; i++) + expected.push_back(atoms[i]); + + bool pass = expected.size() == v->atoms.size(); + + std::sort(v->atoms.begin(), v->atoms.end()); + std::sort(expected.begin(), expected.end()); + for (size_t i = 0; pass && i < n; i++) + pass = pass && expected[i] == v->atoms[i]; + + if (!pass) { + LOG(ERROR) << "Failed " << testname; + LOG(ERROR) << "Expected #atoms = " << expected.size(); + for (size_t i = 0; i < expected.size(); i++) + LOG(ERROR) << expected[i]; + LOG(ERROR) << "Found #atoms = " << v->atoms.size(); + for (size_t i = 0; i < v->atoms.size(); i++) + LOG(ERROR) << v->atoms[i]; + } + + return pass; +} + +TEST(FilteredRE2Test, AtomTests) { + int nfail = 0; + for (size_t i = 0; i < ABSL_ARRAYSIZE(atom_tests); i++) { + FilterTestVars v; + AtomTest* t = &atom_tests[i]; + size_t nregexp, natom; + for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + for (natom = 0; natom < ABSL_ARRAYSIZE(t->atoms); natom++) + if (t->atoms[natom] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) + nfail++; + } + EXPECT_EQ(0, nfail); +} + +void FindAtomIndices(const std::vector<std::string>& atoms, + const std::vector<std::string>& matched_atoms, + std::vector<int>* atom_indices) { + atom_indices->clear(); + for (size_t i = 0; i < matched_atoms.size(); i++) { + for (size_t j = 0; j < atoms.size(); j++) { + if (matched_atoms[i] == atoms[j]) { + atom_indices->push_back(static_cast<int>(j)); + break; + } + } + } +} + +TEST(FilteredRE2Test, MatchEmptyPattern) { + FilterTestVars v; + AtomTest* t = &atom_tests[0]; + // We are using the regexps used in one of the atom tests + // for this test. Adding the EXPECT here to make sure + // the index we use for the test is for the correct test. + EXPECT_EQ("CheckEmptyPattern", std::string(t->testname)); + size_t nregexp; + for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + std::string text = "0123"; + std::vector<int> atom_ids; + std::vector<int> matching_regexps; + EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); +} + +TEST(FilteredRE2Test, MatchTests) { + FilterTestVars v; + AtomTest* t = &atom_tests[2]; + // We are using the regexps used in one of the atom tests + // for this test. + EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname)); + size_t nregexp; + for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + + std::string text = "abc121212xyz"; + // atoms = abc + std::vector<int> atom_ids; + std::vector<std::string> atoms; + atoms.push_back("abc"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + std::vector<int> matching_regexps; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abc12312yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abcd12yyy32yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("abcd"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + LOG(INFO) << "S: " << atom_ids.size(); + for (size_t i = 0; i < atom_ids.size(); i++) + LOG(INFO) << "i: " << i << " : " << atom_ids[i]; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(2, matching_regexps.size()); +} + +TEST(FilteredRE2Test, EmptyStringInStringSetBug) { + // Bug due to find() finding "" at the start of everything in a string + // set and thus SimplifyStringSet() would end up erasing everything. + // In order to test this, we have to keep PrefilterTree from discarding + // the OR entirely, so we have to make the minimum atom length zero. + + FilterTestVars v(0); // override the minimum atom length + const char* regexps[] = {"-R.+(|ADD=;AA){12}}"}; + const char* atoms[] = {"", "-r", "add=;aa", "}"}; + AddRegexpsAndCompile(regexps, ABSL_ARRAYSIZE(regexps), &v); + EXPECT_TRUE(CheckExpectedAtoms(atoms, ABSL_ARRAYSIZE(atoms), + "EmptyStringInStringSetBug", &v)); +} + +TEST(FilteredRE2Test, MoveSemantics) { + FilterTestVars v1; + int id; + v1.f.Add("foo\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("foo", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + + // The moved-to object should do what the moved-from object did. + FilterTestVars v2; + v2.f = std::move(v1.f); + v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches); + EXPECT_EQ(1, v2.matches.size()); + EXPECT_EQ(0, v2.matches[0]); + v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches); + EXPECT_EQ(0, v2.matches.size()); + + // The moved-from object should have been reset and be reusable. + v1.f.Add("bar\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("bar", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + v1.f = std::move(v2.f); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); +} + +} // namespace re2 |