Index: third_party/re2/re2/testing/simplify_test.cc |
diff --git a/third_party/re2/re2/testing/simplify_test.cc b/third_party/re2/re2/testing/simplify_test.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..d54837c950723c4f0b16ba9420e3b126f10a00d6 |
--- /dev/null |
+++ b/third_party/re2/re2/testing/simplify_test.cc |
@@ -0,0 +1,167 @@ |
+// Copyright 2006 The RE2 Authors. All Rights Reserved. |
+// Use of this source code is governed by a BSD-style |
+// license that can be found in the LICENSE file. |
+ |
+// Test simplify.cc. |
+ |
+#include <string> |
+#include <vector> |
+#include "util/test.h" |
+#include "re2/regexp.h" |
+ |
+namespace re2 { |
+ |
+struct Test { |
+ const char* regexp; |
+ const char* simplified; |
+}; |
+ |
+static Test tests[] = { |
+ // Already-simple constructs |
+ { "a", "a" }, |
+ { "ab", "ab" }, |
+ { "a|b", "[a-b]" }, |
+ { "ab|cd", "ab|cd" }, |
+ { "(ab)*", "(ab)*" }, |
+ { "(ab)+", "(ab)+" }, |
+ { "(ab)?", "(ab)?" }, |
+ { ".", "." }, |
+ { "^", "^" }, |
+ { "$", "$" }, |
+ { "[ac]", "[ac]" }, |
+ { "[^ac]", "[^ac]" }, |
+ |
+ // Posix character classes |
+ { "[[:alnum:]]", "[0-9A-Za-z]" }, |
+ { "[[:alpha:]]", "[A-Za-z]" }, |
+ { "[[:blank:]]", "[\\t ]" }, |
+ { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" }, |
+ { "[[:digit:]]", "[0-9]" }, |
+ { "[[:graph:]]", "[!-~]" }, |
+ { "[[:lower:]]", "[a-z]" }, |
+ { "[[:print:]]", "[ -~]" }, |
+ { "[[:punct:]]", "[!-/:-@\\[-`{-~]" }, |
+ { "[[:space:]]" , "[\\t-\\r ]" }, |
+ { "[[:upper:]]", "[A-Z]" }, |
+ { "[[:xdigit:]]", "[0-9A-Fa-f]" }, |
+ |
+ // Perl character classes |
+ { "\\d", "[0-9]" }, |
+ { "\\s", "[\\t-\\n\\f-\\r ]" }, |
+ { "\\w", "[0-9A-Z_a-z]" }, |
+ { "\\D", "[^0-9]" }, |
+ { "\\S", "[^\\t-\\n\\f-\\r ]" }, |
+ { "\\W", "[^0-9A-Z_a-z]" }, |
+ { "[\\d]", "[0-9]" }, |
+ { "[\\s]", "[\\t-\\n\\f-\\r ]" }, |
+ { "[\\w]", "[0-9A-Z_a-z]" }, |
+ { "[\\D]", "[^0-9]" }, |
+ { "[\\S]", "[^\\t-\\n\\f-\\r ]" }, |
+ { "[\\W]", "[^0-9A-Z_a-z]" }, |
+ |
+ // Posix repetitions |
+ { "a{1}", "a" }, |
+ { "a{2}", "aa" }, |
+ { "a{5}", "aaaaa" }, |
+ { "a{0,1}", "a?" }, |
+ // The next three are illegible because Simplify inserts (?:) |
+ // parens instead of () parens to avoid creating extra |
+ // captured subexpressions. The comments show a version fewer parens. |
+ { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)? |
+ { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)? |
+ { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)? |
+ { "a{0,2}", "(?:aa?)?" }, // (aa?)? |
+ { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)? |
+ { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)? |
+ { "a{0,}", "a*" }, |
+ { "a{1,}", "a+" }, |
+ { "a{2,}", "aa+" }, |
+ { "a{5,}", "aaaaa+" }, |
+ |
+ // Test that operators simplify their arguments. |
+ // (Simplify used to not simplify arguments to a {} repeat.) |
+ { "(?:a{1,}){1,}", "a+" }, |
+ { "(a{1,}b{1,})", "(a+b+)" }, |
+ { "a{1,}|b{1,}", "a+|b+" }, |
+ { "(?:a{1,})*", "(?:a+)*" }, |
+ { "(?:a{1,})+", "a+" }, |
+ { "(?:a{1,})?", "(?:a+)?" }, |
+ { "a{0}", "" }, |
+ |
+ // Character class simplification |
+ { "[ab]", "[a-b]" }, |
+ { "[a-za-za-z]", "[a-z]" }, |
+ { "[A-Za-zA-Za-z]", "[A-Za-z]" }, |
+ { "[ABCDEFGH]", "[A-H]" }, |
+ { "[AB-CD-EF-GH]", "[A-H]" }, |
+ { "[W-ZP-XE-R]", "[E-Z]" }, |
+ { "[a-ee-gg-m]", "[a-m]" }, |
+ { "[a-ea-ha-m]", "[a-m]" }, |
+ { "[a-ma-ha-e]", "[a-m]" }, |
+ { "[a-zA-Z0-9 -~]", "[ -~]" }, |
+ |
+ // Empty character classes |
+ { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" }, |
+ |
+ // Full character classes |
+ { "[[:cntrl:][:^cntrl:]]", "." }, |
+ |
+ // Unicode case folding. |
+ { "(?i)A", "[Aa]" }, |
+ { "(?i)a", "[Aa]" }, |
+ { "(?i)K", "[Kk\\x{212a}]" }, |
+ { "(?i)k", "[Kk\\x{212a}]" }, |
+ { "(?i)\\x{212a}", "[Kk\\x{212a}]" }, |
+ { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" }, |
+ { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" }, |
+ { "(?i)[\\x00-\\x{10ffff}]", "." }, |
+ |
+ // Empty string as a regular expression. |
+ // Empty string must be preserved inside parens in order |
+ // to make submatches work right, so these are less |
+ // interesting than they used to be. ToString inserts |
+ // explicit (?:) in place of non-parenthesized empty strings, |
+ // to make them easier to spot for other parsers. |
+ { "(a|b|)", "([a-b]|(?:))" }, |
+ { "(|)", "()" }, |
+ { "a()", "a()" }, |
+ { "(()|())", "(()|())" }, |
+ { "(a|)", "(a|(?:))" }, |
+ { "ab()cd()", "ab()cd()" }, |
+ { "()", "()" }, |
+ { "()*", "()*" }, |
+ { "()+", "()+" }, |
+ { "()?" , "()?" }, |
+ { "(){0}", "" }, |
+ { "(){1}", "()" }, |
+ { "(){1,}", "()+" }, |
+ { "(){0,2}", "(?:()()?)?" }, |
+}; |
+ |
+TEST(TestSimplify, SimpleRegexps) { |
+ for (int i = 0; i < arraysize(tests); i++) { |
+ RegexpStatus status; |
+ VLOG(1) << "Testing " << tests[i].regexp; |
+ Regexp* re = Regexp::Parse(tests[i].regexp, |
+ Regexp::MatchNL | (Regexp::LikePerl & |
+ ~Regexp::OneLine), |
+ &status); |
+ CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text(); |
+ Regexp* sre = re->Simplify(); |
+ CHECK(sre != NULL); |
+ |
+ // Check that already-simple regexps don't allocate new ones. |
+ if (strcmp(tests[i].regexp, tests[i].simplified) == 0) { |
+ CHECK(re == sre) << " " << tests[i].regexp |
+ << " " << re->ToString() << " " << sre->ToString(); |
+ } |
+ |
+ EXPECT_EQ(tests[i].simplified, sre->ToString()) |
+ << " " << tests[i].regexp << " " << sre->Dump(); |
+ |
+ re->Decref(); |
+ sre->Decref(); |
+ } |
+} |
+ |
+} // namespace re2 |