Index: third_party/re2/re2/testing/regexp_generator.cc |
diff --git a/third_party/re2/re2/testing/regexp_generator.cc b/third_party/re2/re2/testing/regexp_generator.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..cf2db11855f5ab12d3068adb4c3cd900352189a5 |
--- /dev/null |
+++ b/third_party/re2/re2/testing/regexp_generator.cc |
@@ -0,0 +1,264 @@ |
+// Copyright 2008 The RE2 Authors. All Rights Reserved. |
+// Use of this source code is governed by a BSD-style |
+// license that can be found in the LICENSE file. |
+ |
+// Regular expression generator: generates all possible |
+// regular expressions within parameters (see regexp_generator.h for details). |
+ |
+// The regexp generator first generates a sequence of commands in a simple |
+// postfix language. Each command in the language is a string, |
+// like "a" or "%s*" or "%s|%s". |
+// |
+// To evaluate a command, enough arguments are popped from the value stack to |
+// plug into the %s slots. Then the result is pushed onto the stack. |
+// For example, the command sequence |
+// a b %s%s c |
+// results in the stack |
+// ab c |
+// |
+// GeneratePostfix generates all possible command sequences. |
+// Then RunPostfix turns each sequence into a regular expression |
+// and passes the regexp to HandleRegexp. |
+ |
+#include <string.h> |
+#include <string> |
+#include <stack> |
+#include <vector> |
+#include "util/test.h" |
+#include "re2/testing/regexp_generator.h" |
+ |
+namespace re2 { |
+ |
+// Returns a vector of the egrep regexp operators. |
+const vector<string>& RegexpGenerator::EgrepOps() { |
+ static const char *ops[] = { |
+ "%s%s", |
+ "%s|%s", |
+ "%s*", |
+ "%s+", |
+ "%s?", |
+ "%s\\C*", |
+ }; |
+ static vector<string> v(ops, ops + arraysize(ops)); |
+ return v; |
+} |
+ |
+RegexpGenerator::RegexpGenerator(int maxatoms, int maxops, |
+ const vector<string>& atoms, |
+ const vector<string>& ops) |
+ : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) { |
+ // Degenerate case. |
+ if (atoms_.size() == 0) |
+ maxatoms_ = 0; |
+ if (ops_.size() == 0) |
+ maxops_ = 0; |
+} |
+ |
+// Generates all possible regular expressions (within the parameters), |
+// calling HandleRegexp for each one. |
+void RegexpGenerator::Generate() { |
+ vector<string> postfix; |
+ GeneratePostfix(&postfix, 0, 0, 0); |
+} |
+ |
+// Generates random regular expressions, calling HandleRegexp for each one. |
+void RegexpGenerator::GenerateRandom(int32 seed, int n) { |
+ ACMRandom acm(seed); |
+ acm_ = &acm; |
+ |
+ for (int i = 0; i < n; i++) { |
+ vector<string> postfix; |
+ GenerateRandomPostfix(&postfix, 0, 0, 0); |
+ } |
+ |
+ acm_ = NULL; |
+} |
+ |
+// Counts and returns the number of occurrences of "%s" in s. |
+static int CountArgs(const string& s) { |
+ const char *p = s.c_str(); |
+ int n = 0; |
+ while ((p = strstr(p, "%s")) != NULL) { |
+ p += 2; |
+ n++; |
+ } |
+ return n; |
+} |
+ |
+// Generates all possible postfix command sequences. |
+// Each sequence is handed off to RunPostfix to generate a regular expression. |
+// The arguments are: |
+// post: the current postfix sequence |
+// nstk: the number of elements that would be on the stack after executing |
+// the sequence |
+// ops: the number of operators used in the sequence |
+// atoms: the number of atoms used in the sequence |
+// For example, if post were ["a", "b", "%s%s", "c"], |
+// then nstk = 2, ops = 1, atoms = 3. |
+// |
+// The initial call should be GeneratePostfix([empty vector], 0, 0, 0). |
+// |
+void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk, |
+ int ops, int atoms) { |
+ if (nstk == 1) |
+ RunPostfix(*post); |
+ |
+ // Early out: if used too many operators or can't |
+ // get back down to a single expression on the stack |
+ // using binary operators, give up. |
+ if (ops + nstk - 1 > maxops_) |
+ return; |
+ |
+ // Add atoms if there is room. |
+ if (atoms < maxatoms_) { |
+ for (int i = 0; i < atoms_.size(); i++) { |
+ post->push_back(atoms_[i]); |
+ GeneratePostfix(post, nstk + 1, ops, atoms + 1); |
+ post->pop_back(); |
+ } |
+ } |
+ |
+ // Add operators if there are enough arguments. |
+ if (ops < maxops_) { |
+ for (int i = 0; i < ops_.size(); i++) { |
+ const string& fmt = ops_[i]; |
+ int nargs = CountArgs(fmt); |
+ if (nargs <= nstk) { |
+ post->push_back(fmt); |
+ GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms); |
+ post->pop_back(); |
+ } |
+ } |
+ } |
+} |
+ |
+// Generates a random postfix command sequence. |
+// Stops and returns true once a single sequence has been generated. |
+bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk, |
+ int ops, int atoms) { |
+ for (;;) { |
+ // Stop if we get to a single element, but only sometimes. |
+ if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) { |
+ RunPostfix(*post); |
+ return true; |
+ } |
+ |
+ // Early out: if used too many operators or can't |
+ // get back down to a single expression on the stack |
+ // using binary operators, give up. |
+ if (ops + nstk - 1 > maxops_) |
+ return false; |
+ |
+ // Add operators if there are enough arguments. |
+ if (ops < maxops_ && acm_->Uniform(2) == 0) { |
+ const string& fmt = ops_[acm_->Uniform(ops_.size())]; |
+ int nargs = CountArgs(fmt); |
+ if (nargs <= nstk) { |
+ post->push_back(fmt); |
+ bool ret = GenerateRandomPostfix(post, nstk - nargs + 1, |
+ ops + 1, atoms); |
+ post->pop_back(); |
+ if (ret) |
+ return true; |
+ } |
+ } |
+ |
+ // Add atoms if there is room. |
+ if (atoms < maxatoms_ && acm_->Uniform(2) == 0) { |
+ post->push_back(atoms_[acm_->Uniform(atoms_.size())]); |
+ bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1); |
+ post->pop_back(); |
+ if (ret) |
+ return true; |
+ } |
+ } |
+} |
+ |
+// Interprets the postfix command sequence to create a regular expression |
+// passed to HandleRegexp. The results of operators like %s|%s are wrapped |
+// in (?: ) to avoid needing to maintain a precedence table. |
+void RegexpGenerator::RunPostfix(const vector<string>& post) { |
+ stack<string> regexps; |
+ for (int i = 0; i < post.size(); i++) { |
+ switch (CountArgs(post[i])) { |
+ default: |
+ LOG(FATAL) << "Bad operator: " << post[i]; |
+ case 0: |
+ regexps.push(post[i]); |
+ break; |
+ case 1: { |
+ string a = regexps.top(); |
+ regexps.pop(); |
+ regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")"); |
+ break; |
+ } |
+ case 2: { |
+ string b = regexps.top(); |
+ regexps.pop(); |
+ string a = regexps.top(); |
+ regexps.pop(); |
+ regexps.push("(?:" + |
+ StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) + |
+ ")"); |
+ break; |
+ } |
+ } |
+ } |
+ |
+ if (regexps.size() != 1) { |
+ // Internal error - should never happen. |
+ printf("Bad regexp program:\n"); |
+ for (int i = 0; i < post.size(); i++) { |
+ printf(" %s\n", CEscape(post[i]).c_str()); |
+ } |
+ printf("Stack after running program:\n"); |
+ while (!regexps.empty()) { |
+ printf(" %s\n", CEscape(regexps.top()).c_str()); |
+ regexps.pop(); |
+ } |
+ LOG(FATAL) << "Bad regexp program."; |
+ } |
+ |
+ HandleRegexp(regexps.top()); |
+ HandleRegexp("^(?:" + regexps.top() + ")$"); |
+ HandleRegexp("^(?:" + regexps.top() + ")"); |
+ HandleRegexp("(?:" + regexps.top() + ")$"); |
+} |
+ |
+// Split s into an vector of strings, one for each UTF-8 character. |
+vector<string> Explode(const StringPiece& s) { |
+ vector<string> v; |
+ |
+ for (const char *q = s.begin(); q < s.end(); ) { |
+ const char* p = q; |
+ Rune r; |
+ q += chartorune(&r, q); |
+ v.push_back(string(p, q - p)); |
+ } |
+ |
+ return v; |
+} |
+ |
+// Split string everywhere a substring is found, returning |
+// vector of pieces. |
+vector<string> Split(const StringPiece& sep, const StringPiece& s) { |
+ vector<string> v; |
+ |
+ if (sep.size() == 0) |
+ return Explode(s); |
+ |
+ const char *p = s.begin(); |
+ for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) { |
+ if (StringPiece(q, sep.size()) == sep) { |
+ v.push_back(string(p, q - p)); |
+ p = q + sep.size(); |
+ q = p - 1; // -1 for ++ in loop |
+ continue; |
+ } |
+ } |
+ if (p < s.end()) |
+ v.push_back(string(p, s.end() - p)); |
+ return v; |
+} |
+ |
+} // namespace re2 |