Index: third_party/re2/re2/make_unicode_groups.py |
diff --git a/third_party/re2/re2/make_unicode_groups.py b/third_party/re2/re2/make_unicode_groups.py |
new file mode 100755 |
index 0000000000000000000000000000000000000000..c2e25c1fc75ca0626c5092043feb1cd8cde31c94 |
--- /dev/null |
+++ b/third_party/re2/re2/make_unicode_groups.py |
@@ -0,0 +1,111 @@ |
+#!/usr/bin/python |
+# Copyright 2008 The RE2 Authors. All Rights Reserved. |
+# Use of this source code is governed by a BSD-style |
+# license that can be found in the LICENSE file. |
+ |
+"""Generate C++ tables for Unicode Script and Category groups.""" |
+ |
+import sys |
+import unicode |
+ |
+_header = """ |
+// GENERATED BY make_unicode_groups.py; DO NOT EDIT. |
+// make_unicode_groups.py >unicode_groups.cc |
+ |
+#include "re2/unicode_groups.h" |
+ |
+namespace re2 { |
+ |
+""" |
+ |
+_trailer = """ |
+ |
+} // namespace re2 |
+ |
+""" |
+ |
+n16 = 0 |
+n32 = 0 |
+ |
+def MakeRanges(codes): |
+ """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]""" |
+ ranges = [] |
+ last = -100 |
+ for c in codes: |
+ if c == last+1: |
+ ranges[-1][1] = c |
+ else: |
+ ranges.append([c, c]) |
+ last = c |
+ return ranges |
+ |
+def PrintRanges(type, name, ranges): |
+ """Print the ranges as an array of type named name.""" |
+ print "static %s %s[] = {" % (type, name,) |
+ for lo, hi in ranges: |
+ print "\t{ %d, %d }," % (lo, hi) |
+ print "};" |
+ |
+# def PrintCodes(type, name, codes): |
+# """Print the codes as an array of type named name.""" |
+# print "static %s %s[] = {" % (type, name,) |
+# for c in codes: |
+# print "\t%d," % (c,) |
+# print "};" |
+ |
+def PrintGroup(name, codes): |
+ """Print the data structures for the group of codes. |
+ Return a UGroup literal for the group.""" |
+ |
+ # See unicode_groups.h for a description of the data structure. |
+ |
+ # Split codes into 16-bit ranges and 32-bit ranges. |
+ range16 = MakeRanges([c for c in codes if c < 65536]) |
+ range32 = MakeRanges([c for c in codes if c >= 65536]) |
+ |
+ # Pull singleton ranges out of range16. |
+ # code16 = [lo for lo, hi in range16 if lo == hi] |
+ # range16 = [[lo, hi] for lo, hi in range16 if lo != hi] |
+ |
+ global n16 |
+ global n32 |
+ n16 += len(range16) |
+ n32 += len(range32) |
+ |
+ ugroup = "{ \"%s\", +1" % (name,) |
+ # if len(code16) > 0: |
+ # PrintCodes("uint16", name+"_code16", code16) |
+ # ugroup += ", %s_code16, %d" % (name, len(code16)) |
+ # else: |
+ # ugroup += ", 0, 0" |
+ if len(range16) > 0: |
+ PrintRanges("URange16", name+"_range16", range16) |
+ ugroup += ", %s_range16, %d" % (name, len(range16)) |
+ else: |
+ ugroup += ", 0, 0" |
+ if len(range32) > 0: |
+ PrintRanges("URange32", name+"_range32", range32) |
+ ugroup += ", %s_range32, %d" % (name, len(range32)) |
+ else: |
+ ugroup += ", 0, 0" |
+ ugroup += " }" |
+ return ugroup |
+ |
+def main(): |
+ print _header |
+ ugroups = [] |
+ for name, codes in unicode.Categories().iteritems(): |
+ ugroups.append(PrintGroup(name, codes)) |
+ for name, codes in unicode.Scripts().iteritems(): |
+ ugroups.append(PrintGroup(name, codes)) |
+ print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32) |
+ print "UGroup unicode_groups[] = {"; |
+ ugroups.sort() |
+ for ug in ugroups: |
+ print "\t%s," % (ug,) |
+ print "};" |
+ print "int num_unicode_groups = %d;" % (len(ugroups),) |
+ print _trailer |
+ |
+if __name__ == '__main__': |
+ main() |