| Index: third_party/re2/re2/make_unicode_casefold.py
|
| diff --git a/third_party/re2/re2/make_unicode_casefold.py b/third_party/re2/re2/make_unicode_casefold.py
|
| new file mode 100755
|
| index 0000000000000000000000000000000000000000..3375d2ef1bf90b2b554a50a07594110555f6341b
|
| --- /dev/null
|
| +++ b/third_party/re2/re2/make_unicode_casefold.py
|
| @@ -0,0 +1,146 @@
|
| +#!/usr/bin/python
|
| +# coding=utf-8
|
| +#
|
| +# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
| +# Use of this source code is governed by a BSD-style
|
| +# license that can be found in the LICENSE file.
|
| +
|
| +# See unicode_casefold.h for description of case folding tables.
|
| +
|
| +"""Generate C++ table for Unicode case folding."""
|
| +
|
| +import unicode, sys
|
| +
|
| +_header = """
|
| +// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
|
| +// make_unicode_casefold.py >unicode_casefold.cc
|
| +
|
| +#include "re2/unicode_casefold.h"
|
| +
|
| +namespace re2 {
|
| +
|
| +"""
|
| +
|
| +_trailer = """
|
| +
|
| +} // namespace re2
|
| +
|
| +"""
|
| +
|
| +def _Delta(a, b):
|
| + """Compute the delta for b - a. Even/odd and odd/even
|
| + are handled specially, as described above."""
|
| + if a+1 == b:
|
| + if a%2 == 0:
|
| + return 'EvenOdd'
|
| + else:
|
| + return 'OddEven'
|
| + if a == b+1:
|
| + if a%2 == 0:
|
| + return 'OddEven'
|
| + else:
|
| + return 'EvenOdd'
|
| + return b - a
|
| +
|
| +def _AddDelta(a, delta):
|
| + """Return a + delta, handling EvenOdd and OddEven specially."""
|
| + if type(delta) == int:
|
| + return a+delta
|
| + if delta == 'EvenOdd':
|
| + if a%2 == 0:
|
| + return a+1
|
| + else:
|
| + return a-1
|
| + if delta == 'OddEven':
|
| + if a%2 == 1:
|
| + return a+1
|
| + else:
|
| + return a-1
|
| + print >>sys.stderr, "Bad Delta: ", delta
|
| + raise "Bad Delta"
|
| +
|
| +def _MakeRanges(pairs):
|
| + """Turn a list like [(65,97), (66, 98), ..., (90,122)]
|
| + into [(65, 90, +32)]."""
|
| + ranges = []
|
| + last = -100
|
| +
|
| + def evenodd(last, a, b, r):
|
| + if a != last+1 or b != _AddDelta(a, r[2]):
|
| + return False
|
| + r[1] = a
|
| + return True
|
| +
|
| + def evenoddpair(last, a, b, r):
|
| + if a != last+2:
|
| + return False
|
| + delta = r[2]
|
| + d = delta
|
| + if type(delta) is not str:
|
| + return False
|
| + if delta.endswith('Skip'):
|
| + d = delta[:-4]
|
| + else:
|
| + delta = d + 'Skip'
|
| + if b != _AddDelta(a, d):
|
| + return False
|
| + r[1] = a
|
| + r[2] = delta
|
| + return True
|
| +
|
| + for a, b in pairs:
|
| + if ranges and evenodd(last, a, b, ranges[-1]):
|
| + pass
|
| + elif ranges and evenoddpair(last, a, b, ranges[-1]):
|
| + pass
|
| + else:
|
| + ranges.append([a, a, _Delta(a, b)])
|
| + last = a
|
| + return ranges
|
| +
|
| +# The maximum size of a case-folding group.
|
| +# Case folding is implemented in parse.cc by a recursive process
|
| +# with a recursion depth equal to the size of the largest
|
| +# case-folding group, so it is important that this bound be small.
|
| +# The current tables have no group bigger than 4.
|
| +# If there are ever groups bigger than 10 or so, it will be
|
| +# time to rework the code in parse.cc.
|
| +MaxCasefoldGroup = 4
|
| +
|
| +def main():
|
| + lowergroups, casegroups = unicode.CaseGroups()
|
| + foldpairs = []
|
| + seen = {}
|
| + for c in casegroups:
|
| + if len(c) > MaxCasefoldGroup:
|
| + raise unicode.Error("casefold group too long: %s" % (c,))
|
| + for i in range(len(c)):
|
| + if c[i-1] in seen:
|
| + raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
|
| + seen[c[i-1]] = True
|
| + foldpairs.append([c[i-1], c[i]])
|
| +
|
| + lowerpairs = []
|
| + for lower, group in lowergroups.iteritems():
|
| + for g in group:
|
| + if g != lower:
|
| + lowerpairs.append([g, lower])
|
| +
|
| + def printpairs(name, foldpairs):
|
| + foldpairs.sort()
|
| + foldranges = _MakeRanges(foldpairs)
|
| + print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
|
| + print "CaseFold unicode_%s[] = {" % (name,)
|
| + for lo, hi, delta in foldranges:
|
| + print "\t{ %d, %d, %s }," % (lo, hi, delta)
|
| + print "};"
|
| + print "int num_unicode_%s = %d;" % (name, len(foldranges),)
|
| + print ""
|
| +
|
| + print _header
|
| + printpairs("casefold", foldpairs)
|
| + printpairs("tolower", lowerpairs)
|
| + print _trailer
|
| +
|
| +if __name__ == '__main__':
|
| + main()
|
|
|