OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/python |
| 2 # coding=utf-8 |
| 3 # |
| 4 # Copyright 2008 The RE2 Authors. All Rights Reserved. |
| 5 # Use of this source code is governed by a BSD-style |
| 6 # license that can be found in the LICENSE file. |
| 7 |
| 8 # See unicode_casefold.h for description of case folding tables. |
| 9 |
| 10 """Generate C++ table for Unicode case folding.""" |
| 11 |
| 12 import unicode, sys |
| 13 |
| 14 _header = """ |
| 15 // GENERATED BY make_unicode_casefold.py; DO NOT EDIT. |
| 16 // make_unicode_casefold.py >unicode_casefold.cc |
| 17 |
| 18 #include "re2/unicode_casefold.h" |
| 19 |
| 20 namespace re2 { |
| 21 |
| 22 """ |
| 23 |
| 24 _trailer = """ |
| 25 |
| 26 } // namespace re2 |
| 27 |
| 28 """ |
| 29 |
| 30 def _Delta(a, b): |
| 31 """Compute the delta for b - a. Even/odd and odd/even |
| 32 are handled specially, as described above.""" |
| 33 if a+1 == b: |
| 34 if a%2 == 0: |
| 35 return 'EvenOdd' |
| 36 else: |
| 37 return 'OddEven' |
| 38 if a == b+1: |
| 39 if a%2 == 0: |
| 40 return 'OddEven' |
| 41 else: |
| 42 return 'EvenOdd' |
| 43 return b - a |
| 44 |
| 45 def _AddDelta(a, delta): |
| 46 """Return a + delta, handling EvenOdd and OddEven specially.""" |
| 47 if type(delta) == int: |
| 48 return a+delta |
| 49 if delta == 'EvenOdd': |
| 50 if a%2 == 0: |
| 51 return a+1 |
| 52 else: |
| 53 return a-1 |
| 54 if delta == 'OddEven': |
| 55 if a%2 == 1: |
| 56 return a+1 |
| 57 else: |
| 58 return a-1 |
| 59 print >>sys.stderr, "Bad Delta: ", delta |
| 60 raise "Bad Delta" |
| 61 |
| 62 def _MakeRanges(pairs): |
| 63 """Turn a list like [(65,97), (66, 98), ..., (90,122)] |
| 64 into [(65, 90, +32)].""" |
| 65 ranges = [] |
| 66 last = -100 |
| 67 |
| 68 def evenodd(last, a, b, r): |
| 69 if a != last+1 or b != _AddDelta(a, r[2]): |
| 70 return False |
| 71 r[1] = a |
| 72 return True |
| 73 |
| 74 def evenoddpair(last, a, b, r): |
| 75 if a != last+2: |
| 76 return False |
| 77 delta = r[2] |
| 78 d = delta |
| 79 if type(delta) is not str: |
| 80 return False |
| 81 if delta.endswith('Skip'): |
| 82 d = delta[:-4] |
| 83 else: |
| 84 delta = d + 'Skip' |
| 85 if b != _AddDelta(a, d): |
| 86 return False |
| 87 r[1] = a |
| 88 r[2] = delta |
| 89 return True |
| 90 |
| 91 for a, b in pairs: |
| 92 if ranges and evenodd(last, a, b, ranges[-1]): |
| 93 pass |
| 94 elif ranges and evenoddpair(last, a, b, ranges[-1]): |
| 95 pass |
| 96 else: |
| 97 ranges.append([a, a, _Delta(a, b)]) |
| 98 last = a |
| 99 return ranges |
| 100 |
| 101 # The maximum size of a case-folding group. |
| 102 # Case folding is implemented in parse.cc by a recursive process |
| 103 # with a recursion depth equal to the size of the largest |
| 104 # case-folding group, so it is important that this bound be small. |
| 105 # The current tables have no group bigger than 4. |
| 106 # If there are ever groups bigger than 10 or so, it will be |
| 107 # time to rework the code in parse.cc. |
| 108 MaxCasefoldGroup = 4 |
| 109 |
| 110 def main(): |
| 111 lowergroups, casegroups = unicode.CaseGroups() |
| 112 foldpairs = [] |
| 113 seen = {} |
| 114 for c in casegroups: |
| 115 if len(c) > MaxCasefoldGroup: |
| 116 raise unicode.Error("casefold group too long: %s" % (c,)) |
| 117 for i in range(len(c)): |
| 118 if c[i-1] in seen: |
| 119 raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) |
| 120 seen[c[i-1]] = True |
| 121 foldpairs.append([c[i-1], c[i]]) |
| 122 |
| 123 lowerpairs = [] |
| 124 for lower, group in lowergroups.iteritems(): |
| 125 for g in group: |
| 126 if g != lower: |
| 127 lowerpairs.append([g, lower]) |
| 128 |
| 129 def printpairs(name, foldpairs): |
| 130 foldpairs.sort() |
| 131 foldranges = _MakeRanges(foldpairs) |
| 132 print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs)
, len(foldranges)) |
| 133 print "CaseFold unicode_%s[] = {" % (name,) |
| 134 for lo, hi, delta in foldranges: |
| 135 print "\t{ %d, %d, %s }," % (lo, hi, delta) |
| 136 print "};" |
| 137 print "int num_unicode_%s = %d;" % (name, len(foldranges),) |
| 138 print "" |
| 139 |
| 140 print _header |
| 141 printpairs("casefold", foldpairs) |
| 142 printpairs("tolower", lowerpairs) |
| 143 print _trailer |
| 144 |
| 145 if __name__ == '__main__': |
| 146 main() |
OLD | NEW |