OLD | NEW |
(Empty) | |
| 1 #======================================================================= |
| 2 # |
| 3 # Python Lexical Analyser |
| 4 # |
| 5 # Traditional Regular Expression Syntax |
| 6 # |
| 7 #======================================================================= |
| 8 |
| 9 from Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char |
| 10 from Errors import PlexError |
| 11 |
| 12 class RegexpSyntaxError(PlexError): |
| 13 pass |
| 14 |
| 15 def re(s): |
| 16 """ |
| 17 Convert traditional string representation of regular expression |s| |
| 18 into Plex representation. |
| 19 """ |
| 20 return REParser(s).parse_re() |
| 21 |
| 22 class REParser(object): |
| 23 |
| 24 def __init__(self, s): |
| 25 self.s = s |
| 26 self.i = -1 |
| 27 self.end = 0 |
| 28 self.next() |
| 29 |
| 30 def parse_re(self): |
| 31 re = self.parse_alt() |
| 32 if not self.end: |
| 33 self.error("Unexpected %s" % repr(self.c)) |
| 34 return re |
| 35 |
| 36 def parse_alt(self): |
| 37 """Parse a set of alternative regexps.""" |
| 38 re = self.parse_seq() |
| 39 if self.c == '|': |
| 40 re_list = [re] |
| 41 while self.c == '|': |
| 42 self.next() |
| 43 re_list.append(self.parse_seq()) |
| 44 re = Alt(*re_list) |
| 45 return re |
| 46 |
| 47 def parse_seq(self): |
| 48 """Parse a sequence of regexps.""" |
| 49 re_list = [] |
| 50 while not self.end and not self.c in "|)": |
| 51 re_list.append(self.parse_mod()) |
| 52 return Seq(*re_list) |
| 53 |
| 54 def parse_mod(self): |
| 55 """Parse a primitive regexp followed by *, +, ? modifiers.""" |
| 56 re = self.parse_prim() |
| 57 while not self.end and self.c in "*+?": |
| 58 if self.c == '*': |
| 59 re = Rep(re) |
| 60 elif self.c == '+': |
| 61 re = Rep1(re) |
| 62 else: # self.c == '?' |
| 63 re = Opt(re) |
| 64 self.next() |
| 65 return re |
| 66 |
| 67 def parse_prim(self): |
| 68 """Parse a primitive regexp.""" |
| 69 c = self.get() |
| 70 if c == '.': |
| 71 re = AnyBut("\n") |
| 72 elif c == '^': |
| 73 re = Bol |
| 74 elif c == '$': |
| 75 re = Eol |
| 76 elif c == '(': |
| 77 re = self.parse_alt() |
| 78 self.expect(')') |
| 79 elif c == '[': |
| 80 re = self.parse_charset() |
| 81 self.expect(']') |
| 82 else: |
| 83 if c == '\\': |
| 84 c = self.get() |
| 85 re = Char(c) |
| 86 return re |
| 87 |
| 88 def parse_charset(self): |
| 89 """Parse a charset. Does not include the surrounding [].""" |
| 90 char_list = [] |
| 91 invert = 0 |
| 92 if self.c == '^': |
| 93 invert = 1 |
| 94 self.next() |
| 95 if self.c == ']': |
| 96 char_list.append(']') |
| 97 self.next() |
| 98 while not self.end and self.c != ']': |
| 99 c1 = self.get() |
| 100 if self.c == '-' and self.lookahead(1) != ']': |
| 101 self.next() |
| 102 c2 = self.get() |
| 103 for a in xrange(ord(c1), ord(c2) + 1): |
| 104 char_list.append(chr(a)) |
| 105 else: |
| 106 char_list.append(c1) |
| 107 chars = ''.join(char_list) |
| 108 if invert: |
| 109 return AnyBut(chars) |
| 110 else: |
| 111 return Any(chars) |
| 112 |
| 113 def next(self): |
| 114 """Advance to the next char.""" |
| 115 s = self.s |
| 116 i = self.i = self.i + 1 |
| 117 if i < len(s): |
| 118 self.c = s[i] |
| 119 else: |
| 120 self.c = '' |
| 121 self.end = 1 |
| 122 |
| 123 def get(self): |
| 124 if self.end: |
| 125 self.error("Premature end of string") |
| 126 c = self.c |
| 127 self.next() |
| 128 return c |
| 129 |
| 130 def lookahead(self, n): |
| 131 """Look ahead n chars.""" |
| 132 j = self.i + n |
| 133 if j < len(self.s): |
| 134 return self.s[j] |
| 135 else: |
| 136 return '' |
| 137 |
| 138 def expect(self, c): |
| 139 """ |
| 140 Expect to find character |c| at current position. |
| 141 Raises an exception otherwise. |
| 142 """ |
| 143 if self.c == c: |
| 144 self.next() |
| 145 else: |
| 146 self.error("Missing %s" % repr(c)) |
| 147 |
| 148 def error(self, mess): |
| 149 """Raise exception to signal syntax error in regexp.""" |
| 150 raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % ( |
| 151 repr(self.s), self.i, mess)) |
| 152 |
| 153 |
| 154 |
OLD | NEW |