OLD | NEW |
(Empty) | |
| 1 #======================================================================= |
| 2 # |
| 3 # Python Lexical Analyser |
| 4 # |
| 5 # Lexical Analyser Specification |
| 6 # |
| 7 #======================================================================= |
| 8 |
| 9 import types |
| 10 |
| 11 import Actions |
| 12 import DFA |
| 13 import Errors |
| 14 import Machines |
| 15 import Regexps |
| 16 |
| 17 # debug_flags for Lexicon constructor |
| 18 DUMP_NFA = 1 |
| 19 DUMP_DFA = 2 |
| 20 |
| 21 class State(object): |
| 22 """ |
| 23 This class is used as part of a Plex.Lexicon specification to |
| 24 introduce a user-defined state. |
| 25 |
| 26 Constructor: |
| 27 |
| 28 State(name, token_specifications) |
| 29 """ |
| 30 |
| 31 name = None |
| 32 tokens = None |
| 33 |
| 34 def __init__(self, name, tokens): |
| 35 self.name = name |
| 36 self.tokens = tokens |
| 37 |
| 38 class Lexicon(object): |
| 39 """ |
| 40 Lexicon(specification) builds a lexical analyser from the given |
| 41 |specification|. The specification consists of a list of |
| 42 specification items. Each specification item may be either: |
| 43 |
| 44 1) A token definition, which is a tuple: |
| 45 |
| 46 (pattern, action) |
| 47 |
| 48 The |pattern| is a regular axpression built using the |
| 49 constructors defined in the Plex module. |
| 50 |
| 51 The |action| is the action to be performed when this pattern |
| 52 is recognised (see below). |
| 53 |
| 54 2) A state definition: |
| 55 |
| 56 State(name, tokens) |
| 57 |
| 58 where |name| is a character string naming the state, |
| 59 and |tokens| is a list of token definitions as |
| 60 above. The meaning and usage of states is described |
| 61 below. |
| 62 |
| 63 Actions |
| 64 ------- |
| 65 |
| 66 The |action| in a token specication may be one of three things: |
| 67 |
| 68 1) A function, which is called as follows: |
| 69 |
| 70 function(scanner, text) |
| 71 |
| 72 where |scanner| is the relevant Scanner instance, and |text| |
| 73 is the matched text. If the function returns anything |
| 74 other than None, that value is returned as the value of the |
| 75 token. If it returns None, scanning continues as if the IGNORE |
| 76 action were specified (see below). |
| 77 |
| 78 2) One of the following special actions: |
| 79 |
| 80 IGNORE means that the recognised characters will be treated as |
| 81 white space and ignored. Scanning will continue until |
| 82 the next non-ignored token is recognised before returning. |
| 83 |
| 84 TEXT causes the scanned text itself to be returned as the |
| 85 value of the token. |
| 86 |
| 87 3) Any other value, which is returned as the value of the token. |
| 88 |
| 89 States |
| 90 ------ |
| 91 |
| 92 At any given time, the scanner is in one of a number of states. |
| 93 Associated with each state is a set of possible tokens. When scanning, |
| 94 only tokens associated with the current state are recognised. |
| 95 |
| 96 There is a default state, whose name is the empty string. Token |
| 97 definitions which are not inside any State definition belong to |
| 98 the default state. |
| 99 |
| 100 The initial state of the scanner is the default state. The state can |
| 101 be changed in one of two ways: |
| 102 |
| 103 1) Using Begin(state_name) as the action of a token. |
| 104 |
| 105 2) Calling the begin(state_name) method of the Scanner. |
| 106 |
| 107 To change back to the default state, use '' as the state name. |
| 108 """ |
| 109 |
| 110 machine = None # Machine |
| 111 tables = None # StateTableMachine |
| 112 |
| 113 def __init__(self, specifications, debug = None, debug_flags = 7, timings = No
ne): |
| 114 if type(specifications) != types.ListType: |
| 115 raise Errors.InvalidScanner("Scanner definition is not a list") |
| 116 if timings: |
| 117 from Timing import time |
| 118 total_time = 0.0 |
| 119 time1 = time() |
| 120 nfa = Machines.Machine() |
| 121 default_initial_state = nfa.new_initial_state('') |
| 122 token_number = 1 |
| 123 for spec in specifications: |
| 124 if isinstance(spec, State): |
| 125 user_initial_state = nfa.new_initial_state(spec.name) |
| 126 for token in spec.tokens: |
| 127 self.add_token_to_machine( |
| 128 nfa, user_initial_state, token, token_number) |
| 129 token_number = token_number + 1 |
| 130 elif type(spec) == types.TupleType: |
| 131 self.add_token_to_machine( |
| 132 nfa, default_initial_state, spec, token_number) |
| 133 token_number = token_number + 1 |
| 134 else: |
| 135 raise Errors.InvalidToken( |
| 136 token_number, |
| 137 "Expected a token definition (tuple) or State instance") |
| 138 if timings: |
| 139 time2 = time() |
| 140 total_time = total_time + (time2 - time1) |
| 141 time3 = time() |
| 142 if debug and (debug_flags & 1): |
| 143 debug.write("\n============= NFA ===========\n") |
| 144 nfa.dump(debug) |
| 145 dfa = DFA.nfa_to_dfa(nfa, debug = (debug_flags & 3) == 3 and debug) |
| 146 if timings: |
| 147 time4 = time() |
| 148 total_time = total_time + (time4 - time3) |
| 149 if debug and (debug_flags & 2): |
| 150 debug.write("\n============= DFA ===========\n") |
| 151 dfa.dump(debug) |
| 152 if timings: |
| 153 timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) |
| 154 timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) |
| 155 timings.write("TOTAL : %5.2f\n" % total_time) |
| 156 self.machine = dfa |
| 157 |
| 158 def add_token_to_machine(self, machine, initial_state, token_spec, token_numbe
r): |
| 159 try: |
| 160 (re, action_spec) = self.parse_token_definition(token_spec) |
| 161 # Disabled this -- matching empty strings can be useful |
| 162 #if re.nullable: |
| 163 # raise Errors.InvalidToken( |
| 164 # token_number, "Pattern can match 0 input symbols") |
| 165 if isinstance(action_spec, Actions.Action): |
| 166 action = action_spec |
| 167 else: |
| 168 try: |
| 169 action_spec.__call__ |
| 170 except AttributeError: |
| 171 action = Actions.Return(action_spec) |
| 172 else: |
| 173 action = Actions.Call(action_spec) |
| 174 final_state = machine.new_state() |
| 175 re.build_machine(machine, initial_state, final_state, |
| 176 match_bol = 1, nocase = 0) |
| 177 final_state.set_action(action, priority = -token_number) |
| 178 except Errors.PlexError, e: |
| 179 raise e.__class__("Token number %d: %s" % (token_number, e)) |
| 180 |
| 181 def parse_token_definition(self, token_spec): |
| 182 if type(token_spec) != types.TupleType: |
| 183 raise Errors.InvalidToken("Token definition is not a tuple") |
| 184 if len(token_spec) != 2: |
| 185 raise Errors.InvalidToken("Wrong number of items in token definition") |
| 186 pattern, action = token_spec |
| 187 if not isinstance(pattern, Regexps.RE): |
| 188 raise Errors.InvalidToken("Pattern is not an RE instance") |
| 189 return (pattern, action) |
| 190 |
| 191 def get_initial_state(self, name): |
| 192 return self.machine.get_initial_state(name) |
| 193 |
| 194 |
| 195 |
OLD | NEW |