OLD | NEW |
(Empty) | |
| 1 #======================================================================= |
| 2 # |
| 3 # Python Lexical Analyser |
| 4 # |
| 5 # |
| 6 # Scanning an input stream |
| 7 # |
| 8 #======================================================================= |
| 9 |
| 10 import cython |
| 11 cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object) |
| 12 |
| 13 import Errors |
| 14 from Regexps import BOL, EOL, EOF |
| 15 |
| 16 NOT_FOUND = object() |
| 17 |
| 18 class Scanner(object): |
| 19 """ |
| 20 A Scanner is used to read tokens from a stream of characters |
| 21 using the token set specified by a Plex.Lexicon. |
| 22 |
| 23 Constructor: |
| 24 |
| 25 Scanner(lexicon, stream, name = '') |
| 26 |
| 27 See the docstring of the __init__ method for details. |
| 28 |
| 29 Methods: |
| 30 |
| 31 See the docstrings of the individual methods for more |
| 32 information. |
| 33 |
| 34 read() --> (value, text) |
| 35 Reads the next lexical token from the stream. |
| 36 |
| 37 position() --> (name, line, col) |
| 38 Returns the position of the last token read using the |
| 39 read() method. |
| 40 |
| 41 begin(state_name) |
| 42 Causes scanner to change state. |
| 43 |
| 44 produce(value [, text]) |
| 45 Causes return of a token value to the caller of the |
| 46 Scanner. |
| 47 |
| 48 """ |
| 49 |
| 50 # lexicon = None # Lexicon |
| 51 # stream = None # file-like object |
| 52 # name = '' |
| 53 # buffer = '' |
| 54 # buf_start_pos = 0 # position in input of start of buffer |
| 55 # next_pos = 0 # position in input of next char to read |
| 56 # cur_pos = 0 # position in input of current char |
| 57 # cur_line = 1 # line number of current char |
| 58 # cur_line_start = 0 # position in input of start of current line |
| 59 # start_pos = 0 # position in input of start of token |
| 60 # start_line = 0 # line number of start of token |
| 61 # start_col = 0 # position in line of start of token |
| 62 # text = None # text of last token read |
| 63 # initial_state = None # Node |
| 64 # state_name = '' # Name of initial state |
| 65 # queue = None # list of tokens to be returned |
| 66 # trace = 0 |
| 67 |
| 68 def __init__(self, lexicon, stream, name = '', initial_pos = None): |
| 69 """ |
| 70 Scanner(lexicon, stream, name = '') |
| 71 |
| 72 |lexicon| is a Plex.Lexicon instance specifying the lexical tokens |
| 73 to be recognised. |
| 74 |
| 75 |stream| can be a file object or anything which implements a |
| 76 compatible read() method. |
| 77 |
| 78 |name| is optional, and may be the name of the file being |
| 79 scanned or any other identifying string. |
| 80 """ |
| 81 self.trace = 0 |
| 82 |
| 83 self.buffer = u'' |
| 84 self.buf_start_pos = 0 |
| 85 self.next_pos = 0 |
| 86 self.cur_pos = 0 |
| 87 self.cur_line = 1 |
| 88 self.start_pos = 0 |
| 89 self.start_line = 0 |
| 90 self.start_col = 0 |
| 91 self.text = None |
| 92 self.state_name = None |
| 93 |
| 94 self.lexicon = lexicon |
| 95 self.stream = stream |
| 96 self.name = name |
| 97 self.queue = [] |
| 98 self.initial_state = None |
| 99 self.begin('') |
| 100 self.next_pos = 0 |
| 101 self.cur_pos = 0 |
| 102 self.cur_line_start = 0 |
| 103 self.cur_char = BOL |
| 104 self.input_state = 1 |
| 105 if initial_pos is not None: |
| 106 self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2] |
| 107 |
| 108 def read(self): |
| 109 """ |
| 110 Read the next lexical token from the stream and return a |
| 111 tuple (value, text), where |value| is the value associated with |
| 112 the token as specified by the Lexicon, and |text| is the actual |
| 113 string read from the stream. Returns (None, '') on end of file. |
| 114 """ |
| 115 queue = self.queue |
| 116 while not queue: |
| 117 self.text, action = self.scan_a_token() |
| 118 if action is None: |
| 119 self.produce(None) |
| 120 self.eof() |
| 121 else: |
| 122 value = action.perform(self, self.text) |
| 123 if value is not None: |
| 124 self.produce(value) |
| 125 result = queue[0] |
| 126 del queue[0] |
| 127 return result |
| 128 |
| 129 def scan_a_token(self): |
| 130 """ |
| 131 Read the next input sequence recognised by the machine |
| 132 and return (text, action). Returns ('', None) on end of |
| 133 file. |
| 134 """ |
| 135 self.start_pos = self.cur_pos |
| 136 self.start_line = self.cur_line |
| 137 self.start_col = self.cur_pos - self.cur_line_start |
| 138 action = self.run_machine_inlined() |
| 139 if action is not None: |
| 140 if self.trace: |
| 141 print("Scanner: read: Performing %s %d:%d" % ( |
| 142 action, self.start_pos, self.cur_pos)) |
| 143 text = self.buffer[self.start_pos - self.buf_start_pos : |
| 144 self.cur_pos - self.buf_start_pos] |
| 145 return (text, action) |
| 146 else: |
| 147 if self.cur_pos == self.start_pos: |
| 148 if self.cur_char is EOL: |
| 149 self.next_char() |
| 150 if self.cur_char is None or self.cur_char is EOF: |
| 151 return (u'', None) |
| 152 raise Errors.UnrecognizedInput(self, self.state_name) |
| 153 |
| 154 def run_machine_inlined(self): |
| 155 """ |
| 156 Inlined version of run_machine for speed. |
| 157 """ |
| 158 state = self.initial_state |
| 159 cur_pos = self.cur_pos |
| 160 cur_line = self.cur_line |
| 161 cur_line_start = self.cur_line_start |
| 162 cur_char = self.cur_char |
| 163 input_state = self.input_state |
| 164 next_pos = self.next_pos |
| 165 buffer = self.buffer |
| 166 buf_start_pos = self.buf_start_pos |
| 167 buf_len = len(buffer) |
| 168 b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state
, b_next_pos = \ |
| 169 None, 0, 0, 0, u'', 0, 0 |
| 170 trace = self.trace |
| 171 while 1: |
| 172 if trace: #TRACE# |
| 173 print("State %d, %d/%d:%s -->" % ( #TRACE# |
| 174 state['number'], input_state, cur_pos, repr(cur_char))) #TRACE# |
| 175 # Begin inlined self.save_for_backup() |
| 176 #action = state.action #@slow |
| 177 action = state['action'] #@fast |
| 178 if action is not None: |
| 179 b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_s
tate, b_next_pos = \ |
| 180 action, cur_pos, cur_line, cur_line_start, cur_char, input_sta
te, next_pos |
| 181 # End inlined self.save_for_backup() |
| 182 c = cur_char |
| 183 #new_state = state.new_state(c) #@slow |
| 184 new_state = state.get(c, NOT_FOUND) #@fast |
| 185 if new_state is NOT_FOUND: #@fast |
| 186 new_state = c and state.get('else') #@fast |
| 187 if new_state: |
| 188 if trace: #TRACE# |
| 189 print("State %d" % new_state['number']) #TRACE# |
| 190 state = new_state |
| 191 # Begin inlined: self.next_char() |
| 192 if input_state == 1: |
| 193 cur_pos = next_pos |
| 194 # Begin inlined: c = self.read_char() |
| 195 buf_index = next_pos - buf_start_pos |
| 196 if buf_index < buf_len: |
| 197 c = buffer[buf_index] |
| 198 next_pos = next_pos + 1 |
| 199 else: |
| 200 discard = self.start_pos - buf_start_pos |
| 201 data = self.stream.read(0x1000) |
| 202 buffer = self.buffer[discard:] + data |
| 203 self.buffer = buffer |
| 204 buf_start_pos = buf_start_pos + discard |
| 205 self.buf_start_pos = buf_start_pos |
| 206 buf_len = len(buffer) |
| 207 buf_index = buf_index - discard |
| 208 if data: |
| 209 c = buffer[buf_index] |
| 210 next_pos = next_pos + 1 |
| 211 else: |
| 212 c = u'' |
| 213 # End inlined: c = self.read_char() |
| 214 if c == u'\n': |
| 215 cur_char = EOL |
| 216 input_state = 2 |
| 217 elif not c: |
| 218 cur_char = EOL |
| 219 input_state = 4 |
| 220 else: |
| 221 cur_char = c |
| 222 elif input_state == 2: |
| 223 cur_char = u'\n' |
| 224 input_state = 3 |
| 225 elif input_state == 3: |
| 226 cur_line = cur_line + 1 |
| 227 cur_line_start = cur_pos = next_pos |
| 228 cur_char = BOL |
| 229 input_state = 1 |
| 230 elif input_state == 4: |
| 231 cur_char = EOF |
| 232 input_state = 5 |
| 233 else: # input_state = 5 |
| 234 cur_char = u'' |
| 235 # End inlined self.next_char() |
| 236 else: # not new_state |
| 237 if trace: #TRACE# |
| 238 print("blocked") #TRACE# |
| 239 # Begin inlined: action = self.back_up() |
| 240 if b_action is not None: |
| 241 (action, cur_pos, cur_line, cur_line_start, |
| 242 cur_char, input_state, next_pos) = \ |
| 243 (b_action, b_cur_pos, b_cur_line, b_cur_line_start, |
| 244 b_cur_char, b_input_state, b_next_pos) |
| 245 else: |
| 246 action = None |
| 247 break # while 1 |
| 248 # End inlined: action = self.back_up() |
| 249 self.cur_pos = cur_pos |
| 250 self.cur_line = cur_line |
| 251 self.cur_line_start = cur_line_start |
| 252 self.cur_char = cur_char |
| 253 self.input_state = input_state |
| 254 self.next_pos = next_pos |
| 255 if trace: #TRACE# |
| 256 if action is not None: #TRACE# |
| 257 print("Doing %s" % action) #TRACE# |
| 258 return action |
| 259 |
| 260 def next_char(self): |
| 261 input_state = self.input_state |
| 262 if self.trace: |
| 263 print("Scanner: next: %s [%d] %d" % (" "*20, input_state, self.cur_pos)) |
| 264 if input_state == 1: |
| 265 self.cur_pos = self.next_pos |
| 266 c = self.read_char() |
| 267 if c == u'\n': |
| 268 self.cur_char = EOL |
| 269 self.input_state = 2 |
| 270 elif not c: |
| 271 self.cur_char = EOL |
| 272 self.input_state = 4 |
| 273 else: |
| 274 self.cur_char = c |
| 275 elif input_state == 2: |
| 276 self.cur_char = u'\n' |
| 277 self.input_state = 3 |
| 278 elif input_state == 3: |
| 279 self.cur_line = self.cur_line + 1 |
| 280 self.cur_line_start = self.cur_pos = self.next_pos |
| 281 self.cur_char = BOL |
| 282 self.input_state = 1 |
| 283 elif input_state == 4: |
| 284 self.cur_char = EOF |
| 285 self.input_state = 5 |
| 286 else: # input_state = 5 |
| 287 self.cur_char = u'' |
| 288 if self.trace: |
| 289 print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char))) |
| 290 |
| 291 def position(self): |
| 292 """ |
| 293 Return a tuple (name, line, col) representing the location of |
| 294 the last token read using the read() method. |name| is the |
| 295 name that was provided to the Scanner constructor; |line| |
| 296 is the line number in the stream (1-based); |col| is the |
| 297 position within the line of the first character of the token |
| 298 (0-based). |
| 299 """ |
| 300 return (self.name, self.start_line, self.start_col) |
| 301 |
| 302 def get_position(self): |
| 303 """Python accessible wrapper around position(), only for error reporting. |
| 304 """ |
| 305 return self.position() |
| 306 |
| 307 def begin(self, state_name): |
| 308 """Set the current state of the scanner to the named state.""" |
| 309 self.initial_state = ( |
| 310 self.lexicon.get_initial_state(state_name)) |
| 311 self.state_name = state_name |
| 312 |
| 313 def produce(self, value, text = None): |
| 314 """ |
| 315 Called from an action procedure, causes |value| to be returned |
| 316 as the token value from read(). If |text| is supplied, it is |
| 317 returned in place of the scanned text. |
| 318 |
| 319 produce() can be called more than once during a single call to an action |
| 320 procedure, in which case the tokens are queued up and returned one |
| 321 at a time by subsequent calls to read(), until the queue is empty, |
| 322 whereupon scanning resumes. |
| 323 """ |
| 324 if text is None: |
| 325 text = self.text |
| 326 self.queue.append((value, text)) |
| 327 |
| 328 def eof(self): |
| 329 """ |
| 330 Override this method if you want something to be done at |
| 331 end of file. |
| 332 """ |
OLD | NEW |