| OLD | NEW |
| (Empty) | |
| 1 #!/usr/bin/env python |
| 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 """ Lexer for PPAPI IDL """ |
| 7 |
| 8 # |
| 9 # IDL Lexer |
| 10 # |
| 11 # The lexer is uses the PLY lex library to build a tokenizer which understands |
| 12 # WebIDL tokens. |
| 13 # |
| 14 # WebIDL, and WebIDL regular expressions can be found at: |
| 15 # http://dev.w3.org/2006/webapi/WebIDL/ |
| 16 # PLY can be found at: |
| 17 # http://www.dabeaz.com/ply/ |
| 18 |
| 19 import os.path |
| 20 import re |
| 21 import sys |
| 22 |
| 23 # |
| 24 # Try to load the ply module, if not, then assume it is in the third_party |
| 25 # directory, relative to ppapi |
| 26 # |
| 27 try: |
| 28 from ply import lex |
| 29 except: |
| 30 module_path, module_name = os.path.split(__file__) |
| 31 third_party = os.path.join(module_path, '..', '..', 'third_party') |
| 32 sys.path.append(third_party) |
| 33 from ply import lex |
| 34 |
| 35 from idl_option import GetOption, Option, ParseOptions |
| 36 |
| 37 |
| 38 Option('output', 'Generate output.') |
| 39 |
| 40 # |
| 41 # IDL Lexer |
| 42 # |
| 43 class IDLLexer(object): |
| 44 # 'tokens' is a value required by lex which specifies the complete list |
| 45 # of valid token types. |
| 46 tokens = [ |
| 47 # Symbol and keywords types |
| 48 'COMMENT', |
| 49 'DESCRIBE', |
| 50 'ENUM', |
| 51 'LABEL', |
| 52 'SYMBOL', |
| 53 'INLINE', |
| 54 'INTERFACE', |
| 55 'STRUCT', |
| 56 'TYPEDEF', |
| 57 |
| 58 # Extra WebIDL keywords |
| 59 'CALLBACK', |
| 60 'DICTIONARY', |
| 61 'OPTIONAL', |
| 62 'STATIC', |
| 63 |
| 64 # Invented for apps use |
| 65 'NAMESPACE', |
| 66 |
| 67 # Data types |
| 68 'FLOAT', |
| 69 'OCT', |
| 70 'INT', |
| 71 'HEX', |
| 72 'STRING', |
| 73 |
| 74 # Operators |
| 75 'LSHIFT', |
| 76 'RSHIFT' |
| 77 ] |
| 78 |
| 79 # 'keywords' is a map of string to token type. All SYMBOL tokens are |
| 80 # matched against keywords, to determine if the token is actually a keyword. |
| 81 keywords = { |
| 82 'describe' : 'DESCRIBE', |
| 83 'enum' : 'ENUM', |
| 84 'label' : 'LABEL', |
| 85 'interface' : 'INTERFACE', |
| 86 'readonly' : 'READONLY', |
| 87 'struct' : 'STRUCT', |
| 88 'typedef' : 'TYPEDEF', |
| 89 |
| 90 'callback' : 'CALLBACK', |
| 91 'dictionary' : 'DICTIONARY', |
| 92 'optional' : 'OPTIONAL', |
| 93 'static' : 'STATIC', |
| 94 'namespace' : 'NAMESPACE', |
| 95 } |
| 96 |
| 97 # 'literals' is a value expected by lex which specifies a list of valid |
| 98 # literal tokens, meaning the token type and token value are identical. |
| 99 literals = '"*.(){}[],;:=+-/~|&^?' |
| 100 |
| 101 # Token definitions |
| 102 # |
| 103 # Lex assumes any value or function in the form of 't_<TYPE>' represents a |
| 104 # regular expression where a match will emit a token of type <TYPE>. In the |
| 105 # case of a function, the function is called when a match is made. These |
| 106 # definitions come from WebIDL. |
| 107 |
| 108 # 't_ignore' is a special match of items to ignore |
| 109 t_ignore = ' \t' |
| 110 |
| 111 # Constant values |
| 112 t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+' |
| 113 t_INT = r'-?[0-9]+[uU]?' |
| 114 t_OCT = r'-?0[0-7]+' |
| 115 t_HEX = r'-?0[Xx][0-9A-Fa-f]+' |
| 116 t_LSHIFT = r'<<' |
| 117 t_RSHIFT = r'>>' |
| 118 |
| 119 # A line ending '\n', we use this to increment the line number |
| 120 def t_LINE_END(self, t): |
| 121 r'\n+' |
| 122 self.AddLines(len(t.value)) |
| 123 |
| 124 # We do not process escapes in the IDL strings. Strings are exclusively |
| 125 # used for attributes, and not used as typical 'C' constants. |
| 126 def t_STRING(self, t): |
| 127 r'"[^"]*"' |
| 128 t.value = t.value[1:-1] |
| 129 self.AddLines(t.value.count('\n')) |
| 130 return t |
| 131 |
| 132 # A C or C++ style comment: /* xxx */ or // |
| 133 def t_COMMENT(self, t): |
| 134 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' |
| 135 self.AddLines(t.value.count('\n')) |
| 136 return t |
| 137 |
| 138 # Return a "preprocessor" inline block |
| 139 def t_INLINE(self, t): |
| 140 r'\#inline (.|\n)*?\#endinl.*' |
| 141 self.AddLines(t.value.count('\n')) |
| 142 return t |
| 143 |
| 144 # A symbol or keyword. |
| 145 def t_KEYWORD_SYMBOL(self, t): |
| 146 r'_?[A-Za-z][A-Za-z_0-9]*' |
| 147 |
| 148 # All non-keywords are assumed to be symbols |
| 149 t.type = self.keywords.get(t.value, 'SYMBOL') |
| 150 |
| 151 # We strip leading underscores so that you can specify symbols with the same |
| 152 # value as a keywords (E.g. a dictionary named 'interface'). |
| 153 if t.value[0] == '_': |
| 154 t.value = t.value[1:] |
| 155 return t |
| 156 |
| 157 def t_ANY_error(self, t): |
| 158 msg = "Unrecognized input" |
| 159 line = self.lexobj.lineno |
| 160 |
| 161 # If that line has not been accounted for, then we must have hit |
| 162 # EoF, so compute the beginning of the line that caused the problem. |
| 163 if line >= len(self.index): |
| 164 # Find the offset in the line of the first word causing the issue |
| 165 word = t.value.split()[0] |
| 166 offs = self.lines[line - 1].find(word) |
| 167 # Add the computed line's starting position |
| 168 self.index.append(self.lexobj.lexpos - offs) |
| 169 msg = "Unexpected EoF reached after" |
| 170 |
| 171 pos = self.lexobj.lexpos - self.index[line] |
| 172 file = self.lexobj.filename |
| 173 out = self.ErrorMessage(file, line, pos, msg) |
| 174 sys.stderr.write(out + '\n') |
| 175 self.lex_errors += 1 |
| 176 |
| 177 |
| 178 def AddLines(self, count): |
| 179 # Set the lexer position for the beginning of the next line. In the case |
| 180 # of multiple lines, tokens can not exist on any of the lines except the |
| 181 # last one, so the recorded value for previous lines are unused. We still |
| 182 # fill the array however, to make sure the line count is correct. |
| 183 self.lexobj.lineno += count |
| 184 for i in range(count): |
| 185 self.index.append(self.lexobj.lexpos) |
| 186 |
| 187 def FileLineMsg(self, file, line, msg): |
| 188 if file: return "%s(%d) : %s" % (file, line + 1, msg) |
| 189 return "<BuiltIn> : %s" % msg |
| 190 |
| 191 def SourceLine(self, file, line, pos): |
| 192 caret = '\t^'.expandtabs(pos) |
| 193 # We decrement the line number since the array is 0 based while the |
| 194 # line numbers are 1 based. |
| 195 return "%s\n%s" % (self.lines[line - 1], caret) |
| 196 |
| 197 def ErrorMessage(self, file, line, pos, msg): |
| 198 return "\n%s\n%s" % ( |
| 199 self.FileLineMsg(file, line, msg), |
| 200 self.SourceLine(file, line, pos)) |
| 201 |
| 202 def SetData(self, filename, data): |
| 203 # Start with line 1, not zero |
| 204 self.lexobj.lineno = 1 |
| 205 self.lexobj.filename = filename |
| 206 self.lines = data.split('\n') |
| 207 self.index = [0] |
| 208 self.lexobj.input(data) |
| 209 self.lex_errors = 0 |
| 210 |
| 211 def __init__(self): |
| 212 self.lexobj = lex.lex(object=self, lextab=None, optimize=0) |
| 213 |
| 214 |
| 215 |
| 216 # |
| 217 # FilesToTokens |
| 218 # |
| 219 # From a set of source file names, generate a list of tokens. |
| 220 # |
| 221 def FilesToTokens(filenames, verbose=False): |
| 222 lexer = IDLLexer() |
| 223 outlist = [] |
| 224 for filename in filenames: |
| 225 data = open(filename).read() |
| 226 lexer.SetData(filename, data) |
| 227 if verbose: sys.stdout.write(' Loaded %s...\n' % filename) |
| 228 while 1: |
| 229 t = lexer.lexobj.token() |
| 230 if t is None: break |
| 231 outlist.append(t) |
| 232 return outlist |
| 233 |
| 234 |
| 235 def TokensFromText(text): |
| 236 lexer = IDLLexer() |
| 237 lexer.SetData('unknown', text) |
| 238 outlist = [] |
| 239 while 1: |
| 240 t = lexer.lexobj.token() |
| 241 if t is None: break |
| 242 outlist.append(t.value) |
| 243 return outlist |
| 244 |
| 245 # |
| 246 # TextToTokens |
| 247 # |
| 248 # From a block of text, generate a list of tokens |
| 249 # |
| 250 def TextToTokens(source): |
| 251 lexer = IDLLexer() |
| 252 outlist = [] |
| 253 lexer.SetData('AUTO', source) |
| 254 while 1: |
| 255 t = lexer.lexobj.token() |
| 256 if t is None: break |
| 257 outlist.append(t.value) |
| 258 return outlist |
| 259 |
| 260 |
| 261 # |
| 262 # TestSame |
| 263 # |
| 264 # From a set of token values, generate a new source text by joining with a |
| 265 # single space. The new source is then tokenized and compared against the |
| 266 # old set. |
| 267 # |
| 268 def TestSame(values1): |
| 269 # Recreate the source from the tokens. We use newline instead of whitespace |
| 270 # since the '//' and #inline regex are line sensitive. |
| 271 text = '\n'.join(values1) |
| 272 values2 = TextToTokens(text) |
| 273 |
| 274 count1 = len(values1) |
| 275 count2 = len(values2) |
| 276 if count1 != count2: |
| 277 print "Size mismatch original %d vs %d\n" % (count1, count2) |
| 278 if count1 > count2: count1 = count2 |
| 279 |
| 280 for i in range(count1): |
| 281 if values1[i] != values2[i]: |
| 282 print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i]) |
| 283 |
| 284 if GetOption('output'): |
| 285 sys.stdout.write('Generating original.txt and tokenized.txt\n') |
| 286 open('original.txt', 'w').write(src1) |
| 287 open('tokenized.txt', 'w').write(src2) |
| 288 |
| 289 if values1 == values2: |
| 290 sys.stdout.write('Same: Pass\n') |
| 291 return 0 |
| 292 |
| 293 print "****************\n%s\n%s***************\n" % (src1, src2) |
| 294 sys.stdout.write('Same: Failed\n') |
| 295 return -1 |
| 296 |
| 297 |
| 298 # |
| 299 # TestExpect |
| 300 # |
| 301 # From a set of tokens pairs, verify the type field of the second matches |
| 302 # the value of the first, so that: |
| 303 # INT 123 FLOAT 1.1 |
| 304 # will generate a passing test, where the first token is the SYMBOL INT, |
| 305 # and the second token is the INT 123, third token is the SYMBOL FLOAT and |
| 306 # the fourth is the FLOAT 1.1, etc... |
| 307 def TestExpect(tokens): |
| 308 count = len(tokens) |
| 309 index = 0 |
| 310 errors = 0 |
| 311 while index < count: |
| 312 type = tokens[index].value |
| 313 token = tokens[index + 1] |
| 314 index += 2 |
| 315 |
| 316 if type != token.type: |
| 317 sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' % |
| 318 (type, token.type, token.value)) |
| 319 errors += 1 |
| 320 |
| 321 if not errors: |
| 322 sys.stdout.write('Expect: Pass\n') |
| 323 return 0 |
| 324 |
| 325 sys.stdout.write('Expect: Failed\n') |
| 326 return -1 |
| 327 |
| 328 |
| 329 def Main(args): |
| 330 filenames = ParseOptions(args) |
| 331 |
| 332 try: |
| 333 tokens = FilesToTokens(filenames, GetOption('verbose')) |
| 334 values = [tok.value for tok in tokens] |
| 335 if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n') |
| 336 if GetOption('test'): |
| 337 if TestSame(values): |
| 338 return -1 |
| 339 if TestExpect(tokens): |
| 340 return -1 |
| 341 return 0 |
| 342 |
| 343 except lex.LexError as le: |
| 344 sys.stderr.write('%s\n' % str(le)) |
| 345 return -1 |
| 346 |
| 347 |
| 348 if __name__ == '__main__': |
| 349 sys.exit(Main(sys.argv[1:])) |
| OLD | NEW |