OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 """Converts profile datasets to dictionary list for Autofill profiles. | |
7 | |
8 Used for test autofill.AutofillTest.testMergeDuplicateProfilesInAutofill. | |
9 Can be used as a stand alone script with -h to print out help text by running: | |
10 python autofill_dataset_converter.py -h | |
11 """ | |
12 | |
13 import codecs | |
14 import logging | |
15 import os | |
16 import re | |
17 import sys | |
18 | |
19 | |
20 class _NullHandler(logging.Handler): | |
21 """Prevents warning when running in quiet mode.""" | |
22 def emit(self, record): | |
23 pass | |
24 | |
25 | |
26 class DatasetConverter(object): | |
27 _fields = [ | |
28 u'NAME_FIRST', | |
29 u'NAME_MIDDLE', | |
30 u'NAME_LAST', | |
31 u'EMAIL_ADDRESS', | |
32 u'COMPANY_NAME', | |
33 u'ADDRESS_HOME_LINE1', | |
34 u'ADDRESS_HOME_LINE2', | |
35 u'ADDRESS_HOME_CITY', | |
36 u'ADDRESS_HOME_STATE', | |
37 u'ADDRESS_HOME_ZIP', | |
38 u'ADDRESS_HOME_COUNTRY', | |
39 u'PHONE_HOME_WHOLE_NUMBER', | |
40 ] | |
41 _record_length = len(_fields) | |
42 _output_pattern = u'{' | |
43 for key in _fields: | |
44 _output_pattern += u"u'%s': u'%%s', " % key | |
45 _output_pattern = _output_pattern[:-1] + '},' | |
46 _re_single_quote = re.compile("'", re.UNICODE) | |
47 _logger = logging.getLogger(__name__) | |
48 _logger.addHandler(_NullHandler()) | |
49 _log_handlers = {'StreamHandler': None} | |
50 | |
51 def __init__(self, input_filename, output_filename=None, | |
52 logging_level=None): | |
53 """Constructs a dataset converter object. | |
54 | |
55 Full input pattern: | |
56 '(?P<NAME_FIRST>.*?)\|(?P<MIDDLE_NAME>.*?)\|(?P<NAME_LAST>.*?)\| | |
57 (?P<EMAIL_ADDRESS>.*?)\|(?P<COMPANY_NAME>.*?)\|(?P<ADDRESS_HOME_LINE1>.*?) | |
58 \|(?P<ADDRESS_HOME_LINE2>.*?)\|(?P<ADDRESS_HOME_CITY>.*?)\| | |
59 (?P<ADDRESS_HOME_STATE>.*?)\|(?P<ADDRESS_HOME_ZIP>.*?)\| | |
60 (?P<ADDRESS_HOME_COUNTRY>.*?)\|(?P<PHONE_HOME_WHOLE_NUMBER>.*?)$' | |
61 | |
62 Full ouput pattern: | |
63 "{u'NAME_FIRST': u'%s', u'NAME_MIDDLE': u'%s', u'NAME_LAST': u'%s', | |
64 u'EMAIL_ADDRESS': u'%s', u'COMPANY_NAME': u'%s', u'ADDRESS_HOME_LINE1': | |
65 u'%s', u'ADDRESS_HOME_LINE2': u'%s', u'ADDRESS_HOME_CITY': u'%s', | |
66 u'ADDRESS_HOME_STATE': u'%s', u'ADDRESS_HOME_ZIP': u'%s', | |
67 u'ADDRESS_HOME_COUNTRY': u'%s', u'PHONE_HOME_WHOLE_NUMBER': u'%s',}," | |
68 | |
69 Args: | |
70 input_filename: name and path of the input dataset. | |
71 output_filename: name and path of the converted file, default is none. | |
72 logging_level: set verbosity levels, default is ERROR. | |
73 | |
74 Raises: | |
75 IOError: error if input file does not exist. | |
76 """ | |
77 if logging_level: | |
78 if not self._log_handlers['StreamHandler']: | |
79 console = logging.StreamHandler() | |
80 console.setLevel(logging_level) | |
81 self._log_handlers['StreamHandler'] = console | |
82 self._logger.addHandler(console) | |
83 self._logger.setLevel(logging_level) | |
84 else: | |
85 if self._log_handlers['StreamHandler']: | |
86 self._logger.removeHandler(self._log_handlers['StreamHandler']) | |
87 self._log_handler['StreamHandler'] = None | |
88 | |
89 self._input_filename = os.path.join(os.path.dirname(sys.argv[0]), | |
90 input_filename) | |
91 if not os.path.isfile(self._input_filename): | |
92 msg = 'File "%s" does not exist' % self._input_filename | |
93 self._logger.error(msg) | |
94 raise IOError(msg) | |
95 self._output_filename = output_filename | |
96 | |
97 def _CreateDictionaryFromRecord(self, record): | |
98 """Constructs and returns a dictionary from a record in the dataset file. | |
99 | |
100 Escapes single quotation first and uses split('|') to separate values. | |
101 The method assumes a valid record always contains at least one "|" | |
102 character. | |
103 Example: | |
104 Take an argument as a string u'John|Doe|Mountain View' | |
105 and returns a dictionary | |
106 { | |
107 u'NAME_FIRST': u'John', | |
108 u'NAME_LAST': u'Doe', | |
109 u'ADDRESS_HOME_CITY': u'Mountain View', | |
110 } | |
111 | |
112 Args: | |
113 record: row of record from the dataset file. | |
114 | |
115 Returns: | |
116 None if the current record line is invalid or a dictionary representing a | |
117 single record from the dataset file. | |
118 """ | |
119 # Ignore irrelevant record lines that do not contain '|'. | |
120 if not '|' in record: | |
121 return | |
122 # Escaping single quote: "'" -> "\'" | |
123 record = self._re_single_quote.sub(r"\'", record) | |
124 record_list = record.split('|') | |
125 if record_list: | |
126 # Check for case when a record may have more or less fields than expected. | |
127 if len(record_list) != self._record_length: | |
128 self._logger.warning( | |
129 'A "|" separated line has %d fields instead of %d: %s' % ( | |
130 len(record_list), self._record_length, record)) | |
131 return | |
132 out_record = {} | |
133 for i, key in enumerate(self._fields): | |
134 out_record[key] = record_list[i] | |
135 return out_record | |
136 | |
137 def Convert(self): | |
138 """Function to convert input data into the desired output format. | |
139 | |
140 Returns: | |
141 List that holds all the dictionaries. | |
142 """ | |
143 input_file = open(self._input_filename) | |
144 if self._output_filename: | |
145 output_file = codecs.open(self._output_filename, mode='wb', | |
146 encoding='utf-8-sig') | |
147 else: | |
148 output_file = None | |
149 try: | |
150 list_of_dict = [] | |
151 i = 0 | |
152 if output_file: | |
153 output_file.write('[') | |
154 output_file.write(os.linesep) | |
155 for line in input_file.readlines(): | |
156 line = line.strip() | |
157 if not line: | |
158 continue | |
159 line = unicode(line, 'UTF-8') | |
160 output_record = self._CreateDictionaryFromRecord(line) | |
161 if output_record: | |
162 i += 1 | |
163 list_of_dict.append(output_record) | |
164 output_line = self._output_pattern % tuple( | |
165 [output_record[key] for key in self._fields]) | |
166 if output_file: | |
167 output_file.write(output_line) | |
168 output_file.write(os.linesep) | |
169 self._logger.info('%d: %s' % (i, line.encode('UTF-8'))) | |
170 self._logger.info('\tconverted to: %s' % output_line.encode('UTF-8')) | |
171 if output_file: | |
172 output_file.write(']') | |
173 output_file.write(os.linesep) | |
174 self._logger.info('%d lines converted SUCCESSFULLY!' % i) | |
175 self._logger.info('--- FINISHED ---') | |
176 return list_of_dict | |
177 finally: | |
178 if output_file: | |
179 output_file.close() | |
180 | |
181 | |
182 def main(): | |
183 from optparse import OptionParser | |
184 input_filename = os.path.join('..', 'data', 'autofill', 'dataset.txt') | |
185 output_filename = os.path.join('..', 'data', 'autofill', | |
186 'dataset_duplicate-profiles.txt') | |
187 parser = OptionParser() | |
188 parser.add_option('-i', '--input', dest='input_filename', | |
189 default=input_filename, | |
190 help='convert FILE [defaults to "%s"]' % input_filename, | |
191 metavar='FILE') | |
192 parser.add_option('-o', '--output', dest='output_filename', | |
193 default=output_filename, | |
194 help='write output to FILE [defaults to "%s"]' % | |
195 output_filename, metavar='FILE') | |
196 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', | |
197 default=True, help='display all [default]') | |
198 parser.add_option('-q', '--quiet', action='store_false', dest='verbose', | |
199 help='display nothing') | |
200 parser.add_option('-l', '--log', dest='logging_level', default=None, | |
201 help='specify logging LEVEL: "info", "warning" or "error"', | |
202 metavar='LEVEL') | |
203 | |
204 (options, args) = parser.parse_args() | |
205 if args: | |
206 parser.print_help() | |
207 return 1 | |
208 if not options.verbose: | |
209 options.logging_level = None | |
210 if options.verbose and not options.logging_level: | |
211 options.logging_level = 'info' | |
212 if options.logging_level: | |
213 if 'info' in options.logging_level.lower(): | |
214 options.logging_level = logging.INFO | |
215 elif 'warn' in options.logging_level.lower(): | |
216 options.logging_level = logging.WARNING | |
217 elif 'error' in options.logging_level.lower(): | |
218 options.logging_level = logging.ERROR | |
219 | |
220 c = DatasetConverter(options.input_filename, options.output_filename, | |
221 options.logging_level) | |
222 c.Convert() | |
223 return 0 | |
224 | |
225 | |
226 if __name__ == '__main__': | |
227 sys.exit(main()) | |
OLD | NEW |