Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(155)

Side by Side Diff: tools/isolate/trace_inputs.py

Issue 10537059: Add ApiBase.Tracer.post_process_log() to do implementation specific one-time mangling of the logs. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding=utf-8 2 # coding=utf-8
3 # Copyright (c) 2012 The Chromium Authors. All rights reserved. 3 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
4 # Use of this source code is governed by a BSD-style license that can be 4 # Use of this source code is governed by a BSD-style license that can be
5 # found in the LICENSE file. 5 # found in the LICENSE file.
6 6
7 """Traces an executable and its child processes and extract the files accessed 7 """Traces an executable and its child processes and extract the files accessed
8 by them. 8 by them.
9 9
10 The implementation uses OS-specific API. The native Kernel logger and the ETL 10 The implementation uses OS-specific API. The native Kernel logger and the ETL
(...skipping 617 matching lines...) Expand 10 before | Expand all | Expand 10 after
628 - output: If False, redirects output to PIPEs. 628 - output: If False, redirects output to PIPEs.
629 629
630 Returns a tuple (resultcode, output) and updates the internal trace 630 Returns a tuple (resultcode, output) and updates the internal trace
631 entries. 631 entries.
632 """ 632 """
633 raise NotImplementedError(self.__class__.__name__) 633 raise NotImplementedError(self.__class__.__name__)
634 634
635 def close(self): 635 def close(self):
636 self._initialized = False 636 self._initialized = False
637 637
638 def post_process_log(self):
639 """Post-processes the log so it becomes faster to load afterward.
640
641 Must not be used manually when using 'with' construct.
642 """
643 assert not self._initialized, 'Must stop tracing first.'
644
638 def __enter__(self): 645 def __enter__(self):
639 """Enables 'with' statement.""" 646 """Enables 'with' statement."""
640 return self 647 return self
641 648
642 def __exit__(self, exc_type, exc_value, traceback): 649 def __exit__(self, exc_type, exc_value, traceback):
643 """Enables 'with' statement.""" 650 """Enables 'with' statement."""
644 self.close() 651 self.close()
652 # If an exception was thrown, do not process logs.
653 if not exc_type:
654 self.post_process_log()
645 655
646 def get_tracer(self, logname): 656 def get_tracer(self, logname):
647 """Returns an ApiBase.Tracer instance. 657 """Returns an ApiBase.Tracer instance.
648 658
649 Initializes the tracing subsystem, which is a requirement for kernel-based 659 Initializes the tracing subsystem, which is a requirement for kernel-based
650 tracers. Only one tracer instance should be live at a time! 660 tracers. Only one tracer instance should be live at a time!
651 661
652 logname is the filepath to the json file that will contain the meta-data 662 logname is the filepath to the json file that will contain the meta-data
653 about the logs. 663 about the logs.
654 """ 664 """
655 return self.Tracer(logname) 665 return self.Tracer(logname)
656 666
657 @staticmethod 667 @staticmethod
658 def clean_trace(logname): 668 def clean_trace(logname):
659 """Deletes the old log.""" 669 """Deletes the old log."""
660 raise NotImplementedError() 670 raise NotImplementedError()
661 671
662 @classmethod 672 @classmethod
663 def parse_log(cls, filename, blacklist): 673 def parse_log(cls, logname, blacklist):
664 """Processes a trace log and returns the files opened and the files that do 674 """Processes a trace log and returns the files opened and the files that do
665 not exist. 675 not exist.
666 676
667 It does not track directories. 677 It does not track directories.
668 678
669 Most of the time, files that do not exist are temporary test files that 679 Most of the time, files that do not exist are temporary test files that
670 should be put in /tmp instead. See http://crbug.com/116251. 680 should be put in /tmp instead. See http://crbug.com/116251.
671 681
672 Returns a tuple (existing files, non existing files, nb_processes_created) 682 Returns a tuple (existing files, non existing files, nb_processes_created)
673 """ 683 """
(...skipping 349 matching lines...) Expand 10 before | Expand all | Expand 10 after
1023 @staticmethod 1033 @staticmethod
1024 def clean_trace(logname): 1034 def clean_trace(logname):
1025 if os.path.isfile(logname): 1035 if os.path.isfile(logname):
1026 os.remove(logname) 1036 os.remove(logname)
1027 # Also delete any pid specific file from previous traces. 1037 # Also delete any pid specific file from previous traces.
1028 for i in glob.iglob(logname + '.*'): 1038 for i in glob.iglob(logname + '.*'):
1029 if i.rsplit('.', 1)[1].isdigit(): 1039 if i.rsplit('.', 1)[1].isdigit():
1030 os.remove(i) 1040 os.remove(i)
1031 1041
1032 @classmethod 1042 @classmethod
1033 def parse_log(cls, filename, blacklist): 1043 def parse_log(cls, logname, blacklist):
1034 logging.info('parse_log(%s, %s)' % (filename, blacklist)) 1044 logging.info('parse_log(%s, %s)' % (logname, blacklist))
1035 data = read_json(filename) 1045 data = read_json(logname)
1036 context = cls.Context(blacklist, data['cwd']) 1046 context = cls.Context(blacklist, data['cwd'])
1037 for pidfile in glob.iglob(filename + '.*'): 1047 for pidfile in glob.iglob(logname + '.*'):
1038 pid = pidfile.rsplit('.', 1)[1] 1048 pid = pidfile.rsplit('.', 1)[1]
1039 if pid.isdigit(): 1049 if pid.isdigit():
1040 pid = int(pid) 1050 pid = int(pid)
1041 # TODO(maruel): Load as utf-8 1051 # TODO(maruel): Load as utf-8
1042 for line in open(pidfile, 'rb'): 1052 for line in open(pidfile, 'rb'):
1043 context.on_line(pid, line) 1053 context.on_line(pid, line)
1044 1054
1045 return context.to_results() 1055 return context.to_results()
1046 1056
1047 1057
(...skipping 582 matching lines...) Expand 10 before | Expand all | Expand 10 after
1630 # code manages the dtrace lifetime itself. 1640 # code manages the dtrace lifetime itself.
1631 trace_cmd = [ 1641 trace_cmd = [
1632 'sudo', 1642 'sudo',
1633 'dtrace', 1643 'dtrace',
1634 '-x', 'dynvarsize=4m', 1644 '-x', 'dynvarsize=4m',
1635 '-x', 'evaltime=exec', 1645 '-x', 'evaltime=exec',
1636 '-n', self.code(child.pid, cwd), 1646 '-n', self.code(child.pid, cwd),
1637 '-o', '/dev/stderr', 1647 '-o', '/dev/stderr',
1638 '-q', 1648 '-q',
1639 ] 1649 ]
1640 with open(self._logname, 'w') as logfile: 1650 logname = self._logname + '.log'
1651 with open(logname, 'w') as logfile:
1641 dtrace = subprocess.Popen( 1652 dtrace = subprocess.Popen(
1642 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT) 1653 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
1643 logging.debug('Started dtrace pid: %d' % dtrace.pid) 1654 logging.debug('Started dtrace pid: %d' % dtrace.pid)
1644 1655
1645 # Part 3: Read until one line is printed, which signifies dtrace is up and 1656 # Part 3: Read until one line is printed, which signifies dtrace is up and
1646 # ready. 1657 # ready.
1647 with open(self._logname, 'r') as logfile: 1658 with open(logname, 'r') as logfile:
1648 while 'dtrace_BEGIN' not in logfile.readline(): 1659 while 'dtrace_BEGIN' not in logfile.readline():
1649 if dtrace.poll() is not None: 1660 if dtrace.poll() is not None:
1650 break 1661 break
1651 1662
1652 try: 1663 # Part 4: We can now tell our child to go.
1653 # Part 4: We can now tell our child to go. 1664 # TODO(maruel): Another pipe than stdin could be used instead. This
1654 # TODO(maruel): Another pipe than stdin could be used instead. This 1665 # would be more consistent with the other tracing methods.
1655 # would be more consistent with the other tracing methods. 1666 out = child.communicate(signal)[0]
1656 out = child.communicate(signal)[0]
1657 1667
1658 dtrace.wait() 1668 dtrace.wait()
1659 if dtrace.returncode != 0: 1669 if dtrace.returncode != 0:
1660 print 'dtrace failure: %d' % dtrace.returncode 1670 print 'dtrace failure: %d' % dtrace.returncode
1661 with open(self._logname) as logfile: 1671 with open(logname) as logfile:
1662 print ''.join(logfile.readlines()[-100:]) 1672 print ''.join(logfile.readlines()[-100:])
1663 # Find a better way. 1673 # Find a better way.
1664 os.remove(self._logname) 1674 os.remove(logname)
1665 else:
1666 # Short the log right away to simplify our life. There isn't much
1667 # advantage in keeping it out of order.
1668 self._sort_log(self._logname)
1669 except KeyboardInterrupt:
1670 # Still sort when testing.
1671 self._sort_log(self._logname)
1672 raise
1673 1675
1674 return dtrace.returncode or child.returncode, out 1676 return dtrace.returncode or child.returncode, out
1675 1677
1676 @staticmethod 1678 def post_process_log(self):
1677 def _sort_log(logname):
1678 """Sorts the log back in order when each call occured. 1679 """Sorts the log back in order when each call occured.
1679 1680
1680 dtrace doesn't save the buffer in strict order since it keeps one buffer 1681 dtrace doesn't save the buffer in strict order since it keeps one buffer
1681 per CPU. 1682 per CPU.
1682 """ 1683 """
1684 super(Dtrace.Tracer, self).post_process_log()
1685 logname = self._logname + '.log'
1683 with open(logname, 'rb') as logfile: 1686 with open(logname, 'rb') as logfile:
1684 lines = [l for l in logfile if l.strip()] 1687 lines = [l for l in logfile if l.strip()]
1685 errors = [l for l in lines if l.startswith('dtrace:')] 1688 errors = [l for l in lines if l.startswith('dtrace:')]
1686 if errors: 1689 if errors:
1687 raise TracingFailure( 1690 raise TracingFailure(
1688 'Found errors in the trace: %s' % '\n'.join(errors), 1691 'Found errors in the trace: %s' % '\n'.join(errors),
1689 None, None, None, logname) 1692 None, None, None, logname)
1690 try: 1693 try:
1691 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0])) 1694 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
1692 except ValueError: 1695 except ValueError:
1693 raise TracingFailure( 1696 raise TracingFailure(
1694 'Found errors in the trace: %s' % '\n'.join( 1697 'Found errors in the trace: %s' % '\n'.join(
1695 l for l in lines if l.split(' ', 1)[0].isdigit()), 1698 l for l in lines if l.split(' ', 1)[0].isdigit()),
1696 None, None, None, logname) 1699 None, None, None, logname)
1697 with open(logname, 'wb') as logfile: 1700 with open(logname, 'wb') as logfile:
1698 logfile.write(''.join(lines)) 1701 logfile.write(''.join(lines))
1699 1702
1700 @staticmethod 1703 @staticmethod
1701 def clean_trace(logname): 1704 def clean_trace(logname):
1702 if os.path.isfile(logname): 1705 for ext in ('', '.log'):
1703 os.remove(logname) 1706 if os.path.isfile(logname + ext):
1707 os.remove(logname + ext)
1704 1708
1705 @classmethod 1709 @classmethod
1706 def parse_log(cls, filename, blacklist): 1710 def parse_log(cls, logname, blacklist):
1707 logging.info('parse_log(%s, %s)' % (filename, blacklist)) 1711 logging.info('parse_log(%s, %s)' % (logname, blacklist))
1708 context = cls.Context(blacklist) 1712 context = cls.Context(blacklist)
1709 for line in open(filename, 'rb'): 1713 for line in open(logname + '.log', 'rb'):
1710 context.on_line(line) 1714 context.on_line(line)
1711 return context.to_results() 1715 return context.to_results()
1712 1716
1713 1717
1714 class LogmanTrace(ApiBase): 1718 class LogmanTrace(ApiBase):
1715 """Uses the native Windows ETW based tracing functionality to trace a child 1719 """Uses the native Windows ETW based tracing functionality to trace a child
1716 process. 1720 process.
1717 1721
1718 Caveat: this implementations doesn't track cwd or initial_cwd. It is because 1722 Caveat: this implementations doesn't track cwd or initial_cwd. It is because
1719 the Windows Kernel doesn't have a concept of 'current working directory' at 1723 the Windows Kernel doesn't have a concept of 'current working directory' at
1720 all. A Win32 process has a map of current directories, one per drive letter 1724 all. A Win32 process has a map of current directories, one per drive letter
1721 and it is managed by the user mode kernel32.dll. In kernel, a file is always 1725 and it is managed by the user mode kernel32.dll. In kernel, a file is always
1722 opened relative to another file_object or as an absolute path. All the current 1726 opened relative to another file_object or as an absolute path. All the current
1723 working directory logic is done in user mode. 1727 working directory logic is done in user mode.
1724 """ 1728 """
1725 # The basic headers.
1726 EXPECTED_HEADER = [
1727 u'Event Name',
1728 u'Type',
1729 u'Event ID',
1730 u'Version',
1731 u'Channel',
1732 u'Level', # 5
1733 u'Opcode',
1734 u'Task',
1735 u'Keyword',
1736 u'PID',
1737 u'TID', # 10
1738 u'Processor Number',
1739 u'Instance ID',
1740 u'Parent Instance ID',
1741 u'Activity ID',
1742 u'Related Activity ID', # 15
1743 u'Clock-Time',
1744 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
1745 u'User(ms)', # pretty much useless.
1746 u'User Data', # Extra arguments that are event-specific.
1747 ]
1748 # Only the useful headers common to all entries are listed there. Any column
1749 # at 19 or higher is dependent on the specific event.
1750 EVENT_NAME = 0
1751 TYPE = 1
1752 PID = 9
1753 TID = 10
1754 PROCESSOR_ID = 11
1755 TIMESTAMP = 16
1756 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
1757 USER_DATA = 19
1758
1759 class Context(ApiBase.Context): 1729 class Context(ApiBase.Context):
1760 """Processes a ETW log line and keeps the list of existent and non 1730 """Processes a ETW log line and keeps the list of existent and non
1761 existent files accessed. 1731 existent files accessed.
1762 1732
1763 Ignores directories. 1733 Ignores directories.
1764 """ 1734 """
1765 # These indexes are for the stripped version in json. 1735 # These indexes are for the stripped version in json.
1766 EVENT_NAME = 0 1736 EVENT_NAME = 0
1767 TYPE = 1 1737 TYPE = 1
1768 PID = 2 1738 PID = 2
(...skipping 268 matching lines...) Expand 10 before | Expand all | Expand 10 after
2037 def supported_events(cls): 2007 def supported_events(cls):
2038 """Returns all the procesed events.""" 2008 """Returns all the procesed events."""
2039 out = [] 2009 out = []
2040 for member in dir(cls): 2010 for member in dir(cls):
2041 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member) 2011 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
2042 if match: 2012 if match:
2043 out.append(match.groups()) 2013 out.append(match.groups())
2044 return out 2014 return out
2045 2015
2046 class Tracer(ApiBase.Tracer): 2016 class Tracer(ApiBase.Tracer):
2017 # The basic headers.
2018 EXPECTED_HEADER = [
2019 u'Event Name',
2020 u'Type',
2021 u'Event ID',
2022 u'Version',
2023 u'Channel',
2024 u'Level', # 5
2025 u'Opcode',
2026 u'Task',
2027 u'Keyword',
2028 u'PID',
2029 u'TID', # 10
2030 u'Processor Number',
2031 u'Instance ID',
2032 u'Parent Instance ID',
2033 u'Activity ID',
2034 u'Related Activity ID', # 15
2035 u'Clock-Time',
2036 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
2037 u'User(ms)', # pretty much useless.
2038 u'User Data', # Extra arguments that are event-specific.
2039 ]
2040 # Only the useful headers common to all entries are listed there. Any column
2041 # at 19 or higher is dependent on the specific event.
2042 EVENT_NAME = 0
2043 TYPE = 1
2044 PID = 9
2045 TID = 10
2046 PROCESSOR_ID = 11
2047 TIMESTAMP = 16
2048 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
2049 USER_DATA = 19
2050
2047 def trace(self, cmd, cwd, tracename, output): 2051 def trace(self, cmd, cwd, tracename, output):
2048 """Uses logman.exe to start and stop the NT Kernel Logger while the 2052 """Uses logman.exe to start and stop the NT Kernel Logger while the
2049 executable to be traced is run. 2053 executable to be traced is run.
2050 """ 2054 """
2051 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) 2055 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2052 # Use "logman -?" for help. 2056 # Use "logman -?" for help.
2053 2057
2054 stdout = stderr = None 2058 stdout = stderr = None
2055 if output: 2059 if output:
2056 stdout = subprocess.PIPE 2060 stdout = subprocess.PIPE
(...skipping 19 matching lines...) Expand all
2076 cwd=cwd, 2080 cwd=cwd,
2077 stdin=subprocess.PIPE, 2081 stdin=subprocess.PIPE,
2078 stdout=stdout, 2082 stdout=stdout,
2079 stderr=stderr) 2083 stderr=stderr)
2080 logging.debug('Started child pid: %d' % child.pid) 2084 logging.debug('Started child pid: %d' % child.pid)
2081 out = child.communicate()[0] 2085 out = child.communicate()[0]
2082 finally: 2086 finally:
2083 # 3. Stop the log collection. 2087 # 3. Stop the log collection.
2084 self._stop_log() 2088 self._stop_log()
2085 2089
2086 # 4. Convert log
2087 # TODO(maruel): Temporary, remove me.
2088 LogmanTrace.process_log(self._logname)
2089
2090 # 5. Save metadata. 2090 # 5. Save metadata.
2091 value = { 2091 value = {
2092 'pid': child.pid, 2092 'pid': child.pid,
2093 'format': 'csv', 2093 'format': 'csv',
2094 } 2094 }
2095 write_json(self._logname, value, False) 2095 write_json(self._logname, value, False)
2096 return child.returncode, out 2096 return child.returncode, out
2097 2097
2098 @classmethod 2098 @classmethod
2099 def _start_log(cls, etl): 2099 def _start_log(cls, etl):
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
2139 'NT Kernel Logger', 2139 'NT Kernel Logger',
2140 '-ets', # Sends the command directly to the kernel. 2140 '-ets', # Sends the command directly to the kernel.
2141 ] 2141 ]
2142 logging.debug('Running: %s' % cmd_stop) 2142 logging.debug('Running: %s' % cmd_stop)
2143 subprocess.check_call( 2143 subprocess.check_call(
2144 cmd_stop, 2144 cmd_stop,
2145 stdin=subprocess.PIPE, 2145 stdin=subprocess.PIPE,
2146 stdout=subprocess.PIPE, 2146 stdout=subprocess.PIPE,
2147 stderr=subprocess.STDOUT) 2147 stderr=subprocess.STDOUT)
2148 2148
2149 def post_process_log(self):
2150 """Converts the .etl file into .csv then into .json."""
2151 super(LogmanTrace.Tracer, self).post_process_log()
2152 logformat = 'csv'
2153 self._convert_log(logformat)
2154
2155 if logformat == 'csv_utf16':
2156 def load_file():
2157 def utf_8_encoder(unicode_csv_data):
2158 """Encodes the unicode object as utf-8 encoded str instance"""
2159 for line in unicode_csv_data:
2160 yield line.encode('utf-8')
2161
2162 def unicode_csv_reader(unicode_csv_data, **kwargs):
2163 """Encodes temporarily as UTF-8 since csv module doesn't do unicode.
2164 """
2165 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
2166 for row in csv_reader:
2167 # Decode str utf-8 instances back to unicode instances, cell by
2168 # cell:
2169 yield [cell.decode('utf-8') for cell in row]
2170
2171 # The CSV file is UTF-16 so use codecs.open() to load the file into
2172 # the python internal unicode format (utf-8). Then explicitly
2173 # re-encode as utf8 as str instances so csv can parse it fine. Then
2174 # decode the utf-8 str back into python unicode instances. This
2175 # sounds about right.
2176 for line in unicode_csv_reader(
2177 codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
2178 # line is a list of unicode objects
2179 # So much white space!
2180 yield [i.strip() for i in line]
2181
2182 elif logformat == 'csv':
2183 def load_file():
2184 def ansi_csv_reader(ansi_csv_data, **kwargs):
2185 """Loads an 'ANSI' code page and returns unicode() objects."""
2186 assert sys.getfilesystemencoding() == 'mbcs'
2187 encoding = get_current_encoding()
2188 for row in csv.reader(ansi_csv_data, **kwargs):
2189 # Decode str 'ansi' instances to unicode instances, cell by cell:
2190 yield [cell.decode(encoding) for cell in row]
2191
2192 # The fastest and smallest format but only supports 'ANSI' file paths.
2193 # E.g. the filenames are encoding in the 'current' encoding.
2194 for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
2195 # line is a list of unicode objects.
2196 yield [i.strip() for i in line]
2197
2198 supported_events = LogmanTrace.Context.supported_events()
2199
2200 def trim(generator):
2201 for index, line in enumerate(generator):
2202 if not index:
2203 if line != self.EXPECTED_HEADER:
2204 raise TracingFailure(
2205 'Found malformed header: %s' % ' '.join(line),
2206 None, None, None)
2207 continue
2208 # As you can see, the CSV is full of useful non-redundant information:
2209 if (line[2] != '0' or # Event ID
2210 line[3] not in ('2', '3') or # Version
2211 line[4] != '0' or # Channel
2212 line[5] != '0' or # Level
2213 line[7] != '0' or # Task
2214 line[8] != '0x0000000000000000' or # Keyword
2215 line[12] != '' or # Instance ID
2216 line[13] != '' or # Parent Instance ID
2217 line[14] != self.NULL_GUID or # Activity ID
2218 line[15] != ''): # Related Activity ID
2219 raise TracingFailure(
2220 'Found unexpected values in line: %s' % ' '.join(line),
2221 None, None, None)
2222
2223 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
2224 continue
2225
2226 # Convert the PID in-place from hex.
2227 line[self.PID] = int(line[self.PID], 16)
2228
2229 yield [
2230 line[self.EVENT_NAME],
2231 line[self.TYPE],
2232 line[self.PID],
2233 line[self.TID],
2234 line[self.PROCESSOR_ID],
2235 line[self.TIMESTAMP],
2236 ] + line[self.USER_DATA:]
2237
2238 write_json('%s.json' % self._logname, list(trim(load_file())), True)
2239
2240 def _convert_log(self, logformat):
2241 """Converts the ETL trace to text representation.
2242
2243 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
2244 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
2245
2246 Arguments:
2247 - logformat: Text format to be generated, csv, csv_utf16 or xml.
2248
2249 Use "tracerpt -?" for help.
2250 """
2251 LOCALE_INVARIANT = 0x7F
2252 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
2253 cmd_convert = [
2254 'tracerpt.exe',
2255 '-l', self._logname + '.etl',
2256 '-o', self._logname + '.' + logformat,
2257 '-gmt', # Use UTC
2258 '-y', # No prompt
2259 # Use -of XML to get the header of each items after column 19, e.g. all
2260 # the actual headers of 'User Data'.
2261 ]
2262
2263 if logformat == 'csv':
2264 # tracerpt localizes the 'Type' column, for major brainfuck
2265 # entertainment. I can't imagine any sane reason to do that.
2266 cmd_convert.extend(['-of', 'CSV'])
2267 elif logformat == 'csv_utf16':
2268 # This causes it to use UTF-16, which doubles the log size but ensures
2269 # the log is readable for non-ASCII characters.
2270 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
2271 elif logformat == 'xml':
2272 cmd_convert.extend(['-of', 'XML'])
2273 else:
2274 raise ValueError('Unexpected log format \'%s\'' % logformat)
2275 logging.debug('Running: %s' % cmd_convert)
2276 # This can takes tens of minutes for large logs.
2277 # Redirects all output to stderr.
2278 subprocess.check_call(
2279 cmd_convert,
2280 stdin=subprocess.PIPE,
2281 stdout=sys.stderr,
2282 stderr=sys.stderr)
2283
2149 def __init__(self): 2284 def __init__(self):
2150 super(LogmanTrace, self).__init__() 2285 super(LogmanTrace, self).__init__()
2151 # Most ignores need to be determined at runtime. 2286 # Most ignores need to be determined at runtime.
2152 self.IGNORED = set([os.path.dirname(sys.executable)]) 2287 self.IGNORED = set([os.path.dirname(sys.executable)])
2153 # Add many directories from environment variables. 2288 # Add many directories from environment variables.
2154 vars_to_ignore = ( 2289 vars_to_ignore = (
2155 'APPDATA', 2290 'APPDATA',
2156 'LOCALAPPDATA', 2291 'LOCALAPPDATA',
2157 'ProgramData', 2292 'ProgramData',
2158 'ProgramFiles', 2293 'ProgramFiles',
2159 'ProgramFiles(x86)', 2294 'ProgramFiles(x86)',
2160 'ProgramW6432', 2295 'ProgramW6432',
2161 'SystemRoot', 2296 'SystemRoot',
2162 'TEMP', 2297 'TEMP',
2163 'TMP', 2298 'TMP',
2164 ) 2299 )
2165 for i in vars_to_ignore: 2300 for i in vars_to_ignore:
2166 if os.environ.get(i): 2301 if os.environ.get(i):
2167 self.IGNORED.add(os.environ[i]) 2302 self.IGNORED.add(os.environ[i])
2168 2303
2169 # Also add their short path name equivalents. 2304 # Also add their short path name equivalents.
2170 for i in list(self.IGNORED): 2305 for i in list(self.IGNORED):
2171 self.IGNORED.add(GetShortPathName(i.replace('/', os.path.sep))) 2306 self.IGNORED.add(GetShortPathName(i.replace('/', os.path.sep)))
2172 2307
2173 # Add these last since they have no short path name equivalent. 2308 # Add these last since they have no short path name equivalent.
2174 self.IGNORED.add('\\SystemRoot') 2309 self.IGNORED.add('\\SystemRoot')
2175 self.IGNORED = tuple(sorted(self.IGNORED)) 2310 self.IGNORED = tuple(sorted(self.IGNORED))
2176 2311
2177 @classmethod
2178 def process_log(cls, logname):
2179 """Converts the .etl file into .csv then into .json."""
2180 logformat = 'csv'
2181 cls._convert_log(logname, logformat)
2182
2183 if logformat == 'csv_utf16':
2184 def load_file():
2185 def utf_8_encoder(unicode_csv_data):
2186 """Encodes the unicode object as utf-8 encoded str instance"""
2187 for line in unicode_csv_data:
2188 yield line.encode('utf-8')
2189
2190 def unicode_csv_reader(unicode_csv_data, **kwargs):
2191 """Encodes temporarily as UTF-8 since csv module doesn't do unicode.
2192 """
2193 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
2194 for row in csv_reader:
2195 # Decode str utf-8 instances back to unicode instances, cell by
2196 # cell:
2197 yield [cell.decode('utf-8') for cell in row]
2198
2199 # The CSV file is UTF-16 so use codecs.open() to load the file into the
2200 # python internal unicode format (utf-8). Then explicitly re-encode as
2201 # utf8 as str instances so csv can parse it fine. Then decode the utf-8
2202 # str back into python unicode instances. This sounds about right.
2203 for line in unicode_csv_reader(
2204 codecs.open(logname + '.' + logformat, 'r', 'utf-16')):
2205 # line is a list of unicode objects
2206 # So much white space!
2207 yield [i.strip() for i in line]
2208
2209 elif logformat == 'csv':
2210 def load_file():
2211 def ansi_csv_reader(ansi_csv_data, **kwargs):
2212 """Loads an 'ANSI' code page and returns unicode() objects."""
2213 assert sys.getfilesystemencoding() == 'mbcs'
2214 encoding = get_current_encoding()
2215 for row in csv.reader(ansi_csv_data, **kwargs):
2216 # Decode str 'ansi' instances to unicode instances, cell by cell:
2217 yield [cell.decode(encoding) for cell in row]
2218
2219 # The fastest and smallest format but only supports 'ANSI' file paths.
2220 # E.g. the filenames are encoding in the 'current' encoding.
2221 for line in ansi_csv_reader(open(logname + '.' + logformat)):
2222 # line is a list of unicode objects.
2223 yield [i.strip() for i in line]
2224
2225 supported_events = cls.Context.supported_events()
2226
2227 def trim(generator):
2228 for index, line in enumerate(generator):
2229 if not index:
2230 if line != cls.EXPECTED_HEADER:
2231 raise TracingFailure(
2232 'Found malformed header: %s' % ' '.join(line),
2233 None, None, None)
2234 continue
2235 # As you can see, the CSV is full of useful non-redundant information:
2236 if (line[2] != '0' or # Event ID
2237 line[3] not in ('2', '3') or # Version
2238 line[4] != '0' or # Channel
2239 line[5] != '0' or # Level
2240 line[7] != '0' or # Task
2241 line[8] != '0x0000000000000000' or # Keyword
2242 line[12] != '' or # Instance ID
2243 line[13] != '' or # Parent Instance ID
2244 line[14] != cls.NULL_GUID or # Activity ID
2245 line[15] != ''): # Related Activity ID
2246 raise TracingFailure(
2247 'Found unexpected values in line: %s' % ' '.join(line),
2248 None, None, None)
2249
2250 if (line[cls.EVENT_NAME], line[cls.TYPE]) not in supported_events:
2251 continue
2252
2253 # Convert the PID in-place from hex.
2254 line[cls.PID] = int(line[cls.PID], 16)
2255
2256 yield [
2257 line[cls.EVENT_NAME],
2258 line[cls.TYPE],
2259 line[cls.PID],
2260 line[cls.TID],
2261 line[cls.PROCESSOR_ID],
2262 line[cls.TIMESTAMP],
2263 ] + line[cls.USER_DATA:]
2264
2265 write_json('%s.json' % logname, list(trim(load_file())), True)
2266
2267 @staticmethod
2268 def _convert_log(logname, logformat):
2269 """Converts the ETL trace to text representation.
2270
2271 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
2272 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
2273
2274 Arguments:
2275 - logname: Base filename to convert.
2276 - logformat: Text format to be generated, csv, csv_utf16 or xml.
2277
2278 Use "tracerpt -?" for help.
2279 """
2280 LOCALE_INVARIANT = 0x7F
2281 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
2282 cmd_convert = [
2283 'tracerpt.exe',
2284 '-l', logname + '.etl',
2285 '-o', logname + '.' + logformat,
2286 '-gmt', # Use UTC
2287 '-y', # No prompt
2288 # Use -of XML to get the header of each items after column 19, e.g. all
2289 # the actual headers of 'User Data'.
2290 ]
2291
2292 if logformat == 'csv':
2293 # tracerpt localizes the 'Type' column, for major brainfuck
2294 # entertainment. I can't imagine any sane reason to do that.
2295 cmd_convert.extend(['-of', 'CSV'])
2296 elif logformat == 'csv_utf16':
2297 # This causes it to use UTF-16, which doubles the log size but ensures
2298 # the log is readable for non-ASCII characters.
2299 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
2300 elif logformat == 'xml':
2301 cmd_convert.extend(['-of', 'XML'])
2302 else:
2303 raise ValueError('Unexpected log format \'%s\'' % logformat)
2304 logging.debug('Running: %s' % cmd_convert)
2305 # This can takes tens of minutes for large logs.
2306 # Redirects all output to stderr.
2307 subprocess.check_call(
2308 cmd_convert,
2309 stdin=subprocess.PIPE,
2310 stdout=sys.stderr,
2311 stderr=sys.stderr)
2312
2313 @staticmethod 2312 @staticmethod
2314 def clean_trace(logname): 2313 def clean_trace(logname):
2315 for ext in ('', '.csv', '.etl', '.json', '.xml'): 2314 for ext in ('', '.csv', '.etl', '.json', '.xml'):
2316 if os.path.isfile(logname + ext): 2315 if os.path.isfile(logname + ext):
2317 os.remove(logname + ext) 2316 os.remove(logname + ext)
2318 2317
2319 @classmethod 2318 @classmethod
2320 def parse_log(cls, logname, blacklist): 2319 def parse_log(cls, logname, blacklist):
2321 logging.info('parse_log(%s, %s)' % (logname, blacklist)) 2320 logging.info('parse_log(%s, %s)' % (logname, blacklist))
2322 2321
(...skipping 282 matching lines...) Expand 10 before | Expand all | Expand 10 after
2605 for fn in dir(sys.modules[__name__]) 2604 for fn in dir(sys.modules[__name__])
2606 if fn.startswith('CMD'))) 2605 if fn.startswith('CMD')))
2607 2606
2608 command = get_command_handler(argv[0] if argv else None) 2607 command = get_command_handler(argv[0] if argv else None)
2609 parser = gen_parser(command) 2608 parser = gen_parser(command)
2610 return command(parser, argv[1:]) 2609 return command(parser, argv[1:])
2611 2610
2612 2611
2613 if __name__ == '__main__': 2612 if __name__ == '__main__':
2614 sys.exit(main(sys.argv[1:])) 2613 sys.exit(main(sys.argv[1:]))
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698