OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # coding=utf-8 | 2 # coding=utf-8 |
3 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 3 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
4 # Use of this source code is governed by a BSD-style license that can be | 4 # Use of this source code is governed by a BSD-style license that can be |
5 # found in the LICENSE file. | 5 # found in the LICENSE file. |
6 | 6 |
7 """Traces an executable and its child processes and extract the files accessed | 7 """Traces an executable and its child processes and extract the files accessed |
8 by them. | 8 by them. |
9 | 9 |
10 The implementation uses OS-specific API. The native Kernel logger and the ETL | 10 The implementation uses OS-specific API. The native Kernel logger and the ETL |
(...skipping 617 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
628 - output: If False, redirects output to PIPEs. | 628 - output: If False, redirects output to PIPEs. |
629 | 629 |
630 Returns a tuple (resultcode, output) and updates the internal trace | 630 Returns a tuple (resultcode, output) and updates the internal trace |
631 entries. | 631 entries. |
632 """ | 632 """ |
633 raise NotImplementedError(self.__class__.__name__) | 633 raise NotImplementedError(self.__class__.__name__) |
634 | 634 |
635 def close(self): | 635 def close(self): |
636 self._initialized = False | 636 self._initialized = False |
637 | 637 |
| 638 def post_process_log(self): |
| 639 """Post-processes the log so it becomes faster to load afterward. |
| 640 |
| 641 Must not be used manually when using 'with' construct. |
| 642 """ |
| 643 assert not self._initialized, 'Must stop tracing first.' |
| 644 |
638 def __enter__(self): | 645 def __enter__(self): |
639 """Enables 'with' statement.""" | 646 """Enables 'with' statement.""" |
640 return self | 647 return self |
641 | 648 |
642 def __exit__(self, exc_type, exc_value, traceback): | 649 def __exit__(self, exc_type, exc_value, traceback): |
643 """Enables 'with' statement.""" | 650 """Enables 'with' statement.""" |
644 self.close() | 651 self.close() |
| 652 # If an exception was thrown, do not process logs. |
| 653 if not exc_type: |
| 654 self.post_process_log() |
645 | 655 |
646 def get_tracer(self, logname): | 656 def get_tracer(self, logname): |
647 """Returns an ApiBase.Tracer instance. | 657 """Returns an ApiBase.Tracer instance. |
648 | 658 |
649 Initializes the tracing subsystem, which is a requirement for kernel-based | 659 Initializes the tracing subsystem, which is a requirement for kernel-based |
650 tracers. Only one tracer instance should be live at a time! | 660 tracers. Only one tracer instance should be live at a time! |
651 | 661 |
652 logname is the filepath to the json file that will contain the meta-data | 662 logname is the filepath to the json file that will contain the meta-data |
653 about the logs. | 663 about the logs. |
654 """ | 664 """ |
655 return self.Tracer(logname) | 665 return self.Tracer(logname) |
656 | 666 |
657 @staticmethod | 667 @staticmethod |
658 def clean_trace(logname): | 668 def clean_trace(logname): |
659 """Deletes the old log.""" | 669 """Deletes the old log.""" |
660 raise NotImplementedError() | 670 raise NotImplementedError() |
661 | 671 |
662 @classmethod | 672 @classmethod |
663 def parse_log(cls, filename, blacklist): | 673 def parse_log(cls, logname, blacklist): |
664 """Processes a trace log and returns the files opened and the files that do | 674 """Processes a trace log and returns the files opened and the files that do |
665 not exist. | 675 not exist. |
666 | 676 |
667 It does not track directories. | 677 It does not track directories. |
668 | 678 |
669 Most of the time, files that do not exist are temporary test files that | 679 Most of the time, files that do not exist are temporary test files that |
670 should be put in /tmp instead. See http://crbug.com/116251. | 680 should be put in /tmp instead. See http://crbug.com/116251. |
671 | 681 |
672 Returns a tuple (existing files, non existing files, nb_processes_created) | 682 Returns a tuple (existing files, non existing files, nb_processes_created) |
673 """ | 683 """ |
(...skipping 349 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1023 @staticmethod | 1033 @staticmethod |
1024 def clean_trace(logname): | 1034 def clean_trace(logname): |
1025 if os.path.isfile(logname): | 1035 if os.path.isfile(logname): |
1026 os.remove(logname) | 1036 os.remove(logname) |
1027 # Also delete any pid specific file from previous traces. | 1037 # Also delete any pid specific file from previous traces. |
1028 for i in glob.iglob(logname + '.*'): | 1038 for i in glob.iglob(logname + '.*'): |
1029 if i.rsplit('.', 1)[1].isdigit(): | 1039 if i.rsplit('.', 1)[1].isdigit(): |
1030 os.remove(i) | 1040 os.remove(i) |
1031 | 1041 |
1032 @classmethod | 1042 @classmethod |
1033 def parse_log(cls, filename, blacklist): | 1043 def parse_log(cls, logname, blacklist): |
1034 logging.info('parse_log(%s, %s)' % (filename, blacklist)) | 1044 logging.info('parse_log(%s, %s)' % (logname, blacklist)) |
1035 data = read_json(filename) | 1045 data = read_json(logname) |
1036 context = cls.Context(blacklist, data['cwd']) | 1046 context = cls.Context(blacklist, data['cwd']) |
1037 for pidfile in glob.iglob(filename + '.*'): | 1047 for pidfile in glob.iglob(logname + '.*'): |
1038 pid = pidfile.rsplit('.', 1)[1] | 1048 pid = pidfile.rsplit('.', 1)[1] |
1039 if pid.isdigit(): | 1049 if pid.isdigit(): |
1040 pid = int(pid) | 1050 pid = int(pid) |
1041 # TODO(maruel): Load as utf-8 | 1051 # TODO(maruel): Load as utf-8 |
1042 for line in open(pidfile, 'rb'): | 1052 for line in open(pidfile, 'rb'): |
1043 context.on_line(pid, line) | 1053 context.on_line(pid, line) |
1044 | 1054 |
1045 return context.to_results() | 1055 return context.to_results() |
1046 | 1056 |
1047 | 1057 |
(...skipping 582 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1630 # code manages the dtrace lifetime itself. | 1640 # code manages the dtrace lifetime itself. |
1631 trace_cmd = [ | 1641 trace_cmd = [ |
1632 'sudo', | 1642 'sudo', |
1633 'dtrace', | 1643 'dtrace', |
1634 '-x', 'dynvarsize=4m', | 1644 '-x', 'dynvarsize=4m', |
1635 '-x', 'evaltime=exec', | 1645 '-x', 'evaltime=exec', |
1636 '-n', self.code(child.pid, cwd), | 1646 '-n', self.code(child.pid, cwd), |
1637 '-o', '/dev/stderr', | 1647 '-o', '/dev/stderr', |
1638 '-q', | 1648 '-q', |
1639 ] | 1649 ] |
1640 with open(self._logname, 'w') as logfile: | 1650 logname = self._logname + '.log' |
| 1651 with open(logname, 'w') as logfile: |
1641 dtrace = subprocess.Popen( | 1652 dtrace = subprocess.Popen( |
1642 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT) | 1653 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT) |
1643 logging.debug('Started dtrace pid: %d' % dtrace.pid) | 1654 logging.debug('Started dtrace pid: %d' % dtrace.pid) |
1644 | 1655 |
1645 # Part 3: Read until one line is printed, which signifies dtrace is up and | 1656 # Part 3: Read until one line is printed, which signifies dtrace is up and |
1646 # ready. | 1657 # ready. |
1647 with open(self._logname, 'r') as logfile: | 1658 with open(logname, 'r') as logfile: |
1648 while 'dtrace_BEGIN' not in logfile.readline(): | 1659 while 'dtrace_BEGIN' not in logfile.readline(): |
1649 if dtrace.poll() is not None: | 1660 if dtrace.poll() is not None: |
1650 break | 1661 break |
1651 | 1662 |
1652 try: | 1663 # Part 4: We can now tell our child to go. |
1653 # Part 4: We can now tell our child to go. | 1664 # TODO(maruel): Another pipe than stdin could be used instead. This |
1654 # TODO(maruel): Another pipe than stdin could be used instead. This | 1665 # would be more consistent with the other tracing methods. |
1655 # would be more consistent with the other tracing methods. | 1666 out = child.communicate(signal)[0] |
1656 out = child.communicate(signal)[0] | |
1657 | 1667 |
1658 dtrace.wait() | 1668 dtrace.wait() |
1659 if dtrace.returncode != 0: | 1669 if dtrace.returncode != 0: |
1660 print 'dtrace failure: %d' % dtrace.returncode | 1670 print 'dtrace failure: %d' % dtrace.returncode |
1661 with open(self._logname) as logfile: | 1671 with open(logname) as logfile: |
1662 print ''.join(logfile.readlines()[-100:]) | 1672 print ''.join(logfile.readlines()[-100:]) |
1663 # Find a better way. | 1673 # Find a better way. |
1664 os.remove(self._logname) | 1674 os.remove(logname) |
1665 else: | |
1666 # Short the log right away to simplify our life. There isn't much | |
1667 # advantage in keeping it out of order. | |
1668 self._sort_log(self._logname) | |
1669 except KeyboardInterrupt: | |
1670 # Still sort when testing. | |
1671 self._sort_log(self._logname) | |
1672 raise | |
1673 | 1675 |
1674 return dtrace.returncode or child.returncode, out | 1676 return dtrace.returncode or child.returncode, out |
1675 | 1677 |
1676 @staticmethod | 1678 def post_process_log(self): |
1677 def _sort_log(logname): | |
1678 """Sorts the log back in order when each call occured. | 1679 """Sorts the log back in order when each call occured. |
1679 | 1680 |
1680 dtrace doesn't save the buffer in strict order since it keeps one buffer | 1681 dtrace doesn't save the buffer in strict order since it keeps one buffer |
1681 per CPU. | 1682 per CPU. |
1682 """ | 1683 """ |
| 1684 super(Dtrace.Tracer, self).post_process_log() |
| 1685 logname = self._logname + '.log' |
1683 with open(logname, 'rb') as logfile: | 1686 with open(logname, 'rb') as logfile: |
1684 lines = [l for l in logfile if l.strip()] | 1687 lines = [l for l in logfile if l.strip()] |
1685 errors = [l for l in lines if l.startswith('dtrace:')] | 1688 errors = [l for l in lines if l.startswith('dtrace:')] |
1686 if errors: | 1689 if errors: |
1687 raise TracingFailure( | 1690 raise TracingFailure( |
1688 'Found errors in the trace: %s' % '\n'.join(errors), | 1691 'Found errors in the trace: %s' % '\n'.join(errors), |
1689 None, None, None, logname) | 1692 None, None, None, logname) |
1690 try: | 1693 try: |
1691 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0])) | 1694 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0])) |
1692 except ValueError: | 1695 except ValueError: |
1693 raise TracingFailure( | 1696 raise TracingFailure( |
1694 'Found errors in the trace: %s' % '\n'.join( | 1697 'Found errors in the trace: %s' % '\n'.join( |
1695 l for l in lines if l.split(' ', 1)[0].isdigit()), | 1698 l for l in lines if l.split(' ', 1)[0].isdigit()), |
1696 None, None, None, logname) | 1699 None, None, None, logname) |
1697 with open(logname, 'wb') as logfile: | 1700 with open(logname, 'wb') as logfile: |
1698 logfile.write(''.join(lines)) | 1701 logfile.write(''.join(lines)) |
1699 | 1702 |
1700 @staticmethod | 1703 @staticmethod |
1701 def clean_trace(logname): | 1704 def clean_trace(logname): |
1702 if os.path.isfile(logname): | 1705 for ext in ('', '.log'): |
1703 os.remove(logname) | 1706 if os.path.isfile(logname + ext): |
| 1707 os.remove(logname + ext) |
1704 | 1708 |
1705 @classmethod | 1709 @classmethod |
1706 def parse_log(cls, filename, blacklist): | 1710 def parse_log(cls, logname, blacklist): |
1707 logging.info('parse_log(%s, %s)' % (filename, blacklist)) | 1711 logging.info('parse_log(%s, %s)' % (logname, blacklist)) |
1708 context = cls.Context(blacklist) | 1712 context = cls.Context(blacklist) |
1709 for line in open(filename, 'rb'): | 1713 for line in open(logname + '.log', 'rb'): |
1710 context.on_line(line) | 1714 context.on_line(line) |
1711 return context.to_results() | 1715 return context.to_results() |
1712 | 1716 |
1713 | 1717 |
1714 class LogmanTrace(ApiBase): | 1718 class LogmanTrace(ApiBase): |
1715 """Uses the native Windows ETW based tracing functionality to trace a child | 1719 """Uses the native Windows ETW based tracing functionality to trace a child |
1716 process. | 1720 process. |
1717 | 1721 |
1718 Caveat: this implementations doesn't track cwd or initial_cwd. It is because | 1722 Caveat: this implementations doesn't track cwd or initial_cwd. It is because |
1719 the Windows Kernel doesn't have a concept of 'current working directory' at | 1723 the Windows Kernel doesn't have a concept of 'current working directory' at |
1720 all. A Win32 process has a map of current directories, one per drive letter | 1724 all. A Win32 process has a map of current directories, one per drive letter |
1721 and it is managed by the user mode kernel32.dll. In kernel, a file is always | 1725 and it is managed by the user mode kernel32.dll. In kernel, a file is always |
1722 opened relative to another file_object or as an absolute path. All the current | 1726 opened relative to another file_object or as an absolute path. All the current |
1723 working directory logic is done in user mode. | 1727 working directory logic is done in user mode. |
1724 """ | 1728 """ |
1725 # The basic headers. | |
1726 EXPECTED_HEADER = [ | |
1727 u'Event Name', | |
1728 u'Type', | |
1729 u'Event ID', | |
1730 u'Version', | |
1731 u'Channel', | |
1732 u'Level', # 5 | |
1733 u'Opcode', | |
1734 u'Task', | |
1735 u'Keyword', | |
1736 u'PID', | |
1737 u'TID', # 10 | |
1738 u'Processor Number', | |
1739 u'Instance ID', | |
1740 u'Parent Instance ID', | |
1741 u'Activity ID', | |
1742 u'Related Activity ID', # 15 | |
1743 u'Clock-Time', | |
1744 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them | |
1745 u'User(ms)', # pretty much useless. | |
1746 u'User Data', # Extra arguments that are event-specific. | |
1747 ] | |
1748 # Only the useful headers common to all entries are listed there. Any column | |
1749 # at 19 or higher is dependent on the specific event. | |
1750 EVENT_NAME = 0 | |
1751 TYPE = 1 | |
1752 PID = 9 | |
1753 TID = 10 | |
1754 PROCESSOR_ID = 11 | |
1755 TIMESTAMP = 16 | |
1756 NULL_GUID = '{00000000-0000-0000-0000-000000000000}' | |
1757 USER_DATA = 19 | |
1758 | |
1759 class Context(ApiBase.Context): | 1729 class Context(ApiBase.Context): |
1760 """Processes a ETW log line and keeps the list of existent and non | 1730 """Processes a ETW log line and keeps the list of existent and non |
1761 existent files accessed. | 1731 existent files accessed. |
1762 | 1732 |
1763 Ignores directories. | 1733 Ignores directories. |
1764 """ | 1734 """ |
1765 # These indexes are for the stripped version in json. | 1735 # These indexes are for the stripped version in json. |
1766 EVENT_NAME = 0 | 1736 EVENT_NAME = 0 |
1767 TYPE = 1 | 1737 TYPE = 1 |
1768 PID = 2 | 1738 PID = 2 |
(...skipping 268 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2037 def supported_events(cls): | 2007 def supported_events(cls): |
2038 """Returns all the procesed events.""" | 2008 """Returns all the procesed events.""" |
2039 out = [] | 2009 out = [] |
2040 for member in dir(cls): | 2010 for member in dir(cls): |
2041 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member) | 2011 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member) |
2042 if match: | 2012 if match: |
2043 out.append(match.groups()) | 2013 out.append(match.groups()) |
2044 return out | 2014 return out |
2045 | 2015 |
2046 class Tracer(ApiBase.Tracer): | 2016 class Tracer(ApiBase.Tracer): |
| 2017 # The basic headers. |
| 2018 EXPECTED_HEADER = [ |
| 2019 u'Event Name', |
| 2020 u'Type', |
| 2021 u'Event ID', |
| 2022 u'Version', |
| 2023 u'Channel', |
| 2024 u'Level', # 5 |
| 2025 u'Opcode', |
| 2026 u'Task', |
| 2027 u'Keyword', |
| 2028 u'PID', |
| 2029 u'TID', # 10 |
| 2030 u'Processor Number', |
| 2031 u'Instance ID', |
| 2032 u'Parent Instance ID', |
| 2033 u'Activity ID', |
| 2034 u'Related Activity ID', # 15 |
| 2035 u'Clock-Time', |
| 2036 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them |
| 2037 u'User(ms)', # pretty much useless. |
| 2038 u'User Data', # Extra arguments that are event-specific. |
| 2039 ] |
| 2040 # Only the useful headers common to all entries are listed there. Any column |
| 2041 # at 19 or higher is dependent on the specific event. |
| 2042 EVENT_NAME = 0 |
| 2043 TYPE = 1 |
| 2044 PID = 9 |
| 2045 TID = 10 |
| 2046 PROCESSOR_ID = 11 |
| 2047 TIMESTAMP = 16 |
| 2048 NULL_GUID = '{00000000-0000-0000-0000-000000000000}' |
| 2049 USER_DATA = 19 |
| 2050 |
2047 def trace(self, cmd, cwd, tracename, output): | 2051 def trace(self, cmd, cwd, tracename, output): |
2048 """Uses logman.exe to start and stop the NT Kernel Logger while the | 2052 """Uses logman.exe to start and stop the NT Kernel Logger while the |
2049 executable to be traced is run. | 2053 executable to be traced is run. |
2050 """ | 2054 """ |
2051 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) | 2055 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) |
2052 # Use "logman -?" for help. | 2056 # Use "logman -?" for help. |
2053 | 2057 |
2054 stdout = stderr = None | 2058 stdout = stderr = None |
2055 if output: | 2059 if output: |
2056 stdout = subprocess.PIPE | 2060 stdout = subprocess.PIPE |
(...skipping 19 matching lines...) Expand all Loading... |
2076 cwd=cwd, | 2080 cwd=cwd, |
2077 stdin=subprocess.PIPE, | 2081 stdin=subprocess.PIPE, |
2078 stdout=stdout, | 2082 stdout=stdout, |
2079 stderr=stderr) | 2083 stderr=stderr) |
2080 logging.debug('Started child pid: %d' % child.pid) | 2084 logging.debug('Started child pid: %d' % child.pid) |
2081 out = child.communicate()[0] | 2085 out = child.communicate()[0] |
2082 finally: | 2086 finally: |
2083 # 3. Stop the log collection. | 2087 # 3. Stop the log collection. |
2084 self._stop_log() | 2088 self._stop_log() |
2085 | 2089 |
2086 # 4. Convert log | |
2087 # TODO(maruel): Temporary, remove me. | |
2088 LogmanTrace.process_log(self._logname) | |
2089 | |
2090 # 5. Save metadata. | 2090 # 5. Save metadata. |
2091 value = { | 2091 value = { |
2092 'pid': child.pid, | 2092 'pid': child.pid, |
2093 'format': 'csv', | 2093 'format': 'csv', |
2094 } | 2094 } |
2095 write_json(self._logname, value, False) | 2095 write_json(self._logname, value, False) |
2096 return child.returncode, out | 2096 return child.returncode, out |
2097 | 2097 |
2098 @classmethod | 2098 @classmethod |
2099 def _start_log(cls, etl): | 2099 def _start_log(cls, etl): |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2139 'NT Kernel Logger', | 2139 'NT Kernel Logger', |
2140 '-ets', # Sends the command directly to the kernel. | 2140 '-ets', # Sends the command directly to the kernel. |
2141 ] | 2141 ] |
2142 logging.debug('Running: %s' % cmd_stop) | 2142 logging.debug('Running: %s' % cmd_stop) |
2143 subprocess.check_call( | 2143 subprocess.check_call( |
2144 cmd_stop, | 2144 cmd_stop, |
2145 stdin=subprocess.PIPE, | 2145 stdin=subprocess.PIPE, |
2146 stdout=subprocess.PIPE, | 2146 stdout=subprocess.PIPE, |
2147 stderr=subprocess.STDOUT) | 2147 stderr=subprocess.STDOUT) |
2148 | 2148 |
| 2149 def post_process_log(self): |
| 2150 """Converts the .etl file into .csv then into .json.""" |
| 2151 super(LogmanTrace.Tracer, self).post_process_log() |
| 2152 logformat = 'csv' |
| 2153 self._convert_log(logformat) |
| 2154 |
| 2155 if logformat == 'csv_utf16': |
| 2156 def load_file(): |
| 2157 def utf_8_encoder(unicode_csv_data): |
| 2158 """Encodes the unicode object as utf-8 encoded str instance""" |
| 2159 for line in unicode_csv_data: |
| 2160 yield line.encode('utf-8') |
| 2161 |
| 2162 def unicode_csv_reader(unicode_csv_data, **kwargs): |
| 2163 """Encodes temporarily as UTF-8 since csv module doesn't do unicode. |
| 2164 """ |
| 2165 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs) |
| 2166 for row in csv_reader: |
| 2167 # Decode str utf-8 instances back to unicode instances, cell by |
| 2168 # cell: |
| 2169 yield [cell.decode('utf-8') for cell in row] |
| 2170 |
| 2171 # The CSV file is UTF-16 so use codecs.open() to load the file into |
| 2172 # the python internal unicode format (utf-8). Then explicitly |
| 2173 # re-encode as utf8 as str instances so csv can parse it fine. Then |
| 2174 # decode the utf-8 str back into python unicode instances. This |
| 2175 # sounds about right. |
| 2176 for line in unicode_csv_reader( |
| 2177 codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')): |
| 2178 # line is a list of unicode objects |
| 2179 # So much white space! |
| 2180 yield [i.strip() for i in line] |
| 2181 |
| 2182 elif logformat == 'csv': |
| 2183 def load_file(): |
| 2184 def ansi_csv_reader(ansi_csv_data, **kwargs): |
| 2185 """Loads an 'ANSI' code page and returns unicode() objects.""" |
| 2186 assert sys.getfilesystemencoding() == 'mbcs' |
| 2187 encoding = get_current_encoding() |
| 2188 for row in csv.reader(ansi_csv_data, **kwargs): |
| 2189 # Decode str 'ansi' instances to unicode instances, cell by cell: |
| 2190 yield [cell.decode(encoding) for cell in row] |
| 2191 |
| 2192 # The fastest and smallest format but only supports 'ANSI' file paths. |
| 2193 # E.g. the filenames are encoding in the 'current' encoding. |
| 2194 for line in ansi_csv_reader(open(self._logname + '.' + logformat)): |
| 2195 # line is a list of unicode objects. |
| 2196 yield [i.strip() for i in line] |
| 2197 |
| 2198 supported_events = LogmanTrace.Context.supported_events() |
| 2199 |
| 2200 def trim(generator): |
| 2201 for index, line in enumerate(generator): |
| 2202 if not index: |
| 2203 if line != self.EXPECTED_HEADER: |
| 2204 raise TracingFailure( |
| 2205 'Found malformed header: %s' % ' '.join(line), |
| 2206 None, None, None) |
| 2207 continue |
| 2208 # As you can see, the CSV is full of useful non-redundant information: |
| 2209 if (line[2] != '0' or # Event ID |
| 2210 line[3] not in ('2', '3') or # Version |
| 2211 line[4] != '0' or # Channel |
| 2212 line[5] != '0' or # Level |
| 2213 line[7] != '0' or # Task |
| 2214 line[8] != '0x0000000000000000' or # Keyword |
| 2215 line[12] != '' or # Instance ID |
| 2216 line[13] != '' or # Parent Instance ID |
| 2217 line[14] != self.NULL_GUID or # Activity ID |
| 2218 line[15] != ''): # Related Activity ID |
| 2219 raise TracingFailure( |
| 2220 'Found unexpected values in line: %s' % ' '.join(line), |
| 2221 None, None, None) |
| 2222 |
| 2223 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events: |
| 2224 continue |
| 2225 |
| 2226 # Convert the PID in-place from hex. |
| 2227 line[self.PID] = int(line[self.PID], 16) |
| 2228 |
| 2229 yield [ |
| 2230 line[self.EVENT_NAME], |
| 2231 line[self.TYPE], |
| 2232 line[self.PID], |
| 2233 line[self.TID], |
| 2234 line[self.PROCESSOR_ID], |
| 2235 line[self.TIMESTAMP], |
| 2236 ] + line[self.USER_DATA:] |
| 2237 |
| 2238 write_json('%s.json' % self._logname, list(trim(load_file())), True) |
| 2239 |
| 2240 def _convert_log(self, logformat): |
| 2241 """Converts the ETL trace to text representation. |
| 2242 |
| 2243 Normally, 'csv' is sufficient. If complex scripts are used (like eastern |
| 2244 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'. |
| 2245 |
| 2246 Arguments: |
| 2247 - logformat: Text format to be generated, csv, csv_utf16 or xml. |
| 2248 |
| 2249 Use "tracerpt -?" for help. |
| 2250 """ |
| 2251 LOCALE_INVARIANT = 0x7F |
| 2252 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT) |
| 2253 cmd_convert = [ |
| 2254 'tracerpt.exe', |
| 2255 '-l', self._logname + '.etl', |
| 2256 '-o', self._logname + '.' + logformat, |
| 2257 '-gmt', # Use UTC |
| 2258 '-y', # No prompt |
| 2259 # Use -of XML to get the header of each items after column 19, e.g. all |
| 2260 # the actual headers of 'User Data'. |
| 2261 ] |
| 2262 |
| 2263 if logformat == 'csv': |
| 2264 # tracerpt localizes the 'Type' column, for major brainfuck |
| 2265 # entertainment. I can't imagine any sane reason to do that. |
| 2266 cmd_convert.extend(['-of', 'CSV']) |
| 2267 elif logformat == 'csv_utf16': |
| 2268 # This causes it to use UTF-16, which doubles the log size but ensures |
| 2269 # the log is readable for non-ASCII characters. |
| 2270 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode']) |
| 2271 elif logformat == 'xml': |
| 2272 cmd_convert.extend(['-of', 'XML']) |
| 2273 else: |
| 2274 raise ValueError('Unexpected log format \'%s\'' % logformat) |
| 2275 logging.debug('Running: %s' % cmd_convert) |
| 2276 # This can takes tens of minutes for large logs. |
| 2277 # Redirects all output to stderr. |
| 2278 subprocess.check_call( |
| 2279 cmd_convert, |
| 2280 stdin=subprocess.PIPE, |
| 2281 stdout=sys.stderr, |
| 2282 stderr=sys.stderr) |
| 2283 |
2149 def __init__(self): | 2284 def __init__(self): |
2150 super(LogmanTrace, self).__init__() | 2285 super(LogmanTrace, self).__init__() |
2151 # Most ignores need to be determined at runtime. | 2286 # Most ignores need to be determined at runtime. |
2152 self.IGNORED = set([os.path.dirname(sys.executable)]) | 2287 self.IGNORED = set([os.path.dirname(sys.executable)]) |
2153 # Add many directories from environment variables. | 2288 # Add many directories from environment variables. |
2154 vars_to_ignore = ( | 2289 vars_to_ignore = ( |
2155 'APPDATA', | 2290 'APPDATA', |
2156 'LOCALAPPDATA', | 2291 'LOCALAPPDATA', |
2157 'ProgramData', | 2292 'ProgramData', |
2158 'ProgramFiles', | 2293 'ProgramFiles', |
2159 'ProgramFiles(x86)', | 2294 'ProgramFiles(x86)', |
2160 'ProgramW6432', | 2295 'ProgramW6432', |
2161 'SystemRoot', | 2296 'SystemRoot', |
2162 'TEMP', | 2297 'TEMP', |
2163 'TMP', | 2298 'TMP', |
2164 ) | 2299 ) |
2165 for i in vars_to_ignore: | 2300 for i in vars_to_ignore: |
2166 if os.environ.get(i): | 2301 if os.environ.get(i): |
2167 self.IGNORED.add(os.environ[i]) | 2302 self.IGNORED.add(os.environ[i]) |
2168 | 2303 |
2169 # Also add their short path name equivalents. | 2304 # Also add their short path name equivalents. |
2170 for i in list(self.IGNORED): | 2305 for i in list(self.IGNORED): |
2171 self.IGNORED.add(GetShortPathName(i.replace('/', os.path.sep))) | 2306 self.IGNORED.add(GetShortPathName(i.replace('/', os.path.sep))) |
2172 | 2307 |
2173 # Add these last since they have no short path name equivalent. | 2308 # Add these last since they have no short path name equivalent. |
2174 self.IGNORED.add('\\SystemRoot') | 2309 self.IGNORED.add('\\SystemRoot') |
2175 self.IGNORED = tuple(sorted(self.IGNORED)) | 2310 self.IGNORED = tuple(sorted(self.IGNORED)) |
2176 | 2311 |
2177 @classmethod | |
2178 def process_log(cls, logname): | |
2179 """Converts the .etl file into .csv then into .json.""" | |
2180 logformat = 'csv' | |
2181 cls._convert_log(logname, logformat) | |
2182 | |
2183 if logformat == 'csv_utf16': | |
2184 def load_file(): | |
2185 def utf_8_encoder(unicode_csv_data): | |
2186 """Encodes the unicode object as utf-8 encoded str instance""" | |
2187 for line in unicode_csv_data: | |
2188 yield line.encode('utf-8') | |
2189 | |
2190 def unicode_csv_reader(unicode_csv_data, **kwargs): | |
2191 """Encodes temporarily as UTF-8 since csv module doesn't do unicode. | |
2192 """ | |
2193 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs) | |
2194 for row in csv_reader: | |
2195 # Decode str utf-8 instances back to unicode instances, cell by | |
2196 # cell: | |
2197 yield [cell.decode('utf-8') for cell in row] | |
2198 | |
2199 # The CSV file is UTF-16 so use codecs.open() to load the file into the | |
2200 # python internal unicode format (utf-8). Then explicitly re-encode as | |
2201 # utf8 as str instances so csv can parse it fine. Then decode the utf-8 | |
2202 # str back into python unicode instances. This sounds about right. | |
2203 for line in unicode_csv_reader( | |
2204 codecs.open(logname + '.' + logformat, 'r', 'utf-16')): | |
2205 # line is a list of unicode objects | |
2206 # So much white space! | |
2207 yield [i.strip() for i in line] | |
2208 | |
2209 elif logformat == 'csv': | |
2210 def load_file(): | |
2211 def ansi_csv_reader(ansi_csv_data, **kwargs): | |
2212 """Loads an 'ANSI' code page and returns unicode() objects.""" | |
2213 assert sys.getfilesystemencoding() == 'mbcs' | |
2214 encoding = get_current_encoding() | |
2215 for row in csv.reader(ansi_csv_data, **kwargs): | |
2216 # Decode str 'ansi' instances to unicode instances, cell by cell: | |
2217 yield [cell.decode(encoding) for cell in row] | |
2218 | |
2219 # The fastest and smallest format but only supports 'ANSI' file paths. | |
2220 # E.g. the filenames are encoding in the 'current' encoding. | |
2221 for line in ansi_csv_reader(open(logname + '.' + logformat)): | |
2222 # line is a list of unicode objects. | |
2223 yield [i.strip() for i in line] | |
2224 | |
2225 supported_events = cls.Context.supported_events() | |
2226 | |
2227 def trim(generator): | |
2228 for index, line in enumerate(generator): | |
2229 if not index: | |
2230 if line != cls.EXPECTED_HEADER: | |
2231 raise TracingFailure( | |
2232 'Found malformed header: %s' % ' '.join(line), | |
2233 None, None, None) | |
2234 continue | |
2235 # As you can see, the CSV is full of useful non-redundant information: | |
2236 if (line[2] != '0' or # Event ID | |
2237 line[3] not in ('2', '3') or # Version | |
2238 line[4] != '0' or # Channel | |
2239 line[5] != '0' or # Level | |
2240 line[7] != '0' or # Task | |
2241 line[8] != '0x0000000000000000' or # Keyword | |
2242 line[12] != '' or # Instance ID | |
2243 line[13] != '' or # Parent Instance ID | |
2244 line[14] != cls.NULL_GUID or # Activity ID | |
2245 line[15] != ''): # Related Activity ID | |
2246 raise TracingFailure( | |
2247 'Found unexpected values in line: %s' % ' '.join(line), | |
2248 None, None, None) | |
2249 | |
2250 if (line[cls.EVENT_NAME], line[cls.TYPE]) not in supported_events: | |
2251 continue | |
2252 | |
2253 # Convert the PID in-place from hex. | |
2254 line[cls.PID] = int(line[cls.PID], 16) | |
2255 | |
2256 yield [ | |
2257 line[cls.EVENT_NAME], | |
2258 line[cls.TYPE], | |
2259 line[cls.PID], | |
2260 line[cls.TID], | |
2261 line[cls.PROCESSOR_ID], | |
2262 line[cls.TIMESTAMP], | |
2263 ] + line[cls.USER_DATA:] | |
2264 | |
2265 write_json('%s.json' % logname, list(trim(load_file())), True) | |
2266 | |
2267 @staticmethod | |
2268 def _convert_log(logname, logformat): | |
2269 """Converts the ETL trace to text representation. | |
2270 | |
2271 Normally, 'csv' is sufficient. If complex scripts are used (like eastern | |
2272 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'. | |
2273 | |
2274 Arguments: | |
2275 - logname: Base filename to convert. | |
2276 - logformat: Text format to be generated, csv, csv_utf16 or xml. | |
2277 | |
2278 Use "tracerpt -?" for help. | |
2279 """ | |
2280 LOCALE_INVARIANT = 0x7F | |
2281 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT) | |
2282 cmd_convert = [ | |
2283 'tracerpt.exe', | |
2284 '-l', logname + '.etl', | |
2285 '-o', logname + '.' + logformat, | |
2286 '-gmt', # Use UTC | |
2287 '-y', # No prompt | |
2288 # Use -of XML to get the header of each items after column 19, e.g. all | |
2289 # the actual headers of 'User Data'. | |
2290 ] | |
2291 | |
2292 if logformat == 'csv': | |
2293 # tracerpt localizes the 'Type' column, for major brainfuck | |
2294 # entertainment. I can't imagine any sane reason to do that. | |
2295 cmd_convert.extend(['-of', 'CSV']) | |
2296 elif logformat == 'csv_utf16': | |
2297 # This causes it to use UTF-16, which doubles the log size but ensures | |
2298 # the log is readable for non-ASCII characters. | |
2299 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode']) | |
2300 elif logformat == 'xml': | |
2301 cmd_convert.extend(['-of', 'XML']) | |
2302 else: | |
2303 raise ValueError('Unexpected log format \'%s\'' % logformat) | |
2304 logging.debug('Running: %s' % cmd_convert) | |
2305 # This can takes tens of minutes for large logs. | |
2306 # Redirects all output to stderr. | |
2307 subprocess.check_call( | |
2308 cmd_convert, | |
2309 stdin=subprocess.PIPE, | |
2310 stdout=sys.stderr, | |
2311 stderr=sys.stderr) | |
2312 | |
2313 @staticmethod | 2312 @staticmethod |
2314 def clean_trace(logname): | 2313 def clean_trace(logname): |
2315 for ext in ('', '.csv', '.etl', '.json', '.xml'): | 2314 for ext in ('', '.csv', '.etl', '.json', '.xml'): |
2316 if os.path.isfile(logname + ext): | 2315 if os.path.isfile(logname + ext): |
2317 os.remove(logname + ext) | 2316 os.remove(logname + ext) |
2318 | 2317 |
2319 @classmethod | 2318 @classmethod |
2320 def parse_log(cls, logname, blacklist): | 2319 def parse_log(cls, logname, blacklist): |
2321 logging.info('parse_log(%s, %s)' % (logname, blacklist)) | 2320 logging.info('parse_log(%s, %s)' % (logname, blacklist)) |
2322 | 2321 |
(...skipping 282 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2605 for fn in dir(sys.modules[__name__]) | 2604 for fn in dir(sys.modules[__name__]) |
2606 if fn.startswith('CMD'))) | 2605 if fn.startswith('CMD'))) |
2607 | 2606 |
2608 command = get_command_handler(argv[0] if argv else None) | 2607 command = get_command_handler(argv[0] if argv else None) |
2609 parser = gen_parser(command) | 2608 parser = gen_parser(command) |
2610 return command(parser, argv[1:]) | 2609 return command(parser, argv[1:]) |
2611 | 2610 |
2612 | 2611 |
2613 if __name__ == '__main__': | 2612 if __name__ == '__main__': |
2614 sys.exit(main(sys.argv[1:])) | 2613 sys.exit(main(sys.argv[1:])) |
OLD | NEW |