tools/isolate/trace_inputs.py - Issue 10537059: Add ApiBase.Tracer.post_process_log() to do implementation specific one-time mangling of the logs.

Side by Side Diff: tools/isolate/trace_inputs.py

Issue 10537059: Add ApiBase.Tracer.post_process_log() to do implementation specific one-time mangling of the logs. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # coding=utf-8	2 # coding=utf-8

3 # Copyright (c) 2012 The Chromium Authors. All rights reserved.	3 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

4 # Use of this source code is governed by a BSD-style license that can be	4 # Use of this source code is governed by a BSD-style license that can be

5 # found in the LICENSE file.	5 # found in the LICENSE file.

6	6

7 """Traces an executable and its child processes and extract the files accessed	7 """Traces an executable and its child processes and extract the files accessed

8 by them.	8 by them.

9	9

10 The implementation uses OS-specific API. The native Kernel logger and the ETL	10 The implementation uses OS-specific API. The native Kernel logger and the ETL

(...skipping 617 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
628 - output: If False, redirects output to PIPEs.	628 - output: If False, redirects output to PIPEs.

629	629

630 Returns a tuple (resultcode, output) and updates the internal trace	630 Returns a tuple (resultcode, output) and updates the internal trace

631 entries.	631 entries.

632 """	632 """

633 raise NotImplementedError(self.__class__.__name__)	633 raise NotImplementedError(self.__class__.__name__)

634	634

635 def close(self):	635 def close(self):

636 self._initialized = False	636 self._initialized = False

637	637

	638 def post_process_log(self):

	639 """Post-processes the log so it becomes faster to load afterward.

	640

	641 Must not be used manually when using 'with' construct.

	642 """

	643 assert not self._initialized, 'Must stop tracing first.'

	644

638 def __enter__(self):	645 def __enter__(self):

639 """Enables 'with' statement."""	646 """Enables 'with' statement."""

640 return self	647 return self

641	648

642 def __exit__(self, exc_type, exc_value, traceback):	649 def __exit__(self, exc_type, exc_value, traceback):

643 """Enables 'with' statement."""	650 """Enables 'with' statement."""

644 self.close()	651 self.close()

	652 # If an exception was thrown, do not process logs.

	653 if not exc_type:

	654 self.post_process_log()

645	655

646 def get_tracer(self, logname):	656 def get_tracer(self, logname):

647 """Returns an ApiBase.Tracer instance.	657 """Returns an ApiBase.Tracer instance.

648	658

649 Initializes the tracing subsystem, which is a requirement for kernel-based	659 Initializes the tracing subsystem, which is a requirement for kernel-based

650 tracers. Only one tracer instance should be live at a time!	660 tracers. Only one tracer instance should be live at a time!

651	661

652 logname is the filepath to the json file that will contain the meta-data	662 logname is the filepath to the json file that will contain the meta-data

653 about the logs.	663 about the logs.

654 """	664 """

655 return self.Tracer(logname)	665 return self.Tracer(logname)

656	666

657 @staticmethod	667 @staticmethod

658 def clean_trace(logname):	668 def clean_trace(logname):

659 """Deletes the old log."""	669 """Deletes the old log."""

660 raise NotImplementedError()	670 raise NotImplementedError()

661	671

662 @classmethod	672 @classmethod

663 def parse_log(cls, filename, blacklist):	673 def parse_log(cls, logname, blacklist):

664 """Processes a trace log and returns the files opened and the files that do	674 """Processes a trace log and returns the files opened and the files that do

665 not exist.	675 not exist.

666	676

667 It does not track directories.	677 It does not track directories.

668	678

669 Most of the time, files that do not exist are temporary test files that	679 Most of the time, files that do not exist are temporary test files that

670 should be put in /tmp instead. See http://crbug.com/116251.	680 should be put in /tmp instead. See http://crbug.com/116251.

671	681

672 Returns a tuple (existing files, non existing files, nb_processes_created)	682 Returns a tuple (existing files, non existing files, nb_processes_created)

673 """	683 """

(...skipping 349 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1023 @staticmethod	1033 @staticmethod

1024 def clean_trace(logname):	1034 def clean_trace(logname):

1025 if os.path.isfile(logname):	1035 if os.path.isfile(logname):

1026 os.remove(logname)	1036 os.remove(logname)

1027 # Also delete any pid specific file from previous traces.	1037 # Also delete any pid specific file from previous traces.

1028 for i in glob.iglob(logname + '.*'):	1038 for i in glob.iglob(logname + '.*'):

1029 if i.rsplit('.', 1)[1].isdigit():	1039 if i.rsplit('.', 1)[1].isdigit():

1030 os.remove(i)	1040 os.remove(i)

1031	1041

1032 @classmethod	1042 @classmethod

1033 def parse_log(cls, filename, blacklist):	1043 def parse_log(cls, logname, blacklist):

1034 logging.info('parse_log(%s, %s)' % (filename, blacklist))	1044 logging.info('parse_log(%s, %s)' % (logname, blacklist))

1035 data = read_json(filename)	1045 data = read_json(logname)

1036 context = cls.Context(blacklist, data['cwd'])	1046 context = cls.Context(blacklist, data['cwd'])

1037 for pidfile in glob.iglob(filename + '.*'):	1047 for pidfile in glob.iglob(logname + '.*'):

1038 pid = pidfile.rsplit('.', 1)[1]	1048 pid = pidfile.rsplit('.', 1)[1]

1039 if pid.isdigit():	1049 if pid.isdigit():

1040 pid = int(pid)	1050 pid = int(pid)

1041 # TODO(maruel): Load as utf-8	1051 # TODO(maruel): Load as utf-8

1042 for line in open(pidfile, 'rb'):	1052 for line in open(pidfile, 'rb'):

1043 context.on_line(pid, line)	1053 context.on_line(pid, line)

1044	1054

1045 return context.to_results()	1055 return context.to_results()

1046	1056

1047	1057

(...skipping 582 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1630 # code manages the dtrace lifetime itself.	1640 # code manages the dtrace lifetime itself.

1631 trace_cmd = [	1641 trace_cmd = [

1632 'sudo',	1642 'sudo',

1633 'dtrace',	1643 'dtrace',

1634 '-x', 'dynvarsize=4m',	1644 '-x', 'dynvarsize=4m',

1635 '-x', 'evaltime=exec',	1645 '-x', 'evaltime=exec',

1636 '-n', self.code(child.pid, cwd),	1646 '-n', self.code(child.pid, cwd),

1637 '-o', '/dev/stderr',	1647 '-o', '/dev/stderr',

1638 '-q',	1648 '-q',

1639 ]	1649 ]

1640 with open(self._logname, 'w') as logfile:	1650 logname = self._logname + '.log'

	1651 with open(logname, 'w') as logfile:

1641 dtrace = subprocess.Popen(	1652 dtrace = subprocess.Popen(

1642 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)	1653 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)

1643 logging.debug('Started dtrace pid: %d' % dtrace.pid)	1654 logging.debug('Started dtrace pid: %d' % dtrace.pid)

1644	1655

1645 # Part 3: Read until one line is printed, which signifies dtrace is up and	1656 # Part 3: Read until one line is printed, which signifies dtrace is up and

1646 # ready.	1657 # ready.

1647 with open(self._logname, 'r') as logfile:	1658 with open(logname, 'r') as logfile:

1648 while 'dtrace_BEGIN' not in logfile.readline():	1659 while 'dtrace_BEGIN' not in logfile.readline():

1649 if dtrace.poll() is not None:	1660 if dtrace.poll() is not None:

1650 break	1661 break

1651	1662

1652 try:	1663 # Part 4: We can now tell our child to go.

1653 # Part 4: We can now tell our child to go.	1664 # TODO(maruel): Another pipe than stdin could be used instead. This

1654 # TODO(maruel): Another pipe than stdin could be used instead. This	1665 # would be more consistent with the other tracing methods.

1655 # would be more consistent with the other tracing methods.	1666 out = child.communicate(signal)[0]

1656 out = child.communicate(signal)[0]

1657	1667

1658 dtrace.wait()	1668 dtrace.wait()

1659 if dtrace.returncode != 0:	1669 if dtrace.returncode != 0:

1660 print 'dtrace failure: %d' % dtrace.returncode	1670 print 'dtrace failure: %d' % dtrace.returncode

1661 with open(self._logname) as logfile:	1671 with open(logname) as logfile:

1662 print ''.join(logfile.readlines()[-100:])	1672 print ''.join(logfile.readlines()[-100:])

1663 # Find a better way.	1673 # Find a better way.

1664 os.remove(self._logname)	1674 os.remove(logname)

1665 else:

1666 # Short the log right away to simplify our life. There isn't much

1667 # advantage in keeping it out of order.

1668 self._sort_log(self._logname)

1669 except KeyboardInterrupt:

1670 # Still sort when testing.

1671 self._sort_log(self._logname)

1672 raise

1673	1675

1674 return dtrace.returncode or child.returncode, out	1676 return dtrace.returncode or child.returncode, out

1675	1677

1676 @staticmethod	1678 def post_process_log(self):

1677 def _sort_log(logname):

1678 """Sorts the log back in order when each call occured.	1679 """Sorts the log back in order when each call occured.

1679	1680

1680 dtrace doesn't save the buffer in strict order since it keeps one buffer	1681 dtrace doesn't save the buffer in strict order since it keeps one buffer

1681 per CPU.	1682 per CPU.

1682 """	1683 """

	1684 super(Dtrace.Tracer, self).post_process_log()

	1685 logname = self._logname + '.log'

1683 with open(logname, 'rb') as logfile:	1686 with open(logname, 'rb') as logfile:

1684 lines = [l for l in logfile if l.strip()]	1687 lines = [l for l in logfile if l.strip()]

1685 errors = [l for l in lines if l.startswith('dtrace:')]	1688 errors = [l for l in lines if l.startswith('dtrace:')]

1686 if errors:	1689 if errors:

1687 raise TracingFailure(	1690 raise TracingFailure(

1688 'Found errors in the trace: %s' % '\n'.join(errors),	1691 'Found errors in the trace: %s' % '\n'.join(errors),

1689 None, None, None, logname)	1692 None, None, None, logname)

1690 try:	1693 try:

1691 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))	1694 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))

1692 except ValueError:	1695 except ValueError:

1693 raise TracingFailure(	1696 raise TracingFailure(

1694 'Found errors in the trace: %s' % '\n'.join(	1697 'Found errors in the trace: %s' % '\n'.join(

1695 l for l in lines if l.split(' ', 1)[0].isdigit()),	1698 l for l in lines if l.split(' ', 1)[0].isdigit()),

1696 None, None, None, logname)	1699 None, None, None, logname)

1697 with open(logname, 'wb') as logfile:	1700 with open(logname, 'wb') as logfile:

1698 logfile.write(''.join(lines))	1701 logfile.write(''.join(lines))

1699	1702

1700 @staticmethod	1703 @staticmethod

1701 def clean_trace(logname):	1704 def clean_trace(logname):

1702 if os.path.isfile(logname):	1705 for ext in ('', '.log'):

1703 os.remove(logname)	1706 if os.path.isfile(logname + ext):

	1707 os.remove(logname + ext)

1704	1708

1705 @classmethod	1709 @classmethod

1706 def parse_log(cls, filename, blacklist):	1710 def parse_log(cls, logname, blacklist):

1707 logging.info('parse_log(%s, %s)' % (filename, blacklist))	1711 logging.info('parse_log(%s, %s)' % (logname, blacklist))

1708 context = cls.Context(blacklist)	1712 context = cls.Context(blacklist)

1709 for line in open(filename, 'rb'):	1713 for line in open(logname + '.log', 'rb'):

1710 context.on_line(line)	1714 context.on_line(line)

1711 return context.to_results()	1715 return context.to_results()

1712	1716

1713	1717

1714 class LogmanTrace(ApiBase):	1718 class LogmanTrace(ApiBase):

1715 """Uses the native Windows ETW based tracing functionality to trace a child	1719 """Uses the native Windows ETW based tracing functionality to trace a child

1716 process.	1720 process.

1717	1721

1718 Caveat: this implementations doesn't track cwd or initial_cwd. It is because	1722 Caveat: this implementations doesn't track cwd or initial_cwd. It is because

1719 the Windows Kernel doesn't have a concept of 'current working directory' at	1723 the Windows Kernel doesn't have a concept of 'current working directory' at

1720 all. A Win32 process has a map of current directories, one per drive letter	1724 all. A Win32 process has a map of current directories, one per drive letter

1721 and it is managed by the user mode kernel32.dll. In kernel, a file is always	1725 and it is managed by the user mode kernel32.dll. In kernel, a file is always

1722 opened relative to another file_object or as an absolute path. All the current	1726 opened relative to another file_object or as an absolute path. All the current

1723 working directory logic is done in user mode.	1727 working directory logic is done in user mode.

1724 """	1728 """

1725 # The basic headers.

1726 EXPECTED_HEADER = [

1727 u'Event Name',

1728 u'Type',

1729 u'Event ID',

1730 u'Version',

1731 u'Channel',

1732 u'Level', # 5

1733 u'Opcode',

1734 u'Task',

1735 u'Keyword',

1736 u'PID',

1737 u'TID', # 10

1738 u'Processor Number',

1739 u'Instance ID',

1740 u'Parent Instance ID',

1741 u'Activity ID',

1742 u'Related Activity ID', # 15

1743 u'Clock-Time',

1744 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them

1745 u'User(ms)', # pretty much useless.

1746 u'User Data', # Extra arguments that are event-specific.

1747 ]

1748 # Only the useful headers common to all entries are listed there. Any column

1749 # at 19 or higher is dependent on the specific event.

1750 EVENT_NAME = 0

1751 TYPE = 1

1752 PID = 9

1753 TID = 10

1754 PROCESSOR_ID = 11

1755 TIMESTAMP = 16

1756 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'

1757 USER_DATA = 19

1758

1759 class Context(ApiBase.Context):	1729 class Context(ApiBase.Context):

1760 """Processes a ETW log line and keeps the list of existent and non	1730 """Processes a ETW log line and keeps the list of existent and non

1761 existent files accessed.	1731 existent files accessed.

1762	1732

1763 Ignores directories.	1733 Ignores directories.

1764 """	1734 """

1765 # These indexes are for the stripped version in json.	1735 # These indexes are for the stripped version in json.

1766 EVENT_NAME = 0	1736 EVENT_NAME = 0

1767 TYPE = 1	1737 TYPE = 1

1768 PID = 2	1738 PID = 2

(...skipping 268 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2037 def supported_events(cls):	2007 def supported_events(cls):

2038 """Returns all the procesed events."""	2008 """Returns all the procesed events."""

2039 out = []	2009 out = []

2040 for member in dir(cls):	2010 for member in dir(cls):

2041 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)	2011 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)

2042 if match:	2012 if match:

2043 out.append(match.groups())	2013 out.append(match.groups())

2044 return out	2014 return out

2045	2015

2046 class Tracer(ApiBase.Tracer):	2016 class Tracer(ApiBase.Tracer):

	2017 # The basic headers.

	2018 EXPECTED_HEADER = [

	2019 u'Event Name',

	2020 u'Type',

	2021 u'Event ID',

	2022 u'Version',

	2023 u'Channel',

	2024 u'Level', # 5

	2025 u'Opcode',

	2026 u'Task',

	2027 u'Keyword',

	2028 u'PID',

	2029 u'TID', # 10

	2030 u'Processor Number',

	2031 u'Instance ID',

	2032 u'Parent Instance ID',

	2033 u'Activity ID',

	2034 u'Related Activity ID', # 15

	2035 u'Clock-Time',

	2036 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them

	2037 u'User(ms)', # pretty much useless.

	2038 u'User Data', # Extra arguments that are event-specific.

	2039 ]

	2040 # Only the useful headers common to all entries are listed there. Any column

	2041 # at 19 or higher is dependent on the specific event.

	2042 EVENT_NAME = 0

	2043 TYPE = 1

	2044 PID = 9

	2045 TID = 10

	2046 PROCESSOR_ID = 11

	2047 TIMESTAMP = 16

	2048 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'

	2049 USER_DATA = 19

	2050

2047 def trace(self, cmd, cwd, tracename, output):	2051 def trace(self, cmd, cwd, tracename, output):

2048 """Uses logman.exe to start and stop the NT Kernel Logger while the	2052 """Uses logman.exe to start and stop the NT Kernel Logger while the

2049 executable to be traced is run.	2053 executable to be traced is run.

2050 """	2054 """

2051 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))	2055 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))

2052 # Use "logman -?" for help.	2056 # Use "logman -?" for help.

2053	2057

2054 stdout = stderr = None	2058 stdout = stderr = None

2055 if output:	2059 if output:

2056 stdout = subprocess.PIPE	2060 stdout = subprocess.PIPE

(...skipping 19 matching lines...) Expand all Loading...
2076 cwd=cwd,	2080 cwd=cwd,

2077 stdin=subprocess.PIPE,	2081 stdin=subprocess.PIPE,

2078 stdout=stdout,	2082 stdout=stdout,

2079 stderr=stderr)	2083 stderr=stderr)

2080 logging.debug('Started child pid: %d' % child.pid)	2084 logging.debug('Started child pid: %d' % child.pid)

2081 out = child.communicate()[0]	2085 out = child.communicate()[0]

2082 finally:	2086 finally:

2083 # 3. Stop the log collection.	2087 # 3. Stop the log collection.

2084 self._stop_log()	2088 self._stop_log()

2085	2089

2086 # 4. Convert log

2087 # TODO(maruel): Temporary, remove me.

2088 LogmanTrace.process_log(self._logname)

2089

2090 # 5. Save metadata.	2090 # 5. Save metadata.

2091 value = {	2091 value = {

2092 'pid': child.pid,	2092 'pid': child.pid,

2093 'format': 'csv',	2093 'format': 'csv',

2094 }	2094 }

2095 write_json(self._logname, value, False)	2095 write_json(self._logname, value, False)

2096 return child.returncode, out	2096 return child.returncode, out

2097	2097

2098 @classmethod	2098 @classmethod

2099 def _start_log(cls, etl):	2099 def _start_log(cls, etl):

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2139 'NT Kernel Logger',	2139 'NT Kernel Logger',

2140 '-ets', # Sends the command directly to the kernel.	2140 '-ets', # Sends the command directly to the kernel.

2141 ]	2141 ]

2142 logging.debug('Running: %s' % cmd_stop)	2142 logging.debug('Running: %s' % cmd_stop)

2143 subprocess.check_call(	2143 subprocess.check_call(

2144 cmd_stop,	2144 cmd_stop,

2145 stdin=subprocess.PIPE,	2145 stdin=subprocess.PIPE,

2146 stdout=subprocess.PIPE,	2146 stdout=subprocess.PIPE,

2147 stderr=subprocess.STDOUT)	2147 stderr=subprocess.STDOUT)

2148	2148

	2149 def post_process_log(self):

	2150 """Converts the .etl file into .csv then into .json."""

	2151 super(LogmanTrace.Tracer, self).post_process_log()

	2152 logformat = 'csv'

	2153 self._convert_log(logformat)

	2154

	2155 if logformat == 'csv_utf16':

	2156 def load_file():

	2157 def utf_8_encoder(unicode_csv_data):

	2158 """Encodes the unicode object as utf-8 encoded str instance"""

	2159 for line in unicode_csv_data:

	2160 yield line.encode('utf-8')

	2161

	2162 def unicode_csv_reader(unicode_csv_data, **kwargs):

	2163 """Encodes temporarily as UTF-8 since csv module doesn't do unicode.

	2164 """

	2165 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)

	2166 for row in csv_reader:

	2167 # Decode str utf-8 instances back to unicode instances, cell by

	2168 # cell:

	2169 yield [cell.decode('utf-8') for cell in row]

	2170

	2171 # The CSV file is UTF-16 so use codecs.open() to load the file into

	2172 # the python internal unicode format (utf-8). Then explicitly

	2173 # re-encode as utf8 as str instances so csv can parse it fine. Then

	2174 # decode the utf-8 str back into python unicode instances. This

	2175 # sounds about right.

	2176 for line in unicode_csv_reader(

	2177 codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):

	2178 # line is a list of unicode objects

	2179 # So much white space!

	2180 yield [i.strip() for i in line]

	2181

	2182 elif logformat == 'csv':

	2183 def load_file():

	2184 def ansi_csv_reader(ansi_csv_data, **kwargs):

	2185 """Loads an 'ANSI' code page and returns unicode() objects."""

	2186 assert sys.getfilesystemencoding() == 'mbcs'

	2187 encoding = get_current_encoding()

	2188 for row in csv.reader(ansi_csv_data, **kwargs):

	2189 # Decode str 'ansi' instances to unicode instances, cell by cell:

	2190 yield [cell.decode(encoding) for cell in row]

	2191

	2192 # The fastest and smallest format but only supports 'ANSI' file paths.

	2193 # E.g. the filenames are encoding in the 'current' encoding.

	2194 for line in ansi_csv_reader(open(self._logname + '.' + logformat)):

	2195 # line is a list of unicode objects.

	2196 yield [i.strip() for i in line]

	2197

	2198 supported_events = LogmanTrace.Context.supported_events()

	2199

	2200 def trim(generator):

	2201 for index, line in enumerate(generator):

	2202 if not index:

	2203 if line != self.EXPECTED_HEADER:

	2204 raise TracingFailure(

	2205 'Found malformed header: %s' % ' '.join(line),

	2206 None, None, None)

	2207 continue

	2208 # As you can see, the CSV is full of useful non-redundant information:

	2209 if (line[2] != '0' or # Event ID

	2210 line[3] not in ('2', '3') or # Version

	2211 line[4] != '0' or # Channel

	2212 line[5] != '0' or # Level

	2213 line[7] != '0' or # Task

	2214 line[8] != '0x0000000000000000' or # Keyword

	2215 line[12] != '' or # Instance ID

	2216 line[13] != '' or # Parent Instance ID

	2217 line[14] != self.NULL_GUID or # Activity ID

	2218 line[15] != ''): # Related Activity ID

	2219 raise TracingFailure(

	2220 'Found unexpected values in line: %s' % ' '.join(line),

	2221 None, None, None)

	2222

	2223 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:

	2224 continue

	2225

	2226 # Convert the PID in-place from hex.

	2227 line[self.PID] = int(line[self.PID], 16)

	2228

	2229 yield [

	2230 line[self.EVENT_NAME],

	2231 line[self.TYPE],

	2232 line[self.PID],

	2233 line[self.TID],

	2234 line[self.PROCESSOR_ID],

	2235 line[self.TIMESTAMP],

	2236 ] + line[self.USER_DATA:]

	2237

	2238 write_json('%s.json' % self._logname, list(trim(load_file())), True)

	2239

	2240 def _convert_log(self, logformat):

	2241 """Converts the ETL trace to text representation.

	2242

	2243 Normally, 'csv' is sufficient. If complex scripts are used (like eastern

	2244 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.

	2245

	2246 Arguments:

	2247 - logformat: Text format to be generated, csv, csv_utf16 or xml.

	2248

	2249 Use "tracerpt -?" for help.

	2250 """

	2251 LOCALE_INVARIANT = 0x7F

	2252 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)

	2253 cmd_convert = [

	2254 'tracerpt.exe',

	2255 '-l', self._logname + '.etl',

	2256 '-o', self._logname + '.' + logformat,

	2257 '-gmt', # Use UTC

	2258 '-y', # No prompt

	2259 # Use -of XML to get the header of each items after column 19, e.g. all

	2260 # the actual headers of 'User Data'.

	2261 ]

	2262

	2263 if logformat == 'csv':

	2264 # tracerpt localizes the 'Type' column, for major brainfuck

	2265 # entertainment. I can't imagine any sane reason to do that.

	2266 cmd_convert.extend(['-of', 'CSV'])

	2267 elif logformat == 'csv_utf16':

	2268 # This causes it to use UTF-16, which doubles the log size but ensures

	2269 # the log is readable for non-ASCII characters.

	2270 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])

	2271 elif logformat == 'xml':

	2272 cmd_convert.extend(['-of', 'XML'])

	2273 else:

	2274 raise ValueError('Unexpected log format \'%s\'' % logformat)

	2275 logging.debug('Running: %s' % cmd_convert)

	2276 # This can takes tens of minutes for large logs.

	2277 # Redirects all output to stderr.

	2278 subprocess.check_call(

	2279 cmd_convert,

	2280 stdin=subprocess.PIPE,

	2281 stdout=sys.stderr,

	2282 stderr=sys.stderr)

	2283

2149 def __init__(self):	2284 def __init__(self):

2150 super(LogmanTrace, self).__init__()	2285 super(LogmanTrace, self).__init__()

2151 # Most ignores need to be determined at runtime.	2286 # Most ignores need to be determined at runtime.

2152 self.IGNORED = set([os.path.dirname(sys.executable)])	2287 self.IGNORED = set([os.path.dirname(sys.executable)])

2153 # Add many directories from environment variables.	2288 # Add many directories from environment variables.

2154 vars_to_ignore = (	2289 vars_to_ignore = (

2155 'APPDATA',	2290 'APPDATA',

2156 'LOCALAPPDATA',	2291 'LOCALAPPDATA',

2157 'ProgramData',	2292 'ProgramData',

2158 'ProgramFiles',	2293 'ProgramFiles',

2159 'ProgramFiles(x86)',	2294 'ProgramFiles(x86)',

2160 'ProgramW6432',	2295 'ProgramW6432',

2161 'SystemRoot',	2296 'SystemRoot',

2162 'TEMP',	2297 'TEMP',

2163 'TMP',	2298 'TMP',

2164 )	2299 )

2165 for i in vars_to_ignore:	2300 for i in vars_to_ignore:

2166 if os.environ.get(i):	2301 if os.environ.get(i):

2167 self.IGNORED.add(os.environ[i])	2302 self.IGNORED.add(os.environ[i])

2168	2303

2169 # Also add their short path name equivalents.	2304 # Also add their short path name equivalents.

2170 for i in list(self.IGNORED):	2305 for i in list(self.IGNORED):

2171 self.IGNORED.add(GetShortPathName(i.replace('/', os.path.sep)))	2306 self.IGNORED.add(GetShortPathName(i.replace('/', os.path.sep)))

2172	2307

2173 # Add these last since they have no short path name equivalent.	2308 # Add these last since they have no short path name equivalent.

2174 self.IGNORED.add('\\SystemRoot')	2309 self.IGNORED.add('\\SystemRoot')

2175 self.IGNORED = tuple(sorted(self.IGNORED))	2310 self.IGNORED = tuple(sorted(self.IGNORED))

2176	2311

2177 @classmethod

2178 def process_log(cls, logname):

2179 """Converts the .etl file into .csv then into .json."""

2180 logformat = 'csv'

2181 cls._convert_log(logname, logformat)

2182

2183 if logformat == 'csv_utf16':

2184 def load_file():

2185 def utf_8_encoder(unicode_csv_data):

2186 """Encodes the unicode object as utf-8 encoded str instance"""

2187 for line in unicode_csv_data:

2188 yield line.encode('utf-8')

2189

2190 def unicode_csv_reader(unicode_csv_data, **kwargs):

2191 """Encodes temporarily as UTF-8 since csv module doesn't do unicode.

2192 """

2193 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)

2194 for row in csv_reader:

2195 # Decode str utf-8 instances back to unicode instances, cell by

2196 # cell:

2197 yield [cell.decode('utf-8') for cell in row]

2198

2199 # The CSV file is UTF-16 so use codecs.open() to load the file into the

2200 # python internal unicode format (utf-8). Then explicitly re-encode as

2201 # utf8 as str instances so csv can parse it fine. Then decode the utf-8

2202 # str back into python unicode instances. This sounds about right.

2203 for line in unicode_csv_reader(

2204 codecs.open(logname + '.' + logformat, 'r', 'utf-16')):

2205 # line is a list of unicode objects

2206 # So much white space!

2207 yield [i.strip() for i in line]

2208

2209 elif logformat == 'csv':

2210 def load_file():

2211 def ansi_csv_reader(ansi_csv_data, **kwargs):

2212 """Loads an 'ANSI' code page and returns unicode() objects."""

2213 assert sys.getfilesystemencoding() == 'mbcs'

2214 encoding = get_current_encoding()

2215 for row in csv.reader(ansi_csv_data, **kwargs):

2216 # Decode str 'ansi' instances to unicode instances, cell by cell:

2217 yield [cell.decode(encoding) for cell in row]

2218

2219 # The fastest and smallest format but only supports 'ANSI' file paths.

2220 # E.g. the filenames are encoding in the 'current' encoding.

2221 for line in ansi_csv_reader(open(logname + '.' + logformat)):

2222 # line is a list of unicode objects.

2223 yield [i.strip() for i in line]

2224

2225 supported_events = cls.Context.supported_events()

2226

2227 def trim(generator):

2228 for index, line in enumerate(generator):

2229 if not index:

2230 if line != cls.EXPECTED_HEADER:

2231 raise TracingFailure(

2232 'Found malformed header: %s' % ' '.join(line),

2233 None, None, None)

2234 continue

2235 # As you can see, the CSV is full of useful non-redundant information:

2236 if (line[2] != '0' or # Event ID

2237 line[3] not in ('2', '3') or # Version

2238 line[4] != '0' or # Channel

2239 line[5] != '0' or # Level

2240 line[7] != '0' or # Task

2241 line[8] != '0x0000000000000000' or # Keyword

2242 line[12] != '' or # Instance ID

2243 line[13] != '' or # Parent Instance ID

2244 line[14] != cls.NULL_GUID or # Activity ID

2245 line[15] != ''): # Related Activity ID

2246 raise TracingFailure(

2247 'Found unexpected values in line: %s' % ' '.join(line),

2248 None, None, None)

2249

2250 if (line[cls.EVENT_NAME], line[cls.TYPE]) not in supported_events:

2251 continue

2252

2253 # Convert the PID in-place from hex.

2254 line[cls.PID] = int(line[cls.PID], 16)

2255

2256 yield [

2257 line[cls.EVENT_NAME],

2258 line[cls.TYPE],

2259 line[cls.PID],

2260 line[cls.TID],

2261 line[cls.PROCESSOR_ID],

2262 line[cls.TIMESTAMP],

2263 ] + line[cls.USER_DATA:]

2264

2265 write_json('%s.json' % logname, list(trim(load_file())), True)

2266

2267 @staticmethod

2268 def _convert_log(logname, logformat):

2269 """Converts the ETL trace to text representation.

2270

2271 Normally, 'csv' is sufficient. If complex scripts are used (like eastern

2272 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.

2273

2274 Arguments:

2275 - logname: Base filename to convert.

2276 - logformat: Text format to be generated, csv, csv_utf16 or xml.

2277

2278 Use "tracerpt -?" for help.

2279 """

2280 LOCALE_INVARIANT = 0x7F

2281 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)

2282 cmd_convert = [

2283 'tracerpt.exe',

2284 '-l', logname + '.etl',

2285 '-o', logname + '.' + logformat,

2286 '-gmt', # Use UTC

2287 '-y', # No prompt

2288 # Use -of XML to get the header of each items after column 19, e.g. all

2289 # the actual headers of 'User Data'.

2290 ]

2291

2292 if logformat == 'csv':

2293 # tracerpt localizes the 'Type' column, for major brainfuck

2294 # entertainment. I can't imagine any sane reason to do that.

2295 cmd_convert.extend(['-of', 'CSV'])

2296 elif logformat == 'csv_utf16':

2297 # This causes it to use UTF-16, which doubles the log size but ensures

2298 # the log is readable for non-ASCII characters.

2299 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])

2300 elif logformat == 'xml':

2301 cmd_convert.extend(['-of', 'XML'])

2302 else:

2303 raise ValueError('Unexpected log format \'%s\'' % logformat)

2304 logging.debug('Running: %s' % cmd_convert)

2305 # This can takes tens of minutes for large logs.

2306 # Redirects all output to stderr.

2307 subprocess.check_call(

2308 cmd_convert,

2309 stdin=subprocess.PIPE,

2310 stdout=sys.stderr,

2311 stderr=sys.stderr)

2312

2313 @staticmethod	2312 @staticmethod

2314 def clean_trace(logname):	2313 def clean_trace(logname):

2315 for ext in ('', '.csv', '.etl', '.json', '.xml'):	2314 for ext in ('', '.csv', '.etl', '.json', '.xml'):

2316 if os.path.isfile(logname + ext):	2315 if os.path.isfile(logname + ext):

2317 os.remove(logname + ext)	2316 os.remove(logname + ext)

2318	2317

2319 @classmethod	2318 @classmethod

2320 def parse_log(cls, logname, blacklist):	2319 def parse_log(cls, logname, blacklist):

2321 logging.info('parse_log(%s, %s)' % (logname, blacklist))	2320 logging.info('parse_log(%s, %s)' % (logname, blacklist))

2322	2321

(...skipping 282 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2605 for fn in dir(sys.modules[__name__])	2604 for fn in dir(sys.modules[__name__])

2606 if fn.startswith('CMD')))	2605 if fn.startswith('CMD')))

2607	2606

2608 command = get_command_handler(argv[0] if argv else None)	2607 command = get_command_handler(argv[0] if argv else None)

2609 parser = gen_parser(command)	2608 parser = gen_parser(command)

2610 return command(parser, argv[1:])	2609 return command(parser, argv[1:])

2611	2610

2612	2611

2613 if __name__ == '__main__':	2612 if __name__ == '__main__':

2614 sys.exit(main(sys.argv[1:]))	2613 sys.exit(main(sys.argv[1:]))

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »