Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(312)

Unified Diff: tools/isolate/trace_inputs.py

Issue 10537059: Add ApiBase.Tracer.post_process_log() to do implementation specific one-time mangling of the logs. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: tools/isolate/trace_inputs.py
diff --git a/tools/isolate/trace_inputs.py b/tools/isolate/trace_inputs.py
index 3c8a81360d28caf36aee2c5ed2b1dad8303bc0ef..b68c3a530ffba28d216e435078f53a34773485cc 100755
--- a/tools/isolate/trace_inputs.py
+++ b/tools/isolate/trace_inputs.py
@@ -635,6 +635,13 @@ class ApiBase(object):
def close(self):
self._initialized = False
+ def post_process_log(self):
+ """Post-processes the log so it becomes faster to load afterward.
+
+ Must not be used manually when using 'with' construct.
+ """
+ assert not self._initialized, 'Must stop tracing first.'
+
def __enter__(self):
"""Enables 'with' statement."""
return self
@@ -642,6 +649,9 @@ class ApiBase(object):
def __exit__(self, exc_type, exc_value, traceback):
"""Enables 'with' statement."""
self.close()
+ # If an exception was thrown, do not process logs.
+ if not exc_type:
+ self.post_process_log()
def get_tracer(self, logname):
"""Returns an ApiBase.Tracer instance.
@@ -660,7 +670,7 @@ class ApiBase(object):
raise NotImplementedError()
@classmethod
- def parse_log(cls, filename, blacklist):
+ def parse_log(cls, logname, blacklist):
"""Processes a trace log and returns the files opened and the files that do
not exist.
@@ -1030,11 +1040,11 @@ class Strace(ApiBase):
os.remove(i)
@classmethod
- def parse_log(cls, filename, blacklist):
- logging.info('parse_log(%s, %s)' % (filename, blacklist))
- data = read_json(filename)
+ def parse_log(cls, logname, blacklist):
+ logging.info('parse_log(%s, %s)' % (logname, blacklist))
+ data = read_json(logname)
context = cls.Context(blacklist, data['cwd'])
- for pidfile in glob.iglob(filename + '.*'):
+ for pidfile in glob.iglob(logname + '.*'):
pid = pidfile.rsplit('.', 1)[1]
if pid.isdigit():
pid = int(pid)
@@ -1637,49 +1647,42 @@ class Dtrace(ApiBase):
'-o', '/dev/stderr',
'-q',
]
- with open(self._logname, 'w') as logfile:
+ logname = self._logname + '.log'
+ with open(logname, 'w') as logfile:
dtrace = subprocess.Popen(
trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
logging.debug('Started dtrace pid: %d' % dtrace.pid)
# Part 3: Read until one line is printed, which signifies dtrace is up and
# ready.
- with open(self._logname, 'r') as logfile:
+ with open(logname, 'r') as logfile:
while 'dtrace_BEGIN' not in logfile.readline():
if dtrace.poll() is not None:
break
- try:
- # Part 4: We can now tell our child to go.
- # TODO(maruel): Another pipe than stdin could be used instead. This
- # would be more consistent with the other tracing methods.
- out = child.communicate(signal)[0]
-
- dtrace.wait()
- if dtrace.returncode != 0:
- print 'dtrace failure: %d' % dtrace.returncode
- with open(self._logname) as logfile:
- print ''.join(logfile.readlines()[-100:])
- # Find a better way.
- os.remove(self._logname)
- else:
- # Short the log right away to simplify our life. There isn't much
- # advantage in keeping it out of order.
- self._sort_log(self._logname)
- except KeyboardInterrupt:
- # Still sort when testing.
- self._sort_log(self._logname)
- raise
+ # Part 4: We can now tell our child to go.
+ # TODO(maruel): Another pipe than stdin could be used instead. This
+ # would be more consistent with the other tracing methods.
+ out = child.communicate(signal)[0]
+
+ dtrace.wait()
+ if dtrace.returncode != 0:
+ print 'dtrace failure: %d' % dtrace.returncode
+ with open(logname) as logfile:
+ print ''.join(logfile.readlines()[-100:])
+ # Find a better way.
+ os.remove(logname)
return dtrace.returncode or child.returncode, out
- @staticmethod
- def _sort_log(logname):
+ def post_process_log(self):
"""Sorts the log back in order when each call occured.
dtrace doesn't save the buffer in strict order since it keeps one buffer
- per CPU.
+ per CPU.
"""
+ super(Dtrace.Tracer, self).post_process_log()
+ logname = self._logname + '.log'
with open(logname, 'rb') as logfile:
lines = [l for l in logfile if l.strip()]
errors = [l for l in lines if l.startswith('dtrace:')]
@@ -1699,14 +1702,15 @@ class Dtrace(ApiBase):
@staticmethod
def clean_trace(logname):
- if os.path.isfile(logname):
- os.remove(logname)
+ for ext in ('', '.log'):
+ if os.path.isfile(logname + ext):
+ os.remove(logname + ext)
@classmethod
- def parse_log(cls, filename, blacklist):
- logging.info('parse_log(%s, %s)' % (filename, blacklist))
+ def parse_log(cls, logname, blacklist):
+ logging.info('parse_log(%s, %s)' % (logname, blacklist))
context = cls.Context(blacklist)
- for line in open(filename, 'rb'):
+ for line in open(logname + '.log', 'rb'):
context.on_line(line)
return context.to_results()
@@ -1722,40 +1726,6 @@ class LogmanTrace(ApiBase):
opened relative to another file_object or as an absolute path. All the current
working directory logic is done in user mode.
"""
- # The basic headers.
- EXPECTED_HEADER = [
- u'Event Name',
- u'Type',
- u'Event ID',
- u'Version',
- u'Channel',
- u'Level', # 5
- u'Opcode',
- u'Task',
- u'Keyword',
- u'PID',
- u'TID', # 10
- u'Processor Number',
- u'Instance ID',
- u'Parent Instance ID',
- u'Activity ID',
- u'Related Activity ID', # 15
- u'Clock-Time',
- u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
- u'User(ms)', # pretty much useless.
- u'User Data', # Extra arguments that are event-specific.
- ]
- # Only the useful headers common to all entries are listed there. Any column
- # at 19 or higher is dependent on the specific event.
- EVENT_NAME = 0
- TYPE = 1
- PID = 9
- TID = 10
- PROCESSOR_ID = 11
- TIMESTAMP = 16
- NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
- USER_DATA = 19
-
class Context(ApiBase.Context):
"""Processes a ETW log line and keeps the list of existent and non
existent files accessed.
@@ -2044,6 +2014,40 @@ class LogmanTrace(ApiBase):
return out
class Tracer(ApiBase.Tracer):
+ # The basic headers.
+ EXPECTED_HEADER = [
+ u'Event Name',
+ u'Type',
+ u'Event ID',
+ u'Version',
+ u'Channel',
+ u'Level', # 5
+ u'Opcode',
+ u'Task',
+ u'Keyword',
+ u'PID',
+ u'TID', # 10
+ u'Processor Number',
+ u'Instance ID',
+ u'Parent Instance ID',
+ u'Activity ID',
+ u'Related Activity ID', # 15
+ u'Clock-Time',
+ u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
+ u'User(ms)', # pretty much useless.
+ u'User Data', # Extra arguments that are event-specific.
+ ]
+ # Only the useful headers common to all entries are listed there. Any column
+ # at 19 or higher is dependent on the specific event.
+ EVENT_NAME = 0
+ TYPE = 1
+ PID = 9
+ TID = 10
+ PROCESSOR_ID = 11
+ TIMESTAMP = 16
+ NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
+ USER_DATA = 19
+
def trace(self, cmd, cwd, tracename, output):
"""Uses logman.exe to start and stop the NT Kernel Logger while the
executable to be traced is run.
@@ -2083,10 +2087,6 @@ class LogmanTrace(ApiBase):
# 3. Stop the log collection.
self._stop_log()
- # 4. Convert log
- # TODO(maruel): Temporary, remove me.
- LogmanTrace.process_log(self._logname)
-
# 5. Save metadata.
value = {
'pid': child.pid,
@@ -2146,6 +2146,141 @@ class LogmanTrace(ApiBase):
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
+ def post_process_log(self):
+ """Converts the .etl file into .csv then into .json."""
+ super(LogmanTrace.Tracer, self).post_process_log()
+ logformat = 'csv'
+ self._convert_log(logformat)
+
+ if logformat == 'csv_utf16':
+ def load_file():
+ def utf_8_encoder(unicode_csv_data):
+ """Encodes the unicode object as utf-8 encoded str instance"""
+ for line in unicode_csv_data:
+ yield line.encode('utf-8')
+
+ def unicode_csv_reader(unicode_csv_data, **kwargs):
+ """Encodes temporarily as UTF-8 since csv module doesn't do unicode.
+ """
+ csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
+ for row in csv_reader:
+ # Decode str utf-8 instances back to unicode instances, cell by
+ # cell:
+ yield [cell.decode('utf-8') for cell in row]
+
+ # The CSV file is UTF-16 so use codecs.open() to load the file into
+ # the python internal unicode format (utf-8). Then explicitly
+ # re-encode as utf8 as str instances so csv can parse it fine. Then
+ # decode the utf-8 str back into python unicode instances. This
+ # sounds about right.
+ for line in unicode_csv_reader(
+ codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
+ # line is a list of unicode objects
+ # So much white space!
+ yield [i.strip() for i in line]
+
+ elif logformat == 'csv':
+ def load_file():
+ def ansi_csv_reader(ansi_csv_data, **kwargs):
+ """Loads an 'ANSI' code page and returns unicode() objects."""
+ assert sys.getfilesystemencoding() == 'mbcs'
+ encoding = get_current_encoding()
+ for row in csv.reader(ansi_csv_data, **kwargs):
+ # Decode str 'ansi' instances to unicode instances, cell by cell:
+ yield [cell.decode(encoding) for cell in row]
+
+ # The fastest and smallest format but only supports 'ANSI' file paths.
+ # E.g. the filenames are encoding in the 'current' encoding.
+ for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
+ # line is a list of unicode objects.
+ yield [i.strip() for i in line]
+
+ supported_events = LogmanTrace.Context.supported_events()
+
+ def trim(generator):
+ for index, line in enumerate(generator):
+ if not index:
+ if line != self.EXPECTED_HEADER:
+ raise TracingFailure(
+ 'Found malformed header: %s' % ' '.join(line),
+ None, None, None)
+ continue
+ # As you can see, the CSV is full of useful non-redundant information:
+ if (line[2] != '0' or # Event ID
+ line[3] not in ('2', '3') or # Version
+ line[4] != '0' or # Channel
+ line[5] != '0' or # Level
+ line[7] != '0' or # Task
+ line[8] != '0x0000000000000000' or # Keyword
+ line[12] != '' or # Instance ID
+ line[13] != '' or # Parent Instance ID
+ line[14] != self.NULL_GUID or # Activity ID
+ line[15] != ''): # Related Activity ID
+ raise TracingFailure(
+ 'Found unexpected values in line: %s' % ' '.join(line),
+ None, None, None)
+
+ if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
+ continue
+
+ # Convert the PID in-place from hex.
+ line[self.PID] = int(line[self.PID], 16)
+
+ yield [
+ line[self.EVENT_NAME],
+ line[self.TYPE],
+ line[self.PID],
+ line[self.TID],
+ line[self.PROCESSOR_ID],
+ line[self.TIMESTAMP],
+ ] + line[self.USER_DATA:]
+
+ write_json('%s.json' % self._logname, list(trim(load_file())), True)
+
+ def _convert_log(self, logformat):
+ """Converts the ETL trace to text representation.
+
+ Normally, 'csv' is sufficient. If complex scripts are used (like eastern
+ languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
+
+ Arguments:
+ - logformat: Text format to be generated, csv, csv_utf16 or xml.
+
+ Use "tracerpt -?" for help.
+ """
+ LOCALE_INVARIANT = 0x7F
+ windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
+ cmd_convert = [
+ 'tracerpt.exe',
+ '-l', self._logname + '.etl',
+ '-o', self._logname + '.' + logformat,
+ '-gmt', # Use UTC
+ '-y', # No prompt
+ # Use -of XML to get the header of each items after column 19, e.g. all
+ # the actual headers of 'User Data'.
+ ]
+
+ if logformat == 'csv':
+ # tracerpt localizes the 'Type' column, for major brainfuck
+ # entertainment. I can't imagine any sane reason to do that.
+ cmd_convert.extend(['-of', 'CSV'])
+ elif logformat == 'csv_utf16':
+ # This causes it to use UTF-16, which doubles the log size but ensures
+ # the log is readable for non-ASCII characters.
+ cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
+ elif logformat == 'xml':
+ cmd_convert.extend(['-of', 'XML'])
+ else:
+ raise ValueError('Unexpected log format \'%s\'' % logformat)
+ logging.debug('Running: %s' % cmd_convert)
+ # This can takes tens of minutes for large logs.
+ # Redirects all output to stderr.
+ subprocess.check_call(
+ cmd_convert,
+ stdin=subprocess.PIPE,
+ stdout=sys.stderr,
+ stderr=sys.stderr)
+
def __init__(self):
super(LogmanTrace, self).__init__()
# Most ignores need to be determined at runtime.
@@ -2174,142 +2309,6 @@ class LogmanTrace(ApiBase):
self.IGNORED.add('\\SystemRoot')
self.IGNORED = tuple(sorted(self.IGNORED))
- @classmethod
- def process_log(cls, logname):
- """Converts the .etl file into .csv then into .json."""
- logformat = 'csv'
- cls._convert_log(logname, logformat)
-
- if logformat == 'csv_utf16':
- def load_file():
- def utf_8_encoder(unicode_csv_data):
- """Encodes the unicode object as utf-8 encoded str instance"""
- for line in unicode_csv_data:
- yield line.encode('utf-8')
-
- def unicode_csv_reader(unicode_csv_data, **kwargs):
- """Encodes temporarily as UTF-8 since csv module doesn't do unicode.
- """
- csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
- for row in csv_reader:
- # Decode str utf-8 instances back to unicode instances, cell by
- # cell:
- yield [cell.decode('utf-8') for cell in row]
-
- # The CSV file is UTF-16 so use codecs.open() to load the file into the
- # python internal unicode format (utf-8). Then explicitly re-encode as
- # utf8 as str instances so csv can parse it fine. Then decode the utf-8
- # str back into python unicode instances. This sounds about right.
- for line in unicode_csv_reader(
- codecs.open(logname + '.' + logformat, 'r', 'utf-16')):
- # line is a list of unicode objects
- # So much white space!
- yield [i.strip() for i in line]
-
- elif logformat == 'csv':
- def load_file():
- def ansi_csv_reader(ansi_csv_data, **kwargs):
- """Loads an 'ANSI' code page and returns unicode() objects."""
- assert sys.getfilesystemencoding() == 'mbcs'
- encoding = get_current_encoding()
- for row in csv.reader(ansi_csv_data, **kwargs):
- # Decode str 'ansi' instances to unicode instances, cell by cell:
- yield [cell.decode(encoding) for cell in row]
-
- # The fastest and smallest format but only supports 'ANSI' file paths.
- # E.g. the filenames are encoding in the 'current' encoding.
- for line in ansi_csv_reader(open(logname + '.' + logformat)):
- # line is a list of unicode objects.
- yield [i.strip() for i in line]
-
- supported_events = cls.Context.supported_events()
-
- def trim(generator):
- for index, line in enumerate(generator):
- if not index:
- if line != cls.EXPECTED_HEADER:
- raise TracingFailure(
- 'Found malformed header: %s' % ' '.join(line),
- None, None, None)
- continue
- # As you can see, the CSV is full of useful non-redundant information:
- if (line[2] != '0' or # Event ID
- line[3] not in ('2', '3') or # Version
- line[4] != '0' or # Channel
- line[5] != '0' or # Level
- line[7] != '0' or # Task
- line[8] != '0x0000000000000000' or # Keyword
- line[12] != '' or # Instance ID
- line[13] != '' or # Parent Instance ID
- line[14] != cls.NULL_GUID or # Activity ID
- line[15] != ''): # Related Activity ID
- raise TracingFailure(
- 'Found unexpected values in line: %s' % ' '.join(line),
- None, None, None)
-
- if (line[cls.EVENT_NAME], line[cls.TYPE]) not in supported_events:
- continue
-
- # Convert the PID in-place from hex.
- line[cls.PID] = int(line[cls.PID], 16)
-
- yield [
- line[cls.EVENT_NAME],
- line[cls.TYPE],
- line[cls.PID],
- line[cls.TID],
- line[cls.PROCESSOR_ID],
- line[cls.TIMESTAMP],
- ] + line[cls.USER_DATA:]
-
- write_json('%s.json' % logname, list(trim(load_file())), True)
-
- @staticmethod
- def _convert_log(logname, logformat):
- """Converts the ETL trace to text representation.
-
- Normally, 'csv' is sufficient. If complex scripts are used (like eastern
- languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
-
- Arguments:
- - logname: Base filename to convert.
- - logformat: Text format to be generated, csv, csv_utf16 or xml.
-
- Use "tracerpt -?" for help.
- """
- LOCALE_INVARIANT = 0x7F
- windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
- cmd_convert = [
- 'tracerpt.exe',
- '-l', logname + '.etl',
- '-o', logname + '.' + logformat,
- '-gmt', # Use UTC
- '-y', # No prompt
- # Use -of XML to get the header of each items after column 19, e.g. all
- # the actual headers of 'User Data'.
- ]
-
- if logformat == 'csv':
- # tracerpt localizes the 'Type' column, for major brainfuck
- # entertainment. I can't imagine any sane reason to do that.
- cmd_convert.extend(['-of', 'CSV'])
- elif logformat == 'csv_utf16':
- # This causes it to use UTF-16, which doubles the log size but ensures
- # the log is readable for non-ASCII characters.
- cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
- elif logformat == 'xml':
- cmd_convert.extend(['-of', 'XML'])
- else:
- raise ValueError('Unexpected log format \'%s\'' % logformat)
- logging.debug('Running: %s' % cmd_convert)
- # This can takes tens of minutes for large logs.
- # Redirects all output to stderr.
- subprocess.check_call(
- cmd_convert,
- stdin=subprocess.PIPE,
- stdout=sys.stderr,
- stderr=sys.stderr)
-
@staticmethod
def clean_trace(logname):
for ext in ('', '.csv', '.etl', '.json', '.xml'):
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698