tools/isolate/trace_inputs.py - Issue 10537059: Add ApiBase.Tracer.post_process_log() to do implementation specific one-time mangling of the logs.

Unified Diff: tools/isolate/trace_inputs.py

Issue 10537059: Add ApiBase.Tracer.post_process_log() to do implementation specific one-time mangling of the logs. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/isolate/trace_inputs.py

diff --git a/tools/isolate/trace_inputs.py b/tools/isolate/trace_inputs.py

index 3c8a81360d28caf36aee2c5ed2b1dad8303bc0ef..b68c3a530ffba28d216e435078f53a34773485cc 100755

--- a/tools/isolate/trace_inputs.py

+++ b/tools/isolate/trace_inputs.py

@@ -635,6 +635,13 @@ class ApiBase(object):

def close(self):

self._initialized = False

+ def post_process_log(self):

+ """Post-processes the log so it becomes faster to load afterward.

+ Must not be used manually when using 'with' construct.

+ """

+ assert not self._initialized, 'Must stop tracing first.'

def __enter__(self):

"""Enables 'with' statement."""

return self

@@ -642,6 +649,9 @@ class ApiBase(object):

def __exit__(self, exc_type, exc_value, traceback):

"""Enables 'with' statement."""

self.close()

+ # If an exception was thrown, do not process logs.

+ if not exc_type:

+ self.post_process_log()

def get_tracer(self, logname):

"""Returns an ApiBase.Tracer instance.

@@ -660,7 +670,7 @@ class ApiBase(object):

raise NotImplementedError()

@classmethod

- def parse_log(cls, filename, blacklist):

+ def parse_log(cls, logname, blacklist):

"""Processes a trace log and returns the files opened and the files that do

not exist.

@@ -1030,11 +1040,11 @@ class Strace(ApiBase):

os.remove(i)

@classmethod

- def parse_log(cls, filename, blacklist):

- logging.info('parse_log(%s, %s)' % (filename, blacklist))

- data = read_json(filename)

+ def parse_log(cls, logname, blacklist):

+ logging.info('parse_log(%s, %s)' % (logname, blacklist))

+ data = read_json(logname)

context = cls.Context(blacklist, data['cwd'])

- for pidfile in glob.iglob(filename + '.*'):

+ for pidfile in glob.iglob(logname + '.*'):

pid = pidfile.rsplit('.', 1)[1]

if pid.isdigit():

pid = int(pid)

@@ -1637,49 +1647,42 @@ class Dtrace(ApiBase):

'-o', '/dev/stderr',

'-q',

]

- with open(self._logname, 'w') as logfile:

+ logname = self._logname + '.log'

+ with open(logname, 'w') as logfile:

dtrace = subprocess.Popen(

trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)

logging.debug('Started dtrace pid: %d' % dtrace.pid)

# Part 3: Read until one line is printed, which signifies dtrace is up and

# ready.

- with open(self._logname, 'r') as logfile:

+ with open(logname, 'r') as logfile:

while 'dtrace_BEGIN' not in logfile.readline():

if dtrace.poll() is not None:

break

- try:

- # Part 4: We can now tell our child to go.

- # TODO(maruel): Another pipe than stdin could be used instead. This

- # would be more consistent with the other tracing methods.

- out = child.communicate(signal)[0]

- dtrace.wait()

- if dtrace.returncode != 0:

- print 'dtrace failure: %d' % dtrace.returncode

- with open(self._logname) as logfile:

- print ''.join(logfile.readlines()[-100:])

- # Find a better way.

- os.remove(self._logname)

- else:

- # Short the log right away to simplify our life. There isn't much

- # advantage in keeping it out of order.

- self._sort_log(self._logname)

- except KeyboardInterrupt:

- # Still sort when testing.

- self._sort_log(self._logname)

- raise

+ # Part 4: We can now tell our child to go.

+ # TODO(maruel): Another pipe than stdin could be used instead. This

+ # would be more consistent with the other tracing methods.

+ out = child.communicate(signal)[0]

+ dtrace.wait()

+ if dtrace.returncode != 0:

+ print 'dtrace failure: %d' % dtrace.returncode

+ with open(logname) as logfile:

+ print ''.join(logfile.readlines()[-100:])

+ # Find a better way.

+ os.remove(logname)

return dtrace.returncode or child.returncode, out

- @staticmethod

- def _sort_log(logname):

+ def post_process_log(self):

"""Sorts the log back in order when each call occured.

dtrace doesn't save the buffer in strict order since it keeps one buffer

- per CPU.

+ per CPU.

"""

+ super(Dtrace.Tracer, self).post_process_log()

+ logname = self._logname + '.log'

with open(logname, 'rb') as logfile:

lines = [l for l in logfile if l.strip()]

errors = [l for l in lines if l.startswith('dtrace:')]

@@ -1699,14 +1702,15 @@ class Dtrace(ApiBase):

@staticmethod

def clean_trace(logname):

- if os.path.isfile(logname):

- os.remove(logname)

+ for ext in ('', '.log'):

+ if os.path.isfile(logname + ext):

+ os.remove(logname + ext)

@classmethod

- def parse_log(cls, filename, blacklist):

- logging.info('parse_log(%s, %s)' % (filename, blacklist))

+ def parse_log(cls, logname, blacklist):

+ logging.info('parse_log(%s, %s)' % (logname, blacklist))

context = cls.Context(blacklist)

- for line in open(filename, 'rb'):

+ for line in open(logname + '.log', 'rb'):

context.on_line(line)

return context.to_results()

@@ -1722,40 +1726,6 @@ class LogmanTrace(ApiBase):

opened relative to another file_object or as an absolute path. All the current

working directory logic is done in user mode.

"""

- # The basic headers.

- EXPECTED_HEADER = [

- u'Event Name',

- u'Type',

- u'Event ID',

- u'Version',

- u'Channel',

- u'Level', # 5

- u'Opcode',

- u'Task',

- u'Keyword',

- u'PID',

- u'TID', # 10

- u'Processor Number',

- u'Instance ID',

- u'Parent Instance ID',

- u'Activity ID',

- u'Related Activity ID', # 15

- u'Clock-Time',

- u'Kernel(ms)', # Both have a resolution of ~15ms which makes them

- u'User(ms)', # pretty much useless.

- u'User Data', # Extra arguments that are event-specific.

- ]

- # Only the useful headers common to all entries are listed there. Any column

- # at 19 or higher is dependent on the specific event.

- EVENT_NAME = 0

- TYPE = 1

- PID = 9

- TID = 10

- PROCESSOR_ID = 11

- TIMESTAMP = 16

- NULL_GUID = '{00000000-0000-0000-0000-000000000000}'

- USER_DATA = 19

class Context(ApiBase.Context):

"""Processes a ETW log line and keeps the list of existent and non

existent files accessed.

@@ -2044,6 +2014,40 @@ class LogmanTrace(ApiBase):

return out

class Tracer(ApiBase.Tracer):

+ # The basic headers.

+ EXPECTED_HEADER = [

+ u'Event Name',

+ u'Type',

+ u'Event ID',

+ u'Version',

+ u'Channel',

+ u'Level', # 5

+ u'Opcode',

+ u'Task',

+ u'Keyword',

+ u'PID',

+ u'TID', # 10

+ u'Processor Number',

+ u'Instance ID',

+ u'Parent Instance ID',

+ u'Activity ID',

+ u'Related Activity ID', # 15

+ u'Clock-Time',

+ u'Kernel(ms)', # Both have a resolution of ~15ms which makes them

+ u'User(ms)', # pretty much useless.

+ u'User Data', # Extra arguments that are event-specific.

+ ]

+ # Only the useful headers common to all entries are listed there. Any column

+ # at 19 or higher is dependent on the specific event.

+ EVENT_NAME = 0

+ TYPE = 1

+ PID = 9

+ TID = 10

+ PROCESSOR_ID = 11

+ TIMESTAMP = 16

+ NULL_GUID = '{00000000-0000-0000-0000-000000000000}'

+ USER_DATA = 19

def trace(self, cmd, cwd, tracename, output):

"""Uses logman.exe to start and stop the NT Kernel Logger while the

executable to be traced is run.

@@ -2083,10 +2087,6 @@ class LogmanTrace(ApiBase):

# 3. Stop the log collection.

self._stop_log()

- # 4. Convert log

- # TODO(maruel): Temporary, remove me.

- LogmanTrace.process_log(self._logname)

# 5. Save metadata.

value = {

'pid': child.pid,

@@ -2146,6 +2146,141 @@ class LogmanTrace(ApiBase):

stdout=subprocess.PIPE,

stderr=subprocess.STDOUT)

+ def post_process_log(self):

+ """Converts the .etl file into .csv then into .json."""

+ super(LogmanTrace.Tracer, self).post_process_log()

+ logformat = 'csv'

+ self._convert_log(logformat)

+ if logformat == 'csv_utf16':

+ def load_file():

+ def utf_8_encoder(unicode_csv_data):

+ """Encodes the unicode object as utf-8 encoded str instance"""

+ for line in unicode_csv_data:

+ yield line.encode('utf-8')

+ def unicode_csv_reader(unicode_csv_data, **kwargs):

+ """Encodes temporarily as UTF-8 since csv module doesn't do unicode.

+ """

+ csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)

+ for row in csv_reader:

+ # Decode str utf-8 instances back to unicode instances, cell by

+ # cell:

+ yield [cell.decode('utf-8') for cell in row]

+ # The CSV file is UTF-16 so use codecs.open() to load the file into

+ # the python internal unicode format (utf-8). Then explicitly

+ # re-encode as utf8 as str instances so csv can parse it fine. Then

+ # decode the utf-8 str back into python unicode instances. This

+ # sounds about right.

+ for line in unicode_csv_reader(

+ codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):

+ # line is a list of unicode objects

+ # So much white space!

+ yield [i.strip() for i in line]

+ elif logformat == 'csv':

+ def load_file():

+ def ansi_csv_reader(ansi_csv_data, **kwargs):

+ """Loads an 'ANSI' code page and returns unicode() objects."""

+ assert sys.getfilesystemencoding() == 'mbcs'

+ encoding = get_current_encoding()

+ for row in csv.reader(ansi_csv_data, **kwargs):

+ # Decode str 'ansi' instances to unicode instances, cell by cell:

+ yield [cell.decode(encoding) for cell in row]

+ # The fastest and smallest format but only supports 'ANSI' file paths.

+ # E.g. the filenames are encoding in the 'current' encoding.

+ for line in ansi_csv_reader(open(self._logname + '.' + logformat)):

+ # line is a list of unicode objects.

+ yield [i.strip() for i in line]

+ supported_events = LogmanTrace.Context.supported_events()

+ def trim(generator):

+ for index, line in enumerate(generator):

+ if not index:

+ if line != self.EXPECTED_HEADER:

+ raise TracingFailure(

+ 'Found malformed header: %s' % ' '.join(line),

+ None, None, None)

+ continue

+ # As you can see, the CSV is full of useful non-redundant information:

+ if (line[2] != '0' or # Event ID

+ line[3] not in ('2', '3') or # Version

+ line[4] != '0' or # Channel

+ line[5] != '0' or # Level

+ line[7] != '0' or # Task

+ line[8] != '0x0000000000000000' or # Keyword

+ line[12] != '' or # Instance ID

+ line[13] != '' or # Parent Instance ID

+ line[14] != self.NULL_GUID or # Activity ID

+ line[15] != ''): # Related Activity ID

+ raise TracingFailure(

+ 'Found unexpected values in line: %s' % ' '.join(line),

+ None, None, None)

+ if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:

+ continue

+ # Convert the PID in-place from hex.

+ line[self.PID] = int(line[self.PID], 16)

+ yield [

+ line[self.EVENT_NAME],

+ line[self.TYPE],

+ line[self.PID],

+ line[self.TID],

+ line[self.PROCESSOR_ID],

+ line[self.TIMESTAMP],

+ ] + line[self.USER_DATA:]

+ write_json('%s.json' % self._logname, list(trim(load_file())), True)

+ def _convert_log(self, logformat):

+ """Converts the ETL trace to text representation.

+ Normally, 'csv' is sufficient. If complex scripts are used (like eastern

+ languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.

+ Arguments:

+ - logformat: Text format to be generated, csv, csv_utf16 or xml.

+ Use "tracerpt -?" for help.

+ """

+ LOCALE_INVARIANT = 0x7F

+ windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)

+ cmd_convert = [

+ 'tracerpt.exe',

+ '-l', self._logname + '.etl',

+ '-o', self._logname + '.' + logformat,

+ '-gmt', # Use UTC

+ '-y', # No prompt

+ # Use -of XML to get the header of each items after column 19, e.g. all

+ # the actual headers of 'User Data'.

+ ]

+ if logformat == 'csv':

+ # tracerpt localizes the 'Type' column, for major brainfuck

+ # entertainment. I can't imagine any sane reason to do that.

+ cmd_convert.extend(['-of', 'CSV'])

+ elif logformat == 'csv_utf16':

+ # This causes it to use UTF-16, which doubles the log size but ensures

+ # the log is readable for non-ASCII characters.

+ cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])

+ elif logformat == 'xml':

+ cmd_convert.extend(['-of', 'XML'])

+ else:

+ raise ValueError('Unexpected log format \'%s\'' % logformat)

+ logging.debug('Running: %s' % cmd_convert)

+ # This can takes tens of minutes for large logs.

+ # Redirects all output to stderr.

+ subprocess.check_call(

+ cmd_convert,

+ stdin=subprocess.PIPE,

+ stdout=sys.stderr,

+ stderr=sys.stderr)

def __init__(self):

super(LogmanTrace, self).__init__()

# Most ignores need to be determined at runtime.

@@ -2174,142 +2309,6 @@ class LogmanTrace(ApiBase):

self.IGNORED.add('\\SystemRoot')

self.IGNORED = tuple(sorted(self.IGNORED))

- @classmethod

- def process_log(cls, logname):

- """Converts the .etl file into .csv then into .json."""

- logformat = 'csv'

- cls._convert_log(logname, logformat)

- if logformat == 'csv_utf16':

- def load_file():

- def utf_8_encoder(unicode_csv_data):

- """Encodes the unicode object as utf-8 encoded str instance"""

- for line in unicode_csv_data:

- yield line.encode('utf-8')

- def unicode_csv_reader(unicode_csv_data, **kwargs):

- """Encodes temporarily as UTF-8 since csv module doesn't do unicode.

- """

- csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)

- for row in csv_reader:

- # Decode str utf-8 instances back to unicode instances, cell by

- # cell:

- yield [cell.decode('utf-8') for cell in row]

- # The CSV file is UTF-16 so use codecs.open() to load the file into the

- # python internal unicode format (utf-8). Then explicitly re-encode as

- # utf8 as str instances so csv can parse it fine. Then decode the utf-8

- # str back into python unicode instances. This sounds about right.

- for line in unicode_csv_reader(

- codecs.open(logname + '.' + logformat, 'r', 'utf-16')):

- # line is a list of unicode objects

- # So much white space!

- yield [i.strip() for i in line]

- elif logformat == 'csv':

- def load_file():

- def ansi_csv_reader(ansi_csv_data, **kwargs):

- """Loads an 'ANSI' code page and returns unicode() objects."""

- assert sys.getfilesystemencoding() == 'mbcs'

- encoding = get_current_encoding()

- for row in csv.reader(ansi_csv_data, **kwargs):

- # Decode str 'ansi' instances to unicode instances, cell by cell:

- yield [cell.decode(encoding) for cell in row]

- # The fastest and smallest format but only supports 'ANSI' file paths.

- # E.g. the filenames are encoding in the 'current' encoding.

- for line in ansi_csv_reader(open(logname + '.' + logformat)):

- # line is a list of unicode objects.

- yield [i.strip() for i in line]

- supported_events = cls.Context.supported_events()

- def trim(generator):

- for index, line in enumerate(generator):

- if not index:

- if line != cls.EXPECTED_HEADER:

- raise TracingFailure(

- 'Found malformed header: %s' % ' '.join(line),

- None, None, None)

- continue

- # As you can see, the CSV is full of useful non-redundant information:

- if (line[2] != '0' or # Event ID

- line[3] not in ('2', '3') or # Version

- line[4] != '0' or # Channel

- line[5] != '0' or # Level

- line[7] != '0' or # Task

- line[8] != '0x0000000000000000' or # Keyword

- line[12] != '' or # Instance ID

- line[13] != '' or # Parent Instance ID

- line[14] != cls.NULL_GUID or # Activity ID

- line[15] != ''): # Related Activity ID

- raise TracingFailure(

- 'Found unexpected values in line: %s' % ' '.join(line),

- None, None, None)

- if (line[cls.EVENT_NAME], line[cls.TYPE]) not in supported_events:

- continue

- # Convert the PID in-place from hex.

- line[cls.PID] = int(line[cls.PID], 16)

- yield [

- line[cls.EVENT_NAME],

- line[cls.TYPE],

- line[cls.PID],

- line[cls.TID],

- line[cls.PROCESSOR_ID],

- line[cls.TIMESTAMP],

- ] + line[cls.USER_DATA:]

- write_json('%s.json' % logname, list(trim(load_file())), True)

- @staticmethod

- def _convert_log(logname, logformat):

- """Converts the ETL trace to text representation.

- Normally, 'csv' is sufficient. If complex scripts are used (like eastern

- languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.

- Arguments:

- - logname: Base filename to convert.

- - logformat: Text format to be generated, csv, csv_utf16 or xml.

- Use "tracerpt -?" for help.

- """

- LOCALE_INVARIANT = 0x7F

- windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)

- cmd_convert = [

- 'tracerpt.exe',

- '-l', logname + '.etl',

- '-o', logname + '.' + logformat,

- '-gmt', # Use UTC

- '-y', # No prompt

- # Use -of XML to get the header of each items after column 19, e.g. all

- # the actual headers of 'User Data'.

- ]

- if logformat == 'csv':

- # tracerpt localizes the 'Type' column, for major brainfuck

- # entertainment. I can't imagine any sane reason to do that.

- cmd_convert.extend(['-of', 'CSV'])

- elif logformat == 'csv_utf16':

- # This causes it to use UTF-16, which doubles the log size but ensures

- # the log is readable for non-ASCII characters.

- cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])

- elif logformat == 'xml':

- cmd_convert.extend(['-of', 'XML'])

- else:

- raise ValueError('Unexpected log format \'%s\'' % logformat)

- logging.debug('Running: %s' % cmd_convert)

- # This can takes tens of minutes for large logs.

- # Redirects all output to stderr.

- subprocess.check_call(

- cmd_convert,

- stdin=subprocess.PIPE,

- stdout=sys.stderr,

- stderr=sys.stderr)

@staticmethod

def clean_trace(logname):

for ext in ('', '.csv', '.etl', '.json', '.xml'):

« no previous file with comments | « no previous file | no next file » | no next file with comments »