tools/isolate/trace_inputs.py - Issue 10444052: Add Results class to hold structured data with details about each of the child processes.

Unified Diff: tools/isolate/trace_inputs.py

Issue 10444052: Add Results class to hold structured data with details about each of the child processes. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebase Created 8 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/isolate/trace_inputs.py

diff --git a/tools/isolate/trace_inputs.py b/tools/isolate/trace_inputs.py

index b51caeb1373183b08a113c9d378ef5e6402690fc..29b595db156b724589112ce4623955794eacdb2b 100755

--- a/tools/isolate/trace_inputs.py

+++ b/tools/isolate/trace_inputs.py

@@ -277,6 +277,41 @@ class ApiBase(object):

self.initial_cwd = initial_cwd

self.cwd = None

self.files = set()

+ self.executable = None

+ self.command = None

+ if parentid:

+ self.root().processes[parentid].children.append(pid)

+ def to_results_process(self):

+ """Resolves file case sensitivity and or late-bound strings."""

+ children = [

+ self.root().processes[c].to_results_process() for c in self.children

MAD 2012/05/30 20:29:36 We usually prefer not doing in an list constructio

M-A Ruel 2012/05/30 22:34:10 Why? It's much slower. I agree for this line in pa

MAD 2012/05/31 14:00:57 OK, then... I would find it more readable, but fi

+ ]

+ # When resolving files, it's normal to get dupe because of a file could

MAD 2012/05/30 20:29:36 "because of a" -> "because a"

M-A Ruel 2012/05/30 22:34:10 done.

+ # be opened multiple times with different case. Resolve the

+ # deduplication here.

+ def render_to_string_and_fix_case(x):

+ """Returns the native file path case if the file exists.

+ Converts late-bound strings.

+ """

+ if not x:

+ return x

+ # TODO(maruel): Do not upconvert to unicode here, on linux we don't

+ # know the file path encoding so they must be treated as bytes.

+ x = unicode(x)

+ if not os.path.exists(x):

+ return x

+ return get_native_path_case(x)

+ return Results.Process(

+ self.pid,

+ set(map(render_to_string_and_fix_case, self.files)),

+ render_to_string_and_fix_case(self.executable),

+ self.command,

+ render_to_string_and_fix_case(self.initial_cwd),

+ children)

def add_file(self, filepath):

if self.root().blacklist(unicode(filepath)):

@@ -288,23 +323,6 @@ class ApiBase(object):

self.blacklist = blacklist

self.processes = {}

- def resolve(self):

- """Resolve all the filenames and returns them."""

- files = set()

- non_existent = set()

- for p in self.processes.itervalues():

- for filepath in p.files:

- filepath = unicode(filepath)

- # For late-bound file paths, it could be blacklisted after all the

- # processes are processed so it needs to be checked again.

- if self.blacklist(filepath):

- break

- if os.path.isfile(filepath):

- files.add(filepath)

- else:

- non_existent.add(filepath)

- return files, non_existent

@staticmethod

def clean_trace(logname):

"""Deletes the old log."""

@@ -333,6 +351,222 @@ class ApiBase(object):

raise NotImplementedError(cls.__class__.__name__)

+class Results(object):

+ """Results of a trace session."""

+ class File(object):

+ """A file that was accessed."""

+ def __init__(self, root, path):

+ """Represents a file accessed. May not be present anymore."""

+ logging.debug('%s(%s, %s)' % (self.__class__.__name__, root, path))

+ self.root = root

+ self.path = path

+ self._size = None

+ # For compatibility with Directory object interface.

+ # Shouldn't be used normally, only exists to simplify algorithms.

+ self.nb_files = 1

+ assert path, path

+ assert bool(root) != bool(isabs(path)), (root, path)

+ assert (

+ not os.path.exists(self.full_path) or

+ self.full_path == get_native_path_case(self.full_path))

+ @property

+ def existent(self):

+ return self.size != -1

+ @property

+ def size(self):

+ """File's size. -1 is not existent."""

+ if self._size is None:

+ try:

+ self._size = os.stat(self.full_path).st_size

+ except OSError:

+ self._size = -1

+ return self._size

+ @property

+ def full_path(self):

+ if self.root:

+ return os.path.join(self.root, self.path)

+ return self.path

+ def flatten(self):

+ return {

+ 'path': self.path,

+ 'size': self.size,

+ }

+ def strip_root(self, root):

+ """Returns a clone of itself with 'root' stripped off."""

+ assert isabs(root) and root.endswith(os.path.sep), root

+ if not self.full_path.startswith(root):

+ return None

+ out = self.__class__(root, self.full_path[len(root):])

+ # Keep size cache.

+ out._size = self._size

+ return out

+ class Directory(File):

+ """A directory of files. Must exist."""

+ def __init__(self, root, path, size, nb_files):

+ """path='.' is a valid value and must be handled appropriately."""

+ super(Results.Directory, self).__init__(root, path)

+ self.path = self.path + os.path.sep

MAD 2012/05/30 20:29:36 maybe add an assert that path doesn't already ends

M-A Ruel 2012/05/30 22:34:10 done

+ self.nb_files = nb_files

+ self._size = size

+ def flatten(self):

+ out = super(Results.Directory, self).flatten()

+ out['nb_files'] = self.nb_files

+ return out

+ class Process(object):

+ """A process that was traced.

+ Contains references to the files accessed by this process and its children.

+ """

+ def __init__(

+ self, pid, files, executable, command, initial_cwd, children):

+ logging.debug('Process(%s, %d, ...)' % (pid, len(files)))

+ self.pid = pid

+ self.files = sorted(

+ (Results.File(None, f) for f in files), key=lambda x: x.path)

+ assert len(set(f.path for f in self.files)) == len(self.files), [

+ f.path for f in self.files]

+ assert isinstance(children, list)

+ assert isinstance(self.files, list)

+ self.children = children

+ self.executable = executable

+ self.command = command

+ self.initial_cwd = initial_cwd

+ @property

+ def all(self):

+ for child in self.children:

+ for i in child.all:

+ yield i

+ yield self

+ def flatten(self):

+ return {

+ 'children': [c.flatten() for c in self.children],

+ 'command': self.command,

+ 'executable': self.executable,

+ 'files': [f.flatten() for f in self.files],

+ 'initial_cwd': self.initial_cwd,

+ 'pid': self.pid,

+ }

+ def strip_root(self, root):

+ assert isabs(root) and root.endswith(os.path.sep), root

+ out = self.__class__(

+ self.pid,

+ [],

+ self.executable,

+ self.command,

+ self.initial_cwd,

+ [c.strip_root(root) for c in self.children])

+ # Override the files property.

+ out.files = filter(None, (f.strip_root(root) for f in self.files))

+ logging.debug(

+ 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))

+ return out

+ def __init__(self, process):

+ self.process = process

+ # Cache.

+ self._files = None

+ def flatten(self):

+ return {

+ 'root': self.process.flatten(),

+ }

+ @property

+ def files(self):

+ if self._files is None:

+ self._files = sorted(

+ sum((p.files for p in self.process.all), []),

+ key=lambda x: x.path)

+ return self._files

+ @property

+ def existent(self):

+ return [f for f in self.files if f.existent]

+ @property

+ def non_existent(self):

+ return [f for f in self.files if not f.existent]

+ def strip_root(self, root):

+ """Returns a clone with all the files outside the directory |root| removed

+ and converts all the path to be relative paths.

+ """

+ root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep

+ logging.debug('strip_root(%s)' % root)

+ return Results(self.process.strip_root(root))

+def extract_directories(files):

+ """Detects if all the files in a directory are in |files| and if so, replace

+ the individual files by a Results.Directory instance.

+ Takes an array of Results.File instances and returns an array of

+ Results.File and Results.Directory instances.

+ """

+ assert not any(isinstance(f, Results.Directory) for f in files)

+ # Remove non existent files.

+ files = [f for f in files if f.existent]

+ if not files:

+ return files

+ # All files must share the same root, which can be None.

+ assert len(set(f.root for f in files)) == 1, set(f.root for f in files)

+ def blacklist(f):

+ return f in ('.git', '.svn') or f.endswith('.pyc')

+ # Creates a {directory: {filename: File}} mapping, up to root.

+ root = files[0].root

+ buckets = {}

+ if root:

+ buckets[root.rstrip(os.path.sep)] = {}

+ for f in files:

MAD 2012/05/30 20:29:36 I prefer for file in files: for a one line list c

M-A Ruel 2012/05/30 22:34:10 'file' is a poor choice because it is a builtin, s

+ path = f.full_path

+ directory = os.path.dirname(path)

+ x = buckets.setdefault(directory, {})

MAD 2012/05/30 20:29:36 what's 'x'?

M-A Ruel 2012/05/30 22:34:10 I didn't recall, fixed.

+ x[path[len(directory)+1:]] = f

MAD 2012/05/30 20:29:36 really readable... :-P

M-A Ruel 2012/05/30 22:34:10 Rewrote.

+ # Add all the directories recursively up to root.

+ while True:

+ old_d = directory

+ directory = os.path.dirname(directory)

+ if directory + os.path.sep == root or directory == old_d:

MAD 2012/05/30 20:29:36 You assume root ends with a path.sep but you don't

M-A Ruel 2012/05/30 22:34:10 Agreed, added assert.

+ break

+ buckets.setdefault(directory, {})

+ for directory in sorted(buckets, reverse=True):

+ actual = set(f for f in os.listdir(directory) if not blacklist(f))

+ expected = set(buckets[directory])

+ if not (actual - expected):

+ parent = os.path.dirname(directory)

+ buckets[parent][os.path.basename(directory)] = Results.Directory(

+ root,

+ directory[len(root):],

+ sum(f.size for f in buckets[directory].itervalues()),

+ sum(f.nb_files for f in buckets[directory].itervalues()))

+ # Remove the whole bucket.

+ del buckets[directory]

+ # Reverse the mapping with what remains. The original instances are returned,

+ # so the cached meta data is kept.

+ return sorted(

+ sum((x.values() for x in buckets.itervalues()), []),

+ key=lambda x: x.path)

class Strace(ApiBase):

"""strace implies linux."""

IGNORED = (

@@ -563,6 +797,15 @@ class Strace(ApiBase):

def on_line(self, pid, line):

self.get_or_set_proc(pid).on_line(line.strip())

+ def to_results(self):

+ """Finds back the root process and verify consistency."""

+ # TODO(maruel): Absolutely unecessary, fix me.

+ root = [p for p in self.processes.itervalues() if not p.parentid]

+ assert len(root) == 1

+ process = root[0].to_results_process()

+ assert sorted(self.processes) == sorted(p.pid for p in process.all)

+ return Results(process)

def get_or_set_proc(self, pid):

"""Returns the Context.Process instance for this pid or creates a new one.

"""

@@ -635,12 +878,8 @@ class Strace(ApiBase):

# TODO(maruel): Load as utf-8

for line in open(pidfile, 'rb'):

context.on_line(pid, line)

- files, non_existent = context.resolve()

- # Resolve any symlink we hit.

- return (

- set(os.path.realpath(f) for f in files),

- set(os.path.realpath(f) for f in non_existent),

- len(context.processes))

+ return context.to_results()

class Dtrace(ApiBase):

@@ -827,6 +1066,13 @@ class Dtrace(ApiBase):

match.group(4),

match.group(5))

+ def to_results(self):

+ """Uses self._initial_pid to determine the initial process."""

+ process = self.processes[self._initial_pid].to_results_process()

+ assert sorted(self.processes) == sorted(p.pid for p in process.all), (

+ sorted(self.processes), sorted(p.pid for p in process.all))

+ return Results(process)

def handle_dtrace_BEGIN(self, _ppid, pid, _function, args, _result):

assert not self._tracer_pid and not self._initial_pid

self._tracer_pid = pid

@@ -1000,12 +1246,7 @@ class Dtrace(ApiBase):

context = cls.Context(blacklist)

for line in open(filename, 'rb'):

context.on_line(line)

- files, non_existent = context.resolve()

- # Resolve any symlink we hit.

- return (

- set(os.path.realpath(f) for f in files),

- set(os.path.realpath(f) for f in non_existent),

- len(context.processes))

+ return context.to_results()

@staticmethod

def _sort_log(logname):

@@ -1131,6 +1372,13 @@ class LogmanTrace(ApiBase):

else:

assert False, '%s_%s' % (line[self.EVENT_NAME], line[self.TYPE])

+ def to_results(self):

+ """Uses self._initial_pid to determine the initial process."""

+ process = self.processes[self._initial_pid].to_results_process()

+ assert sorted(self.processes) == sorted(p.pid for p in process.all), (

+ sorted(self.processes), sorted(p.pid for p in process.all))

+ return Results(process)

def _thread_to_process(self, tid):

"""Finds the process from the thread id."""

tid = int(tid, 16)

@@ -1553,47 +1801,7 @@ class LogmanTrace(ApiBase):

else:

raise NotImplementedError('Implement %s' % logformat)

- files, non_existent = context.resolve()

- # Resolve any symlink we hit.

- return (

- set(os.path.realpath(f) for f in files),

- set(os.path.realpath(f) for f in non_existent),

- len(context.processes))

-def relevant_files(files, root):

- """Trims the list of files to keep the expected files and unexpected files.

- Unexpected files are files that are not based inside the |root| directory.

- """

- expected = []

- unexpected = []

- for f in files:

- if f.startswith(root):

- f = f[len(root):]

- assert f

- expected.append(f)

- else:

- unexpected.append(f)

- return sorted(set(expected)), sorted(set(unexpected))

-def extract_directories(files, root):

- """Detects if all the files in a directory were loaded and if so, replace the

- individual files by the directory entry.

- """

- directories = set(os.path.dirname(f) for f in files)

- files = set(files)

- for directory in sorted(directories, reverse=True):

- actual = set(

- os.path.join(directory, f) for f in

- os.listdir(os.path.join(root, directory))

- if not f.endswith(('.svn', '.pyc'))

- )

- if not (actual - files):

- files -= actual

- files.add(directory + os.path.sep)

- return sorted(files)

+ return context.to_results()

def pretty_print(variables, stdout):

@@ -1749,16 +1957,10 @@ def load_trace(logfile, root_dir, api):

trace or not.

- api: a tracing api instance.

"""

- root_dir = get_native_path_case(root_dir)

- files, non_existent, processes = api.parse_log(logfile, get_blacklist(api))

- expected, unexpected = relevant_files(

- files, root_dir.rstrip(os.path.sep) + os.path.sep)

- # In case the file system is case insensitive.

- expected = sorted(set(

- get_native_path_case(os.path.join(root_dir, f))[len(root_dir)+1:]

- for f in expected))

- simplified = extract_directories(expected, root_dir)

- return files, expected, unexpected, non_existent, simplified, processes

+ results = api.parse_log(logfile, get_blacklist(api))

+ results = results.strip_root(root_dir)

+ simplified = extract_directories(results.files)

+ return results, simplified

def trace_inputs(logfile, cmd, root_dir, cwd_dir, product_dir, force_trace):

@@ -1805,26 +2007,24 @@ def trace_inputs(logfile, cmd, root_dir, cwd_dir, product_dir, force_trace):

return returncode

print_if('Loading traces... %s' % logfile)

- files, expected, unexpected, non_existent, simplified, _ = load_trace(

- logfile, root_dir, api)

- print_if('Total: %d' % len(files))

- print_if('Non existent: %d' % len(non_existent))

- for f in non_existent:

- print_if(' %s' % f)

- if unexpected:

- print_if('Unexpected: %d' % len(unexpected))

- for f in unexpected:

- print_if(' %s' % f)

- print_if('Interesting: %d reduced to %d' % (len(expected), len(simplified)))

+ results, simplified = load_trace(logfile, root_dir, api)

+ print_if('Total: %d' % len(results.files))

+ print_if('Non existent: %d' % len(results.non_existent))

+ for f in results.non_existent:

+ print_if(' %s' % f.path)

+ print_if(

+ 'Interesting: %d reduced to %d' % (

+ len(results.existent), len(simplified)))

for f in simplified:

- print_if(' %s' % f)

+ print_if(' %s' % f.path)

if cwd_dir is not None:

value = {

'conditions': [

['OS=="%s"' % get_flavor(), {

- 'variables': generate_dict(simplified, cwd_dir, product_dir),

+ 'variables': generate_dict(

+ [f.path for f in simplified], cwd_dir, product_dir),

}],

}

« no previous file with comments | « no previous file | tools/isolate/trace_inputs_smoke_test.py » ('j') | tools/isolate/trace_inputs_smoke_test.py » ('J')