Index: tools/isolate/trace_inputs.py |
diff --git a/tools/isolate/trace_inputs.py b/tools/isolate/trace_inputs.py |
index 3dd64bde8052bc80ad1f19f0a7322bf5ab5ef658..0ba03f0a7222a8c73407caff41b66edb777acb62 100755 |
--- a/tools/isolate/trace_inputs.py |
+++ b/tools/isolate/trace_inputs.py |
@@ -277,6 +277,41 @@ class ApiBase(object): |
self.initial_cwd = initial_cwd |
self.cwd = None |
self.files = set() |
+ self.executable = None |
+ self.command = None |
+ |
+ if parentid: |
+ self.root().processes[parentid].children.append(pid) |
+ |
+ def to_results_process(self): |
+ """Resolves file case sensitivity and or late-bound strings.""" |
+ children = [ |
+ self.root().processes[c].to_results_process() for c in self.children |
+ ] |
+ # When resolving files, it's normal to get dupe because a file could be |
+ # opened multiple times with different case. Resolve the deduplication |
+ # here. |
+ def render_to_string_and_fix_case(x): |
+ """Returns the native file path case if the file exists. |
+ |
+ Converts late-bound strings. |
+ """ |
+ if not x: |
+ return x |
+ # TODO(maruel): Do not upconvert to unicode here, on linux we don't |
+ # know the file path encoding so they must be treated as bytes. |
+ x = unicode(x) |
+ if not os.path.exists(x): |
+ return x |
+ return get_native_path_case(x) |
+ |
+ return Results.Process( |
+ self.pid, |
+ set(map(render_to_string_and_fix_case, self.files)), |
+ render_to_string_and_fix_case(self.executable), |
+ self.command, |
+ render_to_string_and_fix_case(self.initial_cwd), |
+ children) |
def add_file(self, filepath): |
if self.root().blacklist(unicode(filepath)): |
@@ -288,23 +323,6 @@ class ApiBase(object): |
self.blacklist = blacklist |
self.processes = {} |
- def resolve(self): |
- """Resolve all the filenames and returns them.""" |
- files = set() |
- non_existent = set() |
- for p in self.processes.itervalues(): |
- for filepath in p.files: |
- filepath = unicode(filepath) |
- # For late-bound file paths, it could be blacklisted after all the |
- # processes are processed so it needs to be checked again. |
- if self.blacklist(filepath): |
- break |
- if os.path.isfile(filepath): |
- files.add(filepath) |
- else: |
- non_existent.add(filepath) |
- return files, non_existent |
- |
@staticmethod |
def clean_trace(logname): |
"""Deletes the old log.""" |
@@ -333,6 +351,226 @@ class ApiBase(object): |
raise NotImplementedError(cls.__class__.__name__) |
+class Results(object): |
+ """Results of a trace session.""" |
+ |
+ class File(object): |
+ """A file that was accessed.""" |
+ def __init__(self, root, path): |
+ """Represents a file accessed. May not be present anymore.""" |
+ logging.debug('%s(%s, %s)' % (self.__class__.__name__, root, path)) |
+ self.root = root |
+ self.path = path |
+ |
+ self._size = None |
+ # For compatibility with Directory object interface. |
+ # Shouldn't be used normally, only exists to simplify algorithms. |
+ self.nb_files = 1 |
+ |
+ assert path, path |
+ assert bool(root) != bool(isabs(path)), (root, path) |
+ assert ( |
+ not os.path.exists(self.full_path) or |
+ self.full_path == get_native_path_case(self.full_path)) |
+ |
+ @property |
+ def existent(self): |
+ return self.size != -1 |
+ |
+ @property |
+ def size(self): |
+ """File's size. -1 is not existent.""" |
+ if self._size is None: |
+ try: |
+ self._size = os.stat(self.full_path).st_size |
+ except OSError: |
+ self._size = -1 |
+ return self._size |
+ |
+ @property |
+ def full_path(self): |
+ if self.root: |
+ return os.path.join(self.root, self.path) |
+ return self.path |
+ |
+ def flatten(self): |
+ return { |
+ 'path': self.path, |
+ 'size': self.size, |
+ } |
+ |
+ def strip_root(self, root): |
+ """Returns a clone of itself with 'root' stripped off.""" |
+ assert isabs(root) and root.endswith(os.path.sep), root |
+ if not self.full_path.startswith(root): |
+ return None |
+ out = self.__class__(root, self.full_path[len(root):]) |
+ # Keep size cache. |
+ out._size = self._size |
+ return out |
+ |
+ class Directory(File): |
+ """A directory of files. Must exist.""" |
+ def __init__(self, root, path, size, nb_files): |
+ """path='.' is a valid value and must be handled appropriately.""" |
+ super(Results.Directory, self).__init__(root, path) |
+ assert not self.path.endswith(os.path.sep) |
+ self.path = self.path + os.path.sep |
+ self.nb_files = nb_files |
+ self._size = size |
+ |
+ def flatten(self): |
+ out = super(Results.Directory, self).flatten() |
+ out['nb_files'] = self.nb_files |
+ return out |
+ |
+ class Process(object): |
+ """A process that was traced. |
+ |
+ Contains references to the files accessed by this process and its children. |
+ """ |
+ def __init__( |
+ self, pid, files, executable, command, initial_cwd, children): |
+ logging.debug('Process(%s, %d, ...)' % (pid, len(files))) |
+ self.pid = pid |
+ self.files = sorted( |
+ (Results.File(None, f) for f in files), key=lambda x: x.path) |
+ assert len(set(f.path for f in self.files)) == len(self.files), [ |
+ f.path for f in self.files] |
+ assert isinstance(children, list) |
+ assert isinstance(self.files, list) |
+ self.children = children |
+ self.executable = executable |
+ self.command = command |
+ self.initial_cwd = initial_cwd |
+ |
+ @property |
+ def all(self): |
+ for child in self.children: |
+ for i in child.all: |
+ yield i |
+ yield self |
+ |
+ def flatten(self): |
+ return { |
+ 'children': [c.flatten() for c in self.children], |
+ 'command': self.command, |
+ 'executable': self.executable, |
+ 'files': [f.flatten() for f in self.files], |
+ 'initial_cwd': self.initial_cwd, |
+ 'pid': self.pid, |
+ } |
+ |
+ def strip_root(self, root): |
+ assert isabs(root) and root.endswith(os.path.sep), root |
+ out = self.__class__( |
+ self.pid, |
+ [], |
+ self.executable, |
+ self.command, |
+ self.initial_cwd, |
+ [c.strip_root(root) for c in self.children]) |
+ # Override the files property. |
+ out.files = filter(None, (f.strip_root(root) for f in self.files)) |
+ logging.debug( |
+ 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files))) |
+ return out |
+ |
+ |
+ def __init__(self, process): |
+ self.process = process |
+ # Cache. |
+ self._files = None |
+ |
+ def flatten(self): |
+ return { |
+ 'root': self.process.flatten(), |
+ } |
+ |
+ @property |
+ def files(self): |
+ if self._files is None: |
+ self._files = sorted( |
+ sum((p.files for p in self.process.all), []), |
+ key=lambda x: x.path) |
+ return self._files |
+ |
+ @property |
+ def existent(self): |
+ return [f for f in self.files if f.existent] |
+ |
+ @property |
+ def non_existent(self): |
+ return [f for f in self.files if not f.existent] |
+ |
+ def strip_root(self, root): |
+ """Returns a clone with all the files outside the directory |root| removed |
+ and converts all the path to be relative paths. |
+ """ |
+ root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep |
+ logging.debug('strip_root(%s)' % root) |
+ return Results(self.process.strip_root(root)) |
+ |
+ |
+def extract_directories(files): |
+ """Detects if all the files in a directory are in |files| and if so, replace |
+ the individual files by a Results.Directory instance. |
+ |
+ Takes an array of Results.File instances and returns an array of |
+ Results.File and Results.Directory instances. |
+ """ |
+ assert not any(isinstance(f, Results.Directory) for f in files) |
+ # Remove non existent files. |
+ files = [f for f in files if f.existent] |
+ if not files: |
+ return files |
+ # All files must share the same root, which can be None. |
+ assert len(set(f.root for f in files)) == 1, set(f.root for f in files) |
+ |
+ def blacklist(f): |
+ return f in ('.git', '.svn') or f.endswith('.pyc') |
+ |
+ # Creates a {directory: {filename: File}} mapping, up to root. |
+ root = files[0].root |
+ assert root.endswith(os.path.sep) |
+ buckets = {} |
+ if root: |
+ buckets[root.rstrip(os.path.sep)] = {} |
+ for fileobj in files: |
+ path = fileobj.full_path |
+ directory = os.path.dirname(path) |
+ # Do not use os.path.basename() so trailing os.path.sep is kept. |
+ basename = path[len(directory)+1:] |
+ files_in_directory = buckets.setdefault(directory, {}) |
+ files_in_directory[basename] = fileobj |
+ # Add all the directories recursively up to root. |
+ while True: |
+ old_d = directory |
+ directory = os.path.dirname(directory) |
+ if directory + os.path.sep == root or directory == old_d: |
+ break |
+ buckets.setdefault(directory, {}) |
+ |
+ for directory in sorted(buckets, reverse=True): |
+ actual = set(f for f in os.listdir(directory) if not blacklist(f)) |
+ expected = set(buckets[directory]) |
+ if not (actual - expected): |
+ parent = os.path.dirname(directory) |
+ buckets[parent][os.path.basename(directory)] = Results.Directory( |
+ root, |
+ directory[len(root):], |
+ sum(f.size for f in buckets[directory].itervalues()), |
+ sum(f.nb_files for f in buckets[directory].itervalues())) |
+ # Remove the whole bucket. |
+ del buckets[directory] |
+ |
+ # Reverse the mapping with what remains. The original instances are returned, |
+ # so the cached meta data is kept. |
+ return sorted( |
+ sum((x.values() for x in buckets.itervalues()), []), |
+ key=lambda x: x.path) |
+ |
+ |
class Strace(ApiBase): |
"""strace implies linux.""" |
IGNORED = ( |
@@ -563,6 +801,15 @@ class Strace(ApiBase): |
def on_line(self, pid, line): |
self.get_or_set_proc(pid).on_line(line.strip()) |
+ def to_results(self): |
+ """Finds back the root process and verify consistency.""" |
+ # TODO(maruel): Absolutely unecessary, fix me. |
+ root = [p for p in self.processes.itervalues() if not p.parentid] |
+ assert len(root) == 1 |
+ process = root[0].to_results_process() |
+ assert sorted(self.processes) == sorted(p.pid for p in process.all) |
+ return Results(process) |
+ |
def get_or_set_proc(self, pid): |
"""Returns the Context.Process instance for this pid or creates a new one. |
""" |
@@ -635,12 +882,8 @@ class Strace(ApiBase): |
# TODO(maruel): Load as utf-8 |
for line in open(pidfile, 'rb'): |
context.on_line(pid, line) |
- files, non_existent = context.resolve() |
- # Resolve any symlink we hit. |
- return ( |
- set(os.path.realpath(f) for f in files), |
- set(os.path.realpath(f) for f in non_existent), |
- len(context.processes)) |
+ |
+ return context.to_results() |
class Dtrace(ApiBase): |
@@ -827,6 +1070,13 @@ class Dtrace(ApiBase): |
match.group(4), |
match.group(5)) |
+ def to_results(self): |
+ """Uses self._initial_pid to determine the initial process.""" |
+ process = self.processes[self._initial_pid].to_results_process() |
+ assert sorted(self.processes) == sorted(p.pid for p in process.all), ( |
+ sorted(self.processes), sorted(p.pid for p in process.all)) |
+ return Results(process) |
+ |
def handle_dtrace_BEGIN(self, _ppid, pid, _function, args, _result): |
assert not self._tracer_pid and not self._initial_pid |
self._tracer_pid = pid |
@@ -1000,12 +1250,7 @@ class Dtrace(ApiBase): |
context = cls.Context(blacklist) |
for line in open(filename, 'rb'): |
context.on_line(line) |
- files, non_existent = context.resolve() |
- # Resolve any symlink we hit. |
- return ( |
- set(os.path.realpath(f) for f in files), |
- set(os.path.realpath(f) for f in non_existent), |
- len(context.processes)) |
+ return context.to_results() |
@staticmethod |
def _sort_log(logname): |
@@ -1131,6 +1376,13 @@ class LogmanTrace(ApiBase): |
else: |
assert False, '%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]) |
+ def to_results(self): |
+ """Uses self._initial_pid to determine the initial process.""" |
+ process = self.processes[self._initial_pid].to_results_process() |
+ assert sorted(self.processes) == sorted(p.pid for p in process.all), ( |
+ sorted(self.processes), sorted(p.pid for p in process.all)) |
+ return Results(process) |
+ |
def _thread_to_process(self, tid): |
"""Finds the process from the thread id.""" |
tid = int(tid, 16) |
@@ -1554,47 +1806,7 @@ class LogmanTrace(ApiBase): |
else: |
raise NotImplementedError('Implement %s' % logformat) |
- files, non_existent = context.resolve() |
- # Resolve any symlink we hit. |
- return ( |
- set(os.path.realpath(f) for f in files), |
- set(os.path.realpath(f) for f in non_existent), |
- len(context.processes)) |
- |
- |
-def relevant_files(files, root): |
- """Trims the list of files to keep the expected files and unexpected files. |
- |
- Unexpected files are files that are not based inside the |root| directory. |
- """ |
- expected = [] |
- unexpected = [] |
- for f in files: |
- if f.startswith(root): |
- f = f[len(root):] |
- assert f |
- expected.append(f) |
- else: |
- unexpected.append(f) |
- return sorted(set(expected)), sorted(set(unexpected)) |
- |
- |
-def extract_directories(files, root): |
- """Detects if all the files in a directory were loaded and if so, replace the |
- individual files by the directory entry. |
- """ |
- directories = set(os.path.dirname(f) for f in files) |
- files = set(files) |
- for directory in sorted(directories, reverse=True): |
- actual = set( |
- os.path.join(directory, f) for f in |
- os.listdir(os.path.join(root, directory)) |
- if not f.endswith(('.svn', '.pyc')) |
- ) |
- if not (actual - files): |
- files -= actual |
- files.add(directory + os.path.sep) |
- return sorted(files) |
+ return context.to_results() |
def pretty_print(variables, stdout): |
@@ -1750,16 +1962,10 @@ def load_trace(logfile, root_dir, api): |
trace or not. |
- api: a tracing api instance. |
""" |
- root_dir = get_native_path_case(root_dir) |
- files, non_existent, processes = api.parse_log(logfile, get_blacklist(api)) |
- expected, unexpected = relevant_files( |
- files, root_dir.rstrip(os.path.sep) + os.path.sep) |
- # In case the file system is case insensitive. |
- expected = sorted(set( |
- get_native_path_case(os.path.join(root_dir, f))[len(root_dir)+1:] |
- for f in expected)) |
- simplified = extract_directories(expected, root_dir) |
- return files, expected, unexpected, non_existent, simplified, processes |
+ results = api.parse_log(logfile, get_blacklist(api)) |
+ results = results.strip_root(root_dir) |
+ simplified = extract_directories(results.files) |
+ return results, simplified |
def trace_inputs(logfile, cmd, root_dir, cwd_dir, product_dir, force_trace): |
@@ -1806,26 +2012,24 @@ def trace_inputs(logfile, cmd, root_dir, cwd_dir, product_dir, force_trace): |
return returncode |
print_if('Loading traces... %s' % logfile) |
- files, expected, unexpected, non_existent, simplified, _ = load_trace( |
- logfile, root_dir, api) |
- |
- print_if('Total: %d' % len(files)) |
- print_if('Non existent: %d' % len(non_existent)) |
- for f in non_existent: |
- print_if(' %s' % f) |
- if unexpected: |
- print_if('Unexpected: %d' % len(unexpected)) |
- for f in unexpected: |
- print_if(' %s' % f) |
- print_if('Interesting: %d reduced to %d' % (len(expected), len(simplified))) |
+ results, simplified = load_trace(logfile, root_dir, api) |
+ |
+ print_if('Total: %d' % len(results.files)) |
+ print_if('Non existent: %d' % len(results.non_existent)) |
+ for f in results.non_existent: |
+ print_if(' %s' % f.path) |
+ print_if( |
+ 'Interesting: %d reduced to %d' % ( |
+ len(results.existent), len(simplified))) |
for f in simplified: |
- print_if(' %s' % f) |
+ print_if(' %s' % f.path) |
if cwd_dir is not None: |
value = { |
'conditions': [ |
['OS=="%s"' % get_flavor(), { |
- 'variables': generate_dict(simplified, cwd_dir, product_dir), |
+ 'variables': generate_dict( |
+ [f.path for f in simplified], cwd_dir, product_dir), |
}], |
], |
} |