Chromium Code Reviews| Index: tools/isolate/trace_inputs.py | 
| diff --git a/tools/isolate/trace_inputs.py b/tools/isolate/trace_inputs.py | 
| index b51caeb1373183b08a113c9d378ef5e6402690fc..29b595db156b724589112ce4623955794eacdb2b 100755 | 
| --- a/tools/isolate/trace_inputs.py | 
| +++ b/tools/isolate/trace_inputs.py | 
| @@ -277,6 +277,41 @@ class ApiBase(object): | 
| self.initial_cwd = initial_cwd | 
| self.cwd = None | 
| self.files = set() | 
| + self.executable = None | 
| + self.command = None | 
| + | 
| + if parentid: | 
| + self.root().processes[parentid].children.append(pid) | 
| + | 
| + def to_results_process(self): | 
| + """Resolves file case sensitivity and or late-bound strings.""" | 
| + children = [ | 
| + self.root().processes[c].to_results_process() for c in self.children | 
| 
 
MAD
2012/05/30 20:29:36
We usually prefer not doing in an list constructio
 
M-A Ruel
2012/05/30 22:34:10
Why? It's much slower. I agree for this line in pa
 
MAD
2012/05/31 14:00:57
OK, then...  I would find it more readable, but fi
 
 | 
| + ] | 
| + # When resolving files, it's normal to get dupe because of a file could | 
| 
 
MAD
2012/05/30 20:29:36
"because of a" -> "because a"
 
M-A Ruel
2012/05/30 22:34:10
done.
 
 | 
| + # be opened multiple times with different case. Resolve the | 
| + # deduplication here. | 
| + def render_to_string_and_fix_case(x): | 
| + """Returns the native file path case if the file exists. | 
| + | 
| + Converts late-bound strings. | 
| + """ | 
| + if not x: | 
| + return x | 
| + # TODO(maruel): Do not upconvert to unicode here, on linux we don't | 
| + # know the file path encoding so they must be treated as bytes. | 
| + x = unicode(x) | 
| + if not os.path.exists(x): | 
| + return x | 
| + return get_native_path_case(x) | 
| + | 
| + return Results.Process( | 
| + self.pid, | 
| + set(map(render_to_string_and_fix_case, self.files)), | 
| + render_to_string_and_fix_case(self.executable), | 
| + self.command, | 
| + render_to_string_and_fix_case(self.initial_cwd), | 
| + children) | 
| def add_file(self, filepath): | 
| if self.root().blacklist(unicode(filepath)): | 
| @@ -288,23 +323,6 @@ class ApiBase(object): | 
| self.blacklist = blacklist | 
| self.processes = {} | 
| - def resolve(self): | 
| - """Resolve all the filenames and returns them.""" | 
| - files = set() | 
| - non_existent = set() | 
| - for p in self.processes.itervalues(): | 
| - for filepath in p.files: | 
| - filepath = unicode(filepath) | 
| - # For late-bound file paths, it could be blacklisted after all the | 
| - # processes are processed so it needs to be checked again. | 
| - if self.blacklist(filepath): | 
| - break | 
| - if os.path.isfile(filepath): | 
| - files.add(filepath) | 
| - else: | 
| - non_existent.add(filepath) | 
| - return files, non_existent | 
| - | 
| @staticmethod | 
| def clean_trace(logname): | 
| """Deletes the old log.""" | 
| @@ -333,6 +351,222 @@ class ApiBase(object): | 
| raise NotImplementedError(cls.__class__.__name__) | 
| +class Results(object): | 
| + """Results of a trace session.""" | 
| + | 
| + class File(object): | 
| + """A file that was accessed.""" | 
| + def __init__(self, root, path): | 
| + """Represents a file accessed. May not be present anymore.""" | 
| + logging.debug('%s(%s, %s)' % (self.__class__.__name__, root, path)) | 
| + self.root = root | 
| + self.path = path | 
| + | 
| + self._size = None | 
| + # For compatibility with Directory object interface. | 
| + # Shouldn't be used normally, only exists to simplify algorithms. | 
| + self.nb_files = 1 | 
| + | 
| + assert path, path | 
| + assert bool(root) != bool(isabs(path)), (root, path) | 
| + assert ( | 
| + not os.path.exists(self.full_path) or | 
| + self.full_path == get_native_path_case(self.full_path)) | 
| + | 
| + @property | 
| + def existent(self): | 
| + return self.size != -1 | 
| + | 
| + @property | 
| + def size(self): | 
| + """File's size. -1 is not existent.""" | 
| + if self._size is None: | 
| + try: | 
| + self._size = os.stat(self.full_path).st_size | 
| + except OSError: | 
| + self._size = -1 | 
| + return self._size | 
| + | 
| + @property | 
| + def full_path(self): | 
| + if self.root: | 
| + return os.path.join(self.root, self.path) | 
| + return self.path | 
| + | 
| + def flatten(self): | 
| + return { | 
| + 'path': self.path, | 
| + 'size': self.size, | 
| + } | 
| + | 
| + def strip_root(self, root): | 
| + """Returns a clone of itself with 'root' stripped off.""" | 
| + assert isabs(root) and root.endswith(os.path.sep), root | 
| + if not self.full_path.startswith(root): | 
| + return None | 
| + out = self.__class__(root, self.full_path[len(root):]) | 
| + # Keep size cache. | 
| + out._size = self._size | 
| + return out | 
| + | 
| + class Directory(File): | 
| + """A directory of files. Must exist.""" | 
| + def __init__(self, root, path, size, nb_files): | 
| + """path='.' is a valid value and must be handled appropriately.""" | 
| + super(Results.Directory, self).__init__(root, path) | 
| + self.path = self.path + os.path.sep | 
| 
 
MAD
2012/05/30 20:29:36
maybe add an assert that path doesn't already ends
 
M-A Ruel
2012/05/30 22:34:10
done
 
 | 
| + self.nb_files = nb_files | 
| + self._size = size | 
| + | 
| + def flatten(self): | 
| + out = super(Results.Directory, self).flatten() | 
| + out['nb_files'] = self.nb_files | 
| + return out | 
| + | 
| + class Process(object): | 
| + """A process that was traced. | 
| + | 
| + Contains references to the files accessed by this process and its children. | 
| + """ | 
| + def __init__( | 
| + self, pid, files, executable, command, initial_cwd, children): | 
| + logging.debug('Process(%s, %d, ...)' % (pid, len(files))) | 
| + self.pid = pid | 
| + self.files = sorted( | 
| + (Results.File(None, f) for f in files), key=lambda x: x.path) | 
| + assert len(set(f.path for f in self.files)) == len(self.files), [ | 
| + f.path for f in self.files] | 
| + assert isinstance(children, list) | 
| + assert isinstance(self.files, list) | 
| + self.children = children | 
| + self.executable = executable | 
| + self.command = command | 
| + self.initial_cwd = initial_cwd | 
| + | 
| + @property | 
| + def all(self): | 
| + for child in self.children: | 
| + for i in child.all: | 
| + yield i | 
| + yield self | 
| + | 
| + def flatten(self): | 
| + return { | 
| + 'children': [c.flatten() for c in self.children], | 
| + 'command': self.command, | 
| + 'executable': self.executable, | 
| + 'files': [f.flatten() for f in self.files], | 
| + 'initial_cwd': self.initial_cwd, | 
| + 'pid': self.pid, | 
| + } | 
| + | 
| + def strip_root(self, root): | 
| + assert isabs(root) and root.endswith(os.path.sep), root | 
| + out = self.__class__( | 
| + self.pid, | 
| + [], | 
| + self.executable, | 
| + self.command, | 
| + self.initial_cwd, | 
| + [c.strip_root(root) for c in self.children]) | 
| + # Override the files property. | 
| + out.files = filter(None, (f.strip_root(root) for f in self.files)) | 
| + logging.debug( | 
| + 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files))) | 
| + return out | 
| + | 
| + | 
| + def __init__(self, process): | 
| + self.process = process | 
| + # Cache. | 
| + self._files = None | 
| + | 
| + def flatten(self): | 
| + return { | 
| + 'root': self.process.flatten(), | 
| + } | 
| + | 
| + @property | 
| + def files(self): | 
| + if self._files is None: | 
| + self._files = sorted( | 
| + sum((p.files for p in self.process.all), []), | 
| + key=lambda x: x.path) | 
| + return self._files | 
| + | 
| + @property | 
| + def existent(self): | 
| + return [f for f in self.files if f.existent] | 
| + | 
| + @property | 
| + def non_existent(self): | 
| + return [f for f in self.files if not f.existent] | 
| + | 
| + def strip_root(self, root): | 
| + """Returns a clone with all the files outside the directory |root| removed | 
| + and converts all the path to be relative paths. | 
| + """ | 
| + root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep | 
| + logging.debug('strip_root(%s)' % root) | 
| + return Results(self.process.strip_root(root)) | 
| + | 
| + | 
| +def extract_directories(files): | 
| + """Detects if all the files in a directory are in |files| and if so, replace | 
| + the individual files by a Results.Directory instance. | 
| + | 
| + Takes an array of Results.File instances and returns an array of | 
| + Results.File and Results.Directory instances. | 
| + """ | 
| + assert not any(isinstance(f, Results.Directory) for f in files) | 
| + # Remove non existent files. | 
| + files = [f for f in files if f.existent] | 
| + if not files: | 
| + return files | 
| + # All files must share the same root, which can be None. | 
| + assert len(set(f.root for f in files)) == 1, set(f.root for f in files) | 
| + | 
| + def blacklist(f): | 
| + return f in ('.git', '.svn') or f.endswith('.pyc') | 
| + | 
| + # Creates a {directory: {filename: File}} mapping, up to root. | 
| + root = files[0].root | 
| + buckets = {} | 
| + if root: | 
| + buckets[root.rstrip(os.path.sep)] = {} | 
| + for f in files: | 
| 
 
MAD
2012/05/30 20:29:36
I prefer for file in files:
for a one line list c
 
M-A Ruel
2012/05/30 22:34:10
'file' is a poor choice because it is a builtin, s
 
 | 
| + path = f.full_path | 
| + directory = os.path.dirname(path) | 
| + x = buckets.setdefault(directory, {}) | 
| 
 
MAD
2012/05/30 20:29:36
what's 'x'?
 
M-A Ruel
2012/05/30 22:34:10
I didn't recall, fixed.
 
 | 
| + x[path[len(directory)+1:]] = f | 
| 
 
MAD
2012/05/30 20:29:36
really readable... :-P
 
M-A Ruel
2012/05/30 22:34:10
Rewrote.
 
 | 
| + # Add all the directories recursively up to root. | 
| + while True: | 
| + old_d = directory | 
| + directory = os.path.dirname(directory) | 
| + if directory + os.path.sep == root or directory == old_d: | 
| 
 
MAD
2012/05/30 20:29:36
You assume root ends with a path.sep but you don't
 
M-A Ruel
2012/05/30 22:34:10
Agreed, added assert.
 
 | 
| + break | 
| + buckets.setdefault(directory, {}) | 
| + | 
| + for directory in sorted(buckets, reverse=True): | 
| + actual = set(f for f in os.listdir(directory) if not blacklist(f)) | 
| + expected = set(buckets[directory]) | 
| + if not (actual - expected): | 
| + parent = os.path.dirname(directory) | 
| + buckets[parent][os.path.basename(directory)] = Results.Directory( | 
| + root, | 
| + directory[len(root):], | 
| + sum(f.size for f in buckets[directory].itervalues()), | 
| + sum(f.nb_files for f in buckets[directory].itervalues())) | 
| + # Remove the whole bucket. | 
| + del buckets[directory] | 
| + | 
| + # Reverse the mapping with what remains. The original instances are returned, | 
| + # so the cached meta data is kept. | 
| + return sorted( | 
| + sum((x.values() for x in buckets.itervalues()), []), | 
| + key=lambda x: x.path) | 
| + | 
| + | 
| class Strace(ApiBase): | 
| """strace implies linux.""" | 
| IGNORED = ( | 
| @@ -563,6 +797,15 @@ class Strace(ApiBase): | 
| def on_line(self, pid, line): | 
| self.get_or_set_proc(pid).on_line(line.strip()) | 
| + def to_results(self): | 
| + """Finds back the root process and verify consistency.""" | 
| + # TODO(maruel): Absolutely unecessary, fix me. | 
| + root = [p for p in self.processes.itervalues() if not p.parentid] | 
| + assert len(root) == 1 | 
| + process = root[0].to_results_process() | 
| + assert sorted(self.processes) == sorted(p.pid for p in process.all) | 
| + return Results(process) | 
| + | 
| def get_or_set_proc(self, pid): | 
| """Returns the Context.Process instance for this pid or creates a new one. | 
| """ | 
| @@ -635,12 +878,8 @@ class Strace(ApiBase): | 
| # TODO(maruel): Load as utf-8 | 
| for line in open(pidfile, 'rb'): | 
| context.on_line(pid, line) | 
| - files, non_existent = context.resolve() | 
| - # Resolve any symlink we hit. | 
| - return ( | 
| - set(os.path.realpath(f) for f in files), | 
| - set(os.path.realpath(f) for f in non_existent), | 
| - len(context.processes)) | 
| + | 
| + return context.to_results() | 
| class Dtrace(ApiBase): | 
| @@ -827,6 +1066,13 @@ class Dtrace(ApiBase): | 
| match.group(4), | 
| match.group(5)) | 
| + def to_results(self): | 
| + """Uses self._initial_pid to determine the initial process.""" | 
| + process = self.processes[self._initial_pid].to_results_process() | 
| + assert sorted(self.processes) == sorted(p.pid for p in process.all), ( | 
| + sorted(self.processes), sorted(p.pid for p in process.all)) | 
| + return Results(process) | 
| + | 
| def handle_dtrace_BEGIN(self, _ppid, pid, _function, args, _result): | 
| assert not self._tracer_pid and not self._initial_pid | 
| self._tracer_pid = pid | 
| @@ -1000,12 +1246,7 @@ class Dtrace(ApiBase): | 
| context = cls.Context(blacklist) | 
| for line in open(filename, 'rb'): | 
| context.on_line(line) | 
| - files, non_existent = context.resolve() | 
| - # Resolve any symlink we hit. | 
| - return ( | 
| - set(os.path.realpath(f) for f in files), | 
| - set(os.path.realpath(f) for f in non_existent), | 
| - len(context.processes)) | 
| + return context.to_results() | 
| @staticmethod | 
| def _sort_log(logname): | 
| @@ -1131,6 +1372,13 @@ class LogmanTrace(ApiBase): | 
| else: | 
| assert False, '%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]) | 
| + def to_results(self): | 
| + """Uses self._initial_pid to determine the initial process.""" | 
| + process = self.processes[self._initial_pid].to_results_process() | 
| + assert sorted(self.processes) == sorted(p.pid for p in process.all), ( | 
| + sorted(self.processes), sorted(p.pid for p in process.all)) | 
| + return Results(process) | 
| + | 
| def _thread_to_process(self, tid): | 
| """Finds the process from the thread id.""" | 
| tid = int(tid, 16) | 
| @@ -1553,47 +1801,7 @@ class LogmanTrace(ApiBase): | 
| else: | 
| raise NotImplementedError('Implement %s' % logformat) | 
| - files, non_existent = context.resolve() | 
| - # Resolve any symlink we hit. | 
| - return ( | 
| - set(os.path.realpath(f) for f in files), | 
| - set(os.path.realpath(f) for f in non_existent), | 
| - len(context.processes)) | 
| - | 
| - | 
| -def relevant_files(files, root): | 
| - """Trims the list of files to keep the expected files and unexpected files. | 
| - | 
| - Unexpected files are files that are not based inside the |root| directory. | 
| - """ | 
| - expected = [] | 
| - unexpected = [] | 
| - for f in files: | 
| - if f.startswith(root): | 
| - f = f[len(root):] | 
| - assert f | 
| - expected.append(f) | 
| - else: | 
| - unexpected.append(f) | 
| - return sorted(set(expected)), sorted(set(unexpected)) | 
| - | 
| - | 
| -def extract_directories(files, root): | 
| - """Detects if all the files in a directory were loaded and if so, replace the | 
| - individual files by the directory entry. | 
| - """ | 
| - directories = set(os.path.dirname(f) for f in files) | 
| - files = set(files) | 
| - for directory in sorted(directories, reverse=True): | 
| - actual = set( | 
| - os.path.join(directory, f) for f in | 
| - os.listdir(os.path.join(root, directory)) | 
| - if not f.endswith(('.svn', '.pyc')) | 
| - ) | 
| - if not (actual - files): | 
| - files -= actual | 
| - files.add(directory + os.path.sep) | 
| - return sorted(files) | 
| + return context.to_results() | 
| def pretty_print(variables, stdout): | 
| @@ -1749,16 +1957,10 @@ def load_trace(logfile, root_dir, api): | 
| trace or not. | 
| - api: a tracing api instance. | 
| """ | 
| - root_dir = get_native_path_case(root_dir) | 
| - files, non_existent, processes = api.parse_log(logfile, get_blacklist(api)) | 
| - expected, unexpected = relevant_files( | 
| - files, root_dir.rstrip(os.path.sep) + os.path.sep) | 
| - # In case the file system is case insensitive. | 
| - expected = sorted(set( | 
| - get_native_path_case(os.path.join(root_dir, f))[len(root_dir)+1:] | 
| - for f in expected)) | 
| - simplified = extract_directories(expected, root_dir) | 
| - return files, expected, unexpected, non_existent, simplified, processes | 
| + results = api.parse_log(logfile, get_blacklist(api)) | 
| + results = results.strip_root(root_dir) | 
| + simplified = extract_directories(results.files) | 
| + return results, simplified | 
| def trace_inputs(logfile, cmd, root_dir, cwd_dir, product_dir, force_trace): | 
| @@ -1805,26 +2007,24 @@ def trace_inputs(logfile, cmd, root_dir, cwd_dir, product_dir, force_trace): | 
| return returncode | 
| print_if('Loading traces... %s' % logfile) | 
| - files, expected, unexpected, non_existent, simplified, _ = load_trace( | 
| - logfile, root_dir, api) | 
| - | 
| - print_if('Total: %d' % len(files)) | 
| - print_if('Non existent: %d' % len(non_existent)) | 
| - for f in non_existent: | 
| - print_if(' %s' % f) | 
| - if unexpected: | 
| - print_if('Unexpected: %d' % len(unexpected)) | 
| - for f in unexpected: | 
| - print_if(' %s' % f) | 
| - print_if('Interesting: %d reduced to %d' % (len(expected), len(simplified))) | 
| + results, simplified = load_trace(logfile, root_dir, api) | 
| + | 
| + print_if('Total: %d' % len(results.files)) | 
| + print_if('Non existent: %d' % len(results.non_existent)) | 
| + for f in results.non_existent: | 
| + print_if(' %s' % f.path) | 
| + print_if( | 
| + 'Interesting: %d reduced to %d' % ( | 
| + len(results.existent), len(simplified))) | 
| for f in simplified: | 
| - print_if(' %s' % f) | 
| + print_if(' %s' % f.path) | 
| if cwd_dir is not None: | 
| value = { | 
| 'conditions': [ | 
| ['OS=="%s"' % get_flavor(), { | 
| - 'variables': generate_dict(simplified, cwd_dir, product_dir), | 
| + 'variables': generate_dict( | 
| + [f.path for f in simplified], cwd_dir, product_dir), | 
| }], | 
| ], | 
| } |