OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # |
| 3 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 4 # Use of this source code is governed by a BSD-style license that can be |
| 5 # found in the LICENSE file. |
| 6 |
| 7 """Helper script to shard build bot steps and save results to disk. |
| 8 |
| 9 Our buildbot infrastructure requires each slave to run steps serially. |
| 10 This is sub-optimal for android, where these steps can run independently on |
| 11 multiple connected devices. |
| 12 |
| 13 The buildbots will run this script multiple times per cycle: |
| 14 - First, without params: all steps will be executed in parallel using all |
| 15 connected devices. Step results will be pickled to disk (each step has a unique |
| 16 name). |
| 17 The buildbot will treat this step as a regular step, and will not process any |
| 18 graph data. |
| 19 |
| 20 - Then, with -p STEP_NAME: at this stage, we'll simply print the file with the |
| 21 step results previously saved. The buildbot will then process the graph data |
| 22 accordingly. |
| 23 |
| 24 The JSON config contains is a file containing a dictionary in the format: |
| 25 { |
| 26 'step_name_foo': 'script_to_execute foo', |
| 27 'step_name_bar': 'script_to_execute bar' |
| 28 } |
| 29 |
| 30 Note that script_to_execute necessarily have to take at least the following |
| 31 options: |
| 32 --device: the serial number to be passed to all adb commands. |
| 33 --keep_test_server_ports: indicates it's being run as a shard, and shouldn't |
| 34 reset test server port allocation. |
| 35 """ |
| 36 |
| 37 |
| 38 import datetime |
| 39 import json |
| 40 import logging |
| 41 import multiprocessing |
| 42 import optparse |
| 43 import pexpect |
| 44 import pickle |
| 45 import os |
| 46 import signal |
| 47 import shutil |
| 48 import sys |
| 49 |
| 50 from pylib import android_commands |
| 51 from pylib import cmd_helper |
| 52 from pylib import constants |
| 53 from pylib import ports |
| 54 |
| 55 |
| 56 _OUTPUT_DIR = os.path.join(constants.CHROME_DIR, 'out', 'step_results') |
| 57 |
| 58 |
| 59 def _SaveResult(result): |
| 60 with file(os.path.join(_OUTPUT_DIR, result['name']), 'w') as f: |
| 61 f.write(pickle.dumps(result)) |
| 62 |
| 63 |
| 64 def _RunStepsPerDevice(steps): |
| 65 results = [] |
| 66 for step in steps: |
| 67 start_time = datetime.datetime.now() |
| 68 print 'Starting %s: %s %s at %s' % (step['name'], step['cmd'], |
| 69 start_time, step['device']) |
| 70 output, exit_code = pexpect.run( |
| 71 step['cmd'], cwd=os.path.abspath(constants.CHROME_DIR), |
| 72 withexitstatus=True, logfile=sys.stdout, timeout=1800, |
| 73 env=os.environ) |
| 74 end_time = datetime.datetime.now() |
| 75 print 'Finished %s: %s %s at %s' % (step['name'], step['cmd'], |
| 76 end_time, step['device']) |
| 77 result = {'name': step['name'], |
| 78 'output': output, |
| 79 'exit_code': exit_code or 0, |
| 80 'total_time': (end_time - start_time).seconds, |
| 81 'device': step['device']} |
| 82 _SaveResult(result) |
| 83 results += [result] |
| 84 return results |
| 85 |
| 86 |
| 87 def _RunShardedSteps(steps, devices): |
| 88 assert steps |
| 89 assert devices, 'No devices connected?' |
| 90 if os.path.exists(_OUTPUT_DIR): |
| 91 assert '/step_results' in _OUTPUT_DIR |
| 92 shutil.rmtree(_OUTPUT_DIR) |
| 93 if not os.path.exists(_OUTPUT_DIR): |
| 94 os.makedirs(_OUTPUT_DIR) |
| 95 step_names = sorted(steps.keys()) |
| 96 all_params = [] |
| 97 num_devices = len(devices) |
| 98 shard_size = (len(steps) + num_devices - 1) / num_devices |
| 99 for i, device in enumerate(devices): |
| 100 steps_per_device = [] |
| 101 for s in steps.keys()[i * shard_size:(i + 1) * shard_size]: |
| 102 steps_per_device += [{'name': s, |
| 103 'device': device, |
| 104 'cmd': steps[s] + ' --device ' + device + |
| 105 ' --keep_test_server_ports'}] |
| 106 all_params += [steps_per_device] |
| 107 print 'Start sharding (note: output is not synchronized...)' |
| 108 print '*' * 80 |
| 109 start_time = datetime.datetime.now() |
| 110 pool = multiprocessing.Pool(processes=num_devices) |
| 111 async_results = pool.map_async(_RunStepsPerDevice, all_params) |
| 112 results_per_device = async_results.get(999999) |
| 113 end_time = datetime.datetime.now() |
| 114 print '*' * 80 |
| 115 print 'Finished sharding.' |
| 116 print 'Summary' |
| 117 total_time = 0 |
| 118 for results in results_per_device: |
| 119 for result in results: |
| 120 print('%s : exit_code=%d in %d secs at %s' % |
| 121 (result['name'], result['exit_code'], result['total_time'], |
| 122 result['device'])) |
| 123 total_time += result['total_time'] |
| 124 print 'Step time: %d secs' % ((end_time - start_time).seconds) |
| 125 print 'Bots time: %d secs' % total_time |
| 126 # No exit_code for the sharding step: the individual _PrintResults step |
| 127 # will return the corresponding exit_code. |
| 128 return 0 |
| 129 |
| 130 |
| 131 def _PrintStepOutput(step_name): |
| 132 file_name = os.path.join(_OUTPUT_DIR, step_name) |
| 133 if not os.path.exists(file_name): |
| 134 print 'File not found ', file_name |
| 135 return 1 |
| 136 with file(file_name, 'r') as f: |
| 137 result = pickle.loads(f.read()) |
| 138 print result['output'] |
| 139 return result['exit_code'] |
| 140 |
| 141 |
| 142 def _KillPendingServers(): |
| 143 for retry in range(5): |
| 144 for server in ['lighttpd', 'web-page-replay']: |
| 145 pids = cmd_helper.GetCmdOutput(['pgrep', '-f', server]) |
| 146 pids = [pid.strip() for pid in pids.split('\n') if pid.strip()] |
| 147 for pid in pids: |
| 148 try: |
| 149 logging.warning('Killing %s %s', server, pid) |
| 150 os.kill(int(pid), signal.SIGQUIT) |
| 151 except Exception as e: |
| 152 logging.warning('Failed killing %s %s %s', server, pid, e) |
| 153 |
| 154 |
| 155 def main(argv): |
| 156 parser = optparse.OptionParser() |
| 157 parser.add_option('-s', '--steps', |
| 158 help='A JSON file containing all the steps to be ' |
| 159 'sharded.') |
| 160 parser.add_option('-p', '--print_results', |
| 161 help='Only prints the results for the previously ' |
| 162 'executed step, do not run it again.') |
| 163 options, urls = parser.parse_args(argv) |
| 164 if options.print_results: |
| 165 return _PrintStepOutput(options.print_results) |
| 166 |
| 167 # At this point, we should kill everything that may have been left over from |
| 168 # previous runs. |
| 169 _KillPendingServers() |
| 170 |
| 171 # Reset the test port allocation. It's important to do it before starting |
| 172 # to dispatch any step. |
| 173 if not ports.ResetTestServerPortAllocation(): |
| 174 raise Exception('Failed to reset test server port.') |
| 175 |
| 176 # Sort the devices so that we'll try to always run a step in the same device. |
| 177 devices = sorted(android_commands.GetAttachedDevices()) |
| 178 if not devices: |
| 179 print 'You must attach a device' |
| 180 return 1 |
| 181 |
| 182 with file(options.steps, 'r') as f: |
| 183 steps = json.load(f) |
| 184 return _RunShardedSteps(steps, devices) |
| 185 |
| 186 |
| 187 if __name__ == '__main__': |
| 188 sys.exit(main(sys.argv)) |
OLD | NEW |