| #!/usr/bin/python |
| |
| """ |
| Poll server-status on cautotest to watch for RPCs taking longer than 10s. Then |
| we go and ssh around to figure out what the command line of the process that |
| caused the RPC was so that one can track down what is generating the expensive |
| RPC load. |
| """ |
| |
| try: |
| from bs4 import BeautifulSoup |
| except ImportError: |
| print 'Run `apt-get install python-bs4`' |
| raise |
| |
| import time |
| import subprocess |
| import multiprocessing |
| |
| import common |
| import requests |
| |
| |
| def check_cautotest(): |
| page = requests.get('http://cautotest/server-status').text |
| soup = BeautifulSoup(page) |
| pids = [] |
| for row in soup.table.findAll('tr'): |
| cols = [x.text.strip() for x in row.findAll('td')] |
| if not cols: |
| continue |
| if cols[3] == 'W' and int(cols[5]) > 10 and cols[1] != '-': |
| pids.append((cols[1], cols[3], cols[5])) |
| return pids |
| |
| def pull_cautotest_info(proc_id): |
| try: |
| conn = subprocess.check_output('become chromeos-test@cautotest -- ' |
| '"sudo lsof -i | grep -e %s | grep -e ESTABLISHED"' % proc_id, |
| shell=True) |
| remote_info = conn.split()[8].split('->')[1].split(':') |
| except Exception: |
| remote_info = None |
| return remote_info |
| |
| def strace_cautotest(proc_id): |
| try: |
| straced = subprocess.check_output('become chromeos-test@cautotest -- ' |
| '"sudo strace -s 500 -p %s 2>&1 | head -n 20"' % proc_id, |
| shell=True) |
| except subprocess.CalledProcessError: |
| straced = "" |
| return straced |
| |
| def pull_drone_info(host, port): |
| try: |
| lsof = subprocess.check_output('become chromeos-test@%s -- ' |
| '"sudo lsof -i | grep -e :%s | grep -e ESTABLISHED"' |
| % (host, port), shell=True) |
| proc_id = lsof.split()[1] |
| cmdline = subprocess.check_output('become chromeos-test@%s -- ' |
| '"cat /proc/%s/cmdline"' % (host,proc_id), shell=True) |
| except Exception: |
| cmdline = '' |
| return cmdline |
| |
| def pull_all_data(pid, queue): |
| try: |
| remote_info = pull_cautotest_info(pid[0]) |
| if remote_info: |
| drone_info = pull_drone_info(*remote_info) |
| else: |
| drone_info = None |
| straced = strace_cautotest(pid[0]) |
| queue.put((pid, remote_info, drone_info, straced)) |
| except Exception: |
| queue.put(None) |
| |
| def print_data(x): |
| (pid, remote_info, drone_info, straced) = x |
| print "*** %s stuck in %s for %s secs" % pid |
| print remote_info |
| print drone_info |
| print straced |
| print '\a' |
| |
| while True: |
| queue = multiprocessing.Queue() |
| processes = [] |
| pids = check_cautotest() |
| for pid in pids: |
| proc = multiprocessing.Process(target=pull_all_data, args=(pid, queue)) |
| proc.start() |
| processes.append(proc) |
| for proc in processes: |
| x = queue.get() |
| if x: |
| print_data(x) |
| for proc in processes: |
| proc.terminate() |
| time.sleep(5) |