blob: 86aeb0fc151af93567bc89cb0635ce267eeebeca [file] [log] [blame]
# Copyright 2021 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
#
# This script checks WiFi/Bluetooth peer devices in the lab and creates
# a google spreadsheet for the one which are down.
# The google sheet is displayed at go/wifi-down
#
# This is used by ACS lab to detect down devices
#
# This script get data from 3 sources
# 1) data from dhcp file /usr/local/google/home/<user>/chromiumos/
# \chromeos-admin/puppet/modules/lab/files/dhcp-server/dhcpd.conf
# 2) Swarming data of all bots with label-wificell
# 3) data from g/cros_conn_device_lifecycle
#
# Once data from these three sources are combined, the script pings the devices
# that we are interested in. Any unreachable devices is displayed in the dashboard
# for the lab team to rectify.
#
# Data from all sources is collected device data which of following format
# At each stage 'ignore' flag in send to False if the device meet the criteria to be monitored
# Any host/peer with ignore flag set is not displayed in dashboard
#
# 'chromeos15-row8-rack2-host2': {'dhcp': True,
# 'doc': True,
# 'doc_data': {'board': 'gnawty',
# 'btpeers': [],
# 'model': 'gnawty',
# 'pool': 'wificell_perbuild'},
# 'ignore': False,
# 'ignore_reason' : ''
# 'peers': {'chromeos15-row8-rack2-host2-pcap': {'dhcp': True,
# 'doc': False,
# 'ignore': True,
# 'ssh_status': False,
# 'swarming': True},
# 'chromeos15-row8-rack2-host2-router': {'dhcp': True,
# 'doc': False,
# 'ignore': True,
# 'ssh_status': False,
# 'swarming': True}},
# 'ssh_status': False,
# 'swarming': True,
# 'swarming_data': {'bluetooth_label': True,
# 'board': 'gnawty',
# 'bt_label': False,
# 'bt_peers': [],
# 'conductive': True,
# 'deleted': False,
# 'host': 'chromeos15-row8-rack2-host2',
# 'hw_phase': 'PHASE_PVT',
# 'is_dead': False,
# 'missing': False,
# 'model': 'gnawty',
# 'pool': 'wificell_perbuild',
# 'servo': False,
# 'wifichip': 'wireless_intel'}},
#
#
# Note 1: Only devices is chromeos15- is checked
# Note 2 : Currently the following peer devices are considered PCAP,ROUTER,BTPEER1-4, SERVO, ATTENUATOR
# Note 3 : Standalone RPMS in chromeos3 are added as special cases
# Note 4: For debugging this script, use debug_main and store intermediate results in files.
#
#
#TODO
# debug the hang
# servo already there
# rpm already there?
# attentuator already there
# separate doc issues to different sheet
# send mail
import csv
import datetime
import gspread
import json
import logging
import os
import pprint
import subprocess
import sys
import time
import queue
from oauth2client.service_account import ServiceAccountCredentials
from credentials import json_keyfile
from multiprocessing import Process
from multiprocessing import Queue
import get_wificell_data
import get_wifisheet_data
import get_dhcp_data
import rpm_list
# Change logging level to DEBUG for more logs
#logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
DASHBOARD_REFRESH_INTERVAL = 1000 # Time to wait between dashboard refreshes in seconds
CONNECTIVITY_RETEST_INTERVAL = 180 # Time to wait before rechecking connectivity to down devices in seconds
HOST_UP = 'UP'
HOST_DOWN = 'DOWN'
HOST_NO_SSH = 'Online w/o SSH Con'
PING_COUNT = 2
SPREADSHEET_ALL = 'WiFi Devices DOWN'
WORKSHEET1 = 'LAB'
WORKSHEET2 = 'Documentation'
# Mapping integers to host status strings.
HOST_STATUS = {0: HOST_UP, 1: HOST_DOWN, 3: HOST_NO_SSH}
# Ignore devices in these pools
POOLS_TO_IGNORE = ['cross_device_multi_cb']
# Names of bluetooth peers
BT_PEERS = ['btpeer1', 'btpeer2', 'btpeer3', 'btpeer4']
# Name of wifi peer devices
WIFI_PEERS = ['router', 'pcap']
#Pools with attentuator
ATTENUATOR_POOLS = ['groamer', 'groamer_two', 'bt_groamer']
def _pretty_print(d, msg=''):
print('------------------------------------------------------------')
if msg != '':
print('====== %s =========' % msg)
if type(d) == dict:
pp = pprint.PrettyPrinter(indent=1)
pp.pprint(d)
print('length is %s' % len(d))
elif type(d) == list:
for i in d:
print(i)
print('length is %s' % len(d))
else:
print(d)
print('------------------------------------------------------------')
def _parse_doc_model_name(m):
""" parse Model name in the go/cros-conn-lifecycle sheet so it can be compared with swarming model name
It can be 'Mordin (Barla)' which be be parsed as [mordin, barla]
veyron_/auron_ prefixes should be removed
There can be WIP in the name which means that is should be ignored
"""
result = []
m = m.lower()
if '[wip]' in m:
result.append('[wip]')
m = m.strip('[wip]')
logging.debug('WIP device found')
if '(' in m:
for i in m.split('('):
i = i.strip().replace(')', '').lower()
result.append(i)
else:
result.append(m.strip().lower())
logging.debug('Returning %s for %s', result, m)
return result
def _make_peers(h, l):
if type(l) == list:
res = []
for p in l:
res.append(h + '-' + p)
return res
else:
return h + '-' + p
def getHostStatus(q, host):
""" Ping the host and check if it is ssh-able"""
try:
logging.debug('Checking status of %s', host)
# Grab the ping exit code.
host_status_code = subprocess.call(['ping', '-c2', host])
# if the device is pingable, we check if port 22 is open to accept ssh connection.
if host_status_code == 0:
try:
nc_output_code = subprocess.call(
['nc', '-zv', '-w3', host, '22'])
except:
logging.debug('netcat failed: %s', host)
if nc_output_code != 0:
host_status_code = 3
ret_status = HOST_STATUS[host_status_code]
except Exception as e:
logging.error('!!!!!!!! Exception %s while checking %s', str(e), host)
ret_status = HOST_DOWN
finally:
logging.debug('Host %s returning status %s', host, ret_status)
q.put((host, ret_status))
def get_rpm_list():
""" Read the list of rpms """
return rpm_list.rpm_list
def update_rpm_data(device_data, rpm_list):
""" Update list of rpm into device data """
for h in rpm_list:
device_data[h] = {
'ignore': False,
'ignore_reason': 'RPM not ignored',
'dhcp': True, # Found in dhcp file
'ssh_status': False,
'swarming': True, # RPM wont be in swarming
'doc': True, # RPM wont be in doc
'pool': 'RPM', # Add a false pool
'peers': {},
'chromeos': False
}
logging.debug('dhcp other device added %s %s', h, device_data[h])
def update_dhcp_data(device_data, hosts, peer_devices, other_devices):
"""
Update dhcp data into device_data
"""
for h in other_devices:
device_data[h] = {
'ignore': True, # Ignore by default
'ignore_reason': 'Other devices ignored in update_dhcp',
'dhcp': True, # Found in dhcp file
'ssh_status': False,
'swarming': False,
'doc': False,
'pool': None,
'peers': {},
'chromeos': False
}
logging.debug('dhcp other device added %s %s', h, device_data[h])
for h in hosts:
device_data[h] = {
'ignore': True, # ignore hosts unless it is a wificell
'ignore_reason': 'host ignored in update_dhcp',
'dhcp': True, # Found in dhcp file
'ssh_status': False,
'swarming': False,
'doc': False,
'pool': None,
'peers': {},
'chromeos': True
}
logging.debug('dhcp host added %s %s', h, device_data[h])
for peer in peer_devices:
# Do not ignore rpm or servo since these can't be detected from swarming or doc
if 'rpm' in peer:
peer_dict = {
'ignore': False, # ignore it is a peer of wificell host
'ignore_reason': 'peer rpm not ignored in update_dhcp',
'dhcp': True, # Found in dhcp file
'ssh_status': False,
'swarming': True, # RPM cannot be found in swarming
'doc': True, # RPM not recorded in doc
'chromeos': False
}
elif 'servo' in peer:
peer_dict = {
'ignore': False, # ignore it is a peer of wificell host
'ignore_reason': 'peer servo not ignored in update_dhcp',
'dhcp': True, # Found in dhcp file
'ssh_status': False,
'swarming':
True, # servo is not currently detected from swarming
'doc':
True, # servo is not currentyl detected from the document
'chromeos': False
}
else:
# Ignore other peer unless they can be found in swarming or doc
peer_dict = {
'ignore': True, # ignore it is a peer of wificell host
'ignore_reason': 'peer ignored in update_dhcp',
'dhcp': True, # Found in dhcp file
'ssh_status': False,
'swarming': False,
'doc': False,
'chromeos': False
}
hostname = '-'.join(peer.split('-')[:4])
logging.debug('derived host %s from peername %s', hostname, peer)
# host is not in dhcp but peer is
if hostname not in device_data:
logging.debug('peer %s present in dhcp but host %s is not', peer,
hostname)
device_data[hostname] = {
'ignore': True, # ignore hosts unless it is a wificell
'ignore_reason':
'host derived from peer ignored in update_dhcp',
'dhcp': False, # Not found in dhcp file
'ssh_status': False,
'swarming': False,
'doc': False,
'pool': None,
'peers': {},
'chromeos': True
}
device_data[hostname]['peers'][peer] = peer_dict
def update_swarming_data(device_data, swarming_data):
""" update device data with swarming data """
for h, v in swarming_data.items():
if 'chromeos3' in h:
logging.debug('Igonring chaos device %s in chromeos3', h)
continue
if v['pool'] in POOLS_TO_IGNORE:
logging.debug(' %s is in ignored pool %s', h, v['pool'])
continue
if h not in device_data:
logging.error(
'host %s in swarming but not in dhcp. This should never happen',
h)
device_data[h] = {
'ignore': False, # ignore hosts unless it is a wificell
'ignore_reason':
'wificell host not ignored in update_swarming',
'dhcp': False, # Not Found in dhcp file
'ssh_status': False,
'swarming': True,
'doc': False,
'pool': None,
'peers': {},
'chromeos': True
}
else:
device_data[h]['ignore'] = False
device_data[h][
'ignore_reason'] = 'wificell host not ignored in update_swarming',
device_data[h]['swarming'] = True
device_data[h]['pool'] = v['pool']
device_data[h]['swarming_data'] = v
# update status of peer devices
# wificell devices always have these peers
# except bt_grover pool
if v['pool'] != 'bt_groamer':
expected_peers = _make_peers(h, WIFI_PEERS)
# some pools have attenuator
if v['pool'] in ATTENUATOR_POOLS:
for peer in _make_peers(h, ['attenuator']):
expected_peers.append(peer)
# number of btpeers vary. Get the number from swarming
expected_peers.extend(_make_peers(h, BT_PEERS[:len(v['bt_peers'])]))
# check only servo v3
if v['servo']:
expected_peers.append(servo)
logging.debug('Expected peers for host %s is %s', h, expected_peers)
for peer in expected_peers:
if peer not in device_data[h]['peers']:
# Peer indicated in swarming data but not in dhcp
logging.debug('Peer %s not in dhcp but in swarming', peer)
device_data[h]['peers'][peer] = {
'ignore': False, # ignore hosts unless it is a wificell
'ignore_reason':
'peer of wificell host not ignored in update_swarming',
'dhcp': False, # Not found in dhcp file
'ssh_status': False,
'swarming': True,
'doc': False
}
else:
device_data[h]['peers'][peer]['swarming'] = True
device_data[h]['peers'][peer]['ignore'] = False
device_data[h]['peers'][peer][
'ignore_reason'] = 'peer of wificell host not ignored in update_swarming'
def update_conn_doc_data(device_data, conn_doc_data):
""" update device data using go/cros_conn_device_lifecyle data"""
for h, v in conn_doc_data.items():
if h not in device_data:
logging.debug(
'host %s not in swarming or dhcp but in go/cros_conn_device_lifecycle',
h)
device_data[h] = {
'ignore': False, # All DUT in doc is important
'ignore_reason': 'device found in conn_doc',
'dhcp': False, # not found in dhcp file
'ssh_status': False,
'swarming': False, # not found in swarming
'pool': None,
'doc': True,
'peers': {},
'chromeos': True
}
else:
device_data[h]['doc'] = True
device_data[h]['ignore'] = False
device_data[h]['doc_data'] = v
# Ignore this host and peers
# Used for test bed until construction
ignore_test_bed = False
if device_data[h]['pool'] is None:
if v['pool'] in POOLS_TO_IGNORE:
logging.debug(
'device %s doc data has pool %s which is to be ignored', h,
v['pool'])
device_data[h]['ignore'] = True
device_data[h]['ignore_reason'] = 'pool ignored'
ignore_test_bed = True
if v['model'] == '':
logging.debug('Empty model. Ignoring %s', h)
device_data[h]['ignore'] = True
device_data[h]['ignore_reason'] = 'empty model ignored'
ignore_test_bed = True
if '[wip]' in _parse_doc_model_name(v['model']):
logging.debug('WIP Ignoring %s', h)
device_data[h]['ignore'] = True
device_data[h]['ignore_reason'] = 'WIP device ignored'
ignore_test_bed = True
# update status of peers
documented_peers = []
for i in v['btpeers']:
documented_peers.append(h + '-' + i)
# bt_groamer doesn't have wifi peersx
if 'wificell' in v['labels'] and v['pool'] != 'bt_groamer':
for i in WIFI_PEERS:
documented_peers.append(h + '-' + i)
logging.debug('documented peers for %s is %s', h, documented_peers)
if v['pool'] in ATTENUATOR_POOLS:
documented_peers.extend(_make_peers(h, ['attenuator']))
for peer in documented_peers:
if peer not in device_data[h]['peers']:
logging.debug('%s in doc data but not swarming', peer)
device_data[h]['peers'][peer] = {
'ignore':
ignore_test_bed, # ignore hosts unless it is a wificell
'ignore_reason':
'peer if dut with ignore_test_bed %s ' % ignore_test_bed,
'dhcp': False, # Not found in dhcp file
'ssh_status': False,
'swarming': False, # Found in swarming
'doc': True,
'chromeos': False
}
else:
device_data[h]['peers'][peer]['doc'] = True
device_data[h]['peers'][peer]['ignore'] = ignore_test_bed
def check_connectivity(device_data, recheck=False):
""" check if device is pingable and sshable"""
def _add_to_result(result_dict, rhost, result):
logging.debug('Adding to result %s %s', rhost, result)
if rhost in result_dict:
logging.error('rhost %s already present in result', rhost)
logging.error('This should not happen###')
raise ValueError
result_dict[rhost] = result
devices_to_check = {
'hosts': [],
'peers': {},
}
# Only check devices which are present in DHCP data
for host, host_value in device_data.items():
if not host_value['ignore'] and host_value['dhcp']:
# On recheck, check devices which is not up
if not recheck or host_value['ssh_status'] != HOST_UP:
devices_to_check['hosts'].append(host)
for peer, peer_value in host_value['peers'].items():
if not peer_value['ignore'] and peer_value['dhcp']:
if not recheck or peer_value['ssh_status'] != HOST_UP:
devices_to_check['peers'][peer] = host
device_list = devices_to_check['hosts'][:]
device_list.extend(list(devices_to_check['peers'].keys()))
#
# GetHostStatus function is called in separate process for each dut
# Each of these process put the result in a queue
# THe main process get results from queue and joins the processes
# The processes was getting hung probably since the queue was growing large
# Adding code to remove items from the queue resolved the issue
#
q = Queue(32000)
result_dict = {}
process_list = []
count = 0
for host in device_list:
p = Process(target=getHostStatus, args=(q, host))
p.start()
process_list.append((p, host))
logging.debug('starting check %s %s', host, count)
count += 1
try:
(rhost, result) = q.get(block=False)
_add_to_result(result_dict, rhost, result)
except queue.Empty:
pass
while process_list != []:
logging.info('{} processes remaining '.format(len(process_list)))
logging.debug(' process list %s result %s queue size %s ',
len(process_list), len(result_dict), q.qsize())
for (p, host) in process_list:
# empty queue to prevent the proceess from hanging
try:
(rhost, result) = q.get(block=False)
_add_to_result(result_dict, rhost, result)
except queue.Empty:
pass
if not p.is_alive():
logging.info('{} process has ended'.format(host))
p.join()
process_list.remove((p, host))
else:
logging.info('{} process pending'.format(host))
logging.debug('sleeping for 3 seconds')
time.sleep(3)
while not q.empty():
(rhost, result) = q.get(timeout=2)
_add_to_result(result_dict, rhost, result)
if len(result_dict) != len(device_list):
logging.error(
'Length of result %s is not equal to length'
'of device list %s', len(result_dict), len(device_list))
for h in result_dict:
if h not in device_list:
logging.error('%s not in device_list', h)
for h in device_list:
if h not in result_dict:
logging.error('%s not in result', h)
raise ValueError
_pretty_print(result_dict, 'result_dict')
for h in devices_to_check['hosts']:
device_data[h]['ssh_status'] = result_dict[h]
for p, h in devices_to_check['peers'].items():
device_data[h]['peers'][p]['ssh_status'] = result_dict[p]
# error conditions
IGNORED = 'IGNORED'
IMPOSSIBLE = 'ERROR' # Impossible combination like device not in DHCP but ssh-able
NOT_DOCUMENTED = 'NOT DOCUMENTED'
NOT_IN_DHCP = 'NOT IN DHCP BUT DOCUMENTED'
NOT_IN_SWARMING = 'NOT IN SWARMING BUT IN DHCP'
NOT_REACHABLE = 'NOT PINGABLE OR SSH-ABLE'
IN_SWARMING_NOT_IN_DHCP = 'IN SWARMING BUT NOT IN DHCP FILE '
ONLINE_BUT_NOT_IN_DHCP = 'DEVICE IS UP BUT NOT IN DHCP FILE!'
ALL_OK = 'UP'
BAD_STATES = [
NOT_REACHABLE, NOT_IN_SWARMING, NOT_DOCUMENTED, IMPOSSIBLE, NOT_IN_DHCP,
IN_SWARMING_NOT_IN_DHCP, ONLINE_BUT_NOT_IN_DHCP
]
# ignore, dhcp, ssh swarming, doc : result
error_dict = {
(False, False, False, False, False): IMPOSSIBLE,
(False, False, False, False, True): NOT_IN_DHCP,
(False, False, False, True, False): IN_SWARMING_NOT_IN_DHCP,
(False, False, False, True, True): IN_SWARMING_NOT_IN_DHCP,
(False, False, True, False, False): ONLINE_BUT_NOT_IN_DHCP,
(False, False, True, False, True): ONLINE_BUT_NOT_IN_DHCP,
(False, False, True, True, False): IN_SWARMING_NOT_IN_DHCP,
(False, False, True, True, True): IN_SWARMING_NOT_IN_DHCP,
(False, True, False, False, False): NOT_REACHABLE,
(False, True, False, False, True): NOT_REACHABLE,
(False, True, False, True, False): NOT_REACHABLE,
(False, True, False, True, True): NOT_REACHABLE,
(False, True, True, False, False): NOT_IN_SWARMING,
(False, True, True, False, True): NOT_IN_SWARMING,
(False, True, True, True, False): NOT_DOCUMENTED,
(False, True, True, True, True): ALL_OK,
(True, False, False, False, False): IGNORED,
(True, False, False, False, True): IGNORED,
(True, False, False, True, False): IGNORED,
(True, False, False, True, True): IGNORED,
(True, False, True, False, False): IGNORED,
(True, False, True, False, True): IGNORED,
(True, False, True, True, False): IGNORED,
(True, False, True, True, True): IGNORED,
(True, True, False, False, False): IGNORED,
(True, True, False, False, True): IGNORED,
(True, True, False, True, False): IGNORED,
(True, True, False, True, True): IGNORED,
(True, True, True, False, False): IGNORED,
(True, True, True, False, True): IGNORED,
(True, True, True, True, False): IGNORED,
(True, True, True, True, True): IGNORED,
}
def generate_dashboard(device_data):
""" Analyses device_data and prepare result to be populated in dashboard"""
for host, hv in device_data.items():
logging.debug(host)
_pretty_print(hv)
peer_error_found = False # Unreachable peer which should be flagged in main dashboard
issue_found = False # Any other issue which is displayed in secondary dashboard
hv['device_status'] = error_dict[(hv['ignore'], hv['dhcp'],
not (hv['ssh_status'] == HOST_DOWN),
hv['swarming'], hv['doc'])]
logging.debug(
'ignore %s dhcp %s swarming %s ssh_status %s not ssh_status == HOST_DOWN %s doc %s status %s',
hv['ignore'], hv['dhcp'], hv['swarming'], hv['ssh_status'],
not (hv['ssh_status'] == HOST_DOWN), hv['doc'],
hv['device_status'])
logging.debug(error_dict[(False, True, True, False, True)])
# main dashboard need not show status of DUT since there is a separate dashboard for that.
if hv['device_status'] != IGNORED and hv['device_status'] in BAD_STATES:
issue_found = True
logging.debug('device status is %s', hv['device_status'])
if 'peers' in hv.keys():
for peer, pv in hv['peers'].items():
logging.debug(peer)
logging.debug(pv)
pv['device_status'] = error_dict[(
pv['ignore'], pv['dhcp'],
not (pv['ssh_status'] == HOST_DOWN), pv['swarming'],
pv['doc'])]
logging.debug(
'ignore %s dhcp %s swarming %s ssh_status %s not(ssh_status == HOST_DOWN) %s doc %s',
pv['ignore'], pv['dhcp'], pv['swarming'], pv['ssh_status'],
not (pv['ssh_status'] == HOST_DOWN), pv['doc'])
# If the host is ignored then do not show it in the dashboard
if hv['device_status'] == IGNORED:
logging.debug('device status is %s ignoring %s',
hv['device_status'], host)
issue_found = issue_found or pv[
'device_status'] in BAD_STATES
else:
peer_error_found = peer_error_found or pv[
'device_status'] == NOT_REACHABLE
logging.debug('device status is %s', pv['device_status'])
# Documentation errors
hv['documentation_errors'] = []
# check only chromeos devices and avoid ignored devices
if hv['device_status'] != IGNORED and hv['chromeos']:
# model/boards of host in go/conn-device-lifecycle is different from swarming
if hv['swarming'] and hv['doc']:
if hv['swarming_data']['model'] not in _parse_doc_model_name(
hv['doc_data']['model']):
hv['documentation_errors'].append(
'model in swarming "%s" differs from model in doc "%s"'
% (hv['swarming_data']['model'],
hv['doc_data']['model']))
if hv['swarming_data']['board'] != hv['doc_data'][
'board'].strip():
hv['documentation_errors'].append(
'board in swarming "%s" differs from board in doc "%s"'
% (hv['swarming_data']['board'],
hv['doc_data']['board']))
# Pool differs
if hv['swarming'] and hv['doc']:
if hv['swarming_data']['pool'] != hv['doc_data']['pool']:
hv['documentation_errors'].append(
'pool in swarming "%s" differs from pool in doc "%s"' %
(hv['swarming_data']['pool'], hv['doc_data']['pool']))
# wificell / conductive label differ
if hv['swarming'] and hv['doc']:
if hv['swarming_data']['wificell'] != (
'wificell' in hv['doc_data']['labels']):
hv['documentation_errors'].append(
'label:wificell differs between doc and swarming')
_pretty_print(hv)
logging.debug('label wificell discrepencise %s %s',
hv['swarming_data']['wificell'],
'wificell' in hv['doc_data']['labels'])
# bluetooth label not found
if hv['swarming'] and not hv['swarming_data']['bluetooth_label']:
hv['documentation_errors'].append('Bluetooth label not found')
if hv['documentation_errors'] != []:
logging.debug(hv['documentation_errors'])
hv['peer_error_found'] = peer_error_found
hv['issue_found'] = issue_found
_pretty_print(device_data)
logging.debug('## IGNORED devices')
for host, hv in device_data.items():
if hv['device_status'] == IGNORED:
logging.debug('IGNORED DEVICE %s', (host))
_pretty_print(hv)
logging.debug('## IGNORED devices END')
logging.debug('## IMPOSSIBLE devices')
for host, hv in device_data.items():
if hv['device_status'] == IMPOSSIBLE:
logging.debug('IMPOSSIBLE DEVICE %s', (host))
_pretty_print(hv)
logging.debug('## IMPOSSIBLE devices END')
def populate_dashboard(spreadsheet_name, device_data):
def _find_header(d):
""" given list of dicts,find all keys"""
header = [
'pool',
'host',
'model',
'host_status',
]
peer_header = []
for _, v in d.items():
if 'peers' in v:
for p, pv in v['peers'].items():
if pv['ignore']:
continue
logging.debug(p)
peer_suffix = p.split('-')[4]
if peer_suffix not in peer_header:
peer_header.append(peer_suffix)
peer_header.sort()
header.extend(peer_header)
logging.debug('header is %s', header)
return header
def _populate_document_sheet(wsheet, msgs, header, data):
row_count = 1
for i, m in enumerate(msgs):
wsheet.insert_row(m.split(' '), i + row_count)
logging.debug('Writing %s at %s', m, i + row_count)
row_count += len(msgs)
wsheet.insert_row([h.upper() for h in header], row_count)
logging.debug('writing header at %s', row_count)
wsheet.format(
'A%s:S%s' % (row_count, row_count),
{'backgroundColor': {
'red': 0.0,
'green': 0.5,
'blue': 0.5
}})
row_count += 1
row_length = 12
cell_start_index = row_count
cell_end_index = cell_start_index + len(data)
range_label = 'A%s:%s%s' % (cell_start_index,
'-ABCDEFGHIJKLMNOPQR' [row_length],
cell_end_index)
logging.debug('range_label %s', range_label)
cell_list = wsheet.range(range_label)
logging.debug('cell_list Info: %s', (cell_list))
cell_list_index = 0
host_list = list(data.keys())
host_list.sort()
for host in host_list:
hv = data[host]
if hv['documentation_errors'] == []:
continue
logging.debug('%s %s', host, hv['documentation_errors'])
_pretty_print(hv)
cell_list[cell_list_index].value = hv['pool']
cell_list_index += 1
cell_list[cell_list_index].value = host
cell_list_index += 1
cell_list[cell_list_index].value = hv['swarming_data'][
'model'] if hv['swarming'] else '--'
cell_list_index += 1
logging.debug(
'%s %s %s %s', hv['pool'], host,
hv['swarming_data']['model'] if hv['swarming'] else '--',
hv['documentation_errors'])
for e in hv['documentation_errors']:
cell_list[cell_list_index].value = e
cell_list_index += 1
for i in range(3 + len(hv['documentation_errors']), row_length):
cell_list[cell_list_index].value = ''
cell_list_index += 1
wsheet.update_cells(cell_list)
def _populate_lab_sheet(wsheet,
msgs,
header,
data,
error_field='peer_error_found'):
row_count = 1
for i, m in enumerate(msgs):
wsheet.insert_row(m.split(' '), i + row_count)
logging.debug('Writing %s at %s', m, i + row_count)
row_count += len(msgs)
wsheet.insert_row([h.upper() for h in header], row_count)
logging.debug('writing header at %s', row_count)
wsheet.format(
'A%s:S%s' % (row_count, row_count),
{'backgroundColor': {
'red': 0.0,
'green': 0.5,
'blue': 0.5
}})
row_count += 1
cell_start_index = row_count
cell_end_index = cell_start_index + len(data)
range_label = 'A%s:%s%s' % (cell_start_index,
'-ABCDEFGHIJKLMNOPQR' [len(header)],
cell_end_index)
logging.debug('range_label %s', range_label)
cell_list = wsheet.range(range_label)
logging.debug('cell_list Info: %s', (cell_list))
cell_list_index = 0
host_list = list(data.keys())
host_list.sort()
for host in host_list:
hv = data[host]
if not hv[error_field]:
continue
if 'rpm' in host:
print('error found')
logging.debug('%s %s', host, hv['device_status'])
logging.debug('%s %s', host, hv['device_status'])
_pretty_print(hv)
cell_list[cell_list_index].value = hv['pool']
cell_list_index += 1
cell_list[cell_list_index].value = host
cell_list_index += 1
cell_list[cell_list_index].value = hv['swarming_data'][
'model'] if hv['swarming'] else '--'
cell_list_index += 1
cell_list[cell_list_index].value = hv['ssh_status'] if hv[
'device_status'] == NOT_REACHABLE else hv['device_status']
cell_list_index += 1
logging.debug('%s %s %s', hv['pool'], host, hv['device_status'])
for suffix in header[4:]:
peername = host + '-' + suffix
if 'peers' in hv and peername in hv['peers']:
cell_list[cell_list_index].value = hv['peers'][peername][
'ssh_status'] if hv['peers'][peername][
'device_status'] == NOT_REACHABLE else hv['peers'][
peername]['device_status']
logging.debug('%s %s', peername,
hv['peers'][peername]['device_status'])
else:
cell_list[cell_list_index].value = '--'
logging.debug('peername not found %s', peername)
cell_list_index += 1
wsheet.update_cells(cell_list)
""" Display the data in the dashboard"""
scope = [
'https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive'
]
credentials = ServiceAccountCredentials.from_json_keyfile_name(
json_keyfile, scope)
gc = gspread.authorize(credentials)
spreadsheet = gc.open(spreadsheet_name)
worksheet = 'DOWN PEERS'
wsheet1 = spreadsheet.worksheet(worksheet)
wsheet1.clear()
wsheet1.format(
'A1:S1000',
{'backgroundColor': {
'red': 1.0,
'green': 1.0,
'blue': 1.0
}})
worksheet = 'DOCUMENTATION ERRORS'
wsheet2 = spreadsheet.worksheet(worksheet)
wsheet2.clear()
wsheet2.format(
'A1:S1000',
{'backgroundColor': {
'red': 1.0,
'green': 1.0,
'blue': 1.0
}})
worksheet = 'OTHER ERRORS'
wsheet3 = spreadsheet.worksheet(worksheet)
wsheet3.clear()
wsheet3.format(
'A1:S1000',
{'backgroundColor': {
'red': 1.0,
'green': 1.0,
'blue': 1.0
}})
lab_issues, documentation_issues, other_issues = False, False, False
lab_messages = []
doc_messages = []
other_messages = []
for k, v in device_data.items():
if v['peer_error_found']:
lab_issues = True
if v['documentation_errors'] != []:
documentation_issues = True
if v['issue_found']:
other_issues = True
if lab_issues and documentation_issues and other_issues:
break
if not lab_issues:
lab_messages = ['No Issues Found. Check other tabs']
if not documentation_issues:
doc_messages = ['No Issues Found. Check other tabs']
if not other_issues:
other_messages = ['No Issues Found. Check other tabs']
messages = [
'LAST_UPDATED_AT %s' % str(datetime.datetime.now()),
'NEXT_UPDATE_WILL_BE_AT %s' %
(str(datetime.datetime.now() +
datetime.timedelta(seconds=DASHBOARD_REFRESH_INTERVAL)))
]
lab_messages.extend(messages)
_pretty_print(lab_messages)
doc_messages.extend(messages)
_pretty_print(doc_messages)
other_messages.extend(messages)
_pretty_print(other_messages)
logging.debug('writing the lab sheet')
header = _find_header(device_data)
_populate_lab_sheet(wsheet1, lab_messages, header, device_data)
logging.debug('writing the other sheet')
header = _find_header(device_data)
_populate_lab_sheet(wsheet3,
lab_messages,
header,
device_data,
error_field='issue_found')
logging.debug('writing the document sheet')
header = ['pool', 'host', 'model', 'Documentation errors']
_populate_document_sheet(wsheet2, doc_messages, header, device_data)
logging.debug('populate_dashboard_complete')
def dict_diff(s1, s2):
if type(s1) == dict and type(s2) == dict:
for k in s1:
if k not in s2:
print('key %s missing in second' % k)
for k in s2:
if k not in s1:
print('key %s missing in first' % k)
for k, v1 in s1.items():
if k in s2:
if type(v1) == dict:
dict_diff(v1, s2[k])
elif v1 != s2[k]:
logging.debug('value %s %s differs', v1, s2[k])
def debug_main():
""" Debug version of Main function """
device_data = {}
#Read the list of rpms to check
rpm_list = get_rpm_list()
update_rpm_data(device_data, rpm_list)
# Get dhcp data and update device data
(hosts, peer_devices, other_devices) = get_dhcp_data.get_data()
update_dhcp_data(device_data, hosts, peer_devices, other_devices)
logging.debug("After update_dhcp_data")
_pretty_print(device_data)
input()
# use this to debug the script without getting swarming data everytime
with open('/tmp/skylab_hosts.json') as json_file:
swarming_data = json.load(json_file)
_pretty_print(swarming_data)
# Get swarming data and update device_data
#swarming_data = get_wificell_data.get_data()
#_pretty_print(swarming_data)
update_swarming_data(device_data, swarming_data)
_pretty_print(device_data)
logging.debug("After update_swarming_data")
input()
# Get data from g/cros_conn_device_lifecycle and updat device data
conn_doc_data = get_wifisheet_data.get_wifisheet_data()
update_conn_doc_data(device_data, conn_doc_data)
_pretty_print(device_data)
logging.debug("After update_conn_data")
input()
with open('data.txt', 'w') as outfile:
json.dump(device_data, outfile)
with open('data.txt') as json_file:
device_data = json.load(json_file)
check_connectivity(device_data)
logging.debug("After check_connectivity")
input()
logging.info('Waiting for 2 seconds before checking connectivity again')
time.sleep(2)
check_connectivity(device_data, recheck=True)
logging.debug("After check_connectivity recheck")
input()
with open('data2.txt', 'w') as outfile:
json.dump(device_data, outfile)
with open('data2.txt') as json_file:
device_data = json.load(json_file)
_pretty_print(device_data)
generate_dashboard(device_data)
logging.debug("After generate_dashboard")
input()
populate_dashboard(SPREADSHEET_ALL, device_data)
logging.debug("After populate_dashboard")
input()
def main():
""" Main function """
device_data = {}
#Read the list of rpms to check
rpm_list = get_rpm_list()
update_rpm_data(device_data, rpm_list)
# Get dhcp data and update device data
(hosts, peer_devices, other_devices) = get_dhcp_data.get_data()
update_dhcp_data(device_data, hosts, peer_devices, other_devices)
# Get swarming data and update device_data
swarming_data = get_wificell_data.get_data()
update_swarming_data(device_data, swarming_data)
# Get data from g/cros_conn_device_lifecycle and updat device data
conn_doc_data = get_wifisheet_data.get_wifisheet_data()
update_conn_doc_data(device_data, conn_doc_data)
# Check connectivity of devices
check_connectivity(device_data)
logging.info('Waiting for %s seconds before checking connectivity again',
CONNECTIVITY_RETEST_INTERVAL)
time.sleep(CONNECTIVITY_RETEST_INTERVAL)
check_connectivity(device_data, recheck=True)
generate_dashboard(device_data)
populate_dashboard(SPREADSHEET_ALL, device_data)
_pretty_print(device_data)
if __name__ == '__main__':
if int(sys.version.split(' ')[0].split('.')[0]) != 3:
print('Please invoke with python3')
sys.exit()
while True:
try:
logging.debug('Ctrl-C to stop')
main()
#debug_main()
logging.debug('Sleeping for %s seconds',
DASHBOARD_REFRESH_INTERVAL)
time.sleep(DASHBOARD_REFRESH_INTERVAL)
except KeyboardInterrupt:
sys.exit()
except Exception as e:
logging.error(
'Exception %s while running script. Press any key to continue',
str(e))
input()
logging.debug('Sleeping for %s seconds',
DASHBOARD_REFRESH_INTERVAL)
time.sleep(DASHBOARD_REFRESH_INTERVAL)