Merge "Improve logging/stats around launch errors"

This commit is contained in:
Jenkins 2014-05-01 14:20:13 +00:00 committed by Gerrit Code Review
commit 7747eb461c
1 changed files with 106 additions and 36 deletions

View File

@ -51,6 +51,22 @@ DELETE_DELAY = 1 * MINS # Delay before deleting a node that has completed
# its job.
class LaunchNodepoolException(Exception):
statsd_key = 'error.nodepool'
class LaunchStatusException(Exception):
statsd_key = 'error.status'
class LaunchNetworkException(Exception):
statsd_key = 'error.network'
class LaunchAuthException(Exception):
statsd_key = 'error.auth'
class NodeCompleteThread(threading.Thread):
log = logging.getLogger("nodepool.NodeCompleteThread")
@ -310,16 +326,35 @@ class NodeLauncher(threading.Thread):
return
try:
self.launchNode(session)
start_time = time.time()
dt = self.launchNode(session)
failed = False
statsd_key = 'ready'
except Exception, e:
self.log.exception("%s launching node id: %s "
"in provider: %s error:" %
(e.__class__.__name__,
self.node_id, self.provider.name))
dt = int((time.time() - start_time) * 1000)
failed = True
if hasattr(e, 'statsd_key'):
statsd_key = e.statsd_key
else:
statsd_key = 'error.unknown'
try:
self.nodepool.launchStats(statsd_key, dt, self.image.name,
self.provider.name,
self.node_target_name)
except Exception:
self.log.exception("Exception launching node id: %s:" %
self.node_id)
self.log.exception("Exception reporting launch stats:")
if failed:
try:
self.nodepool.deleteNode(self.node_id)
except Exception:
self.log.exception("Exception deleting node id: %s:" %
self.node_id)
return
def launchNode(self, session):
start_time = time.time()
@ -333,8 +368,10 @@ class NodeLauncher(threading.Thread):
snap_image = session.getCurrentSnapshotImage(
self.provider.name, self.image.name)
if not snap_image:
raise Exception("Unable to find current snapshot image %s in %s" %
(self.image.name, self.provider.name))
raise LaunchNodepoolException("Unable to find current snapshot "
"image %s in %s" %
(self.image.name,
self.provider.name))
self.log.info("Creating server with hostname %s in %s from image %s "
"for node id: %s" % (hostname, self.provider.name,
@ -349,15 +386,17 @@ class NodeLauncher(threading.Thread):
(server_id, self.node.id))
server = self.manager.waitForServer(server_id, self.launch_timeout)
if server['status'] != 'ACTIVE':
raise Exception("Server %s for node id: %s status: %s" %
(server_id, self.node.id, server['status']))
raise LaunchStatusException("Server %s for node id: %s "
"status: %s" %
(server_id, self.node.id,
server['status']))
ip = server.get('public_v4')
if not ip and self.manager.hasExtension('os-floating-ips'):
ip = self.manager.addPublicIP(server_id,
pool=self.provider.pool)
if not ip:
raise Exception("Unable to find public IP of server")
raise LaunchNetworkException("Unable to find public IP of server")
self.node.ip = ip
self.log.debug("Node id: %s is running, ip: %s, testing ssh" %
@ -366,15 +405,10 @@ class NodeLauncher(threading.Thread):
if not utils.ssh_connect(ip, self.image.username,
connect_kwargs=connect_kwargs,
timeout=self.timeout):
raise Exception("Unable to connect via ssh")
raise LaunchAuthException("Unable to connect via ssh")
if statsd:
dt = int((time.time() - start_time) * 1000)
key = 'nodepool.launch.%s.%s.%s' % (self.image.name,
self.provider.name,
self.target.name)
statsd.timing(key, dt)
statsd.incr(key)
# Save the elapsed time for statsd
dt = int((time.time() - start_time) * 1000)
if self.label.subnodes:
self.log.info("Node id: %s is waiting on subnodes" % self.node.id)
@ -412,6 +446,8 @@ class NodeLauncher(threading.Thread):
self.createJenkinsNode()
self.log.info("Node id: %s added to jenkins" % self.node.id)
return dt
def createJenkinsNode(self):
jenkins = self.nodepool.getJenkinsManager(self.target)
@ -531,11 +567,29 @@ class SubNodeLauncher(threading.Thread):
return
try:
self.launchSubNode(session)
start_time = time.time()
dt = self.launchSubNode(session)
failed = False
statsd_key = 'ready'
except Exception, e:
self.log.exception("%s launching subnode id: %s "
"for node id: %s in provider: %s error:" %
(e.__class__.__name__, self.subnode_id,
self.node_id, self.provider.name))
dt = int((time.time() - start_time) * 1000)
failed = True
if hasattr(e, 'statsd_key'):
statsd_key = e.statsd_key
else:
statsd_key = 'error.unknown'
try:
self.nodepool.launchStats(statsd_key, dt, self.image.name,
self.provider.name, self.target.name)
except Exception:
self.log.exception("Exception launching subnode id: %s for "
"node id: %s:" %
(self.subnode_id, self.node_id))
self.log.exception("Exception reporting launch stats:")
if failed:
try:
self.nodepool.deleteSubNode(self.subnode, self.manager)
except Exception:
@ -555,8 +609,10 @@ class SubNodeLauncher(threading.Thread):
snap_image = session.getCurrentSnapshotImage(
self.provider.name, self.image.name)
if not snap_image:
raise Exception("Unable to find current snapshot image %s in %s" %
(self.image.name, self.provider.name))
raise LaunchNodepoolException("Unable to find current snapshot "
"image %s in %s" %
(self.image.name,
self.provider.name))
self.log.info("Creating server with hostname %s in %s from image %s "
"for subnode id: %s for node id: %s"
@ -573,17 +629,18 @@ class SubNodeLauncher(threading.Thread):
(server_id, self.subnode_id, self.node_id))
server = self.manager.waitForServer(server_id, self.launch_timeout)
if server['status'] != 'ACTIVE':
raise Exception("Server %s for subnode id: %s for node id: %s "
"status: %s" %
(server_id, self.subnode_id, self.node_id,
server['status']))
raise LaunchStatusException("Server %s for subnode id: "
"%s for node id: %s "
"status: %s" %
(server_id, self.subnode_id,
self.node_id, server['status']))
ip = server.get('public_v4')
if not ip and self.manager.hasExtension('os-floating-ips'):
ip = self.manager.addPublicIP(server_id,
pool=self.provider.pool)
if not ip:
raise Exception("Unable to find public IP of server")
raise LaunchNetworkException("Unable to find public IP of server")
self.subnode.ip = ip
self.log.debug("Subnode id: %s for node id: %s is running, ip: %s, "
@ -593,21 +650,18 @@ class SubNodeLauncher(threading.Thread):
if not utils.ssh_connect(ip, self.image.username,
connect_kwargs=connect_kwargs,
timeout=self.timeout):
raise Exception("Unable to connect via ssh")
raise LaunchAuthException("Unable to connect via ssh")
if statsd:
dt = int((time.time() - start_time) * 1000)
key = 'nodepool.launch.%s.%s.%s' % (self.image.name,
self.provider.name,
self.node_target_name)
statsd.timing(key, dt)
statsd.incr(key)
# Save the elapsed time for statsd
dt = int((time.time() - start_time) * 1000)
self.subnode.state = nodedb.READY
self.log.info("Subnode id: %s for node id: %s is ready"
% (self.subnode_id, self.node_id))
self.nodepool.updateStats(session, self.provider.name)
return dt
class ImageUpdater(threading.Thread):
log = logging.getLogger("nodepool.ImageUpdater")
@ -1711,3 +1765,19 @@ class NodePool(threading.Thread):
for provider in self.config.providers.values():
key = 'nodepool.provider.%s.max_servers' % provider.name
statsd.gauge(key, provider.max_servers)
def launchStats(self, subkey, dt, image_name, provider_name, target_name):
if not statsd:
return
#nodepool.launch.provider.PROVIDER.subkey
#nodepool.launch.image.IMAGE.subkey
#nodepool.launch.target.TARGET.subkey
#nodepool.launch.subkey
for key in [
'nodepool.launch.provider.%s.%s' % (provider_name, subkey),
'nodepool.launch.image.%s.%s' % (image_name, subkey),
'nodepool.launch.target.%s.%s' % (target_name, subkey),
'nodepool.launch.%s' % (subkey,),
]:
statsd.timing(key, dt)
statsd.incr(key)