diff --git a/nodepool/exceptions.py b/nodepool/exceptions.py index ef39fe649..5f4409d72 100644 --- a/nodepool/exceptions.py +++ b/nodepool/exceptions.py @@ -23,3 +23,19 @@ class BuilderInvalidCommandError(BuilderError): class DibFailedError(BuilderError): pass + + +class TimeoutException(Exception): + pass + + +class SSHTimeoutException(TimeoutException): + statsd_key = 'error.ssh' + + +class IPAddTimeoutException(TimeoutException): + statsd_key = 'error.ipadd' + + +class ServerDeleteException(TimeoutException): + statsd_key = 'error.serverdelete' diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index e054fffc2..d6c320436 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -34,6 +34,7 @@ import zmq import allocation import jenkins_manager import nodedb +import exceptions import nodeutils as utils import provider_manager import stats @@ -407,6 +408,19 @@ class NodeLauncher(threading.Thread): dt = self.launchNode(session) failed = False statsd_key = 'ready' + self.log.debug('Node %s ready in provider: %s' % + (self.node_id, self.provider.name)) + except exceptions.TimeoutException as e: + # Don't log exception for timeouts. Each one has + # a specific Exception, and we know it's a timeout, so + # the traceback in the log is just noise + self.log.error("Timeout launching node id: %s " + "in provider: %s error: %s" % + (self.node_id, self.provider.name, + str(e))) + dt = int((time.time() - start_time) * 1000) + failed = True + statsd_key = e.statsd_key except Exception as e: self.log.exception("%s launching node id: %s " "in provider: %s error:" % diff --git a/nodepool/nodeutils.py b/nodepool/nodeutils.py index 55b98da39..3c1217d06 100644 --- a/nodepool/nodeutils.py +++ b/nodepool/nodeutils.py @@ -25,6 +25,8 @@ from sshclient import SSHClient import fakeprovider import paramiko +import exceptions + log = logging.getLogger("nodepool.utils") @@ -32,14 +34,14 @@ ITERATE_INTERVAL = 2 # How long to sleep while waiting for something # in a loop -def iterate_timeout(max_seconds, purpose): +def iterate_timeout(max_seconds, exc, purpose): start = time.time() count = 0 while (time.time() < start + max_seconds): count += 1 yield count time.sleep(ITERATE_INTERVAL) - raise Exception("Timeout waiting for %s" % purpose) + raise exc("Timeout waiting for %s" % purpose) def ssh_connect(ip, username, connect_kwargs={}, timeout=60): @@ -47,7 +49,8 @@ def ssh_connect(ip, username, connect_kwargs={}, timeout=60): return fakeprovider.FakeSSHClient() # HPcloud may return ECONNREFUSED or EHOSTUNREACH # for about 30 seconds after adding the IP - for count in iterate_timeout(timeout, "ssh access"): + for count in iterate_timeout( + timeout, exceptions.SSHTimeoutException, "ssh access"): try: client = SSHClient(ip, username, **connect_kwargs) break diff --git a/nodepool/provider_manager.py b/nodepool/provider_manager.py index e177714e2..b08df4168 100644 --- a/nodepool/provider_manager.py +++ b/nodepool/provider_manager.py @@ -28,6 +28,7 @@ import sys import shade import novaclient +import exceptions from nodeutils import iterate_timeout from task_manager import Task, TaskManager, ManagerStoppedException @@ -36,6 +37,14 @@ SERVER_LIST_AGE = 5 # How long to keep a cached copy of the server list IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list +class ServerCreateException(exceptions.TimeoutException): + statsd_key = 'error.servertimeout' + + +class ImageCreateException(exceptions.TimeoutException): + statsd_key = 'error.imagetimeout' + + def get_public_ip(server, provider, version=4): for addr in server.addresses.get('public', []): if type(addr) == type(u''): # Rackspace/openstack 1.0 @@ -456,10 +465,12 @@ class ProviderManager(TaskManager): def _waitForResource(self, resource_type, resource_id, timeout): last_status = None - for count in iterate_timeout(timeout, - "%s %s in %s" % (resource_type, - resource_id, - self.provider.name)): + if resource_type == 'server': + exc = ServerCreateException + elif resource_type == 'image': + exc = ImageCreateException + for count in iterate_timeout(timeout, exc, + "%s creation" % resource_type): try: if resource_type == 'server': resource = self.getServerFromList(resource_id) @@ -497,8 +508,9 @@ class ProviderManager(TaskManager): return self._waitForResource('server', server_id, timeout) def waitForServerDeletion(self, server_id, timeout=600): - for count in iterate_timeout(600, "server %s deletion in %s" % - (server_id, self.provider.name)): + for count in iterate_timeout(600, + exceptions.ServerDeleteException, + "server %s deletion " % server_id): try: self.getServerFromList(server_id) except NotFound: @@ -528,8 +540,9 @@ class ProviderManager(TaskManager): # from removing this IP. self.deleteFloatingIP(ip['id']) raise - for count in iterate_timeout(600, "ip to be added to %s in %s" % - (server_id, self.provider.name)): + for count in iterate_timeout(600, + exceptions.IPAddTimeoutException, + "ip to be added to %s" % server_id): try: newip = self.getFloatingIP(ip['id']) except ManagerStoppedException: