Be more specific in logging timeout exceptions

At the moment, grepping through logs to determine what's happening with
timeouts on a provider is difficult because for some errors the cause of
the timeout is on a different line than the provider in question.

Give each timeout a specific named exception, and then when we catch the
exceptions, log them specifically with node id, provider and then the
additional descriptive text from the timeout exception. This should
allow for easy grepping through logs to find specific instances of
types of timeouts - or of all timeouts. Also add a corresponding success
debug log so that comparitive greps/counts are also easy.

Change-Id: I889bd9b5d92f77ce9ff86415c775fe1cd9545bbc
This commit is contained in:
Monty Taylor 2016-03-03 08:26:33 -06:00
parent 536f7feab0
commit eed395d637
4 changed files with 57 additions and 11 deletions

View File

@ -23,3 +23,19 @@ class BuilderInvalidCommandError(BuilderError):
class DibFailedError(BuilderError): class DibFailedError(BuilderError):
pass pass
class TimeoutException(Exception):
pass
class SSHTimeoutException(TimeoutException):
statsd_key = 'error.ssh'
class IPAddTimeoutException(TimeoutException):
statsd_key = 'error.ipadd'
class ServerDeleteException(TimeoutException):
statsd_key = 'error.serverdelete'

View File

@ -34,6 +34,7 @@ import zmq
import allocation import allocation
import jenkins_manager import jenkins_manager
import nodedb import nodedb
import exceptions
import nodeutils as utils import nodeutils as utils
import provider_manager import provider_manager
import stats import stats
@ -407,6 +408,19 @@ class NodeLauncher(threading.Thread):
dt = self.launchNode(session) dt = self.launchNode(session)
failed = False failed = False
statsd_key = 'ready' statsd_key = 'ready'
self.log.debug('Node %s ready in provider: %s' %
(self.node_id, self.provider.name))
except exceptions.TimeoutException as e:
# Don't log exception for timeouts. Each one has
# a specific Exception, and we know it's a timeout, so
# the traceback in the log is just noise
self.log.error("Timeout launching node id: %s "
"in provider: %s error: %s" %
(self.node_id, self.provider.name,
str(e)))
dt = int((time.time() - start_time) * 1000)
failed = True
statsd_key = e.statsd_key
except Exception as e: except Exception as e:
self.log.exception("%s launching node id: %s " self.log.exception("%s launching node id: %s "
"in provider: %s error:" % "in provider: %s error:" %

View File

@ -25,6 +25,8 @@ from sshclient import SSHClient
import fakeprovider import fakeprovider
import paramiko import paramiko
import exceptions
log = logging.getLogger("nodepool.utils") log = logging.getLogger("nodepool.utils")
@ -32,14 +34,14 @@ ITERATE_INTERVAL = 2 # How long to sleep while waiting for something
# in a loop # in a loop
def iterate_timeout(max_seconds, purpose): def iterate_timeout(max_seconds, exc, purpose):
start = time.time() start = time.time()
count = 0 count = 0
while (time.time() < start + max_seconds): while (time.time() < start + max_seconds):
count += 1 count += 1
yield count yield count
time.sleep(ITERATE_INTERVAL) time.sleep(ITERATE_INTERVAL)
raise Exception("Timeout waiting for %s" % purpose) raise exc("Timeout waiting for %s" % purpose)
def ssh_connect(ip, username, connect_kwargs={}, timeout=60): def ssh_connect(ip, username, connect_kwargs={}, timeout=60):
@ -47,7 +49,8 @@ def ssh_connect(ip, username, connect_kwargs={}, timeout=60):
return fakeprovider.FakeSSHClient() return fakeprovider.FakeSSHClient()
# HPcloud may return ECONNREFUSED or EHOSTUNREACH # HPcloud may return ECONNREFUSED or EHOSTUNREACH
# for about 30 seconds after adding the IP # for about 30 seconds after adding the IP
for count in iterate_timeout(timeout, "ssh access"): for count in iterate_timeout(
timeout, exceptions.SSHTimeoutException, "ssh access"):
try: try:
client = SSHClient(ip, username, **connect_kwargs) client = SSHClient(ip, username, **connect_kwargs)
break break

View File

@ -28,6 +28,7 @@ import sys
import shade import shade
import novaclient import novaclient
import exceptions
from nodeutils import iterate_timeout from nodeutils import iterate_timeout
from task_manager import Task, TaskManager, ManagerStoppedException from task_manager import Task, TaskManager, ManagerStoppedException
@ -36,6 +37,14 @@ SERVER_LIST_AGE = 5 # How long to keep a cached copy of the server list
IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list
class ServerCreateException(exceptions.TimeoutException):
statsd_key = 'error.servertimeout'
class ImageCreateException(exceptions.TimeoutException):
statsd_key = 'error.imagetimeout'
def get_public_ip(server, provider, version=4): def get_public_ip(server, provider, version=4):
for addr in server.addresses.get('public', []): for addr in server.addresses.get('public', []):
if type(addr) == type(u''): # Rackspace/openstack 1.0 if type(addr) == type(u''): # Rackspace/openstack 1.0
@ -456,10 +465,12 @@ class ProviderManager(TaskManager):
def _waitForResource(self, resource_type, resource_id, timeout): def _waitForResource(self, resource_type, resource_id, timeout):
last_status = None last_status = None
for count in iterate_timeout(timeout, if resource_type == 'server':
"%s %s in %s" % (resource_type, exc = ServerCreateException
resource_id, elif resource_type == 'image':
self.provider.name)): exc = ImageCreateException
for count in iterate_timeout(timeout, exc,
"%s creation" % resource_type):
try: try:
if resource_type == 'server': if resource_type == 'server':
resource = self.getServerFromList(resource_id) resource = self.getServerFromList(resource_id)
@ -497,8 +508,9 @@ class ProviderManager(TaskManager):
return self._waitForResource('server', server_id, timeout) return self._waitForResource('server', server_id, timeout)
def waitForServerDeletion(self, server_id, timeout=600): def waitForServerDeletion(self, server_id, timeout=600):
for count in iterate_timeout(600, "server %s deletion in %s" % for count in iterate_timeout(600,
(server_id, self.provider.name)): exceptions.ServerDeleteException,
"server %s deletion " % server_id):
try: try:
self.getServerFromList(server_id) self.getServerFromList(server_id)
except NotFound: except NotFound:
@ -528,8 +540,9 @@ class ProviderManager(TaskManager):
# from removing this IP. # from removing this IP.
self.deleteFloatingIP(ip['id']) self.deleteFloatingIP(ip['id'])
raise raise
for count in iterate_timeout(600, "ip to be added to %s in %s" % for count in iterate_timeout(600,
(server_id, self.provider.name)): exceptions.IPAddTimeoutException,
"ip to be added to %s" % server_id):
try: try:
newip = self.getFloatingIP(ip['id']) newip = self.getFloatingIP(ip['id'])
except ManagerStoppedException: except ManagerStoppedException: