Be more specific in logging timeout exceptions

At the moment, grepping through logs to determine what's happening with
timeouts on a provider is difficult because for some errors the cause of
the timeout is on a different line than the provider in question.

Give each timeout a specific named exception, and then when we catch the
exceptions, log them specifically with node id, provider and then the
additional descriptive text from the timeout exception. This should
allow for easy grepping through logs to find specific instances of
types of timeouts - or of all timeouts. Also add a corresponding success
debug log so that comparitive greps/counts are also easy.

Change-Id: I889bd9b5d92f77ce9ff86415c775fe1cd9545bbc
This commit is contained in:
Monty Taylor 2016-03-03 08:26:33 -06:00
parent 536f7feab0
commit eed395d637
4 changed files with 57 additions and 11 deletions

View File

@ -23,3 +23,19 @@ class BuilderInvalidCommandError(BuilderError):
class DibFailedError(BuilderError):
pass
class TimeoutException(Exception):
pass
class SSHTimeoutException(TimeoutException):
statsd_key = 'error.ssh'
class IPAddTimeoutException(TimeoutException):
statsd_key = 'error.ipadd'
class ServerDeleteException(TimeoutException):
statsd_key = 'error.serverdelete'

View File

@ -34,6 +34,7 @@ import zmq
import allocation
import jenkins_manager
import nodedb
import exceptions
import nodeutils as utils
import provider_manager
import stats
@ -407,6 +408,19 @@ class NodeLauncher(threading.Thread):
dt = self.launchNode(session)
failed = False
statsd_key = 'ready'
self.log.debug('Node %s ready in provider: %s' %
(self.node_id, self.provider.name))
except exceptions.TimeoutException as e:
# Don't log exception for timeouts. Each one has
# a specific Exception, and we know it's a timeout, so
# the traceback in the log is just noise
self.log.error("Timeout launching node id: %s "
"in provider: %s error: %s" %
(self.node_id, self.provider.name,
str(e)))
dt = int((time.time() - start_time) * 1000)
failed = True
statsd_key = e.statsd_key
except Exception as e:
self.log.exception("%s launching node id: %s "
"in provider: %s error:" %

View File

@ -25,6 +25,8 @@ from sshclient import SSHClient
import fakeprovider
import paramiko
import exceptions
log = logging.getLogger("nodepool.utils")
@ -32,14 +34,14 @@ ITERATE_INTERVAL = 2 # How long to sleep while waiting for something
# in a loop
def iterate_timeout(max_seconds, purpose):
def iterate_timeout(max_seconds, exc, purpose):
start = time.time()
count = 0
while (time.time() < start + max_seconds):
count += 1
yield count
time.sleep(ITERATE_INTERVAL)
raise Exception("Timeout waiting for %s" % purpose)
raise exc("Timeout waiting for %s" % purpose)
def ssh_connect(ip, username, connect_kwargs={}, timeout=60):
@ -47,7 +49,8 @@ def ssh_connect(ip, username, connect_kwargs={}, timeout=60):
return fakeprovider.FakeSSHClient()
# HPcloud may return ECONNREFUSED or EHOSTUNREACH
# for about 30 seconds after adding the IP
for count in iterate_timeout(timeout, "ssh access"):
for count in iterate_timeout(
timeout, exceptions.SSHTimeoutException, "ssh access"):
try:
client = SSHClient(ip, username, **connect_kwargs)
break

View File

@ -28,6 +28,7 @@ import sys
import shade
import novaclient
import exceptions
from nodeutils import iterate_timeout
from task_manager import Task, TaskManager, ManagerStoppedException
@ -36,6 +37,14 @@ SERVER_LIST_AGE = 5 # How long to keep a cached copy of the server list
IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list
class ServerCreateException(exceptions.TimeoutException):
statsd_key = 'error.servertimeout'
class ImageCreateException(exceptions.TimeoutException):
statsd_key = 'error.imagetimeout'
def get_public_ip(server, provider, version=4):
for addr in server.addresses.get('public', []):
if type(addr) == type(u''): # Rackspace/openstack 1.0
@ -456,10 +465,12 @@ class ProviderManager(TaskManager):
def _waitForResource(self, resource_type, resource_id, timeout):
last_status = None
for count in iterate_timeout(timeout,
"%s %s in %s" % (resource_type,
resource_id,
self.provider.name)):
if resource_type == 'server':
exc = ServerCreateException
elif resource_type == 'image':
exc = ImageCreateException
for count in iterate_timeout(timeout, exc,
"%s creation" % resource_type):
try:
if resource_type == 'server':
resource = self.getServerFromList(resource_id)
@ -497,8 +508,9 @@ class ProviderManager(TaskManager):
return self._waitForResource('server', server_id, timeout)
def waitForServerDeletion(self, server_id, timeout=600):
for count in iterate_timeout(600, "server %s deletion in %s" %
(server_id, self.provider.name)):
for count in iterate_timeout(600,
exceptions.ServerDeleteException,
"server %s deletion " % server_id):
try:
self.getServerFromList(server_id)
except NotFound:
@ -528,8 +540,9 @@ class ProviderManager(TaskManager):
# from removing this IP.
self.deleteFloatingIP(ip['id'])
raise
for count in iterate_timeout(600, "ip to be added to %s in %s" %
(server_id, self.provider.name)):
for count in iterate_timeout(600,
exceptions.IPAddTimeoutException,
"ip to be added to %s" % server_id):
try:
newip = self.getFloatingIP(ip['id'])
except ManagerStoppedException: