Be more specific in logging timeout exceptions
At the moment, grepping through logs to determine what's happening with timeouts on a provider is difficult because for some errors the cause of the timeout is on a different line than the provider in question. Give each timeout a specific named exception, and then when we catch the exceptions, log them specifically with node id, provider and then the additional descriptive text from the timeout exception. This should allow for easy grepping through logs to find specific instances of types of timeouts - or of all timeouts. Also add a corresponding success debug log so that comparitive greps/counts are also easy. Change-Id: I889bd9b5d92f77ce9ff86415c775fe1cd9545bbc
This commit is contained in:
parent
536f7feab0
commit
eed395d637
|
@ -23,3 +23,19 @@ class BuilderInvalidCommandError(BuilderError):
|
||||||
|
|
||||||
class DibFailedError(BuilderError):
|
class DibFailedError(BuilderError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TimeoutException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SSHTimeoutException(TimeoutException):
|
||||||
|
statsd_key = 'error.ssh'
|
||||||
|
|
||||||
|
|
||||||
|
class IPAddTimeoutException(TimeoutException):
|
||||||
|
statsd_key = 'error.ipadd'
|
||||||
|
|
||||||
|
|
||||||
|
class ServerDeleteException(TimeoutException):
|
||||||
|
statsd_key = 'error.serverdelete'
|
||||||
|
|
|
@ -34,6 +34,7 @@ import zmq
|
||||||
import allocation
|
import allocation
|
||||||
import jenkins_manager
|
import jenkins_manager
|
||||||
import nodedb
|
import nodedb
|
||||||
|
import exceptions
|
||||||
import nodeutils as utils
|
import nodeutils as utils
|
||||||
import provider_manager
|
import provider_manager
|
||||||
import stats
|
import stats
|
||||||
|
@ -407,6 +408,19 @@ class NodeLauncher(threading.Thread):
|
||||||
dt = self.launchNode(session)
|
dt = self.launchNode(session)
|
||||||
failed = False
|
failed = False
|
||||||
statsd_key = 'ready'
|
statsd_key = 'ready'
|
||||||
|
self.log.debug('Node %s ready in provider: %s' %
|
||||||
|
(self.node_id, self.provider.name))
|
||||||
|
except exceptions.TimeoutException as e:
|
||||||
|
# Don't log exception for timeouts. Each one has
|
||||||
|
# a specific Exception, and we know it's a timeout, so
|
||||||
|
# the traceback in the log is just noise
|
||||||
|
self.log.error("Timeout launching node id: %s "
|
||||||
|
"in provider: %s error: %s" %
|
||||||
|
(self.node_id, self.provider.name,
|
||||||
|
str(e)))
|
||||||
|
dt = int((time.time() - start_time) * 1000)
|
||||||
|
failed = True
|
||||||
|
statsd_key = e.statsd_key
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.exception("%s launching node id: %s "
|
self.log.exception("%s launching node id: %s "
|
||||||
"in provider: %s error:" %
|
"in provider: %s error:" %
|
||||||
|
|
|
@ -25,6 +25,8 @@ from sshclient import SSHClient
|
||||||
import fakeprovider
|
import fakeprovider
|
||||||
import paramiko
|
import paramiko
|
||||||
|
|
||||||
|
import exceptions
|
||||||
|
|
||||||
log = logging.getLogger("nodepool.utils")
|
log = logging.getLogger("nodepool.utils")
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,14 +34,14 @@ ITERATE_INTERVAL = 2 # How long to sleep while waiting for something
|
||||||
# in a loop
|
# in a loop
|
||||||
|
|
||||||
|
|
||||||
def iterate_timeout(max_seconds, purpose):
|
def iterate_timeout(max_seconds, exc, purpose):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
count = 0
|
count = 0
|
||||||
while (time.time() < start + max_seconds):
|
while (time.time() < start + max_seconds):
|
||||||
count += 1
|
count += 1
|
||||||
yield count
|
yield count
|
||||||
time.sleep(ITERATE_INTERVAL)
|
time.sleep(ITERATE_INTERVAL)
|
||||||
raise Exception("Timeout waiting for %s" % purpose)
|
raise exc("Timeout waiting for %s" % purpose)
|
||||||
|
|
||||||
|
|
||||||
def ssh_connect(ip, username, connect_kwargs={}, timeout=60):
|
def ssh_connect(ip, username, connect_kwargs={}, timeout=60):
|
||||||
|
@ -47,7 +49,8 @@ def ssh_connect(ip, username, connect_kwargs={}, timeout=60):
|
||||||
return fakeprovider.FakeSSHClient()
|
return fakeprovider.FakeSSHClient()
|
||||||
# HPcloud may return ECONNREFUSED or EHOSTUNREACH
|
# HPcloud may return ECONNREFUSED or EHOSTUNREACH
|
||||||
# for about 30 seconds after adding the IP
|
# for about 30 seconds after adding the IP
|
||||||
for count in iterate_timeout(timeout, "ssh access"):
|
for count in iterate_timeout(
|
||||||
|
timeout, exceptions.SSHTimeoutException, "ssh access"):
|
||||||
try:
|
try:
|
||||||
client = SSHClient(ip, username, **connect_kwargs)
|
client = SSHClient(ip, username, **connect_kwargs)
|
||||||
break
|
break
|
||||||
|
|
|
@ -28,6 +28,7 @@ import sys
|
||||||
import shade
|
import shade
|
||||||
import novaclient
|
import novaclient
|
||||||
|
|
||||||
|
import exceptions
|
||||||
from nodeutils import iterate_timeout
|
from nodeutils import iterate_timeout
|
||||||
from task_manager import Task, TaskManager, ManagerStoppedException
|
from task_manager import Task, TaskManager, ManagerStoppedException
|
||||||
|
|
||||||
|
@ -36,6 +37,14 @@ SERVER_LIST_AGE = 5 # How long to keep a cached copy of the server list
|
||||||
IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list
|
IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list
|
||||||
|
|
||||||
|
|
||||||
|
class ServerCreateException(exceptions.TimeoutException):
|
||||||
|
statsd_key = 'error.servertimeout'
|
||||||
|
|
||||||
|
|
||||||
|
class ImageCreateException(exceptions.TimeoutException):
|
||||||
|
statsd_key = 'error.imagetimeout'
|
||||||
|
|
||||||
|
|
||||||
def get_public_ip(server, provider, version=4):
|
def get_public_ip(server, provider, version=4):
|
||||||
for addr in server.addresses.get('public', []):
|
for addr in server.addresses.get('public', []):
|
||||||
if type(addr) == type(u''): # Rackspace/openstack 1.0
|
if type(addr) == type(u''): # Rackspace/openstack 1.0
|
||||||
|
@ -456,10 +465,12 @@ class ProviderManager(TaskManager):
|
||||||
|
|
||||||
def _waitForResource(self, resource_type, resource_id, timeout):
|
def _waitForResource(self, resource_type, resource_id, timeout):
|
||||||
last_status = None
|
last_status = None
|
||||||
for count in iterate_timeout(timeout,
|
if resource_type == 'server':
|
||||||
"%s %s in %s" % (resource_type,
|
exc = ServerCreateException
|
||||||
resource_id,
|
elif resource_type == 'image':
|
||||||
self.provider.name)):
|
exc = ImageCreateException
|
||||||
|
for count in iterate_timeout(timeout, exc,
|
||||||
|
"%s creation" % resource_type):
|
||||||
try:
|
try:
|
||||||
if resource_type == 'server':
|
if resource_type == 'server':
|
||||||
resource = self.getServerFromList(resource_id)
|
resource = self.getServerFromList(resource_id)
|
||||||
|
@ -497,8 +508,9 @@ class ProviderManager(TaskManager):
|
||||||
return self._waitForResource('server', server_id, timeout)
|
return self._waitForResource('server', server_id, timeout)
|
||||||
|
|
||||||
def waitForServerDeletion(self, server_id, timeout=600):
|
def waitForServerDeletion(self, server_id, timeout=600):
|
||||||
for count in iterate_timeout(600, "server %s deletion in %s" %
|
for count in iterate_timeout(600,
|
||||||
(server_id, self.provider.name)):
|
exceptions.ServerDeleteException,
|
||||||
|
"server %s deletion " % server_id):
|
||||||
try:
|
try:
|
||||||
self.getServerFromList(server_id)
|
self.getServerFromList(server_id)
|
||||||
except NotFound:
|
except NotFound:
|
||||||
|
@ -528,8 +540,9 @@ class ProviderManager(TaskManager):
|
||||||
# from removing this IP.
|
# from removing this IP.
|
||||||
self.deleteFloatingIP(ip['id'])
|
self.deleteFloatingIP(ip['id'])
|
||||||
raise
|
raise
|
||||||
for count in iterate_timeout(600, "ip to be added to %s in %s" %
|
for count in iterate_timeout(600,
|
||||||
(server_id, self.provider.name)):
|
exceptions.IPAddTimeoutException,
|
||||||
|
"ip to be added to %s" % server_id):
|
||||||
try:
|
try:
|
||||||
newip = self.getFloatingIP(ip['id'])
|
newip = self.getFloatingIP(ip['id'])
|
||||||
except ManagerStoppedException:
|
except ManagerStoppedException:
|
||||||
|
|
Loading…
Reference in New Issue