Demote launch/delete timeeouts to warnings
If we hit the internal timeout while launching or deleting a server, we raise an exception and then log the traceback. This is a not- unexpected occurance, and the traceback is not useful since it's just one stack frame within the same class, so instead, let's log these timeouts at warning level without the traceback. Change-Id: Id4806d8ea2d0a232504e5a75d69cec239bcac670
This commit is contained in:
parent
6a71686b6b
commit
64452f1a26
@ -128,6 +128,7 @@ class FakeOpenStackCloud(object):
|
||||
self.compute = FakeSession(self)
|
||||
self.pause_creates = False
|
||||
self._image_list = images
|
||||
self._create_server_timeout = 0
|
||||
if self._image_list is None:
|
||||
self._image_list = [
|
||||
Dummy(
|
||||
@ -265,10 +266,11 @@ class FakeOpenStackCloud(object):
|
||||
volumes=[],
|
||||
_kw=kw)
|
||||
instance_list.append(s)
|
||||
t = threading.Thread(target=self._finish,
|
||||
name='FakeProvider create',
|
||||
args=(s, 0.1, done_status))
|
||||
t.start()
|
||||
if not kw.get('_test_timeout'):
|
||||
t = threading.Thread(target=self._finish,
|
||||
name='FakeProvider create',
|
||||
args=(s, 0.1, done_status))
|
||||
t.start()
|
||||
return s.copy()
|
||||
|
||||
def _delete(self, name_or_id, instance_list):
|
||||
@ -330,6 +332,9 @@ class FakeOpenStackCloud(object):
|
||||
return server
|
||||
|
||||
def create_server(self, **kw):
|
||||
if self._create_server_timeout:
|
||||
self._create_server_timeout -= 1
|
||||
kw['_test_timeout'] = True
|
||||
return self._create(self._server_list, **kw)
|
||||
|
||||
def get_server(self, name_or_id):
|
||||
@ -472,10 +477,10 @@ class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud):
|
||||
|
||||
def delete_server(self, *args, **kwargs):
|
||||
if self.times_to_fail_delete is None:
|
||||
raise exceptions.ServerDeleteException("Test fail server delete.")
|
||||
raise Exception("Test fail server delete.")
|
||||
if self.times_failed_delete < self.times_to_fail_delete:
|
||||
self.times_failed_delete += 1
|
||||
raise exceptions.ServerDeleteException("Test fail server delete.")
|
||||
raise Exception("Test fail server delete.")
|
||||
else:
|
||||
self.delete_success = True
|
||||
return super().delete_server(*args, **kwargs)
|
||||
|
@ -273,7 +273,8 @@ class StateMachineNodeLauncher(stats.StatsReporter):
|
||||
now = time.monotonic()
|
||||
if (now - state_machine.start_time >
|
||||
self.manager.provider.launch_timeout):
|
||||
raise Exception("Timeout waiting for instance creation")
|
||||
raise exceptions.ServerCreateTimeoutException(
|
||||
"Timeout waiting for instance creation")
|
||||
|
||||
if not self.runDeleteStateMachine():
|
||||
return
|
||||
@ -321,8 +322,13 @@ class StateMachineNodeLauncher(stats.StatsReporter):
|
||||
self.manager.nodescan_worker.removeRequest(self.nodescan_request)
|
||||
self.nodescan_request = None
|
||||
except Exception as e:
|
||||
self.log.exception("Launch attempt %d/%d for node %s, failed:",
|
||||
self.attempts, self.retries, node.id)
|
||||
if isinstance(e, exceptions.TimeoutException):
|
||||
self.log.warning(
|
||||
"Launch attempt %d/%d for node %s, failed: %s",
|
||||
self.attempts, self.retries, node.id, str(e))
|
||||
else:
|
||||
self.log.exception("Launch attempt %d/%d for node %s, failed:",
|
||||
self.attempts, self.retries, node.id)
|
||||
if self.state_machine and self.state_machine.external_id:
|
||||
# If we're deleting, don't overwrite the node external
|
||||
# id, because we may make another delete state machine
|
||||
@ -429,7 +435,8 @@ class StateMachineNodeDeleter:
|
||||
try:
|
||||
now = time.monotonic()
|
||||
if now - state_machine.start_time > self.DELETE_TIMEOUT:
|
||||
raise Exception("Timeout waiting for instance deletion")
|
||||
raise exceptions.ServerDeleteTimeoutException(
|
||||
"Timeout waiting for instance deletion")
|
||||
|
||||
if state_machine.state == state_machine.START:
|
||||
node.state = zk.DELETING
|
||||
@ -451,9 +458,13 @@ class StateMachineNodeDeleter:
|
||||
except exceptions.NotFound:
|
||||
self.log.info("Instance %s not found in provider %s",
|
||||
node.external_id, node.provider)
|
||||
except Exception:
|
||||
self.log.exception("Exception deleting instance %s from %s:",
|
||||
node.external_id, node.provider)
|
||||
except Exception as e:
|
||||
if isinstance(e, exceptions.TimeoutException):
|
||||
self.log.warning("Failure deleting instance %s from %s: %s",
|
||||
node.external_id, node.provider, str(e))
|
||||
else:
|
||||
self.log.exception("Exception deleting instance %s from %s:",
|
||||
node.external_id, node.provider)
|
||||
# Don't delete the ZK node in this case, but do unlock it
|
||||
if node_exists:
|
||||
self.zk.unlockNode(node)
|
||||
@ -1321,7 +1332,7 @@ class NodescanRequest:
|
||||
def _checkTimeout(self):
|
||||
now = time.monotonic()
|
||||
if now - self.start_time > self.timeout:
|
||||
raise Exception(
|
||||
raise exceptions.ConnectionTimeoutException(
|
||||
f"Timeout connecting to {self.ip} on port {self.port}")
|
||||
|
||||
def _checkTransport(self):
|
||||
|
@ -69,8 +69,12 @@ class IPAddTimeoutException(TimeoutException):
|
||||
statsd_key = 'error.ipadd'
|
||||
|
||||
|
||||
class ServerDeleteException(TimeoutException):
|
||||
statsd_key = 'error.serverdelete'
|
||||
class ServerCreateTimeoutException(TimeoutException):
|
||||
statsd_key = 'error.servercreatetimeout'
|
||||
|
||||
|
||||
class ServerDeleteTimeoutException(TimeoutException):
|
||||
statsd_key = 'error.serverdeletetimeout'
|
||||
|
||||
|
||||
class ImageCreateException(TimeoutException):
|
||||
|
48
nodepool/tests/fixtures/node_launch_timeout.yaml
vendored
Normal file
48
nodepool/tests/fixtures/node_launch_timeout.yaml
vendored
Normal file
@ -0,0 +1,48 @@
|
||||
elements-dir: .
|
||||
images-dir: '{images_dir}'
|
||||
build-log-dir: '{build_log_dir}'
|
||||
|
||||
zookeeper-servers:
|
||||
- host: {zookeeper_host}
|
||||
port: {zookeeper_port}
|
||||
chroot: {zookeeper_chroot}
|
||||
|
||||
zookeeper-tls:
|
||||
ca: {zookeeper_ca}
|
||||
cert: {zookeeper_cert}
|
||||
key: {zookeeper_key}
|
||||
|
||||
labels:
|
||||
- name: fake-label
|
||||
min-ready: 0
|
||||
|
||||
providers:
|
||||
- name: fake-provider
|
||||
cloud: fake
|
||||
driver: fake
|
||||
region-name: fake-region
|
||||
launch-retries: 2
|
||||
launch-timeout: 1
|
||||
rate: 0.0001
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
pools:
|
||||
- name: main
|
||||
max-servers: 96
|
||||
labels:
|
||||
- name: fake-label
|
||||
diskimage: fake-image
|
||||
min-ram: 8192
|
||||
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
elements:
|
||||
- fedora
|
||||
- vm
|
||||
release: 21
|
||||
dib-cmd: nodepool/tests/fake-image-create
|
||||
env-vars:
|
||||
TMPDIR: /opt/dib_tmp
|
||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
@ -625,6 +625,30 @@ class TestLauncher(tests.DBTestCase):
|
||||
self.assertEqual(req.state, zk.FAILED)
|
||||
self.assertNotEqual(req.declined_by, [])
|
||||
|
||||
def test_fail_request_on_launch_timeout(self):
|
||||
'''
|
||||
Test that provider launch timeout fails the request.
|
||||
'''
|
||||
configfile = self.setup_config('node_launch_timeout.yaml')
|
||||
self.useBuilder(configfile)
|
||||
self.waitForImage('fake-provider', 'fake-image')
|
||||
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
self.startPool(pool)
|
||||
manager = pool.getProviderManager('fake-provider')
|
||||
client = pool.getProviderManager('fake-provider').adapter._getClient()
|
||||
client._create_server_timeout = 2
|
||||
|
||||
req = zk.NodeRequest()
|
||||
req.state = zk.REQUESTED
|
||||
req.node_types.append('fake-label')
|
||||
self.zk.storeNodeRequest(req)
|
||||
|
||||
req = self.waitForNodeRequest(req)
|
||||
self.assertEqual(0, manager.adapter.createServer_fails)
|
||||
self.assertEqual(req.state, zk.FAILED)
|
||||
self.assertNotEqual(req.declined_by, [])
|
||||
|
||||
def test_az_change_recover(self):
|
||||
'''
|
||||
Test that nodepool recovers from az change in the cloud.
|
||||
|
@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nodepool import exceptions
|
||||
from nodepool import tests
|
||||
from nodepool.nodeutils import iterate_timeout
|
||||
from nodepool.zk.zookeeper import Node
|
||||
@ -143,7 +144,8 @@ class TestNodescanWorker(tests.BaseTestCase):
|
||||
for _ in iterate_timeout(30, Exception, 'nodescan'):
|
||||
if request.complete:
|
||||
break
|
||||
with testtools.ExpectedException(Exception):
|
||||
with testtools.ExpectedException(
|
||||
exceptions.ConnectionTimeoutException):
|
||||
request.result()
|
||||
worker.stop()
|
||||
worker.join()
|
||||
@ -170,7 +172,8 @@ class TestNodescanWorker(tests.BaseTestCase):
|
||||
for _ in iterate_timeout(30, Exception, 'nodescan'):
|
||||
if request.complete:
|
||||
break
|
||||
with testtools.ExpectedException(Exception):
|
||||
with testtools.ExpectedException(
|
||||
exceptions.ConnectionTimeoutException):
|
||||
request.result()
|
||||
worker.stop()
|
||||
worker.join()
|
||||
|
Loading…
Reference in New Issue
Block a user