Periodically checks for nodes being cleaned

This patch is adding a periodic task to check for nodes waiting for the
ramdisk callback when cleaning is being executed.

A new configuration option called "clean_callback_timeout" was added,
its value is the number of seconds that the Ironic conductor will wait
for the ramdisk to doing the cleaning to contact Ironic back. Defaults
to 1800.

Closes-Bug: #1483120
Change-Id: Id7f9e9018b5cb2389bbe556171e7a9d46425afba
This commit is contained in:
Lucas Alvares Gomes 2015-08-20 09:54:02 +01:00
parent 3d45795abb
commit ea1b012e93
5 changed files with 113 additions and 55 deletions

View File

@ -1,5 +1,33 @@
[DEFAULT] [DEFAULT]
#
# Options defined in oslo.service.periodic_task
#
# Some periodic tasks can be run in a separate process. Should
# we run them here? (boolean value)
#run_external_periodic_tasks=true
#
# Options defined in oslo.service.service
#
# Enable eventlet backdoor. Acceptable values are 0, <port>,
# and <start>:<end>, where 0 results in listening on a random
# tcp port number; <port> results in listening on the
# specified port number (and not enabling backdoor if that
# port is in use); and <start>:<end> results in listening on
# the smallest unused port number within the specified range
# of port numbers. The chosen port is displayed in the
# service's log file. (string value)
#backdoor_port=<None>
# Enables or disables logging values of all registered options
# when starting a service (at DEBUG level). (boolean value)
#log_options=true
# #
# Options defined in oslo.messaging # Options defined in oslo.messaging
# #
@ -79,12 +107,12 @@
# #
# Print debugging output (set logging level to DEBUG instead # Print debugging output (set logging level to DEBUG instead
# of default WARNING level). (boolean value) # of default INFO level). (boolean value)
#debug=false #debug=false
# Print more verbose output (set logging level to INFO instead # If set to false, will disable INFO logging level, making
# of default WARNING level). (boolean value) # WARNING the default. (boolean value)
#verbose=false #verbose=true
# The name of a logging configuration file. This file is # The name of a logging configuration file. This file is
# appended to any existing logging configuration files. For # appended to any existing logging configuration files. For
@ -121,8 +149,8 @@
# (Optional) Enables or disables syslog rfc5424 format for # (Optional) Enables or disables syslog rfc5424 format for
# logging. If enabled, prefixes the MSG part of the syslog # logging. If enabled, prefixes the MSG part of the syslog
# message with APP-NAME (RFC5424). The format without the APP- # message with APP-NAME (RFC5424). The format without the APP-
# NAME is deprecated in K, and will be removed in M, along # NAME is deprecated in Kilo, and will be removed in Mitaka,
# with this option. (boolean value) # along with this option. (boolean value)
#use_syslog_rfc_format=true #use_syslog_rfc_format=true
# Syslog facility to receive log lines. (string value) # Syslog facility to receive log lines. (string value)
@ -167,34 +195,6 @@
#fatal_deprecations=false #fatal_deprecations=false
#
# Options defined in oslo.service.service
#
# Enable eventlet backdoor. Acceptable values are 0, <port>,
# and <start>:<end>, where 0 results in listening on a random
# tcp port number; <port> results in listening on the
# specified port number (and not enabling backdoor if that
# port is in use); and <start>:<end> results in listening on
# the smallest unused port number within the specified range
# of port numbers. The chosen port is displayed in the
# service's log file. (string value)
#backdoor_port=<None>
# Enables or disables logging values of all registered options
# when starting a service (at DEBUG level). (boolean value)
#log_options=true
#
# Options defined in oslo.service.periodic_task
#
# Some periodic tasks can be run in a separate process. Should
# we run them here? (boolean value)
#run_external_periodic_tasks=true
# #
# Options defined in ironic.netconf # Options defined in ironic.netconf
# #
@ -561,6 +561,12 @@
# option could be safely disabled. (boolean value) # option could be safely disabled. (boolean value)
#clean_nodes=true #clean_nodes=true
# Timeout (seconds) to wait for a callback from the ramdisk
# doing the cleaning. If the timeout is reached the node will
# be put in the "clean failed" provision state. Set to 0 to
# disable timeout. (integer value)
#clean_callback_timeout=1800
[console] [console]
@ -1370,12 +1376,12 @@
# the same time. This option provides such compatibility - it # the same time. This option provides such compatibility - it
# defaults to False in Liberty and can be turned on for early # defaults to False in Liberty and can be turned on for early
# adopters with a new installations or for testing. Please # adopters with a new installations or for testing. Please
# note, that this option will be removed in M release. # note, that this option will be removed in the Mitaka
# (boolean value) # release. (boolean value)
#send_single_reply=false #send_single_reply=false
# Qpid broker hostname. (string value) # Qpid broker hostname. (string value)
#qpid_hostname=ironic #qpid_hostname=localhost
# Qpid broker port. (integer value) # Qpid broker port. (integer value)
#qpid_port=5672 #qpid_port=5672
@ -1438,8 +1444,8 @@
# the same time. This option provides such compatibility - it # the same time. This option provides such compatibility - it
# defaults to False in Liberty and can be turned on for early # defaults to False in Liberty and can be turned on for early
# adopters with a new installations or for testing. Please # adopters with a new installations or for testing. Please
# note, that this option will be removed in M release. # note, that this option will be removed in the Mitaka
# (boolean value) # release. (boolean value)
#send_single_reply=false #send_single_reply=false
# SSL version to use (valid only if SSL enabled). Valid values # SSL version to use (valid only if SSL enabled). Valid values
@ -1468,7 +1474,7 @@
# The RabbitMQ broker address where a single node is used. # The RabbitMQ broker address where a single node is used.
# (string value) # (string value)
#rabbit_host=ironic #rabbit_host=localhost
# The RabbitMQ broker port where a single node is used. # The RabbitMQ broker port where a single node is used.
# (integer value) # (integer value)

View File

@ -184,6 +184,12 @@ conductor_opts = [
'longer. In an environment where all tenants are ' 'longer. In an environment where all tenants are '
'trusted (eg, because there is only one tenant), ' 'trusted (eg, because there is only one tenant), '
'this option could be safely disabled.')), 'this option could be safely disabled.')),
cfg.IntOpt('clean_callback_timeout',
default=1800,
help=_('Timeout (seconds) to wait for a callback from the '
'ramdisk doing the cleaning. If the timeout is reached '
'the node will be put in the "clean failed" provision '
'state. Set to 0 to disable timeout.')),
] ]
CONF = cfg.CONF CONF = cfg.CONF
CONF.register_opts(conductor_opts, 'conductor') CONF.register_opts(conductor_opts, 'conductor')
@ -1253,6 +1259,31 @@ class ConductorManager(periodic_task.PeriodicTasks):
task.node.conductor_affinity = self.conductor.id task.node.conductor_affinity = self.conductor.id
task.node.save() task.node.save()
@periodic_task.periodic_task(
spacing=CONF.conductor.check_provision_state_interval)
def _check_cleanwait_timeouts(self, context):
"""Periodically checks for nodes being cleaned.
If a node doing cleaning is unresponsive (detected when it stops
heart beating), the operation should be aborted.
:param context: request context.
"""
callback_timeout = CONF.conductor.clean_callback_timeout
if not callback_timeout:
return
filters = {'reserved': False,
'provision_state': states.CLEANWAIT,
'maintenance': False,
'provisioned_before': callback_timeout}
last_error = _("Timeout reached while cleaning the node. Please "
"check if the ramdisk responsible for the cleaning is "
"running on the node.")
self._fail_if_in_state(context, filters, states.CLEANWAIT,
'provision_updated_at',
last_error=last_error)
@periodic_task.periodic_task( @periodic_task.periodic_task(
spacing=CONF.conductor.sync_local_state_interval) spacing=CONF.conductor.sync_local_state_interval)
def _sync_local_state(self, context): def _sync_local_state(self, context):

View File

@ -267,19 +267,15 @@ class BaseAgentVendor(base.VendorInterface):
# with previous code, otherwise nodes in CLEANING when this # with previous code, otherwise nodes in CLEANING when this
# is deployed would fail. Should be removed once the Mitaka # is deployed would fail. Should be removed once the Mitaka
# release starts. # release starts.
elif (node.provision_state in (states.CLEANWAIT, states.CLEANING) elif node.provision_state in (states.CLEANWAIT, states.CLEANING):
and not node.clean_step): node.touch_provisioning()
# Agent booted from prepare_cleaning if not node.clean_step:
LOG.debug('Node %s just booted to start cleaning.', node.uuid) LOG.debug('Node %s just booted to start cleaning.',
manager.set_node_cleaning_steps(task) node.uuid)
self._notify_conductor_resume_clean(task) manager.set_node_cleaning_steps(task)
# TODO(lucasagomes): CLEANING here for backwards compat self._notify_conductor_resume_clean(task)
# with previous code, otherwise nodes in CLEANING when this else:
# is deployed would fail. Should be removed once the Mitaka self.continue_cleaning(task, **kwargs)
# release starts.
elif (node.provision_state in (states.CLEANWAIT, states.CLEANING)
and node.clean_step):
self.continue_cleaning(task, **kwargs)
except Exception as e: except Exception as e:
err_info = {'node': node.uuid, 'msg': msg, 'e': e} err_info = {'node': node.uuid, 'msg': msg, 'e': e}

View File

@ -1321,6 +1321,22 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
self.assertIsNotNone(node.last_error) self.assertIsNotNone(node.last_error)
mock_cleanup.assert_called_once_with(mock.ANY) mock_cleanup.assert_called_once_with(mock.ANY)
def test__check_cleanwait_timeouts(self):
self._start_service()
CONF.set_override('clean_callback_timeout', 1, group='conductor')
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANWAIT,
target_provision_state=states.AVAILABLE,
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0))
self.service._check_cleanwait_timeouts(self.context)
self.service._worker_pool.waitall()
node.refresh()
self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(states.AVAILABLE, node.target_provision_state)
self.assertIsNotNone(node.last_error)
def test_do_node_tear_down_invalid_state(self): def test_do_node_tear_down_invalid_state(self):
self._start_service() self._start_service()
# test node.provision_state is incorrect for tear_down # test node.provision_state is incorrect for tear_down

View File

@ -322,10 +322,12 @@ class TestBaseAgentVendor(db_base.DbTestCase):
'1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy ' '1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
'is done. exception: LlamaException') 'is done. exception: LlamaException')
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(manager, 'set_node_cleaning_steps', autospec=True) @mock.patch.object(manager, 'set_node_cleaning_steps', autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor, @mock.patch.object(agent_base_vendor.BaseAgentVendor,
'_notify_conductor_resume_clean', autospec=True) '_notify_conductor_resume_clean', autospec=True)
def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps): def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps,
mock_touch):
kwargs = { kwargs = {
'agent_url': 'http://127.0.0.1:9999/bar' 'agent_url': 'http://127.0.0.1:9999/bar'
} }
@ -337,14 +339,18 @@ class TestBaseAgentVendor(db_base.DbTestCase):
self.context, self.node.uuid, shared=True) as task: self.context, self.node.uuid, shared=True) as task:
self.passthru.heartbeat(task, **kwargs) self.passthru.heartbeat(task, **kwargs)
mock_touch.assert_called_once_with(mock.ANY)
mock_notify.assert_called_once_with(mock.ANY, task) mock_notify.assert_called_once_with(mock.ANY, task)
mock_set_steps.assert_called_once_with(task) mock_set_steps.assert_called_once_with(task)
# Reset mocks for the next interaction
mock_touch.reset_mock()
mock_notify.reset_mock() mock_notify.reset_mock()
mock_set_steps.reset_mock() mock_set_steps.reset_mock()
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor, @mock.patch.object(agent_base_vendor.BaseAgentVendor,
'continue_cleaning', autospec=True) 'continue_cleaning', autospec=True)
def test_heartbeat_continue_cleaning(self, mock_continue): def test_heartbeat_continue_cleaning(self, mock_continue, mock_touch):
kwargs = { kwargs = {
'agent_url': 'http://127.0.0.1:9999/bar' 'agent_url': 'http://127.0.0.1:9999/bar'
} }
@ -361,7 +367,10 @@ class TestBaseAgentVendor(db_base.DbTestCase):
self.context, self.node.uuid, shared=True) as task: self.context, self.node.uuid, shared=True) as task:
self.passthru.heartbeat(task, **kwargs) self.passthru.heartbeat(task, **kwargs)
mock_touch.assert_called_once_with(mock.ANY)
mock_continue.assert_called_once_with(mock.ANY, task, **kwargs) mock_continue.assert_called_once_with(mock.ANY, task, **kwargs)
# Reset mocks for the next interaction
mock_touch.reset_mock()
mock_continue.reset_mock() mock_continue.reset_mock()
@mock.patch.object(agent_base_vendor.BaseAgentVendor, 'continue_deploy', @mock.patch.object(agent_base_vendor.BaseAgentVendor, 'continue_deploy',