Fix the DEPLOYWAIT check for agent_* drivers

The patch [1] sets the node's provision_state to DEPLOYWAIT while the
agent is booting and writing the the image onto the local disk. But in
the ironic-conductor we have a periodic task that checks for nodes in
DEPLOYWAIT state and see if the deployment timed out based on the node's
provision_udpated_at field and CONF.conductor.deploy_callback_timeout
configuration option.

The problem is that prior to this patch the node's provision_updated_at
field was only updated when we actually changed the provision state of
the node. So, if we are deployment a image which takes a long time to
be downloaded and written to the local disk ironic-conductor could time
out the deployment even if the agent still deploying that image.

This patch fixes this problem by touching the node's provision_updated_at
field when the node is heartbeating, so that we can indicate to the
ironic-conductor that the deployment is still running and we shouldn't
time it out.

[1] f1929f0155

Closes-Bug: #1475672
Change-Id: I9373c42168cc1ffaae212f17b067a6e4b6d862fe
This commit is contained in:
Lucas Alvares Gomes
2015-07-17 15:30:12 +01:00
parent 854ab56013
commit 4c8bdc29ec
7 changed files with 76 additions and 1 deletions

View File

@@ -400,3 +400,14 @@ class Connection(object):
:returns: A list of conductor hostnames.
"""
@abc.abstractmethod
def touch_node_provisioning(self, node_id):
"""Mark the node's provisioning as running.
Mark the node's provisioning as running by updating its
'provision_updated_at' property.
:param node_id: The id of a node.
:raises: NodeNotFound
"""

View File

@@ -601,3 +601,11 @@ class Connection(api.Connection):
.filter(models.Conductor.updated_at < limit)
.all())
return [row['hostname'] for row in result]
def touch_node_provisioning(self, node_id):
with _session_for_write():
query = model_query(models.Node)
query = add_identity_filter(query, node_id)
count = query.update({'provision_updated_at': timeutils.utcnow()})
if count == 0:
raise exception.NodeNotFound(node_id)

View File

@@ -260,6 +260,9 @@ class BaseAgentVendor(base.VendorInterface):
self.deploy_is_done(task)):
msg = _('Node failed to move to active state.')
self.reboot_to_instance(task, **kwargs)
elif (node.provision_state == states.DEPLOYWAIT and
self.deploy_has_started(task)):
node.touch_provisioning()
elif (node.provision_state == states.CLEANING and
not node.clean_step):
# Agent booted from prepare_cleaning

View File

@@ -37,7 +37,8 @@ class Node(base.IronicObject):
# Version 1.10: Add name and get_by_name()
# Version 1.11: Add clean_step
# Version 1.12: Add raid_config and target_raid_config
VERSION = '1.12'
# Version 1.13: Add touch_provisioning()
VERSION = '1.13'
dbapi = db_api.get_instance()
@@ -285,3 +286,8 @@ class Node(base.IronicObject):
if (hasattr(self, base.get_attrname(field)) and
self[field] != current[field]):
self[field] = current[field]
@base.remotable
def touch_provisioning(self, context=None):
"""Touch the database record to mark the provisioning as alive."""
self.dbapi.touch_node_provisioning(self.id)

View File

@@ -501,3 +501,22 @@ class DbNodeTestCase(base.DbTestCase):
self.dbapi.release_node, 'fake', node.id)
self.assertRaises(exception.NodeNotLocked,
self.dbapi.release_node, 'fake', node.uuid)
@mock.patch.object(timeutils, 'utcnow', autospec=True)
def test_touch_node_provisioning(self, mock_utcnow):
test_time = datetime.datetime(2000, 1, 1, 0, 0)
mock_utcnow.return_value = test_time
node = utils.create_test_node()
# assert provision_updated_at is None
self.assertIsNone(node.provision_updated_at)
self.dbapi.touch_node_provisioning(node.uuid)
node = self.dbapi.get_node_by_uuid(node.uuid)
# assert provision_updated_at has been updated
self.assertEqual(test_time,
timeutils.normalize_time(node.provision_updated_at))
def test_touch_node_provisioning_not_found(self):
self.assertRaises(
exception.NodeNotFound,
self.dbapi.touch_node_provisioning, uuidutils.generate_uuid())

View File

@@ -321,6 +321,24 @@ class TestBaseAgentVendor(db_base.DbTestCase):
self.assertEqual(0, rti_mock.call_count)
self.assertEqual(0, cd_mock.call_count)
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor, 'deploy_has_started',
autospec=True)
def test_heartbeat_touch_provisioning(self, mock_deploy_started,
mock_touch):
mock_deploy_started.return_value = True
kwargs = {
'agent_url': 'http://127.0.0.1:9999/bar'
}
self.node.provision_state = states.DEPLOYWAIT
self.node.save()
with task_manager.acquire(
self.context, self.node.uuid, shared=True) as task:
self.passthru.heartbeat(task, **kwargs)
mock_touch.assert_called_once_with(mock.ANY)
def test_vendor_passthru_vendor_routes(self):
expected = ['heartbeat']
with task_manager.acquire(self.context, self.node.uuid,

View File

@@ -139,3 +139,13 @@ class TestNodeObject(base.DbTestCase):
self.assertRaises(exception.NodeNotFound,
objects.Node.release, self.context,
'fake-tag', node_id)
def test_touch_provisioning(self):
with mock.patch.object(self.dbapi, 'get_node_by_uuid',
autospec=True) as mock_get_node:
mock_get_node.return_value = self.fake_node
with mock.patch.object(self.dbapi, 'touch_node_provisioning',
autospec=True) as mock_touch:
node = objects.Node.get(self.context, self.fake_node['uuid'])
node.touch_provisioning()
mock_touch.assert_called_once_with(node.id)