Fix the DEPLOYWAIT check for agent_* drivers
The patch [1] sets the node's provision_state to DEPLOYWAIT while the
agent is booting and writing the the image onto the local disk. But in
the ironic-conductor we have a periodic task that checks for nodes in
DEPLOYWAIT state and see if the deployment timed out based on the node's
provision_udpated_at field and CONF.conductor.deploy_callback_timeout
configuration option.
The problem is that prior to this patch the node's provision_updated_at
field was only updated when we actually changed the provision state of
the node. So, if we are deployment a image which takes a long time to
be downloaded and written to the local disk ironic-conductor could time
out the deployment even if the agent still deploying that image.
This patch fixes this problem by touching the node's provision_updated_at
field when the node is heartbeating, so that we can indicate to the
ironic-conductor that the deployment is still running and we shouldn't
time it out.
[1] f1929f0155
Closes-Bug: #1475672
Change-Id: I9373c42168cc1ffaae212f17b067a6e4b6d862fe
This commit is contained in:
@@ -400,3 +400,14 @@ class Connection(object):
|
||||
|
||||
:returns: A list of conductor hostnames.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def touch_node_provisioning(self, node_id):
|
||||
"""Mark the node's provisioning as running.
|
||||
|
||||
Mark the node's provisioning as running by updating its
|
||||
'provision_updated_at' property.
|
||||
|
||||
:param node_id: The id of a node.
|
||||
:raises: NodeNotFound
|
||||
"""
|
||||
|
||||
@@ -601,3 +601,11 @@ class Connection(api.Connection):
|
||||
.filter(models.Conductor.updated_at < limit)
|
||||
.all())
|
||||
return [row['hostname'] for row in result]
|
||||
|
||||
def touch_node_provisioning(self, node_id):
|
||||
with _session_for_write():
|
||||
query = model_query(models.Node)
|
||||
query = add_identity_filter(query, node_id)
|
||||
count = query.update({'provision_updated_at': timeutils.utcnow()})
|
||||
if count == 0:
|
||||
raise exception.NodeNotFound(node_id)
|
||||
|
||||
@@ -260,6 +260,9 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
self.deploy_is_done(task)):
|
||||
msg = _('Node failed to move to active state.')
|
||||
self.reboot_to_instance(task, **kwargs)
|
||||
elif (node.provision_state == states.DEPLOYWAIT and
|
||||
self.deploy_has_started(task)):
|
||||
node.touch_provisioning()
|
||||
elif (node.provision_state == states.CLEANING and
|
||||
not node.clean_step):
|
||||
# Agent booted from prepare_cleaning
|
||||
|
||||
@@ -37,7 +37,8 @@ class Node(base.IronicObject):
|
||||
# Version 1.10: Add name and get_by_name()
|
||||
# Version 1.11: Add clean_step
|
||||
# Version 1.12: Add raid_config and target_raid_config
|
||||
VERSION = '1.12'
|
||||
# Version 1.13: Add touch_provisioning()
|
||||
VERSION = '1.13'
|
||||
|
||||
dbapi = db_api.get_instance()
|
||||
|
||||
@@ -285,3 +286,8 @@ class Node(base.IronicObject):
|
||||
if (hasattr(self, base.get_attrname(field)) and
|
||||
self[field] != current[field]):
|
||||
self[field] = current[field]
|
||||
|
||||
@base.remotable
|
||||
def touch_provisioning(self, context=None):
|
||||
"""Touch the database record to mark the provisioning as alive."""
|
||||
self.dbapi.touch_node_provisioning(self.id)
|
||||
|
||||
@@ -501,3 +501,22 @@ class DbNodeTestCase(base.DbTestCase):
|
||||
self.dbapi.release_node, 'fake', node.id)
|
||||
self.assertRaises(exception.NodeNotLocked,
|
||||
self.dbapi.release_node, 'fake', node.uuid)
|
||||
|
||||
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
||||
def test_touch_node_provisioning(self, mock_utcnow):
|
||||
test_time = datetime.datetime(2000, 1, 1, 0, 0)
|
||||
mock_utcnow.return_value = test_time
|
||||
node = utils.create_test_node()
|
||||
# assert provision_updated_at is None
|
||||
self.assertIsNone(node.provision_updated_at)
|
||||
|
||||
self.dbapi.touch_node_provisioning(node.uuid)
|
||||
node = self.dbapi.get_node_by_uuid(node.uuid)
|
||||
# assert provision_updated_at has been updated
|
||||
self.assertEqual(test_time,
|
||||
timeutils.normalize_time(node.provision_updated_at))
|
||||
|
||||
def test_touch_node_provisioning_not_found(self):
|
||||
self.assertRaises(
|
||||
exception.NodeNotFound,
|
||||
self.dbapi.touch_node_provisioning, uuidutils.generate_uuid())
|
||||
|
||||
@@ -321,6 +321,24 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
||||
self.assertEqual(0, rti_mock.call_count)
|
||||
self.assertEqual(0, cd_mock.call_count)
|
||||
|
||||
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor, 'deploy_has_started',
|
||||
autospec=True)
|
||||
def test_heartbeat_touch_provisioning(self, mock_deploy_started,
|
||||
mock_touch):
|
||||
mock_deploy_started.return_value = True
|
||||
kwargs = {
|
||||
'agent_url': 'http://127.0.0.1:9999/bar'
|
||||
}
|
||||
|
||||
self.node.provision_state = states.DEPLOYWAIT
|
||||
self.node.save()
|
||||
with task_manager.acquire(
|
||||
self.context, self.node.uuid, shared=True) as task:
|
||||
self.passthru.heartbeat(task, **kwargs)
|
||||
|
||||
mock_touch.assert_called_once_with(mock.ANY)
|
||||
|
||||
def test_vendor_passthru_vendor_routes(self):
|
||||
expected = ['heartbeat']
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
|
||||
@@ -139,3 +139,13 @@ class TestNodeObject(base.DbTestCase):
|
||||
self.assertRaises(exception.NodeNotFound,
|
||||
objects.Node.release, self.context,
|
||||
'fake-tag', node_id)
|
||||
|
||||
def test_touch_provisioning(self):
|
||||
with mock.patch.object(self.dbapi, 'get_node_by_uuid',
|
||||
autospec=True) as mock_get_node:
|
||||
mock_get_node.return_value = self.fake_node
|
||||
with mock.patch.object(self.dbapi, 'touch_node_provisioning',
|
||||
autospec=True) as mock_touch:
|
||||
node = objects.Node.get(self.context, self.fake_node['uuid'])
|
||||
node.touch_provisioning()
|
||||
mock_touch.assert_called_once_with(node.id)
|
||||
|
||||
Reference in New Issue
Block a user