Fix race in undercloud cell_v2 host discovery

Ensure that the ironic nodes have been picked up by the nova resource tracker
before running nova-manage cell_v2 host discovery.

Also adds logging of the verbose command output to mistral engine log.

Change-Id: I4cc67935df8f37cdb2d8b0bfd96cf90eb7a6ce25
Closes-Bug: #1660160
This commit is contained in:
Oliver Walsh 2017-02-01 02:51:15 +00:00
parent 38feb02f5d
commit f6c286dbe8
7 changed files with 124 additions and 19 deletions

View File

@ -64,6 +64,7 @@ mistral.actions =
tripleo.baremetal.configure_root_device = tripleo_common.actions.baremetal:ConfigureRootDeviceAction tripleo.baremetal.configure_root_device = tripleo_common.actions.baremetal:ConfigureRootDeviceAction
tripleo.baremetal.register_or_update_nodes = tripleo_common.actions.baremetal:RegisterOrUpdateNodes tripleo.baremetal.register_or_update_nodes = tripleo_common.actions.baremetal:RegisterOrUpdateNodes
tripleo.baremetal.update_node_capability = tripleo_common.actions.baremetal:UpdateNodeCapability tripleo.baremetal.update_node_capability = tripleo_common.actions.baremetal:UpdateNodeCapability
tripleo.baremetal.cell_v2_discover_hosts = tripleo_common.actions.baremetal:CellV2DiscoverHostsAction
tripleo.deployment.config = tripleo_common.actions.deployment:OrchestrationDeployAction tripleo.deployment.config = tripleo_common.actions.deployment:OrchestrationDeployAction
tripleo.deployment.deploy = tripleo_common.actions.deployment:DeployStackAction tripleo.deployment.deploy = tripleo_common.actions.deployment:DeployStackAction
tripleo.deployment.overcloudrc = tripleo_common.actions.deployment:OvercloudRcAction tripleo.deployment.overcloudrc = tripleo_common.actions.deployment:OvercloudRcAction

View File

@ -4,5 +4,5 @@ Defaults:mistral !requiretty
mistral ALL = (validations) NOPASSWD:SETENV: /usr/bin/run-validation mistral ALL = (validations) NOPASSWD:SETENV: /usr/bin/run-validation
mistral ALL = NOPASSWD: /usr/bin/chown validations\: /tmp/validations_identity_* mistral ALL = NOPASSWD: /usr/bin/chown validations\: /tmp/validations_identity_*
mistral ALL = NOPASSWD: /usr/bin/rm -f /tmp/validations_identity_* mistral ALL = NOPASSWD: /usr/bin/rm -f /tmp/validations_identity_*
mistral ALL = NOPASSWD: /bin/nova-manage cell_v2 discover_hosts mistral ALL = NOPASSWD: /bin/nova-manage cell_v2 discover_hosts *
validations ALL = NOPASSWD: ALL validations ALL = NOPASSWD: ALL

View File

@ -288,3 +288,25 @@ class UpdateNodeCapability(base.TripleOAction):
return mistral_workflow_utils.Result( return mistral_workflow_utils.Result(
error="%s: %s" % (type(err).__name__, str(err)) error="%s: %s" % (type(err).__name__, str(err))
) )
class CellV2DiscoverHostsAction(base.TripleOAction):
"""Run cell_v2 host discovery
Runs cell_v2 host discovery to map any newly available ironic nodes.
"""
def run(self):
try:
result = nodes.run_nova_cell_v2_discovery()
LOG.info(
'Successfully ran cell_v2 discover_hosts\n'
'stdout: %(stdout)r\n',
{"stdout": result[0]}
)
except Exception as err:
LOG.exception("Error running cell_v2 discover_hosts")
return mistral_workflow_utils.Result(
error="%s: %s" % (type(err).__name__, str(err))
)

View File

@ -15,6 +15,7 @@ import mock
from glanceclient import exc as glance_exceptions from glanceclient import exc as glance_exceptions
import ironic_inspector_client import ironic_inspector_client
from oslo_concurrency import processutils
from oslo_utils import units from oslo_utils import units
from tripleo_common.actions import baremetal from tripleo_common.actions import baremetal
@ -330,3 +331,25 @@ class TestConfigureRootDeviceAction(base.TestCase):
"Cannot find a disk", "Cannot find a disk",
action.run) action.run)
self.assertEqual(self.ironic.node.update.call_count, 0) self.assertEqual(self.ironic.node.update.call_count, 0)
class TestCellV2DiscoverHostsAction(base.TestCase):
@mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery')
def test_run(self, mock_command):
action = baremetal.CellV2DiscoverHostsAction()
action.run()
mock_command.assert_called_once()
@mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery')
def test_failure(self, mock_command):
mock_command.side_effect = processutils.ProcessExecutionError(
exit_code=1,
stdout='captured stdout',
stderr='captured stderr',
cmd='command'
)
action = baremetal.CellV2DiscoverHostsAction()
result = action.run()
self.assertTrue(result.is_error())
mock_command.assert_called_once()

View File

@ -190,8 +190,7 @@ class NodesTest(base.TestCase):
'pm_password': 'random', 'pm_type': 'pxe_ssh', 'name': 'node1', 'pm_password': 'random', 'pm_type': 'pxe_ssh', 'name': 'node1',
'capabilities': 'num_nics:6'} 'capabilities': 'num_nics:6'}
@mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') def test_register_all_nodes_ironic_no_hw_stats(self):
def test_register_all_nodes_ironic_no_hw_stats(self, mock_discovery):
node_list = [self._get_node()] node_list = [self._get_node()]
# Remove the hardware stats from the node dictionary # Remove the hardware stats from the node dictionary
@ -219,10 +218,8 @@ class NodesTest(base.TestCase):
address='aaa') address='aaa')
ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.node.create.assert_has_calls([pxe_node, mock.ANY])
ironic.port.create.assert_has_calls([port_call]) ironic.port.create.assert_has_calls([port_call])
mock_discovery.assert_called_once()
@mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') def test_register_all_nodes(self):
def test_register_all_nodes(self, mock_discovery):
node_list = [self._get_node()] node_list = [self._get_node()]
node_properties = {"cpus": "1", node_properties = {"cpus": "1",
"memory_mb": "2048", "memory_mb": "2048",
@ -243,10 +240,8 @@ class NodesTest(base.TestCase):
address='aaa') address='aaa')
ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.node.create.assert_has_calls([pxe_node, mock.ANY])
ironic.port.create.assert_has_calls([port_call]) ironic.port.create.assert_has_calls([port_call])
mock_discovery.assert_called_once()
@mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') def test_register_all_nodes_kernel_ramdisk(self):
def test_register_all_nodes_kernel_ramdisk(self, mock_discovery):
node_list = [self._get_node()] node_list = [self._get_node()]
node_properties = {"cpus": "1", node_properties = {"cpus": "1",
"memory_mb": "2048", "memory_mb": "2048",
@ -275,10 +270,8 @@ class NodesTest(base.TestCase):
address='aaa') address='aaa')
ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.node.create.assert_has_calls([pxe_node, mock.ANY])
ironic.port.create.assert_has_calls([port_call]) ironic.port.create.assert_has_calls([port_call])
mock_discovery.assert_called_once()
@mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') def test_register_all_nodes_uuid(self):
def test_register_all_nodes_uuid(self, mock_discovery):
node_list = [self._get_node()] node_list = [self._get_node()]
node_list[0]['uuid'] = 'abcdef' node_list[0]['uuid'] = 'abcdef'
node_properties = {"cpus": "1", node_properties = {"cpus": "1",
@ -301,10 +294,8 @@ class NodesTest(base.TestCase):
address='aaa') address='aaa')
ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.node.create.assert_has_calls([pxe_node, mock.ANY])
ironic.port.create.assert_has_calls([port_call]) ironic.port.create.assert_has_calls([port_call])
mock_discovery.assert_called_once()
@mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') def test_register_all_nodes_caps_dict(self):
def test_register_all_nodes_caps_dict(self, mock_discovery):
node_list = [self._get_node()] node_list = [self._get_node()]
node_list[0]['capabilities'] = { node_list[0]['capabilities'] = {
'num_nics': 7 'num_nics': 7
@ -328,7 +319,6 @@ class NodesTest(base.TestCase):
address='aaa') address='aaa')
ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.node.create.assert_has_calls([pxe_node, mock.ANY])
ironic.port.create.assert_has_calls([port_call]) ironic.port.create.assert_has_calls([port_call])
mock_discovery.assert_called_once()
def test_register_update(self): def test_register_update(self):
node = self._get_node() node = self._get_node()

View File

@ -382,7 +382,6 @@ def register_all_nodes(nodes_list, client, remove=False, glance_client=None,
seen.append(node) seen.append(node)
_clean_up_extra_nodes(seen, client, remove=remove) _clean_up_extra_nodes(seen, client, remove=remove)
run_nova_cell_v2_discovery()
return seen return seen
@ -452,5 +451,6 @@ def run_nova_cell_v2_discovery():
'/usr/bin/sudo', '/usr/bin/sudo',
'/bin/nova-manage', '/bin/nova-manage',
'cell_v2', 'cell_v2',
'discover_hosts' 'discover_hosts',
'--verbose'
) )

View File

@ -193,7 +193,7 @@ workflows:
tasks: tasks:
set_nodes_available: set_nodes_available:
on-success: try_power_off on-success: wait_for_nova_resources
on-error: set_status_failed_nodes_available on-error: set_status_failed_nodes_available
with-items: uuid in <% $.node_uuids %> with-items: uuid in <% $.node_uuids %>
workflow: tripleo.baremetal.v1.set_node_state workflow: tripleo.baremetal.v1.set_node_state
@ -209,6 +209,36 @@ workflows:
status: FAILED status: FAILED
message: <% task(set_nodes_available).result %> message: <% task(set_nodes_available).result %>
wait_for_nova_resources:
on-success: cell_v2_discover_hosts
on-error: wait_for_nova_resources_failed
with-items: node_uuid in <% $.node_uuids %>
action: nova.hypervisors_find hypervisor_hostname=<% $.node_uuid %>
timeout: 900 #15 minutes
retry:
delay: 30
count: 30
wait_for_nova_resources_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(wait_for_nova_resources).result %>
cell_v2_discover_hosts:
on-success: try_power_off
on-error: cell_v2_discover_hosts_failed
workflow: tripleo.baremetal.v1.cellv2_discovery
input:
node_uuids: <% $.node_uuids %>
queue_name: <% $.queue_name %>
cell_v2_discover_hosts_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(cell_v2_discover_hosts).result %>
try_power_off: try_power_off:
on-success: send_message on-success: send_message
on-error: power_off_failed on-error: power_off_failed
@ -758,3 +788,42 @@ workflows:
fail_workflow: fail_workflow:
action: std.fail action: std.fail
cellv2_discovery:
description: Run cell_v2 host discovery
input:
- node_uuids
- queue_name: tripleo
tasks:
cell_v2_discover_hosts:
on-success: send_message
on-error: cell_v2_discover_hosts_failed
action: tripleo.baremetal.cell_v2_discover_hosts
cell_v2_discover_hosts_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(cell_v2_discover_hosts).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.cellv2_discovery
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail_workflow: <% $.get('status') = "FAILED" %>
fail_workflow:
action: std.fail