Fix potential race condition on node power on and reboot
Currently there is the possibility that a configuration job does not transition to the correct state and start execution during a power on or reboot operation. If the boot device is being changed, the system might complete its POST before the job is ready, leaving the job in the queue, and the system will boot from the wrong device. This fix suggests the following change for the power control: o During internal call to set boot device, poll the iDRAC job queue for a configurable amount of time to ensure the job is in the correct state o Throw an exception on timeout if the job does not transition to the correct state o Proceed with normal logic as soon as the job is in the correct state This will ensure the iDRAC is in a state to execute the job prior to the reboot starting, removing this race condition. Story: #2004909 Task: #29259 Change-Id: I5cc71fb3c9a7e0166aab5bd458bbd257cefa8f5b
This commit is contained in:
parent
10bf5d5c2a
commit
62c95a7c96
@ -21,7 +21,14 @@ opts = [
|
||||
min=1,
|
||||
help=_('Interval (in seconds) between periodic RAID job status '
|
||||
'checks to determine whether the asynchronous RAID '
|
||||
'configuration was successfully finished or not.'))
|
||||
'configuration was successfully finished or not.')),
|
||||
cfg.IntOpt('boot_device_job_status_timeout',
|
||||
default=30,
|
||||
min=1,
|
||||
help=_('Maximum amount of time (in seconds) to wait for '
|
||||
'the boot device configuration job to transition '
|
||||
'to the correct state to allow a reboot or power '
|
||||
'on to complete.'))
|
||||
]
|
||||
|
||||
|
||||
|
@ -20,6 +20,8 @@
|
||||
DRAC management interface
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
from ironic_lib import metrics_utils
|
||||
from oslo_log import log as logging
|
||||
from oslo_utils import importutils
|
||||
@ -28,10 +30,12 @@ from ironic.common import boot_devices
|
||||
from ironic.common import exception
|
||||
from ironic.common.i18n import _
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.conf import CONF
|
||||
from ironic.drivers import base
|
||||
from ironic.drivers.modules.drac import common as drac_common
|
||||
from ironic.drivers.modules.drac import job as drac_job
|
||||
|
||||
|
||||
drac_exceptions = importutils.try_import('dracclient.exceptions')
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
@ -246,7 +250,28 @@ def set_boot_device(node, device, persistent=False):
|
||||
"'%(device)s' for node %(node_id)s.") %
|
||||
{'device': device, 'node_id': node.uuid})
|
||||
|
||||
client.commit_pending_bios_changes()
|
||||
job_id = client.commit_pending_bios_changes()
|
||||
job_entry = client.get_job(job_id)
|
||||
|
||||
timeout = CONF.drac.boot_device_job_status_timeout
|
||||
end_time = time.time() + timeout
|
||||
|
||||
LOG.debug('Waiting for BIOS configuration job %{job_id}s '
|
||||
'to be scheduled for node %{node}s',
|
||||
{'job_id': job_id,
|
||||
'node': node.uuid})
|
||||
|
||||
while job_entry.status != "Scheduled":
|
||||
if time.time() >= end_time:
|
||||
raise exception.DracOperationError(
|
||||
error=_(
|
||||
'Timed out waiting BIOS configuration for job '
|
||||
'%(job)s to reach Scheduled state. Job is still '
|
||||
'in %(status)s state.') %
|
||||
{'job': job_id, 'status': job_entry.status})
|
||||
time.sleep(3)
|
||||
job_entry = client.get_job(job_id)
|
||||
|
||||
except drac_exceptions.BaseClientException as exc:
|
||||
LOG.error('DRAC driver failed to change boot device order for '
|
||||
'node %(node_uuid)s. Reason: %(error)s.',
|
||||
|
@ -257,6 +257,10 @@ class DracManagementInternalMethodsTestCase(test_utils.BaseDracTest):
|
||||
mock_client = mock.Mock()
|
||||
mock_get_drac_client.return_value = mock_client
|
||||
mock_client.list_boot_devices.return_value = self.boot_devices['IPL']
|
||||
mock_job = mock.Mock()
|
||||
mock_job.status = "Scheduled"
|
||||
mock_client.get_job.return_value = mock_job
|
||||
|
||||
boot_device = {'boot_device': ironic.common.boot_devices.DISK,
|
||||
'persistent': True}
|
||||
mock__get_boot_device.return_value = boot_device
|
||||
@ -315,6 +319,10 @@ class DracManagementInternalMethodsTestCase(test_utils.BaseDracTest):
|
||||
mock_client = mock.Mock()
|
||||
mock_get_drac_client.return_value = mock_client
|
||||
mock_client.list_boot_devices.return_value = self.boot_devices['UEFI']
|
||||
|
||||
mock_job = mock.Mock()
|
||||
mock_job.status = "Scheduled"
|
||||
mock_client.get_job.return_value = mock_job
|
||||
boot_device = {'boot_device': ironic.common.boot_devices.PXE,
|
||||
'persistent': False}
|
||||
mock__get_boot_device.return_value = boot_device
|
||||
@ -445,6 +453,40 @@ class DracManagementInternalMethodsTestCase(test_utils.BaseDracTest):
|
||||
self.assertEqual(0, mock_client.set_bios_settings.call_count)
|
||||
self.assertEqual(0, mock_client.commit_pending_bios_changes.call_count)
|
||||
|
||||
@mock.patch('time.time')
|
||||
@mock.patch('time.sleep')
|
||||
@mock.patch.object(drac_mgmt, '_get_next_persistent_boot_mode',
|
||||
spec_set=True, autospec=True)
|
||||
@mock.patch.object(drac_mgmt, '_get_boot_device', spec_set=True,
|
||||
autospec=True)
|
||||
@mock.patch.object(drac_job, 'validate_job_queue', spec_set=True,
|
||||
autospec=True)
|
||||
def test_set_boot_device_job_not_scheduled(
|
||||
self,
|
||||
mock_validate_job_queue,
|
||||
mock__get_boot_device,
|
||||
mock__get_next_persistent_boot_mode,
|
||||
mock_sleep,
|
||||
mock_time,
|
||||
mock_get_drac_client):
|
||||
mock_client = mock.Mock()
|
||||
mock_get_drac_client.return_value = mock_client
|
||||
mock_client.list_boot_devices.return_value = self.boot_devices['IPL']
|
||||
mock_job = mock.Mock()
|
||||
mock_job.status = "New"
|
||||
mock_client.get_job.return_value = mock_job
|
||||
mock_time.side_effect = [10, 50]
|
||||
|
||||
boot_device = {'boot_device': ironic.common.boot_devices.DISK,
|
||||
'persistent': True}
|
||||
mock__get_boot_device.return_value = boot_device
|
||||
mock__get_next_persistent_boot_mode.return_value = 'IPL'
|
||||
|
||||
self.assertRaises(exception.DracOperationError,
|
||||
drac_mgmt.set_boot_device, self.node,
|
||||
ironic.common.boot_devices.PXE,
|
||||
persistent=True)
|
||||
|
||||
|
||||
@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
|
||||
autospec=True)
|
||||
|
@ -0,0 +1,16 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Adds a new configuration option ``boot_device_job_status_timeout`` that
|
||||
specifies the Maximum amount of time (in seconds) to wait for the boot
|
||||
device configuration job to transition to the scheduled state to allow a
|
||||
reboot or power on action to complete.
|
||||
fixes:
|
||||
- |
|
||||
Fixes an issue where a configuration job does not transition to the
|
||||
correct state and start execution during a power on or reboot operation. If
|
||||
the boot device is being changed, the system might complete its POST before
|
||||
the job is ready, leaving the job in the queue, and the system will boot
|
||||
from the wrong device.
|
||||
See bug `2004909 <https://storyboard.openstack.org/#!/story/2004909>`_ for
|
||||
details.
|
Loading…
Reference in New Issue
Block a user