Allow retrying PXE boot if it takes too long

PXE is inherently unreliable and sometimes times out without an
obvious reason. It happens particularly often in resource constrained
environments, such as the CI. This change allows an operator to
set a timeout, after which the boot is retried again.

The _add_node_filters call had to be refactored to avoid hitting
the complexity limit.

Change-Id: I34a11f52e8e98e5b64f2d21f7190468a9e4b030d
Story: #2005167
Task: #29901
This commit is contained in:
Dmitry Tantsur 2019-09-19 14:21:58 +02:00
parent 9fa39045da
commit b2834e6661
9 changed files with 279 additions and 56 deletions

View File

@ -88,9 +88,19 @@ IRONIC_USE_MOD_WSGI=$(trueorfalse $ENABLE_HTTPD_MOD_WSGI_SERVICES IRONIC_USE_MOD
# Defaults to the (now confusingly named) IRONIC_USE_MOD_WSGI for backward compat
IRONIC_USE_WSGI=$(trueorfalse $IRONIC_USE_MOD_WSGI IRONIC_USE_WSGI)
# Whether DevStack will be setup for bare metal or VMs
IRONIC_IS_HARDWARE=$(trueorfalse False IRONIC_IS_HARDWARE)
# Deploy callback timeout can be changed from its default (1800), if required.
IRONIC_CALLBACK_TIMEOUT=${IRONIC_CALLBACK_TIMEOUT:-}
# Timeout before retrying PXE boot. Set low to help the CI.
if [[ "$IRONIC_IS_HARDWARE" == False ]]; then
IRONIC_PXE_BOOT_RETRY_TIMEOUT=${IRONIC_PXE_BOOT_RETRY_TIMEOUT:-600}
else
IRONIC_PXE_BOOT_RETRY_TIMEOUT=${IRONIC_PXE_BOOT_RETRY_TIMEOUT:-}
fi
# Ping timeout after the node becomes active
IRONIC_PING_TIMEOUT=${IRONIC_PING_TIMEOUT:-}
@ -327,9 +337,6 @@ IRONIC_HTTP_PORT=${IRONIC_HTTP_PORT:-3928}
IRONIC_RPC_TRANSPORT=${IRONIC_RPC_TRANSPORT:-oslo}
IRONIC_JSON_RPC_PORT=${IRONIC_JSON_RPC_PORT:-8089}
# Whether DevStack will be setup for bare metal or VMs
IRONIC_IS_HARDWARE=$(trueorfalse False IRONIC_IS_HARDWARE)
# The first port in the range to bind the Virtual BMCs. The number of
# ports that will be used depends on $IRONIC_VM_COUNT variable, e.g if
# $IRONIC_VM_COUNT=3 the ports 6230, 6231 and 6232 will be used for the
@ -1475,6 +1482,9 @@ function configure_ironic_conductor {
iniset $IRONIC_CONF_FILE pxe tftp_server $IRONIC_TFTPSERVER_IP
iniset $IRONIC_CONF_FILE pxe tftp_root $IRONIC_TFTPBOOT_DIR
iniset $IRONIC_CONF_FILE pxe tftp_master_path $IRONIC_TFTPBOOT_DIR/master_images
if [[ -n "$IRONIC_PXE_BOOT_RETRY_TIMEOUT" ]]; then
iniset $IRONIC_CONF_FILE pxe boot_retry_timeout $IRONIC_PXE_BOOT_RETRY_TIMEOUT
fi
if [[ "$IRONIC_TERMINAL_SSL" == "True" ]]; then
# Make sure the cert directory exist
sudo mkdir -p $IRONIC_TERMINAL_CERT_DIR

View File

@ -272,6 +272,10 @@ def node_power_action(task, new_state, timeout=None):
if node['target_power_state'] != target_state:
node['target_power_state'] = target_state
node['last_error'] = None
driver_internal_info = node.driver_internal_info
driver_internal_info['last_power_state_change'] = str(
timeutils.utcnow().isoformat())
node.driver_internal_info = driver_internal_info
node.save()
# take power action
@ -866,6 +870,22 @@ def fast_track_able(task):
and task.node.last_error is None)
def value_within_timeout(value, timeout):
"""Checks if the time is within the previous timeout seconds from now.
:param value: a string representing date and time or None.
:param timeout: timeout in seconds.
"""
# use native datetime objects for conversion and compare
# slightly odd because py2 compatability :(
last = datetime.datetime.strptime(value or '1970-01-01T00:00:00.000000',
"%Y-%m-%dT%H:%M:%S.%f")
# If we found nothing, we assume that the time is essentially epoch.
time_delta = datetime.timedelta(seconds=timeout)
last_valid = timeutils.utcnow() - time_delta
return last_valid <= last
def is_fast_track(task):
"""Checks a fast track is available.
@ -882,19 +902,8 @@ def is_fast_track(task):
:returns: True if the last heartbeat that was recorded was within
the [deploy]fast_track_timeout setting.
"""
if not fast_track_able(task):
return False
# use native datetime objects for conversion and compare
# slightly odd because py2 compatability :(
last = datetime.datetime.strptime(
task.node.driver_internal_info.get(
'agent_last_heartbeat',
'1970-01-01T00:00:00.000000'),
"%Y-%m-%dT%H:%M:%S.%f")
# If we found nothing, we assume that the time is essentially epoch.
time_delta = datetime.timedelta(seconds=CONF.deploy.fast_track_timeout)
last_valid = timeutils.utcnow() - time_delta
# Checking the power state, because if we find the machine off due to
# any action, we can't actually fast track the node. :(
return (last_valid <= last
return (fast_track_able(task)
and value_within_timeout(
task.node.driver_internal_info.get('agent_last_heartbeat'),
CONF.deploy.fast_track_timeout)
and task.driver.power.get_power_state(task) == states.POWER_ON)

View File

@ -126,6 +126,16 @@ opts = [
default=0,
help=_('Timeout value (in seconds) for downloading an image '
'via iPXE. Defaults to 0 (no timeout)')),
cfg.IntOpt('boot_retry_timeout',
min=60,
help=_('Timeout (in seconds) after which PXE boot should be '
'retried. Must be less than [conductor]'
'deploy_callback_timeout. Disabled by default.')),
cfg.IntOpt('boot_retry_check_interval',
default=90, min=1,
help=_('Interval (in seconds) between periodic checks on PXE '
'boot retry. Has no effect if boot_retry_timeout '
'is not set.')),
cfg.StrOpt('ip_version',
default='4',
choices=[('4', _('IPv4')),

View File

@ -279,20 +279,28 @@ def _zip_matching(a, b, key):
class Connection(api.Connection):
"""SqlAlchemy connection."""
_NODE_QUERY_FIELDS = {'console_enabled', 'maintenance', 'driver',
'resource_class', 'provision_state', 'uuid', 'id',
'fault', 'conductor_group', 'owner'}
_NODE_IN_QUERY_FIELDS = {'%s_in' % field: field
for field in ('uuid', 'provision_state')}
_NODE_NON_NULL_FILTERS = {'associated': 'instance_uuid',
'reserved': 'reservation',
'with_power_state': 'power_state'}
_NODE_FILTERS = ({'chassis_uuid', 'reserved_by_any_of',
'provisioned_before', 'inspection_started_before',
'description_contains'}
| _NODE_QUERY_FIELDS
| set(_NODE_IN_QUERY_FIELDS)
| set(_NODE_NON_NULL_FILTERS))
def __init__(self):
pass
def _validate_nodes_filters(self, filters):
if filters is None:
filters = dict()
supported_filters = {'console_enabled', 'maintenance', 'driver',
'resource_class', 'provision_state', 'uuid', 'id',
'chassis_uuid', 'associated', 'reserved',
'reserved_by_any_of', 'provisioned_before',
'inspection_started_before', 'fault',
'conductor_group', 'owner', 'uuid_in',
'with_power_state', 'description_contains'}
unsupported_filters = set(filters).difference(supported_filters)
unsupported_filters = set(filters).difference(self._NODE_FILTERS)
if unsupported_filters:
msg = _("SqlAlchemy API does not support "
"filtering by %s") % ', '.join(unsupported_filters)
@ -301,26 +309,26 @@ class Connection(api.Connection):
def _add_nodes_filters(self, query, filters):
filters = self._validate_nodes_filters(filters)
for field in ['console_enabled', 'maintenance', 'driver',
'resource_class', 'provision_state', 'uuid', 'id',
'fault', 'conductor_group', 'owner']:
for field in self._NODE_QUERY_FIELDS:
if field in filters:
query = query.filter_by(**{field: filters[field]})
for key, field in self._NODE_IN_QUERY_FIELDS.items():
if key in filters:
query = query.filter(
getattr(models.Node, field).in_(filters[key]))
for key, field in self._NODE_NON_NULL_FILTERS.items():
if key in filters:
column = getattr(models.Node, field)
if filters[key]:
query = query.filter(column != sql.null())
else:
query = query.filter(column == sql.null())
if 'chassis_uuid' in filters:
# get_chassis_by_uuid() to raise an exception if the chassis
# is not found
chassis_obj = self.get_chassis_by_uuid(filters['chassis_uuid'])
query = query.filter_by(chassis_id=chassis_obj.id)
if 'associated' in filters:
if filters['associated']:
query = query.filter(models.Node.instance_uuid != sql.null())
else:
query = query.filter(models.Node.instance_uuid == sql.null())
if 'reserved' in filters:
if filters['reserved']:
query = query.filter(models.Node.reservation != sql.null())
else:
query = query.filter(models.Node.reservation == sql.null())
if 'reserved_by_any_of' in filters:
query = query.filter(models.Node.reservation.in_(
filters['reserved_by_any_of']))
@ -334,13 +342,6 @@ class Connection(api.Connection):
- (datetime.timedelta(
seconds=filters['inspection_started_before'])))
query = query.filter(models.Node.inspection_started_at < limit)
if 'uuid_in' in filters:
query = query.filter(models.Node.uuid.in_(filters['uuid_in']))
if 'with_power_state' in filters:
if filters['with_power_state']:
query = query.filter(models.Node.power_state != sql.null())
else:
query = query.filter(models.Node.power_state == sql.null())
if 'description_contains' in filters:
keyword = filters['description_contains']
if keyword is not None:

View File

@ -17,7 +17,6 @@ PXE Boot Interface
from ironic_lib import metrics_utils
from oslo_log import log as logging
from oslo_utils import strutils
from ironic.common import boot_devices
from ironic.common import dhcp_factory
@ -174,14 +173,7 @@ class PXEBoot(pxe_base.PXEBaseMixin, base.BootInterface):
pxe_config_template,
ipxe_enabled=CONF.pxe.ipxe_enabled)
persistent = False
value = node.driver_info.get('force_persistent_boot_device',
'Default')
if value in {'Always', 'Default', 'Never'}:
if value == 'Always':
persistent = True
else:
persistent = strutils.bool_from_string(value, False)
persistent = self._persistent_ramdisk_boot(node)
manager_utils.node_set_boot_device(task, boot_devices.PXE,
persistent=persistent)

View File

@ -13,13 +13,24 @@
Base PXE Interface Methods
"""
from futurist import periodics
from ironic_lib import metrics_utils
from oslo_config import cfg
from oslo_log import log as logging
from oslo_utils import strutils
from ironic.common import boot_devices
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import pxe_utils
from ironic.common import states
from ironic.conductor import task_manager
from ironic.conductor import utils as manager_utils
from ironic.drivers.modules import deploy_utils
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
METRICS = metrics_utils.get_metrics_logger(__name__)
@ -99,3 +110,82 @@ class PXEBaseMixin(object):
parameters
"""
pxe_utils.parse_driver_info(task.node, mode='rescue')
def _persistent_ramdisk_boot(self, node):
"""If the ramdisk should be configured as a persistent boot device."""
value = node.driver_info.get('force_persistent_boot_device', 'Default')
if value in {'Always', 'Default', 'Never'}:
return value == 'Always'
else:
return strutils.bool_from_string(value, False)
_RETRY_ALLOWED_STATES = {states.DEPLOYWAIT, states.CLEANWAIT,
states.RESCUEWAIT}
@METRICS.timer('PXEBaseMixin._check_boot_timeouts')
@periodics.periodic(spacing=CONF.pxe.boot_retry_check_interval,
enabled=bool(CONF.pxe.boot_retry_timeout))
def _check_boot_timeouts(self, manager, context):
"""Periodically checks whether boot has timed out and retry it.
:param manager: conductor manager.
:param context: request context.
"""
filters = {'provision_state_in': self._RETRY_ALLOWED_STATES,
'reserved': False,
'maintenance': False,
'provisioned_before': CONF.pxe.boot_retry_timeout}
node_iter = manager.iter_nodes(filters=filters)
for node_uuid, driver, conductor_group in node_iter:
try:
lock_purpose = 'checking PXE boot status'
with task_manager.acquire(context, node_uuid,
shared=True,
purpose=lock_purpose) as task:
self._check_boot_status(task)
except (exception.NodeLocked, exception.NodeNotFound):
continue
def _check_boot_status(self, task):
if not isinstance(task.driver.boot, PXEBaseMixin):
return
if not _should_retry_boot(task.node):
return
task.upgrade_lock(purpose='retrying PXE boot')
# Retry critical checks after acquiring the exclusive lock.
if (task.node.maintenance or task.node.provision_state
not in self._RETRY_ALLOWED_STATES
or not _should_retry_boot(task.node)):
return
LOG.info('Booting the ramdisk on node %(node)s is taking more than '
'%(timeout)d seconds, retrying boot',
{'node': task.node.uuid,
'timeout': CONF.pxe.boot_retry_timeout})
manager_utils.node_power_action(task, states.POWER_OFF)
# NOTE(dtantsur): retry even persistent boot setting in case it did not
# work for some reason.
persistent = self._persistent_ramdisk_boot(task.node)
manager_utils.node_set_boot_device(task, boot_devices.PXE,
persistent=persistent)
manager_utils.node_power_action(task, states.POWER_ON)
def _should_retry_boot(node):
# NOTE(dtantsur): this assumes IPA, do we need to make it generic?
for field in ('agent_last_heartbeat', 'last_power_state_change'):
if manager_utils.value_within_timeout(
node.driver_internal_info.get(field),
CONF.pxe.boot_retry_timeout):
# Alive and heartbeating, probably busy with something long
LOG.debug('Not retrying PXE boot for node %(node)s; its '
'%(event)s happened less than %(timeout)d seconds ago',
{'node': node.uuid, 'event': field,
'timeout': CONF.pxe.boot_retry_timeout})
return False
return True

View File

@ -231,7 +231,8 @@ class DbNodeTestCase(base.DbTestCase):
# node with provision_updated timeout
node1 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
provision_updated_at=past)
provision_updated_at=past,
provision_state=states.DEPLOYING)
# node with None in provision_updated_at
node2 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
provision_state=states.DEPLOYWAIT)
@ -247,6 +248,10 @@ class DbNodeTestCase(base.DbTestCase):
states.DEPLOYWAIT})
self.assertEqual([node2.id], [r[0] for r in res])
res = self.dbapi.get_nodeinfo_list(
filters={'provision_state_in': [states.ACTIVE, states.DEPLOYING]})
self.assertEqual([node1.id], [r[0] for r in res])
@mock.patch.object(timeutils, 'utcnow', autospec=True)
def test_get_nodeinfo_list_inspection(self, mock_utcnow):
past = datetime.datetime(2000, 1, 1, 0, 0)

View File

@ -21,6 +21,7 @@ import tempfile
import mock
from oslo_config import cfg
from oslo_serialization import jsonutils as json
from oslo_utils import timeutils
from oslo_utils import uuidutils
from ironic.common import boot_devices
@ -1162,3 +1163,98 @@ class PXEValidateRescueTestCase(db_base.DbTestCase):
self.assertRaisesRegex(exception.MissingParameterValue,
'Missing.*rescue_kernel',
task.driver.boot.validate_rescue, task)
@mock.patch.object(ipxe.iPXEBoot, '__init__', lambda self: None)
@mock.patch.object(pxe.PXEBoot, '__init__', lambda self: None)
@mock.patch.object(manager_utils, 'node_set_boot_device', autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
class PXEBootRetryTestCase(db_base.DbTestCase):
boot_interface = 'pxe'
def setUp(self):
super(PXEBootRetryTestCase, self).setUp()
self.config(enabled_boot_interfaces=['pxe', 'ipxe', 'fake'])
self.config(boot_retry_timeout=300, group='pxe')
self.node = obj_utils.create_test_node(
self.context,
driver='fake-hardware',
boot_interface=self.boot_interface,
provision_state=states.DEPLOYWAIT)
@mock.patch.object(pxe.PXEBoot, '_check_boot_status', autospec=True)
def test_check_boot_timeouts(self, mock_check_status, mock_power,
mock_boot_dev):
def _side_effect(iface, task):
self.assertEqual(self.node.uuid, task.node.uuid)
mock_check_status.side_effect = _side_effect
manager = mock.Mock(spec=['iter_nodes'])
manager.iter_nodes.return_value = [
(uuidutils.generate_uuid(), 'fake-hardware', ''),
(self.node.uuid, self.node.driver, self.node.conductor_group)
]
iface = pxe.PXEBoot()
iface._check_boot_timeouts(manager, self.context)
mock_check_status.assert_called_once_with(iface, mock.ANY)
def test_check_boot_status_another_boot_interface(self, mock_power,
mock_boot_dev):
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.driver.boot = fake.FakeBoot()
pxe.PXEBoot()._check_boot_status(task)
self.assertTrue(task.shared)
self.assertFalse(mock_power.called)
self.assertFalse(mock_boot_dev.called)
def test_check_boot_status_recent_power_change(self, mock_power,
mock_boot_dev):
for field in ('agent_last_heartbeat', 'last_power_state_change'):
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.node.driver_internal_info = {
field: str(timeutils.utcnow().isoformat())
}
task.driver.boot._check_boot_status(task)
self.assertTrue(task.shared)
self.assertFalse(mock_power.called)
self.assertFalse(mock_boot_dev.called)
def test_check_boot_status_maintenance(self, mock_power, mock_boot_dev):
self.node.maintenance = True
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.driver.boot._check_boot_status(task)
self.assertFalse(task.shared)
self.assertFalse(mock_power.called)
self.assertFalse(mock_boot_dev.called)
def test_check_boot_status_wrong_state(self, mock_power, mock_boot_dev):
self.node.provision_state = states.DEPLOYING
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.driver.boot._check_boot_status(task)
self.assertFalse(task.shared)
self.assertFalse(mock_power.called)
self.assertFalse(mock_boot_dev.called)
def test_check_boot_status_retry(self, mock_power, mock_boot_dev):
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.driver.boot._check_boot_status(task)
self.assertFalse(task.shared)
mock_power.assert_has_calls([
mock.call(task, states.POWER_OFF),
mock.call(task, states.POWER_ON)
])
mock_boot_dev.assert_called_once_with(task, 'pxe',
persistent=False)
class iPXEBootRetryTestCase(PXEBootRetryTestCase):
boot_interface = 'ipxe'

View File

@ -0,0 +1,10 @@
---
features:
- |
Allows retrying PXE/iPXE boot during deployment, cleaning and rescuing.
This feature is disabled by default and can be enabled by setting
``[pxe]boot_retry_timeout`` to the timeout (in seconds) after which
the boot should be retried.
The new option ``[pxe]boot_retry_check_interval`` defines how often to
check the nodes for timeout and defaults to 90 seconds.