Merge "Allow retrying PXE boot if it takes too long"

This commit is contained in:
Zuul 2019-09-23 12:58:26 +00:00 committed by Gerrit Code Review
commit 0450c4f2fd
9 changed files with 279 additions and 56 deletions

View File

@ -88,9 +88,19 @@ IRONIC_USE_MOD_WSGI=$(trueorfalse $ENABLE_HTTPD_MOD_WSGI_SERVICES IRONIC_USE_MOD
# Defaults to the (now confusingly named) IRONIC_USE_MOD_WSGI for backward compat
IRONIC_USE_WSGI=$(trueorfalse $IRONIC_USE_MOD_WSGI IRONIC_USE_WSGI)
# Whether DevStack will be setup for bare metal or VMs
IRONIC_IS_HARDWARE=$(trueorfalse False IRONIC_IS_HARDWARE)
# Deploy callback timeout can be changed from its default (1800), if required.
IRONIC_CALLBACK_TIMEOUT=${IRONIC_CALLBACK_TIMEOUT:-}
# Timeout before retrying PXE boot. Set low to help the CI.
if [[ "$IRONIC_IS_HARDWARE" == False ]]; then
IRONIC_PXE_BOOT_RETRY_TIMEOUT=${IRONIC_PXE_BOOT_RETRY_TIMEOUT:-600}
else
IRONIC_PXE_BOOT_RETRY_TIMEOUT=${IRONIC_PXE_BOOT_RETRY_TIMEOUT:-}
fi
# Ping timeout after the node becomes active
IRONIC_PING_TIMEOUT=${IRONIC_PING_TIMEOUT:-}
@ -327,9 +337,6 @@ IRONIC_HTTP_PORT=${IRONIC_HTTP_PORT:-3928}
IRONIC_RPC_TRANSPORT=${IRONIC_RPC_TRANSPORT:-oslo}
IRONIC_JSON_RPC_PORT=${IRONIC_JSON_RPC_PORT:-8089}
# Whether DevStack will be setup for bare metal or VMs
IRONIC_IS_HARDWARE=$(trueorfalse False IRONIC_IS_HARDWARE)
# The first port in the range to bind the Virtual BMCs. The number of
# ports that will be used depends on $IRONIC_VM_COUNT variable, e.g if
# $IRONIC_VM_COUNT=3 the ports 6230, 6231 and 6232 will be used for the
@ -1479,6 +1486,9 @@ function configure_ironic_conductor {
iniset $IRONIC_CONF_FILE pxe tftp_server $IRONIC_TFTPSERVER_IP
iniset $IRONIC_CONF_FILE pxe tftp_root $IRONIC_TFTPBOOT_DIR
iniset $IRONIC_CONF_FILE pxe tftp_master_path $IRONIC_TFTPBOOT_DIR/master_images
if [[ -n "$IRONIC_PXE_BOOT_RETRY_TIMEOUT" ]]; then
iniset $IRONIC_CONF_FILE pxe boot_retry_timeout $IRONIC_PXE_BOOT_RETRY_TIMEOUT
fi
if [[ "$IRONIC_TERMINAL_SSL" == "True" ]]; then
# Make sure the cert directory exist
sudo mkdir -p $IRONIC_TERMINAL_CERT_DIR

View File

@ -272,6 +272,10 @@ def node_power_action(task, new_state, timeout=None):
if node['target_power_state'] != target_state:
node['target_power_state'] = target_state
node['last_error'] = None
driver_internal_info = node.driver_internal_info
driver_internal_info['last_power_state_change'] = str(
timeutils.utcnow().isoformat())
node.driver_internal_info = driver_internal_info
node.save()
# take power action
@ -872,6 +876,22 @@ def fast_track_able(task):
and task.node.last_error is None)
def value_within_timeout(value, timeout):
"""Checks if the time is within the previous timeout seconds from now.
:param value: a string representing date and time or None.
:param timeout: timeout in seconds.
"""
# use native datetime objects for conversion and compare
# slightly odd because py2 compatability :(
last = datetime.datetime.strptime(value or '1970-01-01T00:00:00.000000',
"%Y-%m-%dT%H:%M:%S.%f")
# If we found nothing, we assume that the time is essentially epoch.
time_delta = datetime.timedelta(seconds=timeout)
last_valid = timeutils.utcnow() - time_delta
return last_valid <= last
def is_fast_track(task):
"""Checks a fast track is available.
@ -888,19 +908,8 @@ def is_fast_track(task):
:returns: True if the last heartbeat that was recorded was within
the [deploy]fast_track_timeout setting.
"""
if not fast_track_able(task):
return False
# use native datetime objects for conversion and compare
# slightly odd because py2 compatability :(
last = datetime.datetime.strptime(
task.node.driver_internal_info.get(
'agent_last_heartbeat',
'1970-01-01T00:00:00.000000'),
"%Y-%m-%dT%H:%M:%S.%f")
# If we found nothing, we assume that the time is essentially epoch.
time_delta = datetime.timedelta(seconds=CONF.deploy.fast_track_timeout)
last_valid = timeutils.utcnow() - time_delta
# Checking the power state, because if we find the machine off due to
# any action, we can't actually fast track the node. :(
return (last_valid <= last
return (fast_track_able(task)
and value_within_timeout(
task.node.driver_internal_info.get('agent_last_heartbeat'),
CONF.deploy.fast_track_timeout)
and task.driver.power.get_power_state(task) == states.POWER_ON)

View File

@ -126,6 +126,16 @@ opts = [
default=0,
help=_('Timeout value (in seconds) for downloading an image '
'via iPXE. Defaults to 0 (no timeout)')),
cfg.IntOpt('boot_retry_timeout',
min=60,
help=_('Timeout (in seconds) after which PXE boot should be '
'retried. Must be less than [conductor]'
'deploy_callback_timeout. Disabled by default.')),
cfg.IntOpt('boot_retry_check_interval',
default=90, min=1,
help=_('Interval (in seconds) between periodic checks on PXE '
'boot retry. Has no effect if boot_retry_timeout '
'is not set.')),
cfg.StrOpt('ip_version',
default='4',
choices=[('4', _('IPv4')),

View File

@ -279,20 +279,28 @@ def _zip_matching(a, b, key):
class Connection(api.Connection):
"""SqlAlchemy connection."""
_NODE_QUERY_FIELDS = {'console_enabled', 'maintenance', 'driver',
'resource_class', 'provision_state', 'uuid', 'id',
'fault', 'conductor_group', 'owner'}
_NODE_IN_QUERY_FIELDS = {'%s_in' % field: field
for field in ('uuid', 'provision_state')}
_NODE_NON_NULL_FILTERS = {'associated': 'instance_uuid',
'reserved': 'reservation',
'with_power_state': 'power_state'}
_NODE_FILTERS = ({'chassis_uuid', 'reserved_by_any_of',
'provisioned_before', 'inspection_started_before',
'description_contains'}
| _NODE_QUERY_FIELDS
| set(_NODE_IN_QUERY_FIELDS)
| set(_NODE_NON_NULL_FILTERS))
def __init__(self):
pass
def _validate_nodes_filters(self, filters):
if filters is None:
filters = dict()
supported_filters = {'console_enabled', 'maintenance', 'driver',
'resource_class', 'provision_state', 'uuid', 'id',
'chassis_uuid', 'associated', 'reserved',
'reserved_by_any_of', 'provisioned_before',
'inspection_started_before', 'fault',
'conductor_group', 'owner', 'uuid_in',
'with_power_state', 'description_contains'}
unsupported_filters = set(filters).difference(supported_filters)
unsupported_filters = set(filters).difference(self._NODE_FILTERS)
if unsupported_filters:
msg = _("SqlAlchemy API does not support "
"filtering by %s") % ', '.join(unsupported_filters)
@ -301,26 +309,26 @@ class Connection(api.Connection):
def _add_nodes_filters(self, query, filters):
filters = self._validate_nodes_filters(filters)
for field in ['console_enabled', 'maintenance', 'driver',
'resource_class', 'provision_state', 'uuid', 'id',
'fault', 'conductor_group', 'owner']:
for field in self._NODE_QUERY_FIELDS:
if field in filters:
query = query.filter_by(**{field: filters[field]})
for key, field in self._NODE_IN_QUERY_FIELDS.items():
if key in filters:
query = query.filter(
getattr(models.Node, field).in_(filters[key]))
for key, field in self._NODE_NON_NULL_FILTERS.items():
if key in filters:
column = getattr(models.Node, field)
if filters[key]:
query = query.filter(column != sql.null())
else:
query = query.filter(column == sql.null())
if 'chassis_uuid' in filters:
# get_chassis_by_uuid() to raise an exception if the chassis
# is not found
chassis_obj = self.get_chassis_by_uuid(filters['chassis_uuid'])
query = query.filter_by(chassis_id=chassis_obj.id)
if 'associated' in filters:
if filters['associated']:
query = query.filter(models.Node.instance_uuid != sql.null())
else:
query = query.filter(models.Node.instance_uuid == sql.null())
if 'reserved' in filters:
if filters['reserved']:
query = query.filter(models.Node.reservation != sql.null())
else:
query = query.filter(models.Node.reservation == sql.null())
if 'reserved_by_any_of' in filters:
query = query.filter(models.Node.reservation.in_(
filters['reserved_by_any_of']))
@ -334,13 +342,6 @@ class Connection(api.Connection):
- (datetime.timedelta(
seconds=filters['inspection_started_before'])))
query = query.filter(models.Node.inspection_started_at < limit)
if 'uuid_in' in filters:
query = query.filter(models.Node.uuid.in_(filters['uuid_in']))
if 'with_power_state' in filters:
if filters['with_power_state']:
query = query.filter(models.Node.power_state != sql.null())
else:
query = query.filter(models.Node.power_state == sql.null())
if 'description_contains' in filters:
keyword = filters['description_contains']
if keyword is not None:

View File

@ -17,7 +17,6 @@ PXE Boot Interface
from ironic_lib import metrics_utils
from oslo_log import log as logging
from oslo_utils import strutils
from ironic.common import boot_devices
from ironic.common import dhcp_factory
@ -174,14 +173,7 @@ class PXEBoot(pxe_base.PXEBaseMixin, base.BootInterface):
pxe_config_template,
ipxe_enabled=CONF.pxe.ipxe_enabled)
persistent = False
value = node.driver_info.get('force_persistent_boot_device',
'Default')
if value in {'Always', 'Default', 'Never'}:
if value == 'Always':
persistent = True
else:
persistent = strutils.bool_from_string(value, False)
persistent = self._persistent_ramdisk_boot(node)
manager_utils.node_set_boot_device(task, boot_devices.PXE,
persistent=persistent)

View File

@ -13,13 +13,24 @@
Base PXE Interface Methods
"""
from futurist import periodics
from ironic_lib import metrics_utils
from oslo_config import cfg
from oslo_log import log as logging
from oslo_utils import strutils
from ironic.common import boot_devices
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import pxe_utils
from ironic.common import states
from ironic.conductor import task_manager
from ironic.conductor import utils as manager_utils
from ironic.drivers.modules import deploy_utils
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
METRICS = metrics_utils.get_metrics_logger(__name__)
@ -99,3 +110,82 @@ class PXEBaseMixin(object):
parameters
"""
pxe_utils.parse_driver_info(task.node, mode='rescue')
def _persistent_ramdisk_boot(self, node):
"""If the ramdisk should be configured as a persistent boot device."""
value = node.driver_info.get('force_persistent_boot_device', 'Default')
if value in {'Always', 'Default', 'Never'}:
return value == 'Always'
else:
return strutils.bool_from_string(value, False)
_RETRY_ALLOWED_STATES = {states.DEPLOYWAIT, states.CLEANWAIT,
states.RESCUEWAIT}
@METRICS.timer('PXEBaseMixin._check_boot_timeouts')
@periodics.periodic(spacing=CONF.pxe.boot_retry_check_interval,
enabled=bool(CONF.pxe.boot_retry_timeout))
def _check_boot_timeouts(self, manager, context):
"""Periodically checks whether boot has timed out and retry it.
:param manager: conductor manager.
:param context: request context.
"""
filters = {'provision_state_in': self._RETRY_ALLOWED_STATES,
'reserved': False,
'maintenance': False,
'provisioned_before': CONF.pxe.boot_retry_timeout}
node_iter = manager.iter_nodes(filters=filters)
for node_uuid, driver, conductor_group in node_iter:
try:
lock_purpose = 'checking PXE boot status'
with task_manager.acquire(context, node_uuid,
shared=True,
purpose=lock_purpose) as task:
self._check_boot_status(task)
except (exception.NodeLocked, exception.NodeNotFound):
continue
def _check_boot_status(self, task):
if not isinstance(task.driver.boot, PXEBaseMixin):
return
if not _should_retry_boot(task.node):
return
task.upgrade_lock(purpose='retrying PXE boot')
# Retry critical checks after acquiring the exclusive lock.
if (task.node.maintenance or task.node.provision_state
not in self._RETRY_ALLOWED_STATES
or not _should_retry_boot(task.node)):
return
LOG.info('Booting the ramdisk on node %(node)s is taking more than '
'%(timeout)d seconds, retrying boot',
{'node': task.node.uuid,
'timeout': CONF.pxe.boot_retry_timeout})
manager_utils.node_power_action(task, states.POWER_OFF)
# NOTE(dtantsur): retry even persistent boot setting in case it did not
# work for some reason.
persistent = self._persistent_ramdisk_boot(task.node)
manager_utils.node_set_boot_device(task, boot_devices.PXE,
persistent=persistent)
manager_utils.node_power_action(task, states.POWER_ON)
def _should_retry_boot(node):
# NOTE(dtantsur): this assumes IPA, do we need to make it generic?
for field in ('agent_last_heartbeat', 'last_power_state_change'):
if manager_utils.value_within_timeout(
node.driver_internal_info.get(field),
CONF.pxe.boot_retry_timeout):
# Alive and heartbeating, probably busy with something long
LOG.debug('Not retrying PXE boot for node %(node)s; its '
'%(event)s happened less than %(timeout)d seconds ago',
{'node': node.uuid, 'event': field,
'timeout': CONF.pxe.boot_retry_timeout})
return False
return True

View File

@ -231,7 +231,8 @@ class DbNodeTestCase(base.DbTestCase):
# node with provision_updated timeout
node1 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
provision_updated_at=past)
provision_updated_at=past,
provision_state=states.DEPLOYING)
# node with None in provision_updated_at
node2 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
provision_state=states.DEPLOYWAIT)
@ -247,6 +248,10 @@ class DbNodeTestCase(base.DbTestCase):
states.DEPLOYWAIT})
self.assertEqual([node2.id], [r[0] for r in res])
res = self.dbapi.get_nodeinfo_list(
filters={'provision_state_in': [states.ACTIVE, states.DEPLOYING]})
self.assertEqual([node1.id], [r[0] for r in res])
@mock.patch.object(timeutils, 'utcnow', autospec=True)
def test_get_nodeinfo_list_inspection(self, mock_utcnow):
past = datetime.datetime(2000, 1, 1, 0, 0)

View File

@ -21,6 +21,7 @@ import tempfile
import mock
from oslo_config import cfg
from oslo_serialization import jsonutils as json
from oslo_utils import timeutils
from oslo_utils import uuidutils
from ironic.common import boot_devices
@ -1162,3 +1163,98 @@ class PXEValidateRescueTestCase(db_base.DbTestCase):
self.assertRaisesRegex(exception.MissingParameterValue,
'Missing.*rescue_kernel',
task.driver.boot.validate_rescue, task)
@mock.patch.object(ipxe.iPXEBoot, '__init__', lambda self: None)
@mock.patch.object(pxe.PXEBoot, '__init__', lambda self: None)
@mock.patch.object(manager_utils, 'node_set_boot_device', autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
class PXEBootRetryTestCase(db_base.DbTestCase):
boot_interface = 'pxe'
def setUp(self):
super(PXEBootRetryTestCase, self).setUp()
self.config(enabled_boot_interfaces=['pxe', 'ipxe', 'fake'])
self.config(boot_retry_timeout=300, group='pxe')
self.node = obj_utils.create_test_node(
self.context,
driver='fake-hardware',
boot_interface=self.boot_interface,
provision_state=states.DEPLOYWAIT)
@mock.patch.object(pxe.PXEBoot, '_check_boot_status', autospec=True)
def test_check_boot_timeouts(self, mock_check_status, mock_power,
mock_boot_dev):
def _side_effect(iface, task):
self.assertEqual(self.node.uuid, task.node.uuid)
mock_check_status.side_effect = _side_effect
manager = mock.Mock(spec=['iter_nodes'])
manager.iter_nodes.return_value = [
(uuidutils.generate_uuid(), 'fake-hardware', ''),
(self.node.uuid, self.node.driver, self.node.conductor_group)
]
iface = pxe.PXEBoot()
iface._check_boot_timeouts(manager, self.context)
mock_check_status.assert_called_once_with(iface, mock.ANY)
def test_check_boot_status_another_boot_interface(self, mock_power,
mock_boot_dev):
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.driver.boot = fake.FakeBoot()
pxe.PXEBoot()._check_boot_status(task)
self.assertTrue(task.shared)
self.assertFalse(mock_power.called)
self.assertFalse(mock_boot_dev.called)
def test_check_boot_status_recent_power_change(self, mock_power,
mock_boot_dev):
for field in ('agent_last_heartbeat', 'last_power_state_change'):
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.node.driver_internal_info = {
field: str(timeutils.utcnow().isoformat())
}
task.driver.boot._check_boot_status(task)
self.assertTrue(task.shared)
self.assertFalse(mock_power.called)
self.assertFalse(mock_boot_dev.called)
def test_check_boot_status_maintenance(self, mock_power, mock_boot_dev):
self.node.maintenance = True
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.driver.boot._check_boot_status(task)
self.assertFalse(task.shared)
self.assertFalse(mock_power.called)
self.assertFalse(mock_boot_dev.called)
def test_check_boot_status_wrong_state(self, mock_power, mock_boot_dev):
self.node.provision_state = states.DEPLOYING
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.driver.boot._check_boot_status(task)
self.assertFalse(task.shared)
self.assertFalse(mock_power.called)
self.assertFalse(mock_boot_dev.called)
def test_check_boot_status_retry(self, mock_power, mock_boot_dev):
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.driver.boot._check_boot_status(task)
self.assertFalse(task.shared)
mock_power.assert_has_calls([
mock.call(task, states.POWER_OFF),
mock.call(task, states.POWER_ON)
])
mock_boot_dev.assert_called_once_with(task, 'pxe',
persistent=False)
class iPXEBootRetryTestCase(PXEBootRetryTestCase):
boot_interface = 'ipxe'

View File

@ -0,0 +1,10 @@
---
features:
- |
Allows retrying PXE/iPXE boot during deployment, cleaning and rescuing.
This feature is disabled by default and can be enabled by setting
``[pxe]boot_retry_timeout`` to the timeout (in seconds) after which
the boot should be retried.
The new option ``[pxe]boot_retry_check_interval`` defines how often to
check the nodes for timeout and defaults to 90 seconds.