Allow retrying PXE boot if it takes too long

PXE is inherently unreliable and sometimes times out without an obvious reason. It happens particularly often in resource constrained environments, such as the CI. This change allows an operator to set a timeout, after which the boot is retried again. The _add_node_filters call had to be refactored to avoid hitting the complexity limit. Change-Id: I34a11f52e8e98e5b64f2d21f7190468a9e4b030d Story: #2005167 Task: #29901
2019-09-19 14:21:58 +02:00
parent 9fa39045da
commit b2834e6661
9 changed files with 279 additions and 56 deletions
--- a/devstack/lib/ironic
+++ b/devstack/lib/ironic
@@ -88,9 +88,19 @@ IRONIC_USE_MOD_WSGI=$(trueorfalse $ENABLE_HTTPD_MOD_WSGI_SERVICES IRONIC_USE_MOD
 # Defaults to the (now confusingly named) IRONIC_USE_MOD_WSGI for backward compat
 IRONIC_USE_WSGI=$(trueorfalse $IRONIC_USE_MOD_WSGI IRONIC_USE_WSGI)

+# Whether DevStack will be setup for bare metal or VMs
+IRONIC_IS_HARDWARE=$(trueorfalse False IRONIC_IS_HARDWARE)
+
 # Deploy callback timeout can be changed from its default (1800), if required.
 IRONIC_CALLBACK_TIMEOUT=${IRONIC_CALLBACK_TIMEOUT:-}

+# Timeout before retrying PXE boot. Set low to help the CI.
+if [[ "$IRONIC_IS_HARDWARE" == False ]]; then
+    IRONIC_PXE_BOOT_RETRY_TIMEOUT=${IRONIC_PXE_BOOT_RETRY_TIMEOUT:-600}
+else
+    IRONIC_PXE_BOOT_RETRY_TIMEOUT=${IRONIC_PXE_BOOT_RETRY_TIMEOUT:-}
+fi
+
 # Ping timeout after the node becomes active
 IRONIC_PING_TIMEOUT=${IRONIC_PING_TIMEOUT:-}

@@ -327,9 +337,6 @@ IRONIC_HTTP_PORT=${IRONIC_HTTP_PORT:-3928}
 IRONIC_RPC_TRANSPORT=${IRONIC_RPC_TRANSPORT:-oslo}
 IRONIC_JSON_RPC_PORT=${IRONIC_JSON_RPC_PORT:-8089}

-# Whether DevStack will be setup for bare metal or VMs
-IRONIC_IS_HARDWARE=$(trueorfalse False IRONIC_IS_HARDWARE)
-
 # The first port in the range to bind the Virtual BMCs. The number of
 # ports that will be used depends on $IRONIC_VM_COUNT variable, e.g if
 # $IRONIC_VM_COUNT=3 the ports 6230, 6231 and 6232 will be used for the
@@ -1475,6 +1482,9 @@ function configure_ironic_conductor {
    iniset $IRONIC_CONF_FILE pxe tftp_server $IRONIC_TFTPSERVER_IP
    iniset $IRONIC_CONF_FILE pxe tftp_root $IRONIC_TFTPBOOT_DIR
    iniset $IRONIC_CONF_FILE pxe tftp_master_path $IRONIC_TFTPBOOT_DIR/master_images
+    if [[ -n "$IRONIC_PXE_BOOT_RETRY_TIMEOUT" ]]; then
+        iniset $IRONIC_CONF_FILE pxe boot_retry_timeout $IRONIC_PXE_BOOT_RETRY_TIMEOUT
+    fi
    if [[ "$IRONIC_TERMINAL_SSL" == "True" ]]; then
        # Make sure the cert directory exist
        sudo mkdir -p $IRONIC_TERMINAL_CERT_DIR
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -272,6 +272,10 @@ def node_power_action(task, new_state, timeout=None):
    if node['target_power_state'] != target_state:
        node['target_power_state'] = target_state
        node['last_error'] = None
+        driver_internal_info = node.driver_internal_info
+        driver_internal_info['last_power_state_change'] = str(
+            timeutils.utcnow().isoformat())
+        node.driver_internal_info = driver_internal_info
        node.save()

    # take power action
@@ -866,6 +870,22 @@ def fast_track_able(task):
            and task.node.last_error is None)


+def value_within_timeout(value, timeout):
+    """Checks if the time is within the previous timeout seconds from now.
+
+    :param value: a string representing date and time or None.
+    :param timeout: timeout in seconds.
+    """
+    # use native datetime objects for conversion and compare
+    # slightly odd because py2 compatability :(
+    last = datetime.datetime.strptime(value or '1970-01-01T00:00:00.000000',
+                                      "%Y-%m-%dT%H:%M:%S.%f")
+    # If we found nothing, we assume that the time is essentially epoch.
+    time_delta = datetime.timedelta(seconds=timeout)
+    last_valid = timeutils.utcnow() - time_delta
+    return last_valid <= last
+
+
 def is_fast_track(task):
    """Checks a fast track is available.

@@ -882,19 +902,8 @@ def is_fast_track(task):
    :returns: True if the last heartbeat that was recorded was within
              the [deploy]fast_track_timeout setting.
    """
-    if not fast_track_able(task):
-        return False
-    # use native datetime objects for conversion and compare
-    # slightly odd because py2 compatability :(
-    last = datetime.datetime.strptime(
-        task.node.driver_internal_info.get(
-            'agent_last_heartbeat',
-            '1970-01-01T00:00:00.000000'),
-        "%Y-%m-%dT%H:%M:%S.%f")
-    # If we found nothing, we assume that the time is essentially epoch.
-    time_delta = datetime.timedelta(seconds=CONF.deploy.fast_track_timeout)
-    last_valid = timeutils.utcnow() - time_delta
-    # Checking the power state, because if we find the machine off due to
-    # any action, we can't actually fast track the node. :(
-    return (last_valid <= last
+    return (fast_track_able(task)
+            and value_within_timeout(
+                task.node.driver_internal_info.get('agent_last_heartbeat'),
+                CONF.deploy.fast_track_timeout)
            and task.driver.power.get_power_state(task) == states.POWER_ON)
--- a/ironic/conf/pxe.py
+++ b/ironic/conf/pxe.py
@@ -126,6 +126,16 @@ opts = [
               default=0,
               help=_('Timeout value (in seconds) for downloading an image '
                      'via iPXE. Defaults to 0 (no timeout)')),
+    cfg.IntOpt('boot_retry_timeout',
+               min=60,
+               help=_('Timeout (in seconds) after which PXE boot should be '
+                      'retried. Must be less than [conductor]'
+                      'deploy_callback_timeout. Disabled by default.')),
+    cfg.IntOpt('boot_retry_check_interval',
+               default=90, min=1,
+               help=_('Interval (in seconds) between periodic checks on PXE '
+                      'boot retry. Has no effect if boot_retry_timeout '
+                      'is not set.')),
    cfg.StrOpt('ip_version',
               default='4',
               choices=[('4', _('IPv4')),
--- a/ironic/db/sqlalchemy/api.py
+++ b/ironic/db/sqlalchemy/api.py
@@ -279,20 +279,28 @@ def _zip_matching(a, b, key):
 class Connection(api.Connection):
    """SqlAlchemy connection."""

+    _NODE_QUERY_FIELDS = {'console_enabled', 'maintenance', 'driver',
+                          'resource_class', 'provision_state', 'uuid', 'id',
+                          'fault', 'conductor_group', 'owner'}
+    _NODE_IN_QUERY_FIELDS = {'%s_in' % field: field
+                             for field in ('uuid', 'provision_state')}
+    _NODE_NON_NULL_FILTERS = {'associated': 'instance_uuid',
+                              'reserved': 'reservation',
+                              'with_power_state': 'power_state'}
+    _NODE_FILTERS = ({'chassis_uuid', 'reserved_by_any_of',
+                      'provisioned_before', 'inspection_started_before',
+                      'description_contains'}
+                     | _NODE_QUERY_FIELDS
+                     | set(_NODE_IN_QUERY_FIELDS)
+                     | set(_NODE_NON_NULL_FILTERS))
+
    def __init__(self):
        pass

    def _validate_nodes_filters(self, filters):
        if filters is None:
            filters = dict()
-        supported_filters = {'console_enabled', 'maintenance', 'driver',
-                             'resource_class', 'provision_state', 'uuid', 'id',
-                             'chassis_uuid', 'associated', 'reserved',
-                             'reserved_by_any_of', 'provisioned_before',
-                             'inspection_started_before', 'fault',
-                             'conductor_group', 'owner', 'uuid_in',
-                             'with_power_state', 'description_contains'}
-        unsupported_filters = set(filters).difference(supported_filters)
+        unsupported_filters = set(filters).difference(self._NODE_FILTERS)
        if unsupported_filters:
            msg = _("SqlAlchemy API does not support "
                    "filtering by %s") % ', '.join(unsupported_filters)
@@ -301,26 +309,26 @@ class Connection(api.Connection):

    def _add_nodes_filters(self, query, filters):
        filters = self._validate_nodes_filters(filters)
-        for field in ['console_enabled', 'maintenance', 'driver',
-                      'resource_class', 'provision_state', 'uuid', 'id',
-                      'fault', 'conductor_group', 'owner']:
+        for field in self._NODE_QUERY_FIELDS:
            if field in filters:
                query = query.filter_by(**{field: filters[field]})
+        for key, field in self._NODE_IN_QUERY_FIELDS.items():
+            if key in filters:
+                query = query.filter(
+                    getattr(models.Node, field).in_(filters[key]))
+        for key, field in self._NODE_NON_NULL_FILTERS.items():
+            if key in filters:
+                column = getattr(models.Node, field)
+                if filters[key]:
+                    query = query.filter(column != sql.null())
+                else:
+                    query = query.filter(column == sql.null())
+
        if 'chassis_uuid' in filters:
            # get_chassis_by_uuid() to raise an exception if the chassis
            # is not found
            chassis_obj = self.get_chassis_by_uuid(filters['chassis_uuid'])
            query = query.filter_by(chassis_id=chassis_obj.id)
-        if 'associated' in filters:
-            if filters['associated']:
-                query = query.filter(models.Node.instance_uuid != sql.null())
-            else:
-                query = query.filter(models.Node.instance_uuid == sql.null())
-        if 'reserved' in filters:
-            if filters['reserved']:
-                query = query.filter(models.Node.reservation != sql.null())
-            else:
-                query = query.filter(models.Node.reservation == sql.null())
        if 'reserved_by_any_of' in filters:
            query = query.filter(models.Node.reservation.in_(
                filters['reserved_by_any_of']))
@@ -334,13 +342,6 @@ class Connection(api.Connection):
                     - (datetime.timedelta(
                         seconds=filters['inspection_started_before'])))
            query = query.filter(models.Node.inspection_started_at < limit)
-        if 'uuid_in' in filters:
-            query = query.filter(models.Node.uuid.in_(filters['uuid_in']))
-        if 'with_power_state' in filters:
-            if filters['with_power_state']:
-                query = query.filter(models.Node.power_state != sql.null())
-            else:
-                query = query.filter(models.Node.power_state == sql.null())
        if 'description_contains' in filters:
            keyword = filters['description_contains']
            if keyword is not None:
--- a/ironic/drivers/modules/pxe.py
+++ b/ironic/drivers/modules/pxe.py
@@ -17,7 +17,6 @@ PXE Boot Interface

 from ironic_lib import metrics_utils
 from oslo_log import log as logging
-from oslo_utils import strutils

 from ironic.common import boot_devices
 from ironic.common import dhcp_factory
@@ -174,14 +173,7 @@ class PXEBoot(pxe_base.PXEBaseMixin, base.BootInterface):
                                    pxe_config_template,
                                    ipxe_enabled=CONF.pxe.ipxe_enabled)

-        persistent = False
-        value = node.driver_info.get('force_persistent_boot_device',
-                                     'Default')
-        if value in {'Always', 'Default', 'Never'}:
-            if value == 'Always':
-                persistent = True
-        else:
-            persistent = strutils.bool_from_string(value, False)
+        persistent = self._persistent_ramdisk_boot(node)
        manager_utils.node_set_boot_device(task, boot_devices.PXE,
                                           persistent=persistent)

--- a/ironic/drivers/modules/pxe_base.py
+++ b/ironic/drivers/modules/pxe_base.py
@@ -13,13 +13,24 @@
 Base PXE Interface Methods
 """

+from futurist import periodics
 from ironic_lib import metrics_utils
+from oslo_config import cfg
 from oslo_log import log as logging
+from oslo_utils import strutils

+from ironic.common import boot_devices
 from ironic.common import exception
 from ironic.common.i18n import _
 from ironic.common import pxe_utils
+from ironic.common import states
+from ironic.conductor import task_manager
+from ironic.conductor import utils as manager_utils
 from ironic.drivers.modules import deploy_utils
+
+
+CONF = cfg.CONF
+
 LOG = logging.getLogger(__name__)

 METRICS = metrics_utils.get_metrics_logger(__name__)
@@ -99,3 +110,82 @@ class PXEBaseMixin(object):
            parameters
        """
        pxe_utils.parse_driver_info(task.node, mode='rescue')
+
+    def _persistent_ramdisk_boot(self, node):
+        """If the ramdisk should be configured as a persistent boot device."""
+        value = node.driver_info.get('force_persistent_boot_device', 'Default')
+        if value in {'Always', 'Default', 'Never'}:
+            return value == 'Always'
+        else:
+            return strutils.bool_from_string(value, False)
+
+    _RETRY_ALLOWED_STATES = {states.DEPLOYWAIT, states.CLEANWAIT,
+                             states.RESCUEWAIT}
+
+    @METRICS.timer('PXEBaseMixin._check_boot_timeouts')
+    @periodics.periodic(spacing=CONF.pxe.boot_retry_check_interval,
+                        enabled=bool(CONF.pxe.boot_retry_timeout))
+    def _check_boot_timeouts(self, manager, context):
+        """Periodically checks whether boot has timed out and retry it.
+
+        :param manager: conductor manager.
+        :param context: request context.
+        """
+        filters = {'provision_state_in': self._RETRY_ALLOWED_STATES,
+                   'reserved': False,
+                   'maintenance': False,
+                   'provisioned_before': CONF.pxe.boot_retry_timeout}
+        node_iter = manager.iter_nodes(filters=filters)
+
+        for node_uuid, driver, conductor_group in node_iter:
+            try:
+                lock_purpose = 'checking PXE boot status'
+                with task_manager.acquire(context, node_uuid,
+                                          shared=True,
+                                          purpose=lock_purpose) as task:
+                    self._check_boot_status(task)
+            except (exception.NodeLocked, exception.NodeNotFound):
+                continue
+
+    def _check_boot_status(self, task):
+        if not isinstance(task.driver.boot, PXEBaseMixin):
+            return
+
+        if not _should_retry_boot(task.node):
+            return
+
+        task.upgrade_lock(purpose='retrying PXE boot')
+
+        # Retry critical checks after acquiring the exclusive lock.
+        if (task.node.maintenance or task.node.provision_state
+                not in self._RETRY_ALLOWED_STATES
+                or not _should_retry_boot(task.node)):
+            return
+
+        LOG.info('Booting the ramdisk on node %(node)s is taking more than '
+                 '%(timeout)d seconds, retrying boot',
+                 {'node': task.node.uuid,
+                  'timeout': CONF.pxe.boot_retry_timeout})
+
+        manager_utils.node_power_action(task, states.POWER_OFF)
+        # NOTE(dtantsur): retry even persistent boot setting in case it did not
+        # work for some reason.
+        persistent = self._persistent_ramdisk_boot(task.node)
+        manager_utils.node_set_boot_device(task, boot_devices.PXE,
+                                           persistent=persistent)
+        manager_utils.node_power_action(task, states.POWER_ON)
+
+
+def _should_retry_boot(node):
+    # NOTE(dtantsur): this assumes IPA, do we need to make it generic?
+    for field in ('agent_last_heartbeat', 'last_power_state_change'):
+        if manager_utils.value_within_timeout(
+                node.driver_internal_info.get(field),
+                CONF.pxe.boot_retry_timeout):
+            # Alive and heartbeating, probably busy with something long
+            LOG.debug('Not retrying PXE boot for node %(node)s; its '
+                      '%(event)s happened less than %(timeout)d seconds ago',
+                      {'node': node.uuid, 'event': field,
+                       'timeout': CONF.pxe.boot_retry_timeout})
+            return False
+    return True
--- a/ironic/tests/unit/db/test_nodes.py
+++ b/ironic/tests/unit/db/test_nodes.py
@@ -231,7 +231,8 @@ class DbNodeTestCase(base.DbTestCase):

        # node with provision_updated timeout
        node1 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
-                                       provision_updated_at=past)
+                                       provision_updated_at=past,
+                                       provision_state=states.DEPLOYING)
        # node with None in provision_updated_at
        node2 = utils.create_test_node(uuid=uuidutils.generate_uuid(),
                                       provision_state=states.DEPLOYWAIT)
@@ -247,6 +248,10 @@ class DbNodeTestCase(base.DbTestCase):
                                                    states.DEPLOYWAIT})
        self.assertEqual([node2.id], [r[0] for r in res])

+        res = self.dbapi.get_nodeinfo_list(
+            filters={'provision_state_in': [states.ACTIVE, states.DEPLOYING]})
+        self.assertEqual([node1.id], [r[0] for r in res])
+
    @mock.patch.object(timeutils, 'utcnow', autospec=True)
    def test_get_nodeinfo_list_inspection(self, mock_utcnow):
        past = datetime.datetime(2000, 1, 1, 0, 0)
--- a/ironic/tests/unit/drivers/modules/test_pxe.py
+++ b/ironic/tests/unit/drivers/modules/test_pxe.py
@@ -21,6 +21,7 @@ import tempfile
 import mock
 from oslo_config import cfg
 from oslo_serialization import jsonutils as json
+from oslo_utils import timeutils
 from oslo_utils import uuidutils

 from ironic.common import boot_devices
@@ -1162,3 +1163,98 @@ class PXEValidateRescueTestCase(db_base.DbTestCase):
            self.assertRaisesRegex(exception.MissingParameterValue,
                                   'Missing.*rescue_kernel',
                                   task.driver.boot.validate_rescue, task)
+
+
+@mock.patch.object(ipxe.iPXEBoot, '__init__', lambda self: None)
+@mock.patch.object(pxe.PXEBoot, '__init__', lambda self: None)
+@mock.patch.object(manager_utils, 'node_set_boot_device', autospec=True)
+@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
+class PXEBootRetryTestCase(db_base.DbTestCase):
+
+    boot_interface = 'pxe'
+
+    def setUp(self):
+        super(PXEBootRetryTestCase, self).setUp()
+        self.config(enabled_boot_interfaces=['pxe', 'ipxe', 'fake'])
+        self.config(boot_retry_timeout=300, group='pxe')
+        self.node = obj_utils.create_test_node(
+            self.context,
+            driver='fake-hardware',
+            boot_interface=self.boot_interface,
+            provision_state=states.DEPLOYWAIT)
+
+    @mock.patch.object(pxe.PXEBoot, '_check_boot_status', autospec=True)
+    def test_check_boot_timeouts(self, mock_check_status, mock_power,
+                                 mock_boot_dev):
+        def _side_effect(iface, task):
+            self.assertEqual(self.node.uuid, task.node.uuid)
+
+        mock_check_status.side_effect = _side_effect
+        manager = mock.Mock(spec=['iter_nodes'])
+        manager.iter_nodes.return_value = [
+            (uuidutils.generate_uuid(), 'fake-hardware', ''),
+            (self.node.uuid, self.node.driver, self.node.conductor_group)
+        ]
+        iface = pxe.PXEBoot()
+        iface._check_boot_timeouts(manager, self.context)
+        mock_check_status.assert_called_once_with(iface, mock.ANY)
+
+    def test_check_boot_status_another_boot_interface(self, mock_power,
+                                                      mock_boot_dev):
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=True) as task:
+            task.driver.boot = fake.FakeBoot()
+            pxe.PXEBoot()._check_boot_status(task)
+            self.assertTrue(task.shared)
+        self.assertFalse(mock_power.called)
+        self.assertFalse(mock_boot_dev.called)
+
+    def test_check_boot_status_recent_power_change(self, mock_power,
+                                                   mock_boot_dev):
+        for field in ('agent_last_heartbeat', 'last_power_state_change'):
+            with task_manager.acquire(self.context, self.node.uuid,
+                                      shared=True) as task:
+                task.node.driver_internal_info = {
+                    field: str(timeutils.utcnow().isoformat())
+                }
+                task.driver.boot._check_boot_status(task)
+                self.assertTrue(task.shared)
+            self.assertFalse(mock_power.called)
+            self.assertFalse(mock_boot_dev.called)
+
+    def test_check_boot_status_maintenance(self, mock_power, mock_boot_dev):
+        self.node.maintenance = True
+        self.node.save()
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=True) as task:
+            task.driver.boot._check_boot_status(task)
+            self.assertFalse(task.shared)
+        self.assertFalse(mock_power.called)
+        self.assertFalse(mock_boot_dev.called)
+
+    def test_check_boot_status_wrong_state(self, mock_power, mock_boot_dev):
+        self.node.provision_state = states.DEPLOYING
+        self.node.save()
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=True) as task:
+            task.driver.boot._check_boot_status(task)
+            self.assertFalse(task.shared)
+        self.assertFalse(mock_power.called)
+        self.assertFalse(mock_boot_dev.called)
+
+    def test_check_boot_status_retry(self, mock_power, mock_boot_dev):
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=True) as task:
+            task.driver.boot._check_boot_status(task)
+            self.assertFalse(task.shared)
+            mock_power.assert_has_calls([
+                mock.call(task, states.POWER_OFF),
+                mock.call(task, states.POWER_ON)
+            ])
+            mock_boot_dev.assert_called_once_with(task, 'pxe',
+                                                  persistent=False)
+
+
+class iPXEBootRetryTestCase(PXEBootRetryTestCase):
+
+    boot_interface = 'ipxe'
--- a/releasenotes/notes/pxe-retry-762a00ba1089bd75.yaml
+++ b/releasenotes/notes/pxe-retry-762a00ba1089bd75.yaml
@@ -0,0 +1,10 @@
+---
+features:
+  - |
+    Allows retrying PXE/iPXE boot during deployment, cleaning and rescuing.
+    This feature is disabled by default and can be enabled by setting
+    ``[pxe]boot_retry_timeout`` to the timeout (in seconds) after which
+    the boot should be retried.
+
+    The new option ``[pxe]boot_retry_check_interval`` defines how often to
+    check the nodes for timeout and defaults to 90 seconds.