Fix take over for ACTIVE nodes in PXEBoot

when taking over an ACTIVE netboot-ed node with driver based on PXEBoot,
the `PXEBoot.prepare_instance` method fails as our current logic of
generating a 'service' mode PXE config relies on editing PXE env files
that must already exist on disk but are missing from the new conductor host
in this case.

This patch forces this method to first re-create the PXE env from scratch
for ACTIVE nodes when PXE config file is missing before attempting
to change it.
It does not try to re-download the deploy ramdisk and generate options for it
though.
It also does not try to re-set boot device for ACTIVE nodes as that
should have already have been set earlier before actual transition to
ACTIVE.

In order to make the logic cleaner, this patch also refactors private methods
for building pxe options to be deploy-vs-instance oriented instead of
pxe-vs-ipxe oriented.

Change-Id: I264d5477523f57552aadf2809021a9a63fee2730
Partial-Bug: #1646038
Closes-Bug: #1516816
This commit is contained in:
Pavlo Shchelokovskyy 2016-12-08 16:37:22 +02:00
parent 8704d47eb9
commit 035ad6a60b
3 changed files with 157 additions and 68 deletions

View File

@ -129,54 +129,70 @@ def _get_deploy_image_info(node):
return pxe_utils.get_deploy_kr_info(node.uuid, d_info)
def _get_pxe_kernel_ramdisk(pxe_info):
pxe_opts = {}
pxe_opts['deployment_aki_path'] = pxe_utils.get_path_relative_to_tftp_root(
pxe_info['deploy_kernel'][1])
pxe_opts['deployment_ari_path'] = pxe_utils.get_path_relative_to_tftp_root(
pxe_info['deploy_ramdisk'][1])
# It is possible that we don't have kernel/ramdisk or even
# image_source to determine if it's a whole disk image or not.
# For example, when transitioning to 'available' state for first
# time from 'manage' state.
if 'kernel' in pxe_info:
pxe_opts['aki_path'] = pxe_utils.get_path_relative_to_tftp_root(
pxe_info['kernel'][1])
if 'ramdisk' in pxe_info:
pxe_opts['ari_path'] = pxe_utils.get_path_relative_to_tftp_root(
pxe_info['ramdisk'][1])
return pxe_opts
def _get_ipxe_kernel_ramdisk(task, pxe_info):
def _build_deploy_pxe_options(task, pxe_info):
pxe_opts = {}
node = task.node
for label, option in (('deploy_kernel', 'deployment_aki_path'),
('deploy_ramdisk', 'deployment_ari_path')):
image_href = pxe_info[label][0]
if (CONF.pxe.ipxe_use_swift and
service_utils.is_glance_image(image_href)):
pxe_opts[option] = images.get_temp_url_for_glance_image(
task.context, image_href)
if CONF.pxe.ipxe_enabled:
image_href = pxe_info[label][0]
if (CONF.pxe.ipxe_use_swift and
service_utils.is_glance_image(image_href)):
pxe_opts[option] = images.get_temp_url_for_glance_image(
task.context, image_href)
else:
pxe_opts[option] = '/'.join([CONF.deploy.http_url, node.uuid,
label])
else:
pxe_opts[option] = '/'.join([CONF.deploy.http_url, node.uuid,
label])
# NOTE(pas-ha) do not use Swift TempURLs for kernel and ramdisk
# of user image when boot_option is not local,
# as this will break instance reboot later when temp urls have timed out.
if 'kernel' in pxe_info:
pxe_opts['aki_path'] = '/'.join(
[CONF.deploy.http_url, node.uuid, 'kernel'])
if 'ramdisk' in pxe_info:
pxe_opts['ari_path'] = '/'.join(
[CONF.deploy.http_url, node.uuid, 'ramdisk'])
pxe_opts[option] = pxe_utils.get_path_relative_to_tftp_root(
pxe_info[label][1])
return pxe_opts
def _build_instance_pxe_options(task, pxe_info):
pxe_opts = {}
node = task.node
for label, option in (('kernel', 'aki_path'),
('ramdisk', 'ari_path')):
if label in pxe_info:
if CONF.pxe.ipxe_enabled:
# NOTE(pas-ha) do not use Swift TempURLs for kernel and
# ramdisk of user image when boot_option is not local,
# as this breaks instance reboot later when temp urls
# have timed out.
pxe_opts[option] = '/'.join(
[CONF.deploy.http_url, node.uuid, label])
else:
# It is possible that we don't have kernel/ramdisk or even
# image_source to determine if it's a whole disk image or not.
# For example, when transitioning to 'available' state
# for first time from 'manage' state.
pxe_opts[option] = pxe_utils.get_path_relative_to_tftp_root(
pxe_info[label][1])
# These are dummy values to satisfy elilo.
# image and initrd fields in elilo config cannot be blank.
pxe_opts.setdefault('aki_path', 'no_kernel')
pxe_opts.setdefault('ari_path', 'no_ramdisk')
return pxe_opts
def _build_pxe_config_options(task, pxe_info):
def _build_extra_pxe_options():
# Enable debug in IPA according to CONF.debug if it was not
# specified yet
pxe_append_params = CONF.pxe.pxe_append_params
if CONF.debug and 'ipa-debug' not in pxe_append_params:
pxe_append_params += ' ipa-debug=1'
return {'pxe_append_params': pxe_append_params,
'tftp_server': CONF.pxe.tftp_server,
'ipxe_timeout': CONF.pxe.ipxe_timeout * 1000}
def _build_pxe_config_options(task, pxe_info, service=False):
"""Build the PXE config options for a node
This method builds the PXE boot options for a node,
@ -187,34 +203,48 @@ def _build_pxe_config_options(task, pxe_info):
:param task: A TaskManager object
:param pxe_info: a dict of values to set on the configuration file
:param service: if True, build "service mode" pxe config for netboot-ed
user image and skip adding deployment image kernel and ramdisk info
to PXE options.
:returns: A dictionary of pxe options to be used in the pxe bootfile
template.
"""
if CONF.pxe.ipxe_enabled:
pxe_options = _get_ipxe_kernel_ramdisk(task, pxe_info)
if service:
pxe_options = {}
else:
pxe_options = _get_pxe_kernel_ramdisk(pxe_info)
pxe_options = _build_deploy_pxe_options(task, pxe_info)
# These are dummy values to satisfy elilo.
# image and initrd fields in elilo config cannot be blank.
pxe_options.setdefault('aki_path', 'no_kernel')
pxe_options.setdefault('ari_path', 'no_ramdisk')
# Enable debug in IPA according to CONF.debug if it was not
# specified yet
pxe_append_params = CONF.pxe.pxe_append_params
if CONF.debug and 'ipa-debug' not in pxe_append_params:
pxe_append_params += ' ipa-debug=1'
pxe_options.update({
'pxe_append_params': pxe_append_params,
'tftp_server': CONF.pxe.tftp_server,
'ipxe_timeout': CONF.pxe.ipxe_timeout * 1000
})
# NOTE(pas-ha) we still must always add user image kernel and ramdisk info
# as later during switching PXE config to service mode the template
# will not be regenerated anew, but instead edited as-is.
# This can be changed later if/when switching PXE config will also use
# proper templating instead of editing existing files on disk.
pxe_options.update(_build_instance_pxe_options(task, pxe_info))
pxe_options.update(_build_extra_pxe_options())
return pxe_options
def _build_service_pxe_config(task, instance_image_info,
root_uuid_or_disk_id):
node = task.node
pxe_config_path = pxe_utils.get_pxe_config_file_path(node.uuid)
# NOTE(pas-ha) if it is takeover of ACTIVE node,
# first ensure that basic PXE configs and links
# are in place before switching pxe config
if (node.provision_state == states.ACTIVE and
not os.path.isfile(pxe_config_path)):
pxe_options = _build_pxe_config_options(task, instance_image_info,
service=True)
pxe_config_template = deploy_utils.get_pxe_config_template(node)
pxe_utils.create_pxe_config(task, pxe_options, pxe_config_template)
iwdi = node.driver_internal_info.get('is_whole_disk_image')
deploy_utils.switch_pxe_config(
pxe_config_path, root_uuid_or_disk_id,
deploy_utils.get_boot_mode_for_deploy(node),
iwdi, deploy_utils.is_trusted_boot_requested(node))
@METRICS.timer('validate_boot_option_for_uefi')
def validate_boot_option_for_uefi(node):
"""In uefi boot mode, validate if the boot option is compatible.
@ -463,6 +493,7 @@ class PXEBoot(base.BootInterface):
"""
node = task.node
boot_option = deploy_utils.get_boot_option(node)
boot_device = None
if boot_option != "local":
# Make sure that the instance kernel/ramdisk is cached.
@ -496,23 +527,21 @@ class PXEBoot(base.BootInterface):
"for node %(node)s"),
{"node": task.node.uuid})
else:
pxe_config_path = pxe_utils.get_pxe_config_file_path(
task.node.uuid)
deploy_utils.switch_pxe_config(
pxe_config_path, root_uuid_or_disk_id,
deploy_utils.get_boot_mode_for_deploy(node),
iwdi, deploy_utils.is_trusted_boot_requested(node))
# In case boot mode changes from bios to uefi, boot device
# order may get lost in some platforms. Better to re-apply
# boot device.
deploy_utils.try_set_boot_device(task, boot_devices.PXE)
_build_service_pxe_config(task, instance_image_info,
root_uuid_or_disk_id)
boot_device = boot_devices.PXE
else:
# If it's going to boot from the local disk, we don't need
# PXE config files. They still need to be generated as part
# of the prepare() because the deployment does PXE boot the
# deploy ramdisk
pxe_utils.clean_up_pxe_config(task)
deploy_utils.try_set_boot_device(task, boot_devices.DISK)
boot_device = boot_devices.DISK
# NOTE(pas-ha) do not re-set boot device on ACTIVE nodes
# during takeover
if boot_device and task.node.provision_state != states.ACTIVE:
deploy_utils.try_set_boot_device(task, boot_device)
@METRICS.timer('PXEBoot.clean_up_instance')
def clean_up_instance(self, task):

View File

@ -933,6 +933,47 @@ class PXEBootTestCase(db_base.DbTestCase):
set_boot_device_mock.assert_called_once_with(task,
boot_devices.PXE)
@mock.patch('os.path.isfile', return_value=False)
@mock.patch.object(pxe_utils, 'create_pxe_config', autospec=True)
@mock.patch.object(deploy_utils, 'try_set_boot_device', autospec=True)
@mock.patch.object(deploy_utils, 'switch_pxe_config', autospec=True)
@mock.patch.object(dhcp_factory, 'DHCPFactory', autospec=True)
@mock.patch.object(pxe, '_cache_ramdisk_kernel', autospec=True)
@mock.patch.object(pxe, '_get_instance_image_info', autospec=True)
def test_prepare_instance_netboot_active(
self, get_image_info_mock, cache_mock,
dhcp_factory_mock, switch_pxe_config_mock,
set_boot_device_mock, create_pxe_config_mock, isfile_mock):
provider_mock = mock.MagicMock()
dhcp_factory_mock.return_value = provider_mock
image_info = {'kernel': ('', '/path/to/kernel'),
'ramdisk': ('', '/path/to/ramdisk')}
get_image_info_mock.return_value = image_info
self.node.provision_state = states.ACTIVE
self.node.save()
with task_manager.acquire(self.context, self.node.uuid) as task:
dhcp_opts = pxe_utils.dhcp_options_for_instance(task)
pxe_config_path = pxe_utils.get_pxe_config_file_path(
task.node.uuid)
task.node.properties['capabilities'] = 'boot_mode:bios'
task.node.driver_internal_info['root_uuid_or_disk_id'] = (
"30212642-09d3-467f-8e09-21685826ab50")
task.node.driver_internal_info['is_whole_disk_image'] = False
task.driver.boot.prepare_instance(task)
get_image_info_mock.assert_called_once_with(
task.node, task.context)
cache_mock.assert_called_once_with(
task.context, task.node, image_info)
provider_mock.update_dhcp.assert_called_once_with(task, dhcp_opts)
create_pxe_config_mock.assert_called_once_with(
task, mock.ANY, CONF.pxe.pxe_config_template)
switch_pxe_config_mock.assert_called_once_with(
pxe_config_path, "30212642-09d3-467f-8e09-21685826ab50",
'bios', False, False)
self.assertFalse(set_boot_device_mock.called)
@mock.patch.object(deploy_utils, 'try_set_boot_device', autospec=True)
@mock.patch.object(deploy_utils, 'switch_pxe_config', autospec=True)
@mock.patch.object(dhcp_factory, 'DHCPFactory')
@ -973,6 +1014,18 @@ class PXEBootTestCase(db_base.DbTestCase):
set_boot_device_mock.assert_called_once_with(task,
boot_devices.DISK)
@mock.patch.object(deploy_utils, 'try_set_boot_device', autospec=True)
@mock.patch.object(pxe_utils, 'clean_up_pxe_config', autospec=True)
def test_prepare_instance_localboot_active(self, clean_up_pxe_config_mock,
set_boot_device_mock):
self.node.provision_state = states.ACTIVE
self.node.save()
with task_manager.acquire(self.context, self.node.uuid) as task:
task.node.instance_info['capabilities'] = {'boot_option': 'local'}
task.driver.boot.prepare_instance(task)
clean_up_pxe_config_mock.assert_called_once_with(task)
self.assertFalse(set_boot_device_mock.called)
@mock.patch.object(pxe, '_clean_up_pxe_env', autospec=True)
@mock.patch.object(pxe, '_get_instance_image_info', autospec=True)
def test_clean_up_instance(self, get_image_info_mock,

View File

@ -0,0 +1,7 @@
---
fixes:
- |
PXEBoot driver interface now correctly supports node take-over
for netboot-ed nodes in ACTIVE state.
During take-over, the PXE environment is first created anew before
attempting to switch it to "service mode".