preserve/handle config drives on 4k block devices

When an underlying block device (or driver) only supports 4KB IO,
this can cause some issues with aspects like using an ISO9660 filesystem
which can only support a maximum of 2KB IO.

The agent will now attempt to mount the filesystem *before* deleting the
supplied file, and should that fail it will mount the configuration drive
file from the ramdisk utilizing a loopback, and then extract the contents
of the ramdisk into a newly created VFAT filesystem which supports 4KB
block IO.

Closes-Bug: #2028002
Change-Id: I336acb8e8eb5a02dde2f5e24c258e23797d200ee
This commit is contained in:
Julia Kreger 2023-07-18 11:01:22 -07:00
parent 5ed520df89
commit b6c263a5dc
4 changed files with 340 additions and 4 deletions

View File

@ -338,6 +338,11 @@ cli_opts = [
help='Time in seconds to wait for an HTTP request TCP socket ' help='Time in seconds to wait for an HTTP request TCP socket '
'used by an API request to a remote service to enter ' 'used by an API request to a remote service to enter '
'a state where a request can be transmitted.'), 'a state where a request can be transmitted.'),
cfg.BoolOpt('config_drive_rebuild',
default=False,
help='If the agent should rebuild the configuration drive '
'using a local filesystem, instead of letting Ironic '
'determine if this action is necessary.'),
] ]
CONF.register_cli_opts(cli_opts) CONF.register_cli_opts(cli_opts)

View File

@ -465,12 +465,27 @@ def create_config_drive_partition(node_uuid, device, configdrive):
{'part': config_drive_part, 'node': node_uuid}) {'part': config_drive_part, 'node': node_uuid})
utils.execute('test', '-e', config_drive_part, attempts=15, utils.execute('test', '-e', config_drive_part, attempts=15,
delay_on_retry=True) delay_on_retry=True)
if not CONF.config_drive_rebuild:
disk_utils.dd(confdrive_file, config_drive_part) disk_utils.dd(confdrive_file, config_drive_part)
if not _does_config_drive_work(config_drive_part):
# If we have reached this point, we might have an
# invalid configuration drive, OR the block device
# layer doesn't support 2K block Logical IO (iso9660)
_try_build_fat32_config_drive(config_drive_part,
confdrive_file)
else:
LOG.info('Extracting configuration drive to write copy to disk.')
_try_build_fat32_config_drive(config_drive_part, confdrive_file)
LOG.info("Configdrive for node %(node)s successfully " LOG.info("Configdrive for node %(node)s successfully "
"copied onto partition %(part)s", "copied onto partition %(part)s",
{'node': node_uuid, 'part': config_drive_part}) {'node': node_uuid, 'part': config_drive_part})
except exception.InstanceDeployFailure:
# Since we no longer have a final action on the decorator, we need
# to catch the failure, and still perform the cleanup.
if confdrive_file:
utils.unlink_without_raise(confdrive_file)
raise
except (processutils.UnknownArgumentError, except (processutils.UnknownArgumentError,
processutils.ProcessExecutionError, OSError) as e: processutils.ProcessExecutionError, OSError) as e:
msg = ('Failed to create config drive on disk %(disk)s ' msg = ('Failed to create config drive on disk %(disk)s '
@ -478,13 +493,90 @@ def create_config_drive_partition(node_uuid, device, configdrive):
{'disk': device, 'node': node_uuid, 'error': e}) {'disk': device, 'node': node_uuid, 'error': e})
LOG.error(msg) LOG.error(msg)
raise exception.InstanceDeployFailure(msg) raise exception.InstanceDeployFailure(msg)
finally:
# If the configdrive was requested make sure we delete the file # If the configdrive was requested make sure we delete the file
# after copying the content to the partition # after copying the content to the partition
finally:
if confdrive_file: if confdrive_file:
utils.unlink_without_raise(confdrive_file) utils.unlink_without_raise(confdrive_file)
def _does_config_drive_work(config_drive_part):
"""Attempts to mount the config drive to validate it works.
:param config_drive_part: The partition to which the configuration drive
was written.
:returns: True if we were able to mount the configuration drive partition.
"""
temp_folder = tempfile.mkdtemp()
try:
# Why: If the filesystem is ISO9660 or vfat, and the logical sector
# size which is supported is *not* something which supports 512 bytes,
# i.e. a 4k Block size, then ISO9660 just will not work. Vfat also
# will not work because the logical size needs to match the logical
# size which is usable. If the underlying driver cannot use that size,
# then the filesystem will not work and cannot be updated because
# structurally it is incompaible with the block device driver.
utils.execute('mount', '-o', 'ro', '-t', 'auto', config_drive_part,
temp_folder)
utils.execute('umount', temp_folder)
except (processutils.ProcessExecutionError, OSError) as e:
LOG.error('Encountered issue attempting to validate the '
'supplied configuration drive. Error: %s', e)
return False
finally:
utils.unlink_without_raise(temp_folder)
return True
def _try_build_fat32_config_drive(partition, confdrive_file):
conf_drive_temp = tempfile.mkdtemp()
try:
utils.execute('mount', '-o', 'loop,ro', '-t', 'auto',
confdrive_file, conf_drive_temp)
except (processutils.ProcessExecutionError, OSError) as e:
# Config drive is invalid, at least to our point of view.
# Bailing.
LOG.warning('We were unable to examine the configuration drive, '
'bypassing. Error: %s', e)
return
new_drive_temp = tempfile.mkdtemp()
try:
# While creating a config drive file from scratch or on
# a loopback will likely result in a 512 byte sector size,
# the underlying fat filesystem utilities *automatically*
# check the device block sector sizing. This *will* break
# above 4k blocks, or at least might. Officially, 4k is the
# *maximum* in the FAT standard. See:
# https://github.com/dosfstools/dosfstools/blame/c483196dd46eab22abba756cef511d36f5f42070/src/mkfs.fat.c#L1987
utils.mkfs(fs='vfat', path=partition, label='CONFIG-2')
utils.execute('mount', '-t', 'auto', partition, new_drive_temp)
# copytree, using copy2, copies everything in the source folder
# into the destination folder, so we should be good, and metadata
# is attempted to be preserved.
shutil.copytree(conf_drive_temp, new_drive_temp, dirs_exist_ok=True)
except (processutils.ProcessExecutionError, OSError) as e:
# We failed to make the filesystem :(
# This is a fairly hard error as we could not use the
# config drive, nor could we recover the state.
LOG.error('We were unable to make a new filesystem for the '
'configuration drive. Error: %s', e)
msg = ('A failure occured while attempting to format, copy, and '
're-create the configuration drive in a structure which '
'is compatible with the underlying hardware and Operating '
'System. Due to the nature of configuration drive, it could '
'have been incorrectly formatted. Operator investigation is '
'required. Error: {}'.format(str(e)))
raise exception.InstanceDeployFailure(msg)
finally:
utils.execute('umount', conf_drive_temp)
utils.execute('umount', new_drive_temp)
utils.unlink_without_raise(new_drive_temp)
utils.unlink_without_raise(conf_drive_temp)
def _is_disk_larger_than_max_size(device, node_uuid): def _is_disk_larger_than_max_size(device, node_uuid):
"""Check if total disk size exceeds 2TB msdos limit """Check if total disk size exceeds 2TB msdos limit

View File

@ -20,6 +20,7 @@ from ironic_lib import disk_utils
from ironic_lib import exception from ironic_lib import exception
from ironic_lib import utils from ironic_lib import utils
from oslo_concurrency import processutils from oslo_concurrency import processutils
from oslo_config import cfg
import requests import requests
from ironic_python_agent import errors from ironic_python_agent import errors
@ -28,6 +29,9 @@ from ironic_python_agent import partition_utils
from ironic_python_agent.tests.unit import base from ironic_python_agent.tests.unit import base
CONF = cfg.CONF
@mock.patch.object(shutil, 'copyfileobj', autospec=True) @mock.patch.object(shutil, 'copyfileobj', autospec=True)
@mock.patch.object(requests, 'get', autospec=True) @mock.patch.object(requests, 'get', autospec=True)
class GetConfigdriveTestCase(base.IronicAgentTest): class GetConfigdriveTestCase(base.IronicAgentTest):
@ -649,7 +653,10 @@ class CreateConfigDriveTestCases(base.IronicAgentTest):
self.config_part_label, self.config_part_label,
self.node_uuid) self.node_uuid)
self.assertFalse(mock_list_partitions.called) self.assertFalse(mock_list_partitions.called)
self.assertFalse(mock_execute.called) mock_execute.assert_has_calls([
mock.call('mount', '-o', 'ro', '-t', 'auto',
'/dev/fake-part1', mock.ANY),
mock.call('umount', mock.ANY)])
self.assertFalse(mock_table_type.called) self.assertFalse(mock_table_type.called)
mock_dd.assert_called_with(configdrive_file, configdrive_part) mock_dd.assert_called_with(configdrive_file, configdrive_part)
mock_unlink.assert_called_with(configdrive_file) mock_unlink.assert_called_with(configdrive_file)
@ -706,6 +713,135 @@ class CreateConfigDriveTestCases(base.IronicAgentTest):
mock_dd.assert_called_with(configdrive_file, expected_part) mock_dd.assert_called_with(configdrive_file, expected_part)
mock_unlink.assert_called_with(configdrive_file) mock_unlink.assert_called_with(configdrive_file)
@mock.patch('oslo_utils.uuidutils.generate_uuid', lambda: 'fake-uuid')
@mock.patch.object(partition_utils, '_try_build_fat32_config_drive',
autospec=True)
@mock.patch.object(partition_utils, '_does_config_drive_work',
autospec=True)
@mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(utils, 'unlink_without_raise',
autospec=True)
@mock.patch.object(disk_utils, 'dd',
autospec=True)
@mock.patch.object(disk_utils, 'fix_gpt_partition',
autospec=True)
@mock.patch.object(disk_utils, 'get_partition_table_type',
autospec=True)
@mock.patch.object(partition_utils, 'get_partition',
autospec=True)
@mock.patch.object(partition_utils, 'get_labelled_partition',
autospec=True)
@mock.patch.object(partition_utils, 'get_configdrive',
autospec=True)
def test_create_partition_gpt_with_fallback(
self, mock_get_configdrive,
mock_get_labelled_partition,
mock_get_partition_by_uuid,
mock_table_type,
mock_fix_gpt_partition,
mock_dd, mock_unlink, mock_execute,
mock_config_drive_work,
mock_rebuild_config_drive):
config_url = 'http://1.2.3.4/cd'
configdrive_file = '/tmp/xyz'
configdrive_mb = 10
mock_get_configdrive.return_value = (configdrive_mb, configdrive_file)
mock_get_labelled_partition.return_value = None
mock_table_type.return_value = 'gpt'
expected_part = '/dev/fake4'
mock_get_partition_by_uuid.return_value = expected_part
mock_config_drive_work.return_value = False
partition_utils.create_config_drive_partition(self.node_uuid, self.dev,
config_url)
mock_execute.assert_has_calls([
mock.call('sgdisk', '-n', '0:-64MB:0', '-u', '0:fake-uuid',
self.dev, run_as_root=True),
mock.call('sync'),
mock.call('udevadm', 'settle'),
mock.call('partprobe', self.dev, attempts=10, run_as_root=True),
mock.call('sgdisk', '-v', self.dev, run_as_root=True),
mock.call('udevadm', 'settle'),
mock.call('test', '-e', expected_part, attempts=15,
delay_on_retry=True)
])
mock_table_type.assert_called_with(self.dev)
mock_fix_gpt_partition.assert_called_with(self.dev, self.node_uuid)
mock_dd.assert_called_with(configdrive_file, expected_part)
mock_unlink.assert_called_with(configdrive_file)
mock_config_drive_work.assert_called_once_with(expected_part)
mock_rebuild_config_drive.assert_called_once_with(expected_part,
configdrive_file)
@mock.patch('oslo_utils.uuidutils.generate_uuid', lambda: 'fake-uuid')
@mock.patch.object(partition_utils, '_try_build_fat32_config_drive',
autospec=True)
@mock.patch.object(partition_utils, '_does_config_drive_work',
autospec=True)
@mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(utils, 'unlink_without_raise',
autospec=True)
@mock.patch.object(disk_utils, 'dd',
autospec=True)
@mock.patch.object(disk_utils, 'fix_gpt_partition',
autospec=True)
@mock.patch.object(disk_utils, 'get_partition_table_type',
autospec=True)
@mock.patch.object(partition_utils, 'get_partition',
autospec=True)
@mock.patch.object(partition_utils, 'get_labelled_partition',
autospec=True)
@mock.patch.object(partition_utils, 'get_configdrive',
autospec=True)
def test_create_partition_gpt_use_vfat(
self, mock_get_configdrive,
mock_get_labelled_partition,
mock_get_partition_by_uuid,
mock_table_type,
mock_fix_gpt_partition,
mock_dd, mock_unlink, mock_execute,
mock_config_drive_work,
mock_rebuild_config_drive):
config_url = 'http://1.2.3.4/cd'
configdrive_file = '/tmp/xyz'
configdrive_mb = 10
CONF.set_override('config_drive_rebuild', True)
mock_get_configdrive.return_value = (configdrive_mb, configdrive_file)
mock_get_labelled_partition.return_value = None
mock_table_type.return_value = 'gpt'
expected_part = '/dev/fake4'
mock_get_partition_by_uuid.return_value = expected_part
mock_config_drive_work.return_value = True
partition_utils.create_config_drive_partition(self.node_uuid, self.dev,
config_url)
mock_execute.assert_has_calls([
mock.call('sgdisk', '-n', '0:-64MB:0', '-u', '0:fake-uuid',
self.dev, run_as_root=True),
mock.call('sync'),
mock.call('udevadm', 'settle'),
mock.call('partprobe', self.dev, attempts=10, run_as_root=True),
mock.call('sgdisk', '-v', self.dev, run_as_root=True),
mock.call('udevadm', 'settle'),
mock.call('test', '-e', expected_part, attempts=15,
delay_on_retry=True)
])
mock_table_type.assert_called_with(self.dev)
mock_fix_gpt_partition.assert_called_with(self.dev, self.node_uuid)
mock_dd.assert_not_called()
mock_unlink.assert_called_with(configdrive_file)
mock_config_drive_work.assert_not_called()
mock_rebuild_config_drive.assert_called_once_with(expected_part,
configdrive_file)
@mock.patch.object(disk_utils, 'count_mbr_partitions', autospec=True) @mock.patch.object(disk_utils, 'count_mbr_partitions', autospec=True)
@mock.patch.object(utils, 'execute', autospec=True) @mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(partition_utils.LOG, 'warning', autospec=True) @mock.patch.object(partition_utils.LOG, 'warning', autospec=True)
@ -1288,3 +1424,94 @@ class TestGetPartition(base.IronicAgentTest):
mock.call('lsblk', '-PbioKNAME,UUID,PARTUUID,TYPE,LABEL', mock.call('lsblk', '-PbioKNAME,UUID,PARTUUID,TYPE,LABEL',
self.fake_dev)] self.fake_dev)]
mock_execute.assert_has_calls(expected) mock_execute.assert_has_calls(expected)
@mock.patch.object(utils, 'execute', autospec=True)
class TestConfigDriveTestRecovery(base.IronicAgentTest):
fake_dev = '/dev/fake'
configdrive_file = '/tmp/config-drive'
def test__does_config_drive_work(self, mock_execute):
self.assertTrue(partition_utils._does_config_drive_work(self.fake_dev))
mock_execute.assert_has_calls([
mock.call('mount', '-o', 'ro', '-t', 'auto', self.fake_dev,
mock.ANY),
mock.call('umount', mock.ANY)])
def test__does_config_drive_failed(self, mock_execute):
mock_execute.side_effect = processutils.ProcessExecutionError('boom')
self.assertFalse(
partition_utils._does_config_drive_work(self.fake_dev)
)
mock_execute.assert_has_calls([
mock.call('mount', '-o', 'ro', '-t', 'auto', self.fake_dev,
mock.ANY)])
@mock.patch.object(shutil, 'copytree', autospec=True)
@mock.patch.object(utils, 'mkfs', autospec=True)
def test__try_build_fat32_config_drive(self,
mock_mkfs,
mock_copy,
mock_execute):
partition_utils._try_build_fat32_config_drive(self.fake_dev,
self.configdrive_file)
mock_execute.assert_has_calls([
mock.call('mount', '-o', 'loop,ro', '-t', 'auto',
self.configdrive_file, mock.ANY),
mock.call('mount', '-t', 'auto', self.fake_dev, mock.ANY),
mock.call('umount', mock.ANY),
mock.call('umount', mock.ANY),
])
mock_mkfs.assert_called_once_with(fs='vfat', path=self.fake_dev,
label='CONFIG-2')
# Validate we called copy as we expect, both source and destination
# are temporary folders.
mock_copy.assert_called_once_with(mock.ANY, mock.ANY,
dirs_exist_ok=True)
@mock.patch.object(shutil, 'copytree', autospec=True)
@mock.patch.object(utils, 'mkfs', autospec=True)
def test__try_build_fat32_config_drive_graceful_fail(
self,
mock_mkfs,
mock_copy,
mock_execute):
mock_execute.side_effect = processutils.ProcessExecutionError('boom')
self.assertIsNone(
partition_utils._try_build_fat32_config_drive(
self.fake_dev,
self.configdrive_file)
)
mock_execute.assert_called_once_with(
'mount', '-o', 'loop,ro', '-t', 'auto',
self.configdrive_file, mock.ANY)
mock_mkfs.assert_not_called()
# Validate we called copy as we expect, both source and destination
# are temporary folders.
mock_copy.assert_not_called()
@mock.patch.object(shutil, 'copytree', autospec=True)
@mock.patch.object(utils, 'mkfs', autospec=True)
def test__try_build_fat32_config_drive_fails_once_invalid(
self,
mock_mkfs,
mock_copy,
mock_execute):
mock_mkfs.side_effect = processutils.ProcessExecutionError('boom')
self.assertRaisesRegex(
exception.InstanceDeployFailure,
'A failure occured while attempting to format.*',
partition_utils._try_build_fat32_config_drive,
self.fake_dev,
self.configdrive_file)
mock_execute.assert_has_calls([
mock.call('mount', '-o', 'loop,ro', '-t', 'auto',
self.configdrive_file, mock.ANY),
mock.call('umount', mock.ANY),
mock.call('umount', mock.ANY),
])
mock_mkfs.assert_called_once_with(fs='vfat', path=self.fake_dev,
label='CONFIG-2')
mock_copy.assert_not_called()

View File

@ -0,0 +1,12 @@
---
fixes:
- |
Fixes a failure case where a deployed instance may be unable to access
the configuration drive post-deployment. This can occur when block
devices only support 4KB IO interactions. When 4KB block IO sizes
are in use, the ISO9660 filesystem driver in Linux cannot be used
as it is modeled around a 2KB block. We now attempt to verify, and
rebuild the configuration drive on a FAT filesystem when we cannot
mount the supplied configuration drive. Operators can force the agent
to write configuration drives using the FAT filesystem using the
``[DEFAULT]config_drive_rebuild`` option.