Unmount config drives

If this seems like deja vu, that is because it is. We had this
very same issue with the original CoreOS ramdisk. Since we don't
control the whole OS of the ramdisk, it only made sense to teach
the agent to umount the folder.

The folder is referenced already, and the agent does have safeguards
in place, but unfortunately this issue led to a rebuild breaking where
cloud-init, glean, and the agent were all trying do the right thing
as they thought, and there were just multiple /mnt/config folders
present in the OS. These are separate issues we also need to try and
remedy.

What happens is when the device is locked via a mount, the partition
table is never updated to the running OS as the mount creates a lock.
So the agent ends up thinking, in the case of a rebuild, that everything
including creating a configuration drive on that device has been
successful, but when you reboot, there is no partition table entry
for the new partition as the change was not successfully written.
This state prevented the workload from rebooting properly.
This change eliminates that possibility moving forward by attempting
to ensure that the cloud configuration folder is no longer mounted.

Change-Id: I4399dd0934361003cca9ff95a7e3e3ae9bba3dab
This commit is contained in:
Julia Kreger 2024-04-28 18:57:49 -07:00
parent 28053644cd
commit 6ac3f350c0
3 changed files with 73 additions and 6 deletions

View File

@ -22,6 +22,7 @@ import shutil
import subprocess
import tarfile
import tempfile
import time
from unittest import mock
from ironic_lib import utils as ironic_utils
@ -877,6 +878,7 @@ class TestClockSyncUtils(ironic_agent_base.IronicAgentTest):
self.assertEqual(0, mock_execute.call_count)
@mock.patch.object(utils, '_unmount_any_config_drives', autospec=True)
@mock.patch.object(utils, '_booted_from_vmedia', autospec=True)
@mock.patch.object(utils, '_check_vmedia_device', autospec=True)
@mock.patch.object(utils, '_find_vmedia_device_by_labels', autospec=True)
@ -887,7 +889,8 @@ class TestCopyConfigFromVmedia(testtools.TestCase):
def test_vmedia_found_not_booted_from_vmedia(
self, mock_execute, mock_mount, mock_copy,
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia):
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia,
mock_unmount_config):
mock_booted_from_vmedia.return_value = False
mock_find_device.return_value = '/dev/fake'
utils.copy_config_from_vmedia()
@ -896,10 +899,12 @@ class TestCopyConfigFromVmedia(testtools.TestCase):
mock_copy.assert_not_called()
mock_check_vmedia.assert_not_called()
self.assertTrue(mock_booted_from_vmedia.called)
self.assertTrue(mock_unmount_config.called)
def test_no_vmedia(
self, mock_execute, mock_mount, mock_copy,
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia):
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia,
mock_unmount_config):
mock_booted_from_vmedia.return_value = True
mock_find_device.return_value = None
utils.copy_config_from_vmedia()
@ -908,10 +913,12 @@ class TestCopyConfigFromVmedia(testtools.TestCase):
mock_copy.assert_not_called()
mock_check_vmedia.assert_not_called()
self.assertFalse(mock_booted_from_vmedia.called)
self.assertTrue(mock_unmount_config.called)
def test_no_files(
self, mock_execute, mock_mount, mock_copy,
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia):
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia,
mock_unmount_config):
mock_booted_from_vmedia.return_value = True
temp_path = tempfile.mkdtemp()
self.addCleanup(lambda: shutil.rmtree(temp_path))
@ -925,10 +932,12 @@ class TestCopyConfigFromVmedia(testtools.TestCase):
'/dev/something')
mock_copy.assert_not_called()
self.assertTrue(mock_booted_from_vmedia.called)
self.assertTrue(mock_unmount_config.called)
def test_mounted_no_files(
self, mock_execute, mock_mount, mock_copy,
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia):
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia,
mock_unmount_config):
mock_booted_from_vmedia.return_value = True
mock_execute.return_value = '/some/path', ''
@ -939,11 +948,13 @@ class TestCopyConfigFromVmedia(testtools.TestCase):
mock_copy.assert_not_called()
mock_mount.assert_not_called()
self.assertTrue(mock_booted_from_vmedia.called)
self.assertTrue(mock_unmount_config.called)
@mock.patch.object(os, 'makedirs', autospec=True)
def test_copy(
self, mock_makedirs, mock_execute, mock_mount, mock_copy,
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia):
mock_find_device, mock_check_vmedia, mock_booted_from_vmedia,
mock_unmount_config):
mock_booted_from_vmedia.return_value = True
mock_find_device.return_value = '/dev/something'
@ -982,12 +993,14 @@ class TestCopyConfigFromVmedia(testtools.TestCase):
mock.call(mock.ANY, '/etc/ironic-python-agent.d/ironic.conf'),
], any_order=True)
self.assertTrue(mock_booted_from_vmedia.called)
self.assertTrue(mock_unmount_config.called)
@mock.patch.object(os, 'makedirs', autospec=True)
def test_copy_mounted(
self, mock_makedirs, mock_execute, mock_mount,
mock_copy, mock_find_device, mock_check_vmedia,
mock_booted_from_vmedia):
mock_booted_from_vmedia,
mock_unmount_config):
mock_booted_from_vmedia.return_value = True
mock_find_device.return_value = '/dev/something'
path = tempfile.mkdtemp()
@ -1023,6 +1036,7 @@ class TestCopyConfigFromVmedia(testtools.TestCase):
], any_order=True)
mock_mount.assert_not_called()
self.assertTrue(mock_booted_from_vmedia.called)
self.assertTrue(mock_unmount_config.called)
@mock.patch.object(requests, 'get', autospec=True)
@ -1143,3 +1157,20 @@ class TestCheckEarlyLogging(ironic_agent_base.IronicAgentTest):
expected_calls = [mock.call('Early logging: %s', 'line 1.'),
mock.call('Early logging: %s', 'line 2 message')]
info.assert_has_calls(expected_calls)
class TestUnmountOfConfig(ironic_agent_base.IronicAgentTest):
@mock.patch.object(utils, '_early_log', autospec=True)
@mock.patch.object(os.path, 'ismount', autospec=True)
@mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
def test__unmount_any_config_drives(self, mock_sleep, mock_exec,
mock_ismount, mock_log,):
mock_ismount.side_effect = iter([True, True, False])
utils._unmount_any_config_drives()
self.assertEqual(2, mock_sleep.call_count)
self.assertEqual(2, mock_log.call_count)
mock_exec.assert_has_calls([
mock.call('umount', '/mnt/config'),
mock.call('umount', '/mnt/config')])

View File

@ -311,18 +311,27 @@ def copy_config_from_vmedia():
['config-2', 'vmedia_boot_iso'])
if not vmedia_device_file:
_early_log('No virtual media device detected')
_unmount_any_config_drives()
return
if not _booted_from_vmedia():
_early_log('Cannot use configuration from virtual media as the '
'agent was not booted from virtual media.')
_unmount_any_config_drives()
return
# Determine the device
mounted = _find_mount_point(vmedia_device_file)
if mounted:
# In this case a utility like a configuration drive tool
# has *already* mounted the device we believe to be the
# configuration drive.
_copy_config_from(mounted)
else:
with ironic_utils.mounted(vmedia_device_file) as vmedia_mount_point:
# In this case, we use a temporary folder and extract the contents
# for our configuration.
_copy_config_from(vmedia_mount_point)
# As a last act, just make sure there is nothing else mounted.
_unmount_any_config_drives()
def _get_cached_params():
@ -944,3 +953,22 @@ def find_in_lshw(lshw, by_id=None, by_class=None, recursive=False, **fields):
yield child
if recursive:
yield from find_in_lshw(child, by_id, recursive=True, **fields)
def _unmount_any_config_drives():
"""Umount anything mounted to /mnt/config
As part of the configuration drive model, utilities like cloud-init
and glean leverage a folder at /mnt/config to convey configuration
to a booting OS.
The possibility exists that one of the utilties mounted one or multiple
such folders, even if the configuration was not used, and this can
result in locked devices which can prevent rebuild operations from
completing successfully as as long as the folder is mounted, it is
a "locked" device to the operating system.
"""
while os.path.ismount('/mnt/config'):
_early_log('Issuing an umount command for /mnt/config...')
execute('umount', '/mnt/config')
time.sleep(1)

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an issue where configuration drive volumes which are mounted
by the operating system could remain mounted and cause a lock to be
held, which may conflict with actions such as ``rebuild``.
The agent now always makes sure the folder used by Glean and Cloud-init
is not mounted.