Software RAID: Re-add missing devices

Upon md device creation, component devices are sometimes removed
immediately again due to a "disk failure". The disks seem healthy,
though. This patch re-adds compoenent devices in such cases to
prevent that the md device will remain in a degraded state (which
would cause issues later, e.g. during ESP creation).

Story: #2008164
Task: #40914

Change-Id: I2ac7cb4a546de84686d5c3435e850c14b3f6c1d7
This commit is contained in:
Arne Wiebalck 2020-10-02 21:36:46 +02:00
parent 3ddca46131
commit 253b4887d5
3 changed files with 146 additions and 10 deletions

View File

@ -203,6 +203,36 @@ def _get_component_devices(raid_device):
return component_devices
def _get_actual_component_devices(raid_device):
"""Get the component devices of a Software RAID device.
Examine an md device and return its constituent devices.
:param raid_device: A Software RAID block device name.
:returns: A list of the component devices.
"""
if not raid_device:
return []
try:
out, _ = utils.execute('mdadm', '--detail', raid_device,
use_standard_locale=True)
except processutils.ProcessExecutionError as e:
msg = ('Could not get component devices of %(dev)s: %(err)s' %
{'dev': raid_device, 'err': e})
LOG.warning(msg)
return []
component_devices = []
lines = out.splitlines()
# the first line contains the md device itself
for line in lines[1:]:
device = re.findall(r'/dev/\w+', line)
component_devices += device
return component_devices
def _calc_memory(sys_dict):
physical = 0
for sys_child in sys_dict['children']:
@ -1866,6 +1896,20 @@ class GenericHardwareManager(HardwareManager):
md_device, ' '.join(component_devices), e)
raise errors.SoftwareRAIDError(msg)
# check for missing devices and re-add them
actual_components = _get_actual_component_devices(md_device)
missing = list(set(component_devices) - set(actual_components))
for dev in missing:
try:
LOG.warning('Found %s to be missing from %s '
'... re-adding!', dev, md_device)
utils.execute('mdadm', '--add', md_device, dev,
attempts=3, delay_on_retry=True)
except processutils.ProcessExecutionError as e:
msg = "Failed re-add {} to {}: {}".format(
dev, md_device, e)
raise errors.SoftwareRAIDError(msg)
LOG.info("Successfully created Software RAID")
return raid_config

View File

@ -2960,11 +2960,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
mocked_create.assert_called_once_with(self.hardware, self.node, [],
raid_config)
@mock.patch.object(hardware, '_get_actual_component_devices',
autospec=True)
@mock.patch.object(disk_utils, 'list_partitions', autospec=True)
@mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(os.path, 'isdir', autospec=True, return_value=False)
def test_create_configuration(self, mocked_os_path_isdir, mocked_execute,
mock_list_parts):
mock_list_parts, mocked_actual_comp):
node = self.node
raid_config = {
@ -3003,6 +3005,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
None, None # mdadms
]
mocked_actual_comp.side_effect = [
('/dev/sda1', '/dev/sdb1'),
('/dev/sda2', '/dev/sdb2'),
]
result = self.hardware.create_configuration(node, [])
mocked_os_path_isdir.assert_has_calls([
mock.call('/sys/firmware/efi')
@ -3037,12 +3044,14 @@ class TestGenericHardwareManager(base.IronicAgentTest):
mock.call(x) for x in ['/dev/sda', '/dev/sdb']
])
@mock.patch.object(hardware, '_get_actual_component_devices',
autospec=True)
@mock.patch.object(utils, 'get_node_boot_mode', lambda node: 'bios')
@mock.patch.object(disk_utils, 'list_partitions', autospec=True,
return_value=[])
@mock.patch.object(utils, 'execute', autospec=True)
def test_create_configuration_raid_5(self, mocked_execute,
mock_list_parts):
mock_list_parts, mocked_actual_comp):
node = self.node
raid_config = {
"logical_disks": [
@ -3082,6 +3091,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
None, None # mdadms
]
mocked_actual_comp.side_effect = [
('/dev/sda1', '/dev/sdb1', '/dev/sdc1'),
('/dev/sda2', '/dev/sdb2', '/dev/sdc2'),
]
result = self.hardware.create_configuration(node, [])
mocked_execute.assert_has_calls([
@ -3120,12 +3134,14 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'/dev/sda2', '/dev/sdb2', '/dev/sdc2')])
self.assertEqual(raid_config, result)
@mock.patch.object(hardware, '_get_actual_component_devices',
autospec=True)
@mock.patch.object(utils, 'get_node_boot_mode', lambda node: 'bios')
@mock.patch.object(disk_utils, 'list_partitions', autospec=True,
return_value=[])
@mock.patch.object(utils, 'execute', autospec=True)
def test_create_configuration_raid_6(self, mocked_execute,
mock_list_parts):
mock_list_parts, mocked_actual_comp):
node = self.node
raid_config = {
"logical_disks": [
@ -3170,6 +3186,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
None, None # mdadms
]
mocked_actual_comp.side_effect = [
('/dev/sda1', '/dev/sdb1', '/dev/sdc1', '/dev/sdd1'),
('/dev/sda2', '/dev/sdb2', '/dev/sdc2', '/dev/sdd2'),
]
result = self.hardware.create_configuration(node, [])
mocked_execute.assert_has_calls([
@ -3217,12 +3238,15 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'/dev/sda2', '/dev/sdb2', '/dev/sdc2', '/dev/sdd2')])
self.assertEqual(raid_config, result)
@mock.patch.object(hardware, '_get_actual_component_devices',
autospec=True)
@mock.patch.object(disk_utils, 'list_partitions', autospec=True,
return_value=[])
@mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(os.path, 'isdir', autospec=True, return_value=True)
def test_create_configuration_efi(self, mocked_os_path_isdir,
mocked_execute, mock_list_parts):
mocked_execute, mock_list_parts,
mocked_actual_comp):
node = self.node
raid_config = {
@ -3255,6 +3279,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
None, None # mdadms
]
mocked_actual_comp.side_effect = [
('/dev/sda1', '/dev/sdb1'),
('/dev/sda2', '/dev/sdb2'),
]
result = self.hardware.create_configuration(node, [])
mocked_os_path_isdir.assert_has_calls([
mock.call('/sys/firmware/efi')
@ -3282,12 +3311,15 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'/dev/sda2', '/dev/sdb2')])
self.assertEqual(raid_config, result)
@mock.patch.object(hardware, '_get_actual_component_devices',
autospec=True)
@mock.patch.object(disk_utils, 'list_partitions', autospec=True,
return_value=[])
@mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(os.path, 'isdir', autospec=True, return_value=False)
def test_create_configuration_force_gpt_with_disk_label(
self, mocked_os_path_isdir, mocked_execute, mock_list_part):
self, mocked_os_path_isdir, mocked_execute, mock_list_part,
mocked_actual_comp):
node = self.node
raid_config = {
@ -3326,6 +3358,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
None, None # mdadms
]
mocked_actual_comp.side_effect = [
('/dev/sda1', '/dev/sdb1'),
('/dev/sda2', '/dev/sdb2'),
]
result = self.hardware.create_configuration(node, [])
mocked_os_path_isdir.assert_has_calls([
mock.call('/sys/firmware/efi')
@ -3353,12 +3390,14 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'/dev/sda2', '/dev/sdb2')])
self.assertEqual(raid_config, result)
@mock.patch.object(hardware, '_get_actual_component_devices',
autospec=True)
@mock.patch.object(disk_utils, 'list_partitions', autospec=True,
return_value=[])
@mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(os.path, 'isdir', autospec=True, return_value=False)
def test_create_configuration_no_max(self, _mocked_isdir, mocked_execute,
mock_list_parts):
mock_list_parts, mocked_actual_comp):
node = self.node
raid_config = {
"logical_disks": [
@ -3381,6 +3420,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
self.hardware.list_block_devices = mock.Mock()
self.hardware.list_block_devices.return_value = [device1, device2]
mocked_actual_comp.side_effect = [
('/dev/sda1', '/dev/sdb1'),
('/dev/sda2', '/dev/sdb2'),
]
mocked_execute.side_effect = [
None, # mklabel sda
('42', None), # sgdisk -F sda
@ -3390,7 +3434,7 @@ class TestGenericHardwareManager(base.IronicAgentTest):
None, None, # parted + partx sdb
None, None, # parted + partx sda
None, None, # parted + partx sdb
None, None # mdadms
None, None, # mdadms
]
result = self.hardware.create_configuration(node, [])
@ -3420,13 +3464,16 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'/dev/sda2', '/dev/sdb2')])
self.assertEqual(raid_config, result)
@mock.patch.object(hardware, '_get_actual_component_devices',
autospec=True)
@mock.patch.object(disk_utils, 'list_partitions', autospec=True,
return_value=[])
@mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(os.path, 'isdir', autospec=True, return_value=False)
def test_create_configuration_max_is_first_logical(self, _mocked_isdir,
mocked_execute,
mock_list_parts):
mock_list_parts,
mocked_actual_comp):
node = self.node
raid_config = {
"logical_disks": [
@ -3461,6 +3508,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
None, None # mdadms
]
mocked_actual_comp.side_effect = [
('/dev/sda1', '/dev/sdb1'),
('/dev/sda2', '/dev/sdb2'),
]
result = self.hardware.create_configuration(node, [])
mocked_execute.assert_has_calls([
@ -3488,12 +3540,15 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'/dev/sda2', '/dev/sdb2')])
self.assertEqual(raid_config, result)
@mock.patch.object(hardware, '_get_actual_component_devices',
autospec=True)
@mock.patch.object(utils, 'get_node_boot_mode', lambda node: 'bios')
@mock.patch.object(disk_utils, 'list_partitions', autospec=True,
return_value=[])
@mock.patch.object(utils, 'execute', autospec=True)
def test_create_configuration_with_hints(self, mocked_execute,
mock_list_parts):
mock_list_parts,
mocked_actual_comp):
node = self.node
raid_config = {
"logical_disks": [
@ -3538,6 +3593,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
None, None # mdadms
]
mocked_actual_comp.side_effect = [
('/dev/sda1', '/dev/sdb1'),
('/dev/sda2', '/dev/sdb2'),
]
result = self.hardware.create_configuration(node, [])
mocked_execute.assert_has_calls([
@ -3818,9 +3878,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
self.hardware.list_block_devices.side_effect = [
[device1, device2, device3],
[device1, device2, device3]]
# pre-creation validation fails as insufficent number of devices found
error_regex = ("Software RAID configuration is not possible for "
"RAID level 6 with only 3 block devices found.")
# Execute is actually called for listing_block_devices
self.assertFalse(mocked_execute.called)
self.assertRaisesRegex(errors.SoftwareRAIDError, error_regex,
@ -3832,12 +3894,15 @@ class TestGenericHardwareManager(base.IronicAgentTest):
result = self.hardware.create_configuration(self.node, [])
self.assertEqual(result, {})
@mock.patch.object(hardware, '_get_actual_component_devices',
autospec=True)
@mock.patch.object(disk_utils, 'list_partitions', autospec=True,
return_value=[])
@mock.patch.object(utils, 'execute', autospec=True)
@mock.patch.object(os.path, 'isdir', autospec=True, return_value=True)
def test_create_configuration_with_nvme(self, mocked_os_path_isdir,
mocked_execute, mock_list_parts):
mocked_execute, mock_list_parts,
mocked_actual_comp):
raid_config = {
"logical_disks": [
{
@ -3870,6 +3935,11 @@ class TestGenericHardwareManager(base.IronicAgentTest):
None, None # mdadms
]
mocked_actual_comp.side_effect = [
('/dev/nvme0n1p1', '/dev/nvme1n1p1'),
('/dev/nvme0n1p2', '/dev/nvme1n1p2'),
]
result = self.hardware.create_configuration(self.node, [])
mocked_execute.assert_has_calls([
@ -3965,6 +4035,20 @@ class TestGenericHardwareManager(base.IronicAgentTest):
self.hardware.create_configuration,
self.node, [])
@mock.patch.object(utils, 'execute', autospec=True)
def test__get_actual_component_devices(self, mocked_execute):
mocked_execute.side_effect = [(MDADM_DETAIL_OUTPUT, '')]
component_devices = hardware._get_actual_component_devices(
'/dev/md0')
self.assertEqual(['/dev/vde1', '/dev/vdf1'], component_devices)
@mock.patch.object(utils, 'execute', autospec=True)
def test__get_actual_component_devices_broken_raid0(self, mocked_execute):
mocked_execute.side_effect = [(MDADM_DETAIL_OUTPUT_BROKEN_RAID0, '')]
component_devices = hardware._get_actual_component_devices(
'/dev/md126')
self.assertEqual(['/dev/sda2'], component_devices)
@mock.patch.object(utils, 'execute', autospec=True)
def test__get_md_uuid(self, mocked_execute):
mocked_execute.side_effect = [(MDADM_DETAIL_OUTPUT, '')]

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Upon the creation of Software RAID devices, component devices are
sometimes kicked out immediately (for no apparent reason). This
fix re-adds devices in such cases in order to prevent the component
to be missing next time the device is assembled, which, for instance
may prevent the UEFI ESPs to be installed properly.