Improve efficiency of storage cleaning in mixed media envs

https://storyboard.openstack.org/#!/story/2008290 added support
for NVMe-native storage cleaning, greatly improving storage clean
times on NVMe-based nodes as well as reducing device wear.

This is a follow up change which aims to make further improvements
to cleaning efficiency in mixed NVMe-HDD environments. This is
achieved by combining NVMe-native cleaning methods on NVMe devices
with traditional metadata clean on non-NVMe devices.

Story: 2009264
Task: 43498
Change-Id: I445d8f4aaa6cd191d2e540032aed3148fdbff341
This commit is contained in:
Jacob Anders 2021-11-22 15:59:50 +10:00
parent cf46443120
commit c5f7f18bcb
3 changed files with 126 additions and 18 deletions

View File

@ -1347,6 +1347,30 @@ class GenericHardwareManager(HardwareManager):
LOG.error(msg) LOG.error(msg)
raise errors.IncompatibleHardwareMethodError(msg) raise errors.IncompatibleHardwareMethodError(msg)
def _list_erasable_devices(self):
block_devices = self.list_block_devices(include_partitions=True)
# NOTE(coreywright): Reverse sort by device name so a partition (eg
# sda1) is processed before it disappears when its associated disk (eg
# sda) has its partition table erased and the kernel notified.
block_devices.sort(key=lambda dev: dev.name, reverse=True)
erasable_devices = []
for dev in block_devices:
if self._is_virtual_media_device(dev):
LOG.info("Skipping erasure of virtual media device %s",
dev.name)
continue
if self._is_linux_raid_member(dev):
LOG.info("Skipping erasure of RAID member device %s",
dev.name)
continue
if self._is_read_only_device(dev):
LOG.info("Skipping erasure of read-only device %s",
dev.name)
continue
erasable_devices.append(dev)
return erasable_devices
def erase_devices_metadata(self, node, ports): def erase_devices_metadata(self, node, ports):
"""Attempt to erase the disk devices metadata. """Attempt to erase the disk devices metadata.
@ -1361,20 +1385,7 @@ class GenericHardwareManager(HardwareManager):
# sda) has its partition table erased and the kernel notified. # sda) has its partition table erased and the kernel notified.
block_devices.sort(key=lambda dev: dev.name, reverse=True) block_devices.sort(key=lambda dev: dev.name, reverse=True)
erase_errors = {} erase_errors = {}
for dev in block_devices: for dev in self._list_erasable_devices():
if self._is_virtual_media_device(dev):
LOG.info("Skipping metadata erase of virtual media device %s",
dev.name)
continue
if self._is_linux_raid_member(dev):
LOG.info("Skipping metadata erase of RAID member device %s",
dev.name)
continue
if self._is_read_only_device(dev):
LOG.info("Skipping metadata erase of read-only device %s",
dev.name)
continue
try: try:
disk_utils.destroy_disk_metadata(dev.name, node['uuid']) disk_utils.destroy_disk_metadata(dev.name, node['uuid'])
except processutils.ProcessExecutionError as e: except processutils.ProcessExecutionError as e:
@ -1388,6 +1399,55 @@ class GenericHardwareManager(HardwareManager):
for k, v in erase_errors.items()])) for k, v in erase_errors.items()]))
raise errors.BlockDeviceEraseError(excpt_msg) raise errors.BlockDeviceEraseError(excpt_msg)
def erase_devices_express(self, node, ports):
"""Attempt to perform time-optimised disk erasure:
for NVMe devices, perform NVMe Secure Erase if supported. For other
devices, perform metadata erasure
:param node: Ironic node object
:param ports: list of Ironic port objects
:raises BlockDeviceEraseError: when there's an error erasing the
block device
"""
erase_errors = {}
info = node.get('driver_internal_info', {})
if not self._list_erasable_devices:
LOG.debug("No erasable devices have been found.")
return
for dev in self._list_erasable_devices():
try:
if self._is_nvme(dev):
execute_nvme_erase = info.get(
'agent_enable_nvme_secure_erase', True)
if execute_nvme_erase and self._nvme_erase(dev):
continue
except errors.BlockDeviceEraseError as e:
LOG.error('Failed to securely erase device "%(dev)s". '
'Error: %(error)s, falling back to metadata '
'clean', {'dev': dev.name, 'error': e})
secure_erase_error = e
try:
disk_utils.destroy_disk_metadata(dev.name, node['uuid'])
except processutils.ProcessExecutionError as e:
LOG.error('Failed to erase the metadata on device '
'"%(dev)s". Error: %(error)s',
{'dev': dev.name, 'error': e})
if secure_erase_error:
erase_errors[dev.name] = (
"Secure erase failed: %s. "
"Fallback to metadata erase also failed: %s.",
secure_erase_error, e)
else:
erase_errors[dev.name] = e
if erase_errors:
excpt_msg = ('Failed to conduct an express erase on '
'the device(s): %s' % '\n'.join('"%s": %s' % item
for item in
erase_errors.items()))
raise errors.BlockDeviceEraseError(excpt_msg)
def _find_pstore_mount_point(self): def _find_pstore_mount_point(self):
"""Find the pstore mount point by scanning /proc/mounts. """Find the pstore mount point by scanning /proc/mounts.
@ -1940,6 +2000,13 @@ class GenericHardwareManager(HardwareManager):
'reboot_requested': False, 'reboot_requested': False,
'abortable': True 'abortable': True
}, },
{
'step': 'erase_devices_express',
'priority': 0,
'interface': 'deploy',
'reboot_requested': False,
'abortable': True
},
{ {
'step': 'erase_pstore', 'step': 'erase_pstore',
'priority': 0, 'priority': 0,

View File

@ -129,6 +129,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'reboot_requested': False, 'reboot_requested': False,
'abortable': True 'abortable': True
}, },
{
'step': 'erase_devices_express',
'priority': 0,
'interface': 'deploy',
'reboot_requested': False,
'abortable': True
},
{ {
'step': 'erase_pstore', 'step': 'erase_pstore',
'priority': 0, 'priority': 0,
@ -2095,6 +2102,32 @@ class TestGenericHardwareManager(base.IronicAgentTest):
mock.call('/sys/fs/pstore/' + arg) for arg in pstore_entries mock.call('/sys/fs/pstore/' + arg) for arg in pstore_entries
]) ])
@mock.patch.object(il_utils, 'execute', autospec=True)
@mock.patch.object(disk_utils, 'destroy_disk_metadata', autospec=True)
@mock.patch.object(hardware.GenericHardwareManager,
'_nvme_erase', autospec=True)
@mock.patch.object(hardware.GenericHardwareManager,
'_list_erasable_devices', autospec=True)
def test_erase_devices_express(
self, mock_list_erasable_devices, mock_nvme_erase,
mock_destroy_disk_metadata, mock_execute):
block_devices = [
hardware.BlockDevice('/dev/sda', 'sata', 65535, False),
hardware.BlockDevice('/dev/md0', 'raid-device', 32767, False),
hardware.BlockDevice('/dev/nvme0n1', 'nvme', 32767, False),
hardware.BlockDevice('/dev/nvme1n1', 'nvme', 32767, False)
]
mock_list_erasable_devices.return_value = list(block_devices)
self.hardware.erase_devices_express(self.node, [])
self.assertEqual([mock.call(self.hardware, block_devices[2]),
mock.call(self.hardware, block_devices[3])],
mock_nvme_erase.call_args_list)
self.assertEqual([mock.call('/dev/sda', self.node['uuid']),
mock.call('/dev/md0', self.node['uuid'])],
mock_destroy_disk_metadata.call_args_list)
mock_list_erasable_devices.assert_called_with(self.hardware)
@mock.patch.object(il_utils, 'execute', autospec=True) @mock.patch.object(il_utils, 'execute', autospec=True)
@mock.patch.object(hardware.GenericHardwareManager, @mock.patch.object(hardware.GenericHardwareManager,
'_is_virtual_media_device', autospec=True) '_is_virtual_media_device', autospec=True)
@ -2128,7 +2161,7 @@ class TestGenericHardwareManager(base.IronicAgentTest):
mock.call('/dev/sda', self.node['uuid']), mock.call('/dev/sda', self.node['uuid']),
mock.call('/dev/md0', self.node['uuid'])], mock.call('/dev/md0', self.node['uuid'])],
mock_metadata.call_args_list) mock_metadata.call_args_list)
mock_list_devs.assert_called_once_with(self.hardware, mock_list_devs.assert_called_with(self.hardware,
include_partitions=True) include_partitions=True)
self.assertEqual([mock.call(self.hardware, block_devices[0]), self.assertEqual([mock.call(self.hardware, block_devices[0]),
mock.call(self.hardware, block_devices[1]), mock.call(self.hardware, block_devices[1]),
@ -2176,7 +2209,7 @@ class TestGenericHardwareManager(base.IronicAgentTest):
self.assertEqual([mock.call('/dev/sdb', self.node['uuid']), self.assertEqual([mock.call('/dev/sdb', self.node['uuid']),
mock.call('/dev/sda', self.node['uuid'])], mock.call('/dev/sda', self.node['uuid'])],
mock_metadata.call_args_list) mock_metadata.call_args_list)
mock_list_devs.assert_called_once_with(self.hardware, mock_list_devs.assert_called_with(self.hardware,
include_partitions=True) include_partitions=True)
self.assertEqual([mock.call(self.hardware, block_devices[1]), self.assertEqual([mock.call(self.hardware, block_devices[1]),
mock.call(self.hardware, block_devices[0])], mock.call(self.hardware, block_devices[0])],

View File

@ -0,0 +1,8 @@
---
features:
- |
Adds support for express cleaning mode where hardware-assisted, fast and
secure data erasure is performed on NVMe devices that support it,
while other devices fall back to erase_devices_metadata.
The goal of this feature is to enable express node cleaning in
environments with hybrid storage configuration (e.g. NVMe + HDD).