From c5f7f18bcb2e1876b0712c691996346620ffc920 Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Mon, 22 Nov 2021 15:59:50 +1000 Subject: [PATCH] Improve efficiency of storage cleaning in mixed media envs https://storyboard.openstack.org/#!/story/2008290 added support for NVMe-native storage cleaning, greatly improving storage clean times on NVMe-based nodes as well as reducing device wear. This is a follow up change which aims to make further improvements to cleaning efficiency in mixed NVMe-HDD environments. This is achieved by combining NVMe-native cleaning methods on NVMe devices with traditional metadata clean on non-NVMe devices. Story: 2009264 Task: 43498 Change-Id: I445d8f4aaa6cd191d2e540032aed3148fdbff341 --- ironic_python_agent/hardware.py | 95 ++++++++++++++++--- .../tests/unit/test_hardware.py | 41 +++++++- ...rase-devices-express-1df107c75f2b3627.yaml | 8 ++ 3 files changed, 126 insertions(+), 18 deletions(-) create mode 100644 releasenotes/notes/add-erase-devices-express-1df107c75f2b3627.yaml diff --git a/ironic_python_agent/hardware.py b/ironic_python_agent/hardware.py index ceafe31e3..d02620c67 100644 --- a/ironic_python_agent/hardware.py +++ b/ironic_python_agent/hardware.py @@ -1347,6 +1347,30 @@ class GenericHardwareManager(HardwareManager): LOG.error(msg) raise errors.IncompatibleHardwareMethodError(msg) + def _list_erasable_devices(self): + block_devices = self.list_block_devices(include_partitions=True) + # NOTE(coreywright): Reverse sort by device name so a partition (eg + # sda1) is processed before it disappears when its associated disk (eg + # sda) has its partition table erased and the kernel notified. + block_devices.sort(key=lambda dev: dev.name, reverse=True) + erasable_devices = [] + for dev in block_devices: + if self._is_virtual_media_device(dev): + LOG.info("Skipping erasure of virtual media device %s", + dev.name) + continue + if self._is_linux_raid_member(dev): + LOG.info("Skipping erasure of RAID member device %s", + dev.name) + continue + if self._is_read_only_device(dev): + LOG.info("Skipping erasure of read-only device %s", + dev.name) + continue + erasable_devices.append(dev) + + return erasable_devices + def erase_devices_metadata(self, node, ports): """Attempt to erase the disk devices metadata. @@ -1361,20 +1385,7 @@ class GenericHardwareManager(HardwareManager): # sda) has its partition table erased and the kernel notified. block_devices.sort(key=lambda dev: dev.name, reverse=True) erase_errors = {} - for dev in block_devices: - if self._is_virtual_media_device(dev): - LOG.info("Skipping metadata erase of virtual media device %s", - dev.name) - continue - if self._is_linux_raid_member(dev): - LOG.info("Skipping metadata erase of RAID member device %s", - dev.name) - continue - if self._is_read_only_device(dev): - LOG.info("Skipping metadata erase of read-only device %s", - dev.name) - continue - + for dev in self._list_erasable_devices(): try: disk_utils.destroy_disk_metadata(dev.name, node['uuid']) except processutils.ProcessExecutionError as e: @@ -1388,6 +1399,55 @@ class GenericHardwareManager(HardwareManager): for k, v in erase_errors.items()])) raise errors.BlockDeviceEraseError(excpt_msg) + def erase_devices_express(self, node, ports): + """Attempt to perform time-optimised disk erasure: + + for NVMe devices, perform NVMe Secure Erase if supported. For other + devices, perform metadata erasure + + :param node: Ironic node object + :param ports: list of Ironic port objects + :raises BlockDeviceEraseError: when there's an error erasing the + block device + """ + erase_errors = {} + info = node.get('driver_internal_info', {}) + if not self._list_erasable_devices: + LOG.debug("No erasable devices have been found.") + return + for dev in self._list_erasable_devices(): + try: + if self._is_nvme(dev): + execute_nvme_erase = info.get( + 'agent_enable_nvme_secure_erase', True) + if execute_nvme_erase and self._nvme_erase(dev): + continue + except errors.BlockDeviceEraseError as e: + LOG.error('Failed to securely erase device "%(dev)s". ' + 'Error: %(error)s, falling back to metadata ' + 'clean', {'dev': dev.name, 'error': e}) + secure_erase_error = e + try: + disk_utils.destroy_disk_metadata(dev.name, node['uuid']) + except processutils.ProcessExecutionError as e: + LOG.error('Failed to erase the metadata on device ' + '"%(dev)s". Error: %(error)s', + {'dev': dev.name, 'error': e}) + if secure_erase_error: + erase_errors[dev.name] = ( + "Secure erase failed: %s. " + "Fallback to metadata erase also failed: %s.", + secure_erase_error, e) + else: + erase_errors[dev.name] = e + + if erase_errors: + excpt_msg = ('Failed to conduct an express erase on ' + 'the device(s): %s' % '\n'.join('"%s": %s' % item + for item in + erase_errors.items())) + raise errors.BlockDeviceEraseError(excpt_msg) + def _find_pstore_mount_point(self): """Find the pstore mount point by scanning /proc/mounts. @@ -1940,6 +2000,13 @@ class GenericHardwareManager(HardwareManager): 'reboot_requested': False, 'abortable': True }, + { + 'step': 'erase_devices_express', + 'priority': 0, + 'interface': 'deploy', + 'reboot_requested': False, + 'abortable': True + }, { 'step': 'erase_pstore', 'priority': 0, diff --git a/ironic_python_agent/tests/unit/test_hardware.py b/ironic_python_agent/tests/unit/test_hardware.py index 7b19931b3..e02c908fe 100644 --- a/ironic_python_agent/tests/unit/test_hardware.py +++ b/ironic_python_agent/tests/unit/test_hardware.py @@ -129,6 +129,13 @@ class TestGenericHardwareManager(base.IronicAgentTest): 'reboot_requested': False, 'abortable': True }, + { + 'step': 'erase_devices_express', + 'priority': 0, + 'interface': 'deploy', + 'reboot_requested': False, + 'abortable': True + }, { 'step': 'erase_pstore', 'priority': 0, @@ -2095,6 +2102,32 @@ class TestGenericHardwareManager(base.IronicAgentTest): mock.call('/sys/fs/pstore/' + arg) for arg in pstore_entries ]) + @mock.patch.object(il_utils, 'execute', autospec=True) + @mock.patch.object(disk_utils, 'destroy_disk_metadata', autospec=True) + @mock.patch.object(hardware.GenericHardwareManager, + '_nvme_erase', autospec=True) + @mock.patch.object(hardware.GenericHardwareManager, + '_list_erasable_devices', autospec=True) + def test_erase_devices_express( + self, mock_list_erasable_devices, mock_nvme_erase, + mock_destroy_disk_metadata, mock_execute): + block_devices = [ + hardware.BlockDevice('/dev/sda', 'sata', 65535, False), + hardware.BlockDevice('/dev/md0', 'raid-device', 32767, False), + hardware.BlockDevice('/dev/nvme0n1', 'nvme', 32767, False), + hardware.BlockDevice('/dev/nvme1n1', 'nvme', 32767, False) + ] + mock_list_erasable_devices.return_value = list(block_devices) + + self.hardware.erase_devices_express(self.node, []) + self.assertEqual([mock.call(self.hardware, block_devices[2]), + mock.call(self.hardware, block_devices[3])], + mock_nvme_erase.call_args_list) + self.assertEqual([mock.call('/dev/sda', self.node['uuid']), + mock.call('/dev/md0', self.node['uuid'])], + mock_destroy_disk_metadata.call_args_list) + mock_list_erasable_devices.assert_called_with(self.hardware) + @mock.patch.object(il_utils, 'execute', autospec=True) @mock.patch.object(hardware.GenericHardwareManager, '_is_virtual_media_device', autospec=True) @@ -2128,8 +2161,8 @@ class TestGenericHardwareManager(base.IronicAgentTest): mock.call('/dev/sda', self.node['uuid']), mock.call('/dev/md0', self.node['uuid'])], mock_metadata.call_args_list) - mock_list_devs.assert_called_once_with(self.hardware, - include_partitions=True) + mock_list_devs.assert_called_with(self.hardware, + include_partitions=True) self.assertEqual([mock.call(self.hardware, block_devices[0]), mock.call(self.hardware, block_devices[1]), mock.call(self.hardware, block_devices[4]), @@ -2176,8 +2209,8 @@ class TestGenericHardwareManager(base.IronicAgentTest): self.assertEqual([mock.call('/dev/sdb', self.node['uuid']), mock.call('/dev/sda', self.node['uuid'])], mock_metadata.call_args_list) - mock_list_devs.assert_called_once_with(self.hardware, - include_partitions=True) + mock_list_devs.assert_called_with(self.hardware, + include_partitions=True) self.assertEqual([mock.call(self.hardware, block_devices[1]), mock.call(self.hardware, block_devices[0])], mock__is_vmedia.call_args_list) diff --git a/releasenotes/notes/add-erase-devices-express-1df107c75f2b3627.yaml b/releasenotes/notes/add-erase-devices-express-1df107c75f2b3627.yaml new file mode 100644 index 000000000..790d1189f --- /dev/null +++ b/releasenotes/notes/add-erase-devices-express-1df107c75f2b3627.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Adds support for express cleaning mode where hardware-assisted, fast and + secure data erasure is performed on NVMe devices that support it, + while other devices fall back to erase_devices_metadata. + The goal of this feature is to enable express node cleaning in + environments with hybrid storage configuration (e.g. NVMe + HDD).