diff --git a/ironic_python_agent/burnin.py b/ironic_python_agent/burnin.py index c97eea965..841dbfb11 100644 --- a/ironic_python_agent/burnin.py +++ b/ironic_python_agent/burnin.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import time from ironic_lib import utils @@ -84,6 +85,83 @@ def stress_ng_vm(node): raise errors.CommandExecutionError(error_msg) +def _smart_test_status(device): + """Get the SMART test status of a device + + :param device: The device to check. + :raises: CommandExecutionError if the execution of smartctl fails. + :returns: A string with the SMART test status of the device and + None if the status is not available. + """ + args = ['smartctl', '-ja', device.name] + try: + out, _ = utils.execute(*args) + smart_info = json.loads(out) + if smart_info: + return smart_info['ata_smart_data'][ + 'self_test']['status']['string'] + except (processutils.ProcessExecutionError, OSError, KeyError) as e: + LOG.error('SMART test on %(device)s failed with ' + '%(err)s', {'device': device.name, 'err': e}) + return None + + +def _run_smart_test(devices): + """Launch a SMART test on the passed devices + + :param devices: A list of device objects to check. + :raises: CommandExecutionError if the execution of smartctl fails. + :raises: CleaningError if the SMART test on any of the devices fails. + """ + failed_devices = [] + for device in devices: + args = ['smartctl', '-t', 'long', device.name] + LOG.info('SMART self test command: %s', + ' '.join(map(str, args))) + try: + utils.execute(*args) + except (processutils.ProcessExecutionError, OSError) as e: + LOG.error("Starting SMART test on %(device)s failed with: " + "%(err)s", {'device': device.name, 'err': e}) + failed_devices.append(device.name) + if failed_devices: + error_msg = ("fio (disk) failed to start SMART self test on %s", + ', '.join(failed_devices)) + raise errors.CleaningError(error_msg) + + # wait for the test to finish and report the test results + failed_devices = [] + while True: + for device in list(devices): + status = _smart_test_status(device) + if status is None: + devices.remove(device) + continue + if "in progress" in status: + msg = "SMART test still running on %s ..." % device.name + LOG.debug(msg) + continue + if "completed without error" in status: + msg = "%s passed SMART test" % device.name + LOG.info(msg) + devices.remove(device) + continue + failed_devices.append(device.name) + LOG.warning("%(device)s failed SMART test with: %(err)s", + {'device': device.name, 'err': status}) + devices.remove(device) + if not devices: + break + LOG.info("SMART tests still running ...") + time.sleep(30) + + # fail the clean step if the SMART test has failed + if failed_devices: + msg = ('fio (disk) SMART test failed for %s' % ' '.join( + map(str, failed_devices))) + raise errors.CleaningError(msg) + + def fio_disk(node): """Burn-in the disks with fio @@ -118,6 +196,12 @@ def fio_disk(node): LOG.error(error_msg) raise errors.CommandExecutionError(error_msg) + # if configured, run a smart self test on all devices and fail the + # step if any of the devices reports an error + smart_test = info.get('agent_burnin_fio_disk_smart_test', False) + if smart_test: + _run_smart_test(devices) + def _do_fio_network(writer, runtime, partner): diff --git a/ironic_python_agent/tests/unit/test_burnin.py b/ironic_python_agent/tests/unit/test_burnin.py index 2258352ec..16aea3753 100644 --- a/ironic_python_agent/tests/unit/test_burnin.py +++ b/ironic_python_agent/tests/unit/test_burnin.py @@ -21,6 +21,39 @@ from ironic_python_agent import hardware from ironic_python_agent.tests.unit import base +SMART_OUTPUT_JSON_COMPLETED = (""" +{ + "ata_smart_data": { + "self_test": { + "status": { + "value": 0, + "string": "completed without error", + "passed": true + }, + "polling_minutes": { + "short": 1, + "extended": 2, + "conveyance": 2 + } + } + } +} +""") + +SMART_OUTPUT_JSON_MISSING = (""" +{ + "ata_smart_data": { + "self_test": { + "status": { + "value": 0, + "passed": true + } + } + } +} +""") + + @mock.patch.object(utils, 'execute', autospec=True) class TestBurnin(base.IronicAgentTest): @@ -133,6 +166,50 @@ class TestBurnin(base.IronicAgentTest): '--loops', 5, '--runtime', 600, '--time_based', '--name', '/dev/sdj', '--name', '/dev/hdaa') + def test__smart_test_status(self, mock_execute): + device = hardware.BlockDevice('/dev/sdj', 'big', 1073741824, True) + mock_execute.return_value = ([SMART_OUTPUT_JSON_COMPLETED, 'err']) + + status = burnin._smart_test_status(device) + + mock_execute.assert_called_once_with('smartctl', '-ja', '/dev/sdj') + self.assertEqual(status, "completed without error") + + def test__smart_test_status_missing(self, mock_execute): + device = hardware.BlockDevice('/dev/sdj', 'big', 1073741824, True) + mock_execute.return_value = ([SMART_OUTPUT_JSON_MISSING, 'err']) + + status = burnin._smart_test_status(device) + + mock_execute.assert_called_once_with('smartctl', '-ja', '/dev/sdj') + self.assertIsNone(status) + + @mock.patch.object(burnin, '_smart_test_status', autospec=True) + @mock.patch.object(hardware, 'list_all_block_devices', autospec=True) + def test_fio_disk_smart_test(self, mock_list, mock_status, mock_execute): + + node = {'driver_info': {'agent_burnin_fio_disk_smart_test': True}} + + mock_list.return_value = [ + hardware.BlockDevice('/dev/sdj', 'big', 1073741824, True), + hardware.BlockDevice('/dev/hdaa', 'small', 65535, False), + ] + mock_status.return_value = "completed without error" + mock_execute.return_value = (['out', 'err']) + + burnin.fio_disk(node) + + expected_calls = [ + mock.call('fio', '--rw', 'readwrite', '--bs', '4k', '--direct', 1, + '--ioengine', 'libaio', '--iodepth', '32', '--verify', + 'crc32c', '--verify_dump', 1, '--continue_on_error', + 'verify', '--loops', 4, '--runtime', 0, '--time_based', + '--name', '/dev/sdj', '--name', '/dev/hdaa'), + mock.call('smartctl', '-t', 'long', '/dev/sdj'), + mock.call('smartctl', '-t', 'long', '/dev/hdaa') + ] + mock_execute.assert_has_calls(expected_calls) + @mock.patch.object(hardware, 'list_all_block_devices', autospec=True) def test_fio_disk_no_fio(self, mock_list, mock_execute): diff --git a/releasenotes/notes/add-smart-test-to-disk-burnin-d02d31e23e5efa9a.yaml b/releasenotes/notes/add-smart-test-to-disk-burnin-d02d31e23e5efa9a.yaml new file mode 100644 index 000000000..0e95b911f --- /dev/null +++ b/releasenotes/notes/add-smart-test-to-disk-burnin-d02d31e23e5efa9a.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Add 'agent_burnin_fio_disk_smart_test' option in driver-info for disk + burn-in. If set to True, this option will launch a parallel SMART self + test on all devices after the disk burn-in and fail the disk burn-in + clean step if any of the tests fail.