Burn-in: Add SMART self test to disk burn-in
Add the option to run a SMART self test right after the disk burn-in. The disk burn-in step will fail if the SMART test on any of the disk fails. Story: #2007523 Task: #43383 Change-Id: I1312d5b71bedd044581a136af0b4c43769d21877
This commit is contained in:
parent
bcf2846553
commit
c6b1cb1c32
@ -10,6 +10,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import json
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ironic_lib import utils
|
from ironic_lib import utils
|
||||||
@ -84,6 +85,83 @@ def stress_ng_vm(node):
|
|||||||
raise errors.CommandExecutionError(error_msg)
|
raise errors.CommandExecutionError(error_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _smart_test_status(device):
|
||||||
|
"""Get the SMART test status of a device
|
||||||
|
|
||||||
|
:param device: The device to check.
|
||||||
|
:raises: CommandExecutionError if the execution of smartctl fails.
|
||||||
|
:returns: A string with the SMART test status of the device and
|
||||||
|
None if the status is not available.
|
||||||
|
"""
|
||||||
|
args = ['smartctl', '-ja', device.name]
|
||||||
|
try:
|
||||||
|
out, _ = utils.execute(*args)
|
||||||
|
smart_info = json.loads(out)
|
||||||
|
if smart_info:
|
||||||
|
return smart_info['ata_smart_data'][
|
||||||
|
'self_test']['status']['string']
|
||||||
|
except (processutils.ProcessExecutionError, OSError, KeyError) as e:
|
||||||
|
LOG.error('SMART test on %(device)s failed with '
|
||||||
|
'%(err)s', {'device': device.name, 'err': e})
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _run_smart_test(devices):
|
||||||
|
"""Launch a SMART test on the passed devices
|
||||||
|
|
||||||
|
:param devices: A list of device objects to check.
|
||||||
|
:raises: CommandExecutionError if the execution of smartctl fails.
|
||||||
|
:raises: CleaningError if the SMART test on any of the devices fails.
|
||||||
|
"""
|
||||||
|
failed_devices = []
|
||||||
|
for device in devices:
|
||||||
|
args = ['smartctl', '-t', 'long', device.name]
|
||||||
|
LOG.info('SMART self test command: %s',
|
||||||
|
' '.join(map(str, args)))
|
||||||
|
try:
|
||||||
|
utils.execute(*args)
|
||||||
|
except (processutils.ProcessExecutionError, OSError) as e:
|
||||||
|
LOG.error("Starting SMART test on %(device)s failed with: "
|
||||||
|
"%(err)s", {'device': device.name, 'err': e})
|
||||||
|
failed_devices.append(device.name)
|
||||||
|
if failed_devices:
|
||||||
|
error_msg = ("fio (disk) failed to start SMART self test on %s",
|
||||||
|
', '.join(failed_devices))
|
||||||
|
raise errors.CleaningError(error_msg)
|
||||||
|
|
||||||
|
# wait for the test to finish and report the test results
|
||||||
|
failed_devices = []
|
||||||
|
while True:
|
||||||
|
for device in list(devices):
|
||||||
|
status = _smart_test_status(device)
|
||||||
|
if status is None:
|
||||||
|
devices.remove(device)
|
||||||
|
continue
|
||||||
|
if "in progress" in status:
|
||||||
|
msg = "SMART test still running on %s ..." % device.name
|
||||||
|
LOG.debug(msg)
|
||||||
|
continue
|
||||||
|
if "completed without error" in status:
|
||||||
|
msg = "%s passed SMART test" % device.name
|
||||||
|
LOG.info(msg)
|
||||||
|
devices.remove(device)
|
||||||
|
continue
|
||||||
|
failed_devices.append(device.name)
|
||||||
|
LOG.warning("%(device)s failed SMART test with: %(err)s",
|
||||||
|
{'device': device.name, 'err': status})
|
||||||
|
devices.remove(device)
|
||||||
|
if not devices:
|
||||||
|
break
|
||||||
|
LOG.info("SMART tests still running ...")
|
||||||
|
time.sleep(30)
|
||||||
|
|
||||||
|
# fail the clean step if the SMART test has failed
|
||||||
|
if failed_devices:
|
||||||
|
msg = ('fio (disk) SMART test failed for %s' % ' '.join(
|
||||||
|
map(str, failed_devices)))
|
||||||
|
raise errors.CleaningError(msg)
|
||||||
|
|
||||||
|
|
||||||
def fio_disk(node):
|
def fio_disk(node):
|
||||||
"""Burn-in the disks with fio
|
"""Burn-in the disks with fio
|
||||||
|
|
||||||
@ -118,6 +196,12 @@ def fio_disk(node):
|
|||||||
LOG.error(error_msg)
|
LOG.error(error_msg)
|
||||||
raise errors.CommandExecutionError(error_msg)
|
raise errors.CommandExecutionError(error_msg)
|
||||||
|
|
||||||
|
# if configured, run a smart self test on all devices and fail the
|
||||||
|
# step if any of the devices reports an error
|
||||||
|
smart_test = info.get('agent_burnin_fio_disk_smart_test', False)
|
||||||
|
if smart_test:
|
||||||
|
_run_smart_test(devices)
|
||||||
|
|
||||||
|
|
||||||
def _do_fio_network(writer, runtime, partner):
|
def _do_fio_network(writer, runtime, partner):
|
||||||
|
|
||||||
|
@ -21,6 +21,39 @@ from ironic_python_agent import hardware
|
|||||||
from ironic_python_agent.tests.unit import base
|
from ironic_python_agent.tests.unit import base
|
||||||
|
|
||||||
|
|
||||||
|
SMART_OUTPUT_JSON_COMPLETED = ("""
|
||||||
|
{
|
||||||
|
"ata_smart_data": {
|
||||||
|
"self_test": {
|
||||||
|
"status": {
|
||||||
|
"value": 0,
|
||||||
|
"string": "completed without error",
|
||||||
|
"passed": true
|
||||||
|
},
|
||||||
|
"polling_minutes": {
|
||||||
|
"short": 1,
|
||||||
|
"extended": 2,
|
||||||
|
"conveyance": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
SMART_OUTPUT_JSON_MISSING = ("""
|
||||||
|
{
|
||||||
|
"ata_smart_data": {
|
||||||
|
"self_test": {
|
||||||
|
"status": {
|
||||||
|
"value": 0,
|
||||||
|
"passed": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
@mock.patch.object(utils, 'execute', autospec=True)
|
@mock.patch.object(utils, 'execute', autospec=True)
|
||||||
class TestBurnin(base.IronicAgentTest):
|
class TestBurnin(base.IronicAgentTest):
|
||||||
|
|
||||||
@ -133,6 +166,50 @@ class TestBurnin(base.IronicAgentTest):
|
|||||||
'--loops', 5, '--runtime', 600, '--time_based', '--name',
|
'--loops', 5, '--runtime', 600, '--time_based', '--name',
|
||||||
'/dev/sdj', '--name', '/dev/hdaa')
|
'/dev/sdj', '--name', '/dev/hdaa')
|
||||||
|
|
||||||
|
def test__smart_test_status(self, mock_execute):
|
||||||
|
device = hardware.BlockDevice('/dev/sdj', 'big', 1073741824, True)
|
||||||
|
mock_execute.return_value = ([SMART_OUTPUT_JSON_COMPLETED, 'err'])
|
||||||
|
|
||||||
|
status = burnin._smart_test_status(device)
|
||||||
|
|
||||||
|
mock_execute.assert_called_once_with('smartctl', '-ja', '/dev/sdj')
|
||||||
|
self.assertEqual(status, "completed without error")
|
||||||
|
|
||||||
|
def test__smart_test_status_missing(self, mock_execute):
|
||||||
|
device = hardware.BlockDevice('/dev/sdj', 'big', 1073741824, True)
|
||||||
|
mock_execute.return_value = ([SMART_OUTPUT_JSON_MISSING, 'err'])
|
||||||
|
|
||||||
|
status = burnin._smart_test_status(device)
|
||||||
|
|
||||||
|
mock_execute.assert_called_once_with('smartctl', '-ja', '/dev/sdj')
|
||||||
|
self.assertIsNone(status)
|
||||||
|
|
||||||
|
@mock.patch.object(burnin, '_smart_test_status', autospec=True)
|
||||||
|
@mock.patch.object(hardware, 'list_all_block_devices', autospec=True)
|
||||||
|
def test_fio_disk_smart_test(self, mock_list, mock_status, mock_execute):
|
||||||
|
|
||||||
|
node = {'driver_info': {'agent_burnin_fio_disk_smart_test': True}}
|
||||||
|
|
||||||
|
mock_list.return_value = [
|
||||||
|
hardware.BlockDevice('/dev/sdj', 'big', 1073741824, True),
|
||||||
|
hardware.BlockDevice('/dev/hdaa', 'small', 65535, False),
|
||||||
|
]
|
||||||
|
mock_status.return_value = "completed without error"
|
||||||
|
mock_execute.return_value = (['out', 'err'])
|
||||||
|
|
||||||
|
burnin.fio_disk(node)
|
||||||
|
|
||||||
|
expected_calls = [
|
||||||
|
mock.call('fio', '--rw', 'readwrite', '--bs', '4k', '--direct', 1,
|
||||||
|
'--ioengine', 'libaio', '--iodepth', '32', '--verify',
|
||||||
|
'crc32c', '--verify_dump', 1, '--continue_on_error',
|
||||||
|
'verify', '--loops', 4, '--runtime', 0, '--time_based',
|
||||||
|
'--name', '/dev/sdj', '--name', '/dev/hdaa'),
|
||||||
|
mock.call('smartctl', '-t', 'long', '/dev/sdj'),
|
||||||
|
mock.call('smartctl', '-t', 'long', '/dev/hdaa')
|
||||||
|
]
|
||||||
|
mock_execute.assert_has_calls(expected_calls)
|
||||||
|
|
||||||
@mock.patch.object(hardware, 'list_all_block_devices', autospec=True)
|
@mock.patch.object(hardware, 'list_all_block_devices', autospec=True)
|
||||||
def test_fio_disk_no_fio(self, mock_list, mock_execute):
|
def test_fio_disk_no_fio(self, mock_list, mock_execute):
|
||||||
|
|
||||||
|
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Add 'agent_burnin_fio_disk_smart_test' option in driver-info for disk
|
||||||
|
burn-in. If set to True, this option will launch a parallel SMART self
|
||||||
|
test on all devices after the disk burn-in and fail the disk burn-in
|
||||||
|
clean step if any of the tests fail.
|
Loading…
Reference in New Issue
Block a user