diff --git a/fuel_agent/fuel_agent/errors.py b/fuel_agent/fuel_agent/errors.py index 05ce7d751d..b531ba6311 100644 --- a/fuel_agent/fuel_agent/errors.py +++ b/fuel_agent/fuel_agent/errors.py @@ -87,6 +87,10 @@ class MDWrongSpecError(BaseError): pass +class MDRemovingError(BaseError): + pass + + class WrongConfigDriveDataError(BaseError): pass diff --git a/fuel_agent/fuel_agent/tests/test_md_utils.py b/fuel_agent/fuel_agent/tests/test_md_utils.py index 04742de358..35a70e4cbd 100644 --- a/fuel_agent/fuel_agent/tests/test_md_utils.py +++ b/fuel_agent/fuel_agent/tests/test_md_utils.py @@ -119,6 +119,10 @@ localhost.localdomain) {'device': '/dev/fake2'}] mu.mdcreate('/dev/md0', 'mirror', '/dev/fake1', '/dev/fake2') + mock_mdclean_expected_calls = [mock.call('/dev/fake1'), + mock.call('/dev/fake2')] + self.assertEqual(mock_mdclean_expected_calls, + mock_mdclean.call_args_list) mock_exec.assert_called_once_with( 'mdadm', '--create', '--force', '/dev/md0', '-e0.90', '--level=mirror', @@ -182,6 +186,32 @@ localhost.localdomain) expected_calls = [mock.call('/dev/fake1'), mock.call('/dev/fake2')] self.assertEqual(mock_mdclean.call_args_list, expected_calls) + @mock.patch.object(mu, 'mdclean') + @mock.patch.object(mu, 'mdremove') + @mock.patch.object(mu, 'mddisplay') + def test_mdclean_all(self, mock_mddisplay, mock_mdremove, mock_mdclean): + mock_mddisplay.side_effect = [ + [{'name': '/dev/md10', 'devices': ['/dev/fake10']}, + {'name': '/dev/md11'}], + [{'name': '/dev/md11'}], + [] + ] + mu.mdclean_all() + mock_mdremove_expected_calls = [ + mock.call('/dev/md10'), mock.call('/dev/md11'), + mock.call('/dev/md11')] + mock_mdclean.assert_called_once_with('/dev/fake10') + self.assertEqual(mock_mdremove.call_args_list, + mock_mdremove_expected_calls) + + @mock.patch.object(mu, 'mdclean') + @mock.patch.object(mu, 'mdremove') + @mock.patch.object(mu, 'mddisplay') + def test_mdclean_all_fail(self, mock_mddisplay, mock_mdremove, + mock_mdclean): + mock_mddisplay.return_value = [{'name': '/dev/md11'}] + self.assertRaises(errors.MDRemovingError, mu.mdclean_all) + @mock.patch.object(utils, 'execute') @mock.patch.object(mu, 'get_mdnames') def test_mdremove_ok(self, mock_get_mdn, mock_exec): @@ -189,6 +219,7 @@ localhost.localdomain) # should run mdadm command to remove md device mock_get_mdn.return_value = ['/dev/md0'] expected_calls = [ + mock.call('udevadm', 'settle', '--quiet', check_exit_code=[0]), mock.call('mdadm', '--stop', '/dev/md0', check_exit_code=[0]), mock.call('mdadm', '--remove', '/dev/md0', check_exit_code=[0, 1]) ] diff --git a/fuel_agent/fuel_agent/utils/md_utils.py b/fuel_agent/fuel_agent/utils/md_utils.py index cf058d83a0..0b4aef825e 100644 --- a/fuel_agent/fuel_agent/utils/md_utils.py +++ b/fuel_agent/fuel_agent/utils/md_utils.py @@ -100,7 +100,8 @@ def mdcreate(mdname, level, device, *args): 'Error while creating md: at least one of devices is ' 'already in belongs to some md') - # cleaning md metadata from devices + #FIXME: mdadm will ask user to continue creating if any device appears to + # be a part of raid array. Superblock zeroing helps to avoid that. map(mdclean, devices) utils.execute('mdadm', '--create', '--force', mdname, '-e0.90', '--level=%s' % level, @@ -113,6 +114,31 @@ def mdremove(mdname): if mdname not in get_mdnames(): raise errors.MDNotFoundError( 'Error while removing md: md %s not found' % mdname) + #FIXME: The issue faced was quiet hard to reproduce and to figure out the + # root cause. For unknown reason already removed md device is + # unexpectedly returning back after a while from time to time making + # new md device creation to fail. + # Still the actual reason of its failure is unknown, but after a + # searching on a web a mention was found about a race in udev + # http://dev.bizo.com/2012/07/mdadm-device-or-resource-busy.html + # The article recommends to disable udev's queue entirely during md + # device manipulation which sounds rather unappropriate for our case. + # And the link to original post on mailing list suggests to execute + # `udevadm settle` before removing the md device. + # here -> http://permalink.gmane.org/gmane.linux.raid/34027 + # So, what was done. `udevadm settle` calls were placed just + # before any of `mdadm` calls and the analizyng the logs was started. + # According to the manual `settle` is an option that "Watches the + # udev event queue, and exits if all current events are handled". + # That means it will wait for udev's finishing of processing the + # events. According to the logs noticeable delay had been recognized + # between `udevadm settle` and the next `mdadm` call. + # The delay was about 150-200ms or even bigger. It was appeared + # right before the `mdadm --stop` call. That just means that udev was + # too busy with events when we start to modifiy md devices hard. + # Thus `udevadm settle` is helping to avoid the later failure and + # to prevent strange behaviour of md device. + utils.execute('udevadm', 'settle', '--quiet', check_exit_code=[0]) utils.execute('mdadm', '--stop', mdname, check_exit_code=[0]) utils.execute('mdadm', '--remove', mdname, check_exit_code=[0, 1]) @@ -129,3 +155,10 @@ def mdclean_all(): mdremove(md['name']) for dev in md.get('devices', []): mdclean(dev) + # second attempt, remove stale inactive devices + for md in mddisplay(): + mdremove(md['name']) + mds = mddisplay() + if len(mds) > 0: + raise errors.MDRemovingError( + 'Error while removing mds: few devices still presented %s' % mds)