Wait for at least one suitable disk to appear on start up

Some kernel modules take substantial time to initialize. For example,
with mpt2sas RAID driver inspection and deployment randomly fail
due to IPA starting before the driver finishes initialization.

This problem is probably impossible to solve in a generic case, as
modern Linux environment do not have a notion of "hardware is fully
initialized" moment. All hardware is essentially hotplug.

To solve it at least for the simplest case, this patch adds a wait loop
on start up waiting for at least one suitable disk to appear in inventory.
Note that root device hints are not considered, as the node might not
be known at that moment yet.

Change-Id: Id163ca28f7c140c302ea04947ded3f3c58b284de
Partial-Bug: #1582797
This commit is contained in:
Dmitry Tantsur 2016-05-24 10:04:12 +02:00
parent 015fad6054
commit c15ed6a48e
4 changed files with 71 additions and 1 deletions

View File

@ -16,6 +16,7 @@ import abc
import functools
import os
import shlex
import time
import netifaces
from oslo_concurrency import processutils
@ -38,6 +39,9 @@ UNIT_CONVERTER = pint.UnitRegistry(filename=None)
UNIT_CONVERTER.define('MB = []')
UNIT_CONVERTER.define('GB = 1024 MB')
_DISK_WAIT_ATTEMPTS = 10
_DISK_WAIT_DELAY = 3
def _get_device_vendor(dev):
"""Get the vendor name of a given device."""
@ -394,8 +398,27 @@ class GenericHardwareManager(HardwareManager):
self.sys_path = '/sys'
def evaluate_hardware_support(self):
# Do some initialization before we declare ourself ready
self._wait_for_disks()
return HardwareSupport.GENERIC
def _wait_for_disks(self):
# Wait for at least one suitable disk to show up, otherwise neither
# inspection not deployment have any chances to succeed.
for attempt in range(_DISK_WAIT_ATTEMPTS):
try:
block_devices = self.list_block_devices()
utils.guess_root_disk(block_devices)
except errors.DeviceNotFound:
LOG.debug('Still waiting for at least one disk to appear, '
'attempt %d of %d', attempt + 1, _DISK_WAIT_ATTEMPTS)
time.sleep(_DISK_WAIT_DELAY)
else:
break
else:
LOG.warning('No disks detected in %d seconds',
_DISK_WAIT_DELAY * _DISK_WAIT_ATTEMPTS)
def _get_interface_info(self, interface_name):
addr_path = '{0}/class/net/{1}/address'.format(self.sys_path,
interface_name)

View File

@ -127,6 +127,8 @@ class TestHeartbeater(test_base.BaseTestCase):
self.assertEqual(2.7, self.heartbeater.error_delay)
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
lambda self: None)
class TestBaseAgent(test_base.BaseTestCase):
def setUp(self):
@ -294,6 +296,8 @@ class TestBaseAgent(test_base.BaseTestCase):
self.agent.get_node_uuid)
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
lambda self: None)
class TestAgentStandalone(test_base.BaseTestCase):
def setUp(self):
@ -338,6 +342,8 @@ class TestAgentStandalone(test_base.BaseTestCase):
self.assertFalse(self.agent.api_client.lookup_node.called)
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
lambda self: None)
@mock.patch.object(socket, 'gethostbyname', autospec=True)
@mock.patch.object(utils, 'execute', autospec=True)
class TestAdvertiseAddress(test_base.BaseTestCase):

View File

@ -12,9 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import mock
import netifaces
import os
from oslo_concurrency import processutils
from oslo_utils import units
from oslotest import base as test_base
@ -1084,6 +1086,40 @@ class TestGenericHardwareManager(test_base.BaseTestCase):
self.assertEqual('NEC',
self.hardware.get_system_vendor_info().manufacturer)
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
def test_evaluate_hw_waits_for_disks(self, mocked_root_dev, mocked_sleep,
mocked_block_dev):
mocked_root_dev.side_effect = [
errors.DeviceNotFound('boom'),
None
]
result = self.hardware.evaluate_hardware_support()
self.assertEqual(hardware.HardwareSupport.GENERIC, result)
mocked_root_dev.assert_called_with(mocked_block_dev.return_value)
self.assertEqual(2, mocked_root_dev.call_count)
mocked_sleep.assert_called_once_with(hardware._DISK_WAIT_DELAY)
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
def test_evaluate_hw_disks_timeout(self, mocked_root_dev, mocked_sleep,
mocked_block_dev):
mocked_root_dev.side_effect = errors.DeviceNotFound('boom')
result = self.hardware.evaluate_hardware_support()
self.assertEqual(hardware.HardwareSupport.GENERIC, result)
mocked_root_dev.assert_called_with(mocked_block_dev.return_value)
self.assertEqual(hardware._DISK_WAIT_ATTEMPTS,
mocked_root_dev.call_count)
mocked_sleep.assert_called_with(hardware._DISK_WAIT_DELAY)
@mock.patch.object(utils, 'execute', autospec=True)
class TestModuleFunctions(test_base.BaseTestCase):

View File

@ -0,0 +1,5 @@
---
fixes:
- On start up wait up to 30 seconds for the first disk device suitable for
deployment to appear. This is to fix both inspection and deployment on
hardware that takes long to initialize (e.g. some RAID devices).