Insert artificial delay between sending virtual nodes on introspection

KVM PXE code seems broken in an interesting way, when you try to PXE
boot too many nodes. This change makes inspector sleep configurable
amount of time between powering on nodes with *_ssh driver.

Work around in devstack/exercise.sh is no longer needed and is dropped.

Note that this change is not HA, so we might revisit it in the future.

Change-Id: I9b16592f9b5130e90c02fce1b421887f451e397b
Closes-Bug: #1473024
This commit is contained in:
Dmitry Tantsur 2015-07-17 15:55:34 +02:00
parent d3bd8f41d7
commit f15aee4a7f
5 changed files with 110 additions and 6 deletions

View File

@ -36,10 +36,6 @@ done
for uuid in $nodes; do for uuid in $nodes; do
ironic node-set-provision-state $uuid inspect ironic node-set-provision-state $uuid inspect
# FIXME(dtantsur): virtual machines PXE often behaves weirdly when a lot of
# machines DHCP at the same time, inserting sleep helps. It does not affect
# bare metal environment AFAIK.
sleep 5
done done
current_nodes=$nodes current_nodes=$nodes

View File

@ -56,6 +56,13 @@
# The green thread pool size. (integer value) # The green thread pool size. (integer value)
#max_concurrency = 1000 #max_concurrency = 1000
# Delay (in seconds) between two introspections. (integer value)
#introspection_delay = 5
# Only node with drivers matching this regular expression will be
# affected by introspection_delay setting. (string value)
#introspection_delay_drivers = ^.*_ssh$
[database] [database]

View File

@ -210,7 +210,14 @@ SERVICE_OPTS = [
help='Path to SSL key'), help='Path to SSL key'),
cfg.IntOpt('max_concurrency', cfg.IntOpt('max_concurrency',
default=1000, default=1000,
help='The green thread pool size.') help='The green thread pool size.'),
cfg.IntOpt('introspection_delay',
default=5,
help='Delay (in seconds) between two introspections.'),
cfg.StrOpt('introspection_delay_drivers',
default='^.*_ssh$',
help='Only node with drivers matching this regular expression '
'will be affected by introspection_delay setting.'),
] ]

View File

@ -14,8 +14,11 @@
"""Handling introspection request.""" """Handling introspection request."""
import logging import logging
import re
import string import string
import time
from eventlet import semaphore
from ironicclient import exceptions from ironicclient import exceptions
from oslo_config import cfg from oslo_config import cfg
@ -31,6 +34,10 @@ LOG = logging.getLogger("ironic_inspector.introspect")
PASSWORD_ACCEPTED_CHARS = set(string.ascii_letters + string.digits) PASSWORD_ACCEPTED_CHARS = set(string.ascii_letters + string.digits)
PASSWORD_MAX_LENGTH = 20 # IPMI v2.0 PASSWORD_MAX_LENGTH = 20 # IPMI v2.0
_LAST_INTROSPECTION_TIME = 0
_LAST_INTROSPECTION_LOCK = semaphore.BoundedSemaphore()
_LAST_INTROSPECTION_RE = re.compile(CONF.introspection_delay_drivers)
def _validate_ipmi_credentials(node, new_ipmi_credentials): def _validate_ipmi_credentials(node, new_ipmi_credentials):
if not CONF.processing.enable_setting_ipmi_credentials: if not CONF.processing.enable_setting_ipmi_credentials:
@ -112,6 +119,8 @@ def introspect(uuid, new_ipmi_credentials=None):
def _background_introspect(ironic, node_info): def _background_introspect(ironic, node_info):
global _LAST_INTROSPECTION_TIME
# TODO(dtantsur): pagination # TODO(dtantsur): pagination
macs = list(node_info.ports(ironic)) macs = list(node_info.ports(ironic))
if macs: if macs:
@ -130,6 +139,17 @@ def _background_introspect(ironic, node_info):
' node %(node)s: %(exc)s') % ' node %(node)s: %(exc)s') %
{'node': node_info.uuid, 'exc': exc}) {'node': node_info.uuid, 'exc': exc})
if _LAST_INTROSPECTION_RE.match(node_info.node().driver):
LOG.debug('Attempting to acquire lock on last introspection time')
with _LAST_INTROSPECTION_LOCK:
delay = (_LAST_INTROSPECTION_TIME - time.time()
+ CONF.introspection_delay)
if delay > 0:
LOG.debug('Waiting %d seconds before sending the next '
'node on introspection', delay)
time.sleep(delay)
_LAST_INTROSPECTION_TIME = time.time()
try: try:
utils.retry_on_conflict(ironic.node.set_power_state, utils.retry_on_conflict(ironic.node.set_power_state,
node_info.uuid, 'reboot') node_info.uuid, 'reboot')

View File

@ -12,6 +12,7 @@
# limitations under the License. # limitations under the License.
import collections import collections
import time
import eventlet import eventlet
from ironicclient import exceptions from ironicclient import exceptions
@ -30,6 +31,7 @@ CONF = cfg.CONF
class BaseTest(test_base.NodeTest): class BaseTest(test_base.NodeTest):
def setUp(self): def setUp(self):
super(BaseTest, self).setUp() super(BaseTest, self).setUp()
introspect._LAST_INTROSPECTION_TIME = 0
self.node.power_state = 'power off' self.node.power_state = 'power off'
self.node_compat = mock.Mock(driver='pxe_ssh', self.node_compat = mock.Mock(driver='pxe_ssh',
uuid='uuid_compat', uuid='uuid_compat',
@ -43,6 +45,7 @@ class BaseTest(test_base.NodeTest):
for p in self.ports) for p in self.ports)
self.node_info = mock.Mock(uuid=self.uuid, options={}) self.node_info = mock.Mock(uuid=self.uuid, options={})
self.node_info.ports.return_value = self.ports_dict self.node_info.ports.return_value = self.ports_dict
self.node_info.node.return_value = self.node
def _prepare(self, client_mock): def _prepare(self, client_mock):
cli = client_mock.return_value cli = client_mock.return_value
@ -157,7 +160,8 @@ class TestIntrospect(BaseTest):
cli.node.get.return_value = self.node_compat cli.node.get.return_value = self.node_compat
cli.node.validate.return_value = mock.Mock(power={'result': True}) cli.node.validate.return_value = mock.Mock(power={'result': True})
add_mock.return_value = mock.Mock(uuid=self.node_compat.uuid, add_mock.return_value = mock.Mock(uuid=self.node_compat.uuid,
options={}) options={},
**{'node.return_value': self.node})
add_mock.return_value.ports.return_value = collections.OrderedDict( add_mock.return_value.ports.return_value = collections.OrderedDict(
(p.address, p) for p in self.ports) (p.address, p) for p in self.ports)
@ -247,6 +251,76 @@ class TestIntrospect(BaseTest):
self.assertEqual(0, cli.node.set_power_state.call_count) self.assertEqual(0, cli.node.set_power_state.call_count)
self.assertFalse(add_mock.called) self.assertFalse(add_mock.called)
@mock.patch.object(time, 'sleep')
@mock.patch.object(time, 'time')
def test_sleep_no_pxe_ssh(self, time_mock, sleep_mock, client_mock,
add_mock, filters_mock):
self.node.driver = 'pxe_ipmitool'
time_mock.return_value = 42
introspect._LAST_INTROSPECTION_TIME = 40
CONF.set_override('introspection_delay', 10)
cli = self._prepare(client_mock)
add_mock.return_value = self.node_info
introspect.introspect(self.uuid)
self.assertFalse(sleep_mock.called)
cli.node.set_boot_device.assert_called_once_with(self.uuid,
'pxe',
persistent=False)
cli.node.set_power_state.assert_called_once_with(self.uuid,
'reboot')
# not changed
self.assertEqual(40, introspect._LAST_INTROSPECTION_TIME)
@mock.patch.object(time, 'sleep')
@mock.patch.object(time, 'time')
def test_sleep_with_pxe_ssh(self, time_mock, sleep_mock, client_mock,
add_mock, filters_mock):
self.node.driver = 'pxe_ssh'
time_mock.return_value = 42
introspect._LAST_INTROSPECTION_TIME = 40
CONF.set_override('introspection_delay', 10)
cli = self._prepare(client_mock)
add_mock.return_value = self.node_info
introspect.introspect(self.uuid)
sleep_mock.assert_called_once_with(8)
cli.node.set_boot_device.assert_called_once_with(self.uuid,
'pxe',
persistent=False)
cli.node.set_power_state.assert_called_once_with(self.uuid,
'reboot')
# updated to the current time.time()
self.assertEqual(42, introspect._LAST_INTROSPECTION_TIME)
@mock.patch.object(time, 'sleep')
@mock.patch.object(time, 'time')
def test_sleep_not_needed_with_pxe_ssh(self, time_mock, sleep_mock,
client_mock, add_mock,
filters_mock):
self.node.driver = 'agent_ssh'
time_mock.return_value = 100
introspect._LAST_INTROSPECTION_TIME = 40
CONF.set_override('introspection_delay', 10)
cli = self._prepare(client_mock)
add_mock.return_value = self.node_info
introspect.introspect(self.uuid)
self.assertFalse(sleep_mock.called)
cli.node.set_boot_device.assert_called_once_with(self.uuid,
'pxe',
persistent=False)
cli.node.set_power_state.assert_called_once_with(self.uuid,
'reboot')
# updated to the current time.time()
self.assertEqual(100, introspect._LAST_INTROSPECTION_TIME)
@mock.patch.object(utils, 'spawn_n', @mock.patch.object(utils, 'spawn_n',
lambda f, *a, **kw: f(*a, **kw) and None) lambda f, *a, **kw: f(*a, **kw) and None)