Add service version check workaround for FFU

We recently added a hard failure to nova service startup for the case
where computes were more than one version old (as indicated by their
service record). This helps to prevent starting up new control
services when a very old compute is still running. However, during an
FFU, control services that have skipped multiple versions will be
started and find the older compute records (which could not be updated
yet due to their reliance on the control services being up) and refuse
to start. This creates a cross-dependency which is not resolvable
without hacking the database.

This patch adds a workaround flag to allow turning that hard fail into
a warning to proceed past the issue. This less-than-ideal solution
is simple and backportable, but perhaps a better solution can be
implemented for the future.

Related-Bug: #1958883

Change-Id: Iddbc9b2a13f19cea9a996aeadfe891f4ef3b0264
This commit is contained in:
Dan Smith 2022-01-21 12:51:35 -08:00
parent 52b974acb7
commit 7d2e481589
5 changed files with 64 additions and 2 deletions

View File

@ -47,7 +47,13 @@ def _get_config_files(env=None):
def _setup_service(host, name):
utils.raise_if_old_compute()
try:
utils.raise_if_old_compute()
except exception.TooOldComputeService as e:
if CONF.workarounds.disable_compute_service_check_for_ffu:
LOG.warning(str(e))
else:
raise
binary = name if name.startswith('nova-') else "nova-%s" % name

View File

@ -369,6 +369,16 @@ to update network switches in the post live migration phase on the destination.
Related options:
* :oslo.config:option:`DEFAULT.compute_driver` (libvirt)
"""),
cfg.BoolOpt('disable_compute_service_check_for_ffu',
default=False,
help="""
If this is set, the normal safety check for old compute services will be
treated as a warning instead of an error. This is only to be enabled to
facilitate a Fast-Forward upgrade where new control services are being started
before compute nodes have been able to update their service record. In an FFU,
the service records in the database will be more than one version old until
the compute nodes start up, but control services need to be online first.
"""),
]

View File

@ -261,7 +261,13 @@ class Service(service.Service):
# up before it allows the service to be created. The
# raise_if_old_compute() depends on the RPC to be up and does not
# implement its own retry mechanism to connect to the conductor.
utils.raise_if_old_compute()
try:
utils.raise_if_old_compute()
except exception.TooOldComputeService as e:
if CONF.workarounds.disable_compute_service_check_for_ffu:
LOG.warning(str(e))
else:
raise
return service_obj

View File

@ -18,6 +18,7 @@ from oslo_config import fixture as config_fixture
from oslotest import base
from nova.api.openstack import wsgi_app
from nova import exception
from nova import test
from nova.tests import fixtures as nova_fixtures
@ -87,3 +88,19 @@ document_root = /tmp
wsgi_app.init_application('nova-api')
self.assertIn('Global data already initialized, not re-initializing.',
self.stdlog.logger.output)
@mock.patch('nova.objects.Service.get_by_host_and_binary')
@mock.patch('nova.utils.raise_if_old_compute')
def test_setup_service_version_workaround(self, mock_check_old, mock_get):
mock_check_old.side_effect = exception.TooOldComputeService(
oldest_supported_version='2',
scope='scope',
min_service_level=2,
oldest_supported_service=1)
self.assertRaises(exception.TooOldComputeService,
wsgi_app._setup_service, 'myhost', 'api')
wsgi_app.CONF.set_override(
'disable_compute_service_check_for_ffu', True,
group='workarounds')
wsgi_app._setup_service('myhost', 'api')

View File

@ -287,6 +287,29 @@ class ServiceTestCase(test.NoDBTestCase):
mock_check_old.assert_called_once_with()
mock_wait.assert_called_once_with(mock.ANY)
@mock.patch('nova.utils.raise_if_old_compute')
def test_old_compute_version_check_workaround(
self, mock_check_old):
mock_check_old.side_effect = exception.TooOldComputeService(
oldest_supported_version='2',
scope='scope',
min_service_level=2,
oldest_supported_service=1)
self.assertRaises(exception.TooOldComputeService,
service.Service.create,
self.host, 'nova-conductor', self.topic,
'nova.tests.unit.test_service.FakeManager')
CONF.set_override('disable_compute_service_check_for_ffu', True,
group='workarounds')
service.Service.create(self.host, 'nova-conductor', self.topic,
'nova.tests.unit.test_service.FakeManager')
mock_check_old.assert_has_calls([mock.call(), mock.call()])
class TestWSGIService(test.NoDBTestCase):