Fix inactive session error in compute node creation

In the fix for bug 1839560 [1][2], soft-deleted compute nodes may be
restored, to ensure we can reuse ironic node UUIDs as compute node
UUIDs. While this seems to largely work, it results in some nasty errors
being generated [3]:

    InvalidRequestError This session is in 'inactive' state, due to the
    SQL transaction being rolled back; no further SQL can be emitted
    within this transaction.

This happens because compute_node_create is decorated with
pick_context_manager_writer, which begins a transaction. While
_compute_node_get_and_update_deleted claims that calling a second
pick_context_manager_writer decorated function will begin a new
subtransaction, this does not appear to be the case.

This change removes pick_context_manager_writer from the
compute_node_create function, and adds a new _compute_node_create
function which ensures the transaction is finished if
_compute_node_get_and_update_deleted is called.

The new unit test added here fails without this change.

This change marks the removal of the final FIXME from the functional
test added in [4].

[1] https://bugs.launchpad.net/nova/+bug/1839560
[2] https://git.openstack.org/cgit/openstack/nova/commit/?id=89dd74ac7f1028daadf86cb18948e27fe9d1d411
[3] http://paste.openstack.org/show/786350/
[4] https://review.opendev.org/#/c/695012/

Conflicts:
    nova/db/sqlalchemy/api.py

NOTE(melwitt): The conflict is because change
I9f414cf831316b624132d9e06192f1ecbbd3dd78 (db: Copy docs from
'nova.db.*' to 'nova.db.sqlalchemy.*') is not in Wallaby.

NOTE(melwitt): Difference from the cherry picked change from calling
nova.db.api => nova.db.sqlalchemy.api directly are due to the alembic
migration in Xena which looks to have made the nova.db.api interface
obsolete.

Change-Id: Iae119ea8776bc7f2e5dbe2e502a743217beded73
Closes-Bug: #1853159
Related-Bug: #1853009
(cherry picked from commit 2383cbb4a5)
This commit is contained in:
Mark Goddard 2019-11-20 12:01:33 +00:00 committed by melanie witt
parent cbbca58504
commit 665c053315
3 changed files with 31 additions and 35 deletions

View File

@ -688,7 +688,7 @@ def compute_node_search_by_hypervisor(context, hypervisor_match):
@pick_context_manager_writer
def compute_node_create(context, values):
def _compute_node_create(context, values):
"""Creates a new ComputeNode and populates the capacity fields
with the most recent data.
"""
@ -696,8 +696,21 @@ def compute_node_create(context, values):
compute_node_ref = models.ComputeNode()
compute_node_ref.update(values)
compute_node_ref.save(context.session)
return compute_node_ref
# NOTE(mgoddard): We avoid decorating this with @pick_context_manager_writer,
# so that we get a separate transaction in the exception handler. This avoids
# an error message about inactive DB sessions during a transaction rollback.
# See https://bugs.launchpad.net/nova/+bug/1853159.
def compute_node_create(context, values):
"""Creates a new ComputeNode and populates the capacity fields
with the most recent data. Will restore a soft deleted compute node if a
UUID has been explicitly requested.
"""
try:
compute_node_ref.save(context.session)
compute_node_ref = _compute_node_create(context, values)
except db_exc.DBDuplicateEntry:
with excutils.save_and_reraise_exception(logger=LOG) as err_ctx:
# Check to see if we have a (soft) deleted ComputeNode with the

View File

@ -85,10 +85,6 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
# host_b[1]: Finds no compute record in RT. Tries to create one
# (_init_compute_node).
# FIXME(mgoddard): This shows a traceback with SQL rollback due to
# soft-deleted node. The create seems to succeed but breaks the RT
# update for this node. See
# https://bugs.launchpad.net/nova/+bug/1853159.
host_b.manager.update_available_resource(self.ctxt)
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
# There should only be one resource provider (fake-node).
@ -164,41 +160,12 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
self.ctxt, cn, cascade=True)
# host_b[3]: Should recreate compute node and resource provider.
# FIXME(mgoddard): Resource provider not recreated here, due to
# https://bugs.launchpad.net/nova/+bug/1853159.
host_b.manager.update_available_resource(self.ctxt)
# Verify that the node was recreated.
self._assert_hypervisor_api(self.nodename, 'host_b')
# But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
# node is not cached in the RT.
self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)
# There is no RP.
rps = self._get_all_providers()
self.assertEqual(0, len(rps), rps)
# But the RP exists in the provider tree.
self.assertFalse(host_b.manager.rt.reportclient._provider_tree.exists(
self.nodename))
# host_b[1]: Should add compute node to RT cache and recreate resource
# provider.
host_b.manager.update_available_resource(self.ctxt)
# Verify that the node still exists.
self._assert_hypervisor_api(self.nodename, 'host_b')
# And it is now in the RT cache.
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
# The resource provider has now been created.
rps = self._get_all_providers()
self.assertEqual(1, len(rps), rps)
self.assertEqual(self.nodename, rps[0]['name'])
# This fails due to the lack of a resource provider.
self.assertIn(
'Skipping removal of allocations for deleted instances',
self.stdlog.logger.output)

View File

@ -5257,6 +5257,22 @@ class ComputeNodeTestCase(test.TestCase, ModelsObjectComparatorMixin):
self.assertRaises(db_exc.DBDuplicateEntry,
db.compute_node_create, self.ctxt, other_node)
def test_compute_node_create_duplicate_uuid(self):
"""Tests to make sure that no exception is raised when trying to create
a compute node with the same host, hypervisor_hostname and uuid values
as another compute node that was previously soft-deleted.
"""
# Prior to fixing https://bugs.launchpad.net/nova/+bug/1853159, this
# raised the following error:
# sqlalchemy.exc.InvalidRequestError: This session is in 'inactive'
# state, due to the SQL transaction being rolled back; no further SQL
# can be emitted within this transaction.
constraint = db.constraint(host=db.equal_any(self.item['host']))
sqlalchemy_api.compute_node_delete(
self.ctxt, self.item['id'], constraint=constraint)
new_node = db.compute_node_create(self.ctxt, self.compute_node_dict)
self.assertEqual(self.item['uuid'], new_node['uuid'])
def test_compute_node_get_all(self):
nodes = db.compute_node_get_all(self.ctxt)
self.assertEqual(1, len(nodes))