Add support for DBDeadlock handling
Wrap all DB write operations with oslo_db decorator to retry on deadlock. DBDeadlock exception can be raised quite often on Galera clusters under load. This also fixes an issue which causes conductor's periodic tasks to stop executing. Closes-Bug: #1637210 Closes-Bug: #1639338 Co-Authored-By: Vladyslav Drok <vdrok@mirantis.com> Co-Authored-By: Joanna Taryma <joanna.taryma@intel.com> Change-Id: I61db83637adfd98a5394d1f570f3de4302c93497
This commit is contained in:
parent
5071b99835
commit
3428cb74f0
@ -1208,7 +1208,7 @@
|
||||
# Maximum retries in case of connection error or deadlock
|
||||
# error before error is raised. Set to -1 to specify an
|
||||
# infinite retry count. (integer value)
|
||||
#db_max_retries = 20
|
||||
#db_max_retries = 5
|
||||
|
||||
|
||||
[deploy]
|
||||
|
@ -26,3 +26,6 @@ opts = [
|
||||
|
||||
def register_opts(conf):
|
||||
conf.register_opts(opts, group='database')
|
||||
# Change the oslo_db side default to 5
|
||||
conf.import_opt('db_max_retries', 'ironic.db.api', group='database')
|
||||
conf.set_default('db_max_retries', 5, group='database')
|
||||
|
@ -20,6 +20,7 @@ import collections
|
||||
import datetime
|
||||
import threading
|
||||
|
||||
from oslo_db import api as oslo_db_api
|
||||
from oslo_db import exception as db_exc
|
||||
from oslo_db.sqlalchemy import enginefacade
|
||||
from oslo_db.sqlalchemy import utils as db_utils
|
||||
@ -55,6 +56,9 @@ def _session_for_read():
|
||||
return enginefacade.reader.using(_CONTEXT)
|
||||
|
||||
|
||||
# Please add @oslo_db_api.retry_on_deadlock decorator to all methods using
|
||||
# _session_for_write (as deadlocks happen on write), so that oslo_db is able
|
||||
# to retry in case of deadlocks.
|
||||
def _session_for_write():
|
||||
return enginefacade.writer.using(_CONTEXT)
|
||||
|
||||
@ -258,6 +262,7 @@ class Connection(api.Connection):
|
||||
return _paginate_query(models.Node, limit, marker,
|
||||
sort_key, sort_dir, query)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def reserve_node(self, tag, node_id):
|
||||
with _session_for_write():
|
||||
query = _get_node_query_with_tags()
|
||||
@ -276,6 +281,7 @@ class Connection(api.Connection):
|
||||
except NoResultFound:
|
||||
raise exception.NodeNotFound(node_id)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def release_node(self, tag, node_id):
|
||||
with _session_for_write():
|
||||
query = model_query(models.Node)
|
||||
@ -294,6 +300,7 @@ class Connection(api.Connection):
|
||||
except NoResultFound:
|
||||
raise exception.NodeNotFound(node_id)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def create_node(self, values):
|
||||
# ensure defaults are present for new nodes
|
||||
if 'uuid' not in values:
|
||||
@ -364,6 +371,7 @@ class Connection(api.Connection):
|
||||
|
||||
return result
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def destroy_node(self, node_id):
|
||||
with _session_for_write():
|
||||
query = model_query(models.Node)
|
||||
@ -422,6 +430,7 @@ class Connection(api.Connection):
|
||||
else:
|
||||
raise
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def _do_update_node(self, node_id, values):
|
||||
with _session_for_write():
|
||||
query = model_query(models.Node)
|
||||
@ -492,6 +501,7 @@ class Connection(api.Connection):
|
||||
return _paginate_query(models.Port, limit, marker,
|
||||
sort_key, sort_dir, query)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def create_port(self, values):
|
||||
if not values.get('uuid'):
|
||||
values['uuid'] = uuidutils.generate_uuid()
|
||||
@ -508,6 +518,7 @@ class Connection(api.Connection):
|
||||
raise exception.PortAlreadyExists(uuid=values['uuid'])
|
||||
return port
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def update_port(self, port_id, values):
|
||||
# NOTE(dtantsur): this can lead to very strange errors
|
||||
if 'uuid' in values:
|
||||
@ -527,6 +538,7 @@ class Connection(api.Connection):
|
||||
raise exception.MACAlreadyExists(mac=values['address'])
|
||||
return ref
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def destroy_port(self, port_id):
|
||||
with _session_for_write():
|
||||
query = model_query(models.Port)
|
||||
@ -575,6 +587,7 @@ class Connection(api.Connection):
|
||||
return _paginate_query(models.Portgroup, limit, marker,
|
||||
sort_key, sort_dir, query)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def create_portgroup(self, values):
|
||||
if not values.get('uuid'):
|
||||
values['uuid'] = uuidutils.generate_uuid()
|
||||
@ -596,6 +609,7 @@ class Connection(api.Connection):
|
||||
raise exception.PortgroupAlreadyExists(uuid=values['uuid'])
|
||||
return portgroup
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def update_portgroup(self, portgroup_id, values):
|
||||
if 'uuid' in values:
|
||||
msg = _("Cannot overwrite UUID for an existing portgroup.")
|
||||
@ -620,6 +634,7 @@ class Connection(api.Connection):
|
||||
raise
|
||||
return ref
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def destroy_portgroup(self, portgroup_id):
|
||||
def portgroup_not_empty(session):
|
||||
"""Checks whether the portgroup does not have ports."""
|
||||
@ -659,6 +674,7 @@ class Connection(api.Connection):
|
||||
return _paginate_query(models.Chassis, limit, marker,
|
||||
sort_key, sort_dir)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def create_chassis(self, values):
|
||||
if not values.get('uuid'):
|
||||
values['uuid'] = uuidutils.generate_uuid()
|
||||
@ -673,6 +689,7 @@ class Connection(api.Connection):
|
||||
raise exception.ChassisAlreadyExists(uuid=values['uuid'])
|
||||
return chassis
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def update_chassis(self, chassis_id, values):
|
||||
# NOTE(dtantsur): this can lead to very strange errors
|
||||
if 'uuid' in values:
|
||||
@ -689,6 +706,7 @@ class Connection(api.Connection):
|
||||
ref = query.one()
|
||||
return ref
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def destroy_chassis(self, chassis_id):
|
||||
def chassis_not_empty():
|
||||
"""Checks whether the chassis does not have nodes."""
|
||||
@ -709,6 +727,7 @@ class Connection(api.Connection):
|
||||
if count != 1:
|
||||
raise exception.ChassisNotFound(chassis=chassis_id)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def register_conductor(self, values, update_existing=False):
|
||||
with _session_for_write() as session:
|
||||
query = (model_query(models.Conductor)
|
||||
@ -736,6 +755,7 @@ class Connection(api.Connection):
|
||||
except NoResultFound:
|
||||
raise exception.ConductorNotFound(conductor=hostname)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def unregister_conductor(self, hostname):
|
||||
with _session_for_write():
|
||||
query = (model_query(models.Conductor)
|
||||
@ -744,6 +764,7 @@ class Connection(api.Connection):
|
||||
if count == 0:
|
||||
raise exception.ConductorNotFound(conductor=hostname)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def touch_conductor(self, hostname):
|
||||
with _session_for_write():
|
||||
query = (model_query(models.Conductor)
|
||||
@ -755,6 +776,7 @@ class Connection(api.Connection):
|
||||
if count == 0:
|
||||
raise exception.ConductorNotFound(conductor=hostname)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def clear_node_reservations_for_conductor(self, hostname):
|
||||
nodes = []
|
||||
with _session_for_write():
|
||||
@ -769,6 +791,7 @@ class Connection(api.Connection):
|
||||
_LW('Cleared reservations held by %(hostname)s: '
|
||||
'%(nodes)s'), {'hostname': hostname, 'nodes': nodes})
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def clear_node_target_power_state(self, hostname):
|
||||
nodes = []
|
||||
with _session_for_write():
|
||||
@ -831,6 +854,7 @@ class Connection(api.Connection):
|
||||
query = _filter_active_conductors(query)
|
||||
return query.all()
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def register_conductor_hardware_interfaces(self, conductor_id,
|
||||
hardware_type, interface_type,
|
||||
interfaces, default_interface):
|
||||
@ -852,12 +876,14 @@ class Connection(api.Connection):
|
||||
interface_type=interface_type,
|
||||
interfaces=interfaces)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def unregister_conductor_hardware_interfaces(self, conductor_id):
|
||||
with _session_for_write():
|
||||
query = (model_query(models.ConductorHardwareInterfaces)
|
||||
.filter_by(conductor_id=conductor_id))
|
||||
query.delete()
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def touch_node_provisioning(self, node_id):
|
||||
with _session_for_write():
|
||||
query = model_query(models.Node)
|
||||
@ -870,6 +896,7 @@ class Connection(api.Connection):
|
||||
if not model_query(models.Node).filter_by(id=node_id).scalar():
|
||||
raise exception.NodeNotFound(node=node_id)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def set_node_tags(self, node_id, tags):
|
||||
# remove duplicate tags
|
||||
tags = set(tags)
|
||||
@ -883,6 +910,7 @@ class Connection(api.Connection):
|
||||
|
||||
return node_tags
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def unset_node_tags(self, node_id):
|
||||
self._check_node_exists(node_id)
|
||||
with _session_for_write():
|
||||
@ -895,6 +923,7 @@ class Connection(api.Connection):
|
||||
.all())
|
||||
return result
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def add_node_tag(self, node_id, tag):
|
||||
node_tag = models.NodeTag(tag=tag, node_id=node_id)
|
||||
|
||||
@ -909,6 +938,7 @@ class Connection(api.Connection):
|
||||
|
||||
return node_tag
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def delete_node_tag(self, node_id, tag):
|
||||
self._check_node_exists(node_id)
|
||||
with _session_for_write():
|
||||
@ -964,6 +994,7 @@ class Connection(api.Connection):
|
||||
return _paginate_query(models.VolumeConnector, limit, marker,
|
||||
sort_key, sort_dir, query)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def create_volume_connector(self, connector_info):
|
||||
if 'uuid' not in connector_info:
|
||||
connector_info['uuid'] = uuidutils.generate_uuid()
|
||||
@ -983,6 +1014,7 @@ class Connection(api.Connection):
|
||||
uuid=connector_info['uuid'])
|
||||
return connector
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def update_volume_connector(self, ident, connector_info):
|
||||
if 'uuid' in connector_info:
|
||||
msg = _("Cannot overwrite UUID for an existing Volume Connector.")
|
||||
@ -1006,6 +1038,7 @@ class Connection(api.Connection):
|
||||
raise exception.VolumeConnectorNotFound(connector=ident)
|
||||
return ref
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def destroy_volume_connector(self, ident):
|
||||
with _session_for_write():
|
||||
query = model_query(models.VolumeConnector)
|
||||
@ -1039,6 +1072,7 @@ class Connection(api.Connection):
|
||||
return _paginate_query(models.VolumeTarget, limit, marker, sort_key,
|
||||
sort_dir, query)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def create_volume_target(self, target_info):
|
||||
if 'uuid' not in target_info:
|
||||
target_info['uuid'] = uuidutils.generate_uuid()
|
||||
@ -1057,6 +1091,7 @@ class Connection(api.Connection):
|
||||
uuid=target_info['uuid'])
|
||||
return target
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def update_volume_target(self, ident, target_info):
|
||||
if 'uuid' in target_info:
|
||||
msg = _("Cannot overwrite UUID for an existing Volume Target.")
|
||||
@ -1077,6 +1112,7 @@ class Connection(api.Connection):
|
||||
raise exception.VolumeTargetNotFound(target=ident)
|
||||
return ref
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def destroy_volume_target(self, ident):
|
||||
with _session_for_write():
|
||||
query = model_query(models.VolumeTarget)
|
||||
|
32
ironic/tests/unit/db/sqlalchemy/test_api.py
Normal file
32
ironic/tests/unit/db/sqlalchemy/test_api.py
Normal file
@ -0,0 +1,32 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import inspect
|
||||
|
||||
from ironic.db.sqlalchemy import api as sqlalchemy_api
|
||||
from ironic.tests import base as test_base
|
||||
|
||||
|
||||
class TestDBWriteMethodsRetryOnDeadlock(test_base.TestCase):
|
||||
|
||||
def test_retry_on_deadlock(self):
|
||||
# This test ensures that every dbapi method doing database write is
|
||||
# wrapped with retry_on_deadlock decorator
|
||||
for name, method in inspect.getmembers(sqlalchemy_api.Connection,
|
||||
predicate=inspect.ismethod):
|
||||
src = inspect.getsource(method)
|
||||
if 'with _session_for_write()' in src:
|
||||
self.assertIn(
|
||||
'@oslo_db_api.retry_on_deadlock', src,
|
||||
'oslo_db\'s retry_on_deadlock decorator not '
|
||||
'applied to method ironic.db.sqlalchemy.api.Connection.%s '
|
||||
'doing database write' % name)
|
@ -18,6 +18,8 @@
|
||||
import datetime
|
||||
|
||||
import mock
|
||||
from oslo_db import exception as db_exc
|
||||
from oslo_db import sqlalchemy
|
||||
from oslo_utils import timeutils
|
||||
|
||||
from ironic.common import exception
|
||||
@ -129,6 +131,13 @@ class DbConductorTestCase(base.DbTestCase):
|
||||
c = self.dbapi.get_conductor(c.hostname)
|
||||
self.assertEqual(test_time, timeutils.normalize_time(c.updated_at))
|
||||
|
||||
@mock.patch.object(sqlalchemy.orm.Query, 'update', autospec=True)
|
||||
def test_touch_conductor_deadlock(self, mock_update):
|
||||
mock_update.side_effect = [db_exc.DBDeadlock(), None]
|
||||
c = self._create_test_cdr()
|
||||
self.dbapi.touch_conductor(c.hostname)
|
||||
self.assertEqual(2, mock_update.call_count)
|
||||
|
||||
def test_touch_conductor_not_found(self):
|
||||
# A conductor's heartbeat will not create a new record,
|
||||
# it will only update existing ones
|
||||
|
@ -0,0 +1,12 @@
|
||||
---
|
||||
fixes:
|
||||
- Fixes an issue which caused conductor's periodic tasks to stop executing.
|
||||
See https://bugs.launchpad.net/ironic/+bug/1637210
|
||||
features:
|
||||
- Adds DBDeadlock handling which may improve stability when using Galera.
|
||||
See https://bugs.launchpad.net/ironic/+bug/1639338
|
||||
upgrade:
|
||||
- All DB API methods doing database writes now retry on deadlock. The
|
||||
``[database]db_max_retries`` configuration option specifies the maximum
|
||||
number of times to retry, and can be customised if necessary. It is 5 by
|
||||
default.
|
Loading…
Reference in New Issue
Block a user