Add support for DBDeadlock handling

Wrap all DB write operations with oslo_db decorator to retry on
deadlock. DBDeadlock exception can be raised quite often on Galera
clusters under load.
This also fixes an issue which causes conductor's periodic
tasks to stop executing.

Closes-Bug: #1637210
Closes-Bug: #1639338
Co-Authored-By: Vladyslav Drok <vdrok@mirantis.com>
Co-Authored-By: Joanna Taryma <joanna.taryma@intel.com>
Change-Id: I61db83637adfd98a5394d1f570f3de4302c93497
This commit is contained in:
Joanna Taryma 2017-02-27 13:39:37 -08:00
parent 5071b99835
commit 3428cb74f0
6 changed files with 93 additions and 1 deletions

View File

@ -1208,7 +1208,7 @@
# Maximum retries in case of connection error or deadlock
# error before error is raised. Set to -1 to specify an
# infinite retry count. (integer value)
#db_max_retries = 20
#db_max_retries = 5
[deploy]

View File

@ -26,3 +26,6 @@ opts = [
def register_opts(conf):
conf.register_opts(opts, group='database')
# Change the oslo_db side default to 5
conf.import_opt('db_max_retries', 'ironic.db.api', group='database')
conf.set_default('db_max_retries', 5, group='database')

View File

@ -20,6 +20,7 @@ import collections
import datetime
import threading
from oslo_db import api as oslo_db_api
from oslo_db import exception as db_exc
from oslo_db.sqlalchemy import enginefacade
from oslo_db.sqlalchemy import utils as db_utils
@ -55,6 +56,9 @@ def _session_for_read():
return enginefacade.reader.using(_CONTEXT)
# Please add @oslo_db_api.retry_on_deadlock decorator to all methods using
# _session_for_write (as deadlocks happen on write), so that oslo_db is able
# to retry in case of deadlocks.
def _session_for_write():
return enginefacade.writer.using(_CONTEXT)
@ -258,6 +262,7 @@ class Connection(api.Connection):
return _paginate_query(models.Node, limit, marker,
sort_key, sort_dir, query)
@oslo_db_api.retry_on_deadlock
def reserve_node(self, tag, node_id):
with _session_for_write():
query = _get_node_query_with_tags()
@ -276,6 +281,7 @@ class Connection(api.Connection):
except NoResultFound:
raise exception.NodeNotFound(node_id)
@oslo_db_api.retry_on_deadlock
def release_node(self, tag, node_id):
with _session_for_write():
query = model_query(models.Node)
@ -294,6 +300,7 @@ class Connection(api.Connection):
except NoResultFound:
raise exception.NodeNotFound(node_id)
@oslo_db_api.retry_on_deadlock
def create_node(self, values):
# ensure defaults are present for new nodes
if 'uuid' not in values:
@ -364,6 +371,7 @@ class Connection(api.Connection):
return result
@oslo_db_api.retry_on_deadlock
def destroy_node(self, node_id):
with _session_for_write():
query = model_query(models.Node)
@ -422,6 +430,7 @@ class Connection(api.Connection):
else:
raise
@oslo_db_api.retry_on_deadlock
def _do_update_node(self, node_id, values):
with _session_for_write():
query = model_query(models.Node)
@ -492,6 +501,7 @@ class Connection(api.Connection):
return _paginate_query(models.Port, limit, marker,
sort_key, sort_dir, query)
@oslo_db_api.retry_on_deadlock
def create_port(self, values):
if not values.get('uuid'):
values['uuid'] = uuidutils.generate_uuid()
@ -508,6 +518,7 @@ class Connection(api.Connection):
raise exception.PortAlreadyExists(uuid=values['uuid'])
return port
@oslo_db_api.retry_on_deadlock
def update_port(self, port_id, values):
# NOTE(dtantsur): this can lead to very strange errors
if 'uuid' in values:
@ -527,6 +538,7 @@ class Connection(api.Connection):
raise exception.MACAlreadyExists(mac=values['address'])
return ref
@oslo_db_api.retry_on_deadlock
def destroy_port(self, port_id):
with _session_for_write():
query = model_query(models.Port)
@ -575,6 +587,7 @@ class Connection(api.Connection):
return _paginate_query(models.Portgroup, limit, marker,
sort_key, sort_dir, query)
@oslo_db_api.retry_on_deadlock
def create_portgroup(self, values):
if not values.get('uuid'):
values['uuid'] = uuidutils.generate_uuid()
@ -596,6 +609,7 @@ class Connection(api.Connection):
raise exception.PortgroupAlreadyExists(uuid=values['uuid'])
return portgroup
@oslo_db_api.retry_on_deadlock
def update_portgroup(self, portgroup_id, values):
if 'uuid' in values:
msg = _("Cannot overwrite UUID for an existing portgroup.")
@ -620,6 +634,7 @@ class Connection(api.Connection):
raise
return ref
@oslo_db_api.retry_on_deadlock
def destroy_portgroup(self, portgroup_id):
def portgroup_not_empty(session):
"""Checks whether the portgroup does not have ports."""
@ -659,6 +674,7 @@ class Connection(api.Connection):
return _paginate_query(models.Chassis, limit, marker,
sort_key, sort_dir)
@oslo_db_api.retry_on_deadlock
def create_chassis(self, values):
if not values.get('uuid'):
values['uuid'] = uuidutils.generate_uuid()
@ -673,6 +689,7 @@ class Connection(api.Connection):
raise exception.ChassisAlreadyExists(uuid=values['uuid'])
return chassis
@oslo_db_api.retry_on_deadlock
def update_chassis(self, chassis_id, values):
# NOTE(dtantsur): this can lead to very strange errors
if 'uuid' in values:
@ -689,6 +706,7 @@ class Connection(api.Connection):
ref = query.one()
return ref
@oslo_db_api.retry_on_deadlock
def destroy_chassis(self, chassis_id):
def chassis_not_empty():
"""Checks whether the chassis does not have nodes."""
@ -709,6 +727,7 @@ class Connection(api.Connection):
if count != 1:
raise exception.ChassisNotFound(chassis=chassis_id)
@oslo_db_api.retry_on_deadlock
def register_conductor(self, values, update_existing=False):
with _session_for_write() as session:
query = (model_query(models.Conductor)
@ -736,6 +755,7 @@ class Connection(api.Connection):
except NoResultFound:
raise exception.ConductorNotFound(conductor=hostname)
@oslo_db_api.retry_on_deadlock
def unregister_conductor(self, hostname):
with _session_for_write():
query = (model_query(models.Conductor)
@ -744,6 +764,7 @@ class Connection(api.Connection):
if count == 0:
raise exception.ConductorNotFound(conductor=hostname)
@oslo_db_api.retry_on_deadlock
def touch_conductor(self, hostname):
with _session_for_write():
query = (model_query(models.Conductor)
@ -755,6 +776,7 @@ class Connection(api.Connection):
if count == 0:
raise exception.ConductorNotFound(conductor=hostname)
@oslo_db_api.retry_on_deadlock
def clear_node_reservations_for_conductor(self, hostname):
nodes = []
with _session_for_write():
@ -769,6 +791,7 @@ class Connection(api.Connection):
_LW('Cleared reservations held by %(hostname)s: '
'%(nodes)s'), {'hostname': hostname, 'nodes': nodes})
@oslo_db_api.retry_on_deadlock
def clear_node_target_power_state(self, hostname):
nodes = []
with _session_for_write():
@ -831,6 +854,7 @@ class Connection(api.Connection):
query = _filter_active_conductors(query)
return query.all()
@oslo_db_api.retry_on_deadlock
def register_conductor_hardware_interfaces(self, conductor_id,
hardware_type, interface_type,
interfaces, default_interface):
@ -852,12 +876,14 @@ class Connection(api.Connection):
interface_type=interface_type,
interfaces=interfaces)
@oslo_db_api.retry_on_deadlock
def unregister_conductor_hardware_interfaces(self, conductor_id):
with _session_for_write():
query = (model_query(models.ConductorHardwareInterfaces)
.filter_by(conductor_id=conductor_id))
query.delete()
@oslo_db_api.retry_on_deadlock
def touch_node_provisioning(self, node_id):
with _session_for_write():
query = model_query(models.Node)
@ -870,6 +896,7 @@ class Connection(api.Connection):
if not model_query(models.Node).filter_by(id=node_id).scalar():
raise exception.NodeNotFound(node=node_id)
@oslo_db_api.retry_on_deadlock
def set_node_tags(self, node_id, tags):
# remove duplicate tags
tags = set(tags)
@ -883,6 +910,7 @@ class Connection(api.Connection):
return node_tags
@oslo_db_api.retry_on_deadlock
def unset_node_tags(self, node_id):
self._check_node_exists(node_id)
with _session_for_write():
@ -895,6 +923,7 @@ class Connection(api.Connection):
.all())
return result
@oslo_db_api.retry_on_deadlock
def add_node_tag(self, node_id, tag):
node_tag = models.NodeTag(tag=tag, node_id=node_id)
@ -909,6 +938,7 @@ class Connection(api.Connection):
return node_tag
@oslo_db_api.retry_on_deadlock
def delete_node_tag(self, node_id, tag):
self._check_node_exists(node_id)
with _session_for_write():
@ -964,6 +994,7 @@ class Connection(api.Connection):
return _paginate_query(models.VolumeConnector, limit, marker,
sort_key, sort_dir, query)
@oslo_db_api.retry_on_deadlock
def create_volume_connector(self, connector_info):
if 'uuid' not in connector_info:
connector_info['uuid'] = uuidutils.generate_uuid()
@ -983,6 +1014,7 @@ class Connection(api.Connection):
uuid=connector_info['uuid'])
return connector
@oslo_db_api.retry_on_deadlock
def update_volume_connector(self, ident, connector_info):
if 'uuid' in connector_info:
msg = _("Cannot overwrite UUID for an existing Volume Connector.")
@ -1006,6 +1038,7 @@ class Connection(api.Connection):
raise exception.VolumeConnectorNotFound(connector=ident)
return ref
@oslo_db_api.retry_on_deadlock
def destroy_volume_connector(self, ident):
with _session_for_write():
query = model_query(models.VolumeConnector)
@ -1039,6 +1072,7 @@ class Connection(api.Connection):
return _paginate_query(models.VolumeTarget, limit, marker, sort_key,
sort_dir, query)
@oslo_db_api.retry_on_deadlock
def create_volume_target(self, target_info):
if 'uuid' not in target_info:
target_info['uuid'] = uuidutils.generate_uuid()
@ -1057,6 +1091,7 @@ class Connection(api.Connection):
uuid=target_info['uuid'])
return target
@oslo_db_api.retry_on_deadlock
def update_volume_target(self, ident, target_info):
if 'uuid' in target_info:
msg = _("Cannot overwrite UUID for an existing Volume Target.")
@ -1077,6 +1112,7 @@ class Connection(api.Connection):
raise exception.VolumeTargetNotFound(target=ident)
return ref
@oslo_db_api.retry_on_deadlock
def destroy_volume_target(self, ident):
with _session_for_write():
query = model_query(models.VolumeTarget)

View File

@ -0,0 +1,32 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import inspect
from ironic.db.sqlalchemy import api as sqlalchemy_api
from ironic.tests import base as test_base
class TestDBWriteMethodsRetryOnDeadlock(test_base.TestCase):
def test_retry_on_deadlock(self):
# This test ensures that every dbapi method doing database write is
# wrapped with retry_on_deadlock decorator
for name, method in inspect.getmembers(sqlalchemy_api.Connection,
predicate=inspect.ismethod):
src = inspect.getsource(method)
if 'with _session_for_write()' in src:
self.assertIn(
'@oslo_db_api.retry_on_deadlock', src,
'oslo_db\'s retry_on_deadlock decorator not '
'applied to method ironic.db.sqlalchemy.api.Connection.%s '
'doing database write' % name)

View File

@ -18,6 +18,8 @@
import datetime
import mock
from oslo_db import exception as db_exc
from oslo_db import sqlalchemy
from oslo_utils import timeutils
from ironic.common import exception
@ -129,6 +131,13 @@ class DbConductorTestCase(base.DbTestCase):
c = self.dbapi.get_conductor(c.hostname)
self.assertEqual(test_time, timeutils.normalize_time(c.updated_at))
@mock.patch.object(sqlalchemy.orm.Query, 'update', autospec=True)
def test_touch_conductor_deadlock(self, mock_update):
mock_update.side_effect = [db_exc.DBDeadlock(), None]
c = self._create_test_cdr()
self.dbapi.touch_conductor(c.hostname)
self.assertEqual(2, mock_update.call_count)
def test_touch_conductor_not_found(self):
# A conductor's heartbeat will not create a new record,
# it will only update existing ones

View File

@ -0,0 +1,12 @@
---
fixes:
- Fixes an issue which caused conductor's periodic tasks to stop executing.
See https://bugs.launchpad.net/ironic/+bug/1637210
features:
- Adds DBDeadlock handling which may improve stability when using Galera.
See https://bugs.launchpad.net/ironic/+bug/1639338
upgrade:
- All DB API methods doing database writes now retry on deadlock. The
``[database]db_max_retries`` configuration option specifies the maximum
number of times to retry, and can be customised if necessary. It is 5 by
default.