Add support for DBDeadlock handling

Wrap all DB write operations with oslo_db decorator to retry on
deadlock. DBDeadlock exception can be raised quite often on Galera
clusters under load.
This also fixes an issue which causes conductor's periodic
tasks to stop executing.

Closes-Bug: #1637210
Closes-Bug: #1639338
Co-Authored-By: Vladyslav Drok <vdrok@mirantis.com>
Co-Authored-By: Joanna Taryma <joanna.taryma@intel.com>
Change-Id: I61db83637adfd98a5394d1f570f3de4302c93497
This commit is contained in:
Joanna Taryma 2017-02-27 13:39:37 -08:00
parent 5071b99835
commit 3428cb74f0
6 changed files with 93 additions and 1 deletions

View File

@ -1208,7 +1208,7 @@
# Maximum retries in case of connection error or deadlock # Maximum retries in case of connection error or deadlock
# error before error is raised. Set to -1 to specify an # error before error is raised. Set to -1 to specify an
# infinite retry count. (integer value) # infinite retry count. (integer value)
#db_max_retries = 20 #db_max_retries = 5
[deploy] [deploy]

View File

@ -26,3 +26,6 @@ opts = [
def register_opts(conf): def register_opts(conf):
conf.register_opts(opts, group='database') conf.register_opts(opts, group='database')
# Change the oslo_db side default to 5
conf.import_opt('db_max_retries', 'ironic.db.api', group='database')
conf.set_default('db_max_retries', 5, group='database')

View File

@ -20,6 +20,7 @@ import collections
import datetime import datetime
import threading import threading
from oslo_db import api as oslo_db_api
from oslo_db import exception as db_exc from oslo_db import exception as db_exc
from oslo_db.sqlalchemy import enginefacade from oslo_db.sqlalchemy import enginefacade
from oslo_db.sqlalchemy import utils as db_utils from oslo_db.sqlalchemy import utils as db_utils
@ -55,6 +56,9 @@ def _session_for_read():
return enginefacade.reader.using(_CONTEXT) return enginefacade.reader.using(_CONTEXT)
# Please add @oslo_db_api.retry_on_deadlock decorator to all methods using
# _session_for_write (as deadlocks happen on write), so that oslo_db is able
# to retry in case of deadlocks.
def _session_for_write(): def _session_for_write():
return enginefacade.writer.using(_CONTEXT) return enginefacade.writer.using(_CONTEXT)
@ -258,6 +262,7 @@ class Connection(api.Connection):
return _paginate_query(models.Node, limit, marker, return _paginate_query(models.Node, limit, marker,
sort_key, sort_dir, query) sort_key, sort_dir, query)
@oslo_db_api.retry_on_deadlock
def reserve_node(self, tag, node_id): def reserve_node(self, tag, node_id):
with _session_for_write(): with _session_for_write():
query = _get_node_query_with_tags() query = _get_node_query_with_tags()
@ -276,6 +281,7 @@ class Connection(api.Connection):
except NoResultFound: except NoResultFound:
raise exception.NodeNotFound(node_id) raise exception.NodeNotFound(node_id)
@oslo_db_api.retry_on_deadlock
def release_node(self, tag, node_id): def release_node(self, tag, node_id):
with _session_for_write(): with _session_for_write():
query = model_query(models.Node) query = model_query(models.Node)
@ -294,6 +300,7 @@ class Connection(api.Connection):
except NoResultFound: except NoResultFound:
raise exception.NodeNotFound(node_id) raise exception.NodeNotFound(node_id)
@oslo_db_api.retry_on_deadlock
def create_node(self, values): def create_node(self, values):
# ensure defaults are present for new nodes # ensure defaults are present for new nodes
if 'uuid' not in values: if 'uuid' not in values:
@ -364,6 +371,7 @@ class Connection(api.Connection):
return result return result
@oslo_db_api.retry_on_deadlock
def destroy_node(self, node_id): def destroy_node(self, node_id):
with _session_for_write(): with _session_for_write():
query = model_query(models.Node) query = model_query(models.Node)
@ -422,6 +430,7 @@ class Connection(api.Connection):
else: else:
raise raise
@oslo_db_api.retry_on_deadlock
def _do_update_node(self, node_id, values): def _do_update_node(self, node_id, values):
with _session_for_write(): with _session_for_write():
query = model_query(models.Node) query = model_query(models.Node)
@ -492,6 +501,7 @@ class Connection(api.Connection):
return _paginate_query(models.Port, limit, marker, return _paginate_query(models.Port, limit, marker,
sort_key, sort_dir, query) sort_key, sort_dir, query)
@oslo_db_api.retry_on_deadlock
def create_port(self, values): def create_port(self, values):
if not values.get('uuid'): if not values.get('uuid'):
values['uuid'] = uuidutils.generate_uuid() values['uuid'] = uuidutils.generate_uuid()
@ -508,6 +518,7 @@ class Connection(api.Connection):
raise exception.PortAlreadyExists(uuid=values['uuid']) raise exception.PortAlreadyExists(uuid=values['uuid'])
return port return port
@oslo_db_api.retry_on_deadlock
def update_port(self, port_id, values): def update_port(self, port_id, values):
# NOTE(dtantsur): this can lead to very strange errors # NOTE(dtantsur): this can lead to very strange errors
if 'uuid' in values: if 'uuid' in values:
@ -527,6 +538,7 @@ class Connection(api.Connection):
raise exception.MACAlreadyExists(mac=values['address']) raise exception.MACAlreadyExists(mac=values['address'])
return ref return ref
@oslo_db_api.retry_on_deadlock
def destroy_port(self, port_id): def destroy_port(self, port_id):
with _session_for_write(): with _session_for_write():
query = model_query(models.Port) query = model_query(models.Port)
@ -575,6 +587,7 @@ class Connection(api.Connection):
return _paginate_query(models.Portgroup, limit, marker, return _paginate_query(models.Portgroup, limit, marker,
sort_key, sort_dir, query) sort_key, sort_dir, query)
@oslo_db_api.retry_on_deadlock
def create_portgroup(self, values): def create_portgroup(self, values):
if not values.get('uuid'): if not values.get('uuid'):
values['uuid'] = uuidutils.generate_uuid() values['uuid'] = uuidutils.generate_uuid()
@ -596,6 +609,7 @@ class Connection(api.Connection):
raise exception.PortgroupAlreadyExists(uuid=values['uuid']) raise exception.PortgroupAlreadyExists(uuid=values['uuid'])
return portgroup return portgroup
@oslo_db_api.retry_on_deadlock
def update_portgroup(self, portgroup_id, values): def update_portgroup(self, portgroup_id, values):
if 'uuid' in values: if 'uuid' in values:
msg = _("Cannot overwrite UUID for an existing portgroup.") msg = _("Cannot overwrite UUID for an existing portgroup.")
@ -620,6 +634,7 @@ class Connection(api.Connection):
raise raise
return ref return ref
@oslo_db_api.retry_on_deadlock
def destroy_portgroup(self, portgroup_id): def destroy_portgroup(self, portgroup_id):
def portgroup_not_empty(session): def portgroup_not_empty(session):
"""Checks whether the portgroup does not have ports.""" """Checks whether the portgroup does not have ports."""
@ -659,6 +674,7 @@ class Connection(api.Connection):
return _paginate_query(models.Chassis, limit, marker, return _paginate_query(models.Chassis, limit, marker,
sort_key, sort_dir) sort_key, sort_dir)
@oslo_db_api.retry_on_deadlock
def create_chassis(self, values): def create_chassis(self, values):
if not values.get('uuid'): if not values.get('uuid'):
values['uuid'] = uuidutils.generate_uuid() values['uuid'] = uuidutils.generate_uuid()
@ -673,6 +689,7 @@ class Connection(api.Connection):
raise exception.ChassisAlreadyExists(uuid=values['uuid']) raise exception.ChassisAlreadyExists(uuid=values['uuid'])
return chassis return chassis
@oslo_db_api.retry_on_deadlock
def update_chassis(self, chassis_id, values): def update_chassis(self, chassis_id, values):
# NOTE(dtantsur): this can lead to very strange errors # NOTE(dtantsur): this can lead to very strange errors
if 'uuid' in values: if 'uuid' in values:
@ -689,6 +706,7 @@ class Connection(api.Connection):
ref = query.one() ref = query.one()
return ref return ref
@oslo_db_api.retry_on_deadlock
def destroy_chassis(self, chassis_id): def destroy_chassis(self, chassis_id):
def chassis_not_empty(): def chassis_not_empty():
"""Checks whether the chassis does not have nodes.""" """Checks whether the chassis does not have nodes."""
@ -709,6 +727,7 @@ class Connection(api.Connection):
if count != 1: if count != 1:
raise exception.ChassisNotFound(chassis=chassis_id) raise exception.ChassisNotFound(chassis=chassis_id)
@oslo_db_api.retry_on_deadlock
def register_conductor(self, values, update_existing=False): def register_conductor(self, values, update_existing=False):
with _session_for_write() as session: with _session_for_write() as session:
query = (model_query(models.Conductor) query = (model_query(models.Conductor)
@ -736,6 +755,7 @@ class Connection(api.Connection):
except NoResultFound: except NoResultFound:
raise exception.ConductorNotFound(conductor=hostname) raise exception.ConductorNotFound(conductor=hostname)
@oslo_db_api.retry_on_deadlock
def unregister_conductor(self, hostname): def unregister_conductor(self, hostname):
with _session_for_write(): with _session_for_write():
query = (model_query(models.Conductor) query = (model_query(models.Conductor)
@ -744,6 +764,7 @@ class Connection(api.Connection):
if count == 0: if count == 0:
raise exception.ConductorNotFound(conductor=hostname) raise exception.ConductorNotFound(conductor=hostname)
@oslo_db_api.retry_on_deadlock
def touch_conductor(self, hostname): def touch_conductor(self, hostname):
with _session_for_write(): with _session_for_write():
query = (model_query(models.Conductor) query = (model_query(models.Conductor)
@ -755,6 +776,7 @@ class Connection(api.Connection):
if count == 0: if count == 0:
raise exception.ConductorNotFound(conductor=hostname) raise exception.ConductorNotFound(conductor=hostname)
@oslo_db_api.retry_on_deadlock
def clear_node_reservations_for_conductor(self, hostname): def clear_node_reservations_for_conductor(self, hostname):
nodes = [] nodes = []
with _session_for_write(): with _session_for_write():
@ -769,6 +791,7 @@ class Connection(api.Connection):
_LW('Cleared reservations held by %(hostname)s: ' _LW('Cleared reservations held by %(hostname)s: '
'%(nodes)s'), {'hostname': hostname, 'nodes': nodes}) '%(nodes)s'), {'hostname': hostname, 'nodes': nodes})
@oslo_db_api.retry_on_deadlock
def clear_node_target_power_state(self, hostname): def clear_node_target_power_state(self, hostname):
nodes = [] nodes = []
with _session_for_write(): with _session_for_write():
@ -831,6 +854,7 @@ class Connection(api.Connection):
query = _filter_active_conductors(query) query = _filter_active_conductors(query)
return query.all() return query.all()
@oslo_db_api.retry_on_deadlock
def register_conductor_hardware_interfaces(self, conductor_id, def register_conductor_hardware_interfaces(self, conductor_id,
hardware_type, interface_type, hardware_type, interface_type,
interfaces, default_interface): interfaces, default_interface):
@ -852,12 +876,14 @@ class Connection(api.Connection):
interface_type=interface_type, interface_type=interface_type,
interfaces=interfaces) interfaces=interfaces)
@oslo_db_api.retry_on_deadlock
def unregister_conductor_hardware_interfaces(self, conductor_id): def unregister_conductor_hardware_interfaces(self, conductor_id):
with _session_for_write(): with _session_for_write():
query = (model_query(models.ConductorHardwareInterfaces) query = (model_query(models.ConductorHardwareInterfaces)
.filter_by(conductor_id=conductor_id)) .filter_by(conductor_id=conductor_id))
query.delete() query.delete()
@oslo_db_api.retry_on_deadlock
def touch_node_provisioning(self, node_id): def touch_node_provisioning(self, node_id):
with _session_for_write(): with _session_for_write():
query = model_query(models.Node) query = model_query(models.Node)
@ -870,6 +896,7 @@ class Connection(api.Connection):
if not model_query(models.Node).filter_by(id=node_id).scalar(): if not model_query(models.Node).filter_by(id=node_id).scalar():
raise exception.NodeNotFound(node=node_id) raise exception.NodeNotFound(node=node_id)
@oslo_db_api.retry_on_deadlock
def set_node_tags(self, node_id, tags): def set_node_tags(self, node_id, tags):
# remove duplicate tags # remove duplicate tags
tags = set(tags) tags = set(tags)
@ -883,6 +910,7 @@ class Connection(api.Connection):
return node_tags return node_tags
@oslo_db_api.retry_on_deadlock
def unset_node_tags(self, node_id): def unset_node_tags(self, node_id):
self._check_node_exists(node_id) self._check_node_exists(node_id)
with _session_for_write(): with _session_for_write():
@ -895,6 +923,7 @@ class Connection(api.Connection):
.all()) .all())
return result return result
@oslo_db_api.retry_on_deadlock
def add_node_tag(self, node_id, tag): def add_node_tag(self, node_id, tag):
node_tag = models.NodeTag(tag=tag, node_id=node_id) node_tag = models.NodeTag(tag=tag, node_id=node_id)
@ -909,6 +938,7 @@ class Connection(api.Connection):
return node_tag return node_tag
@oslo_db_api.retry_on_deadlock
def delete_node_tag(self, node_id, tag): def delete_node_tag(self, node_id, tag):
self._check_node_exists(node_id) self._check_node_exists(node_id)
with _session_for_write(): with _session_for_write():
@ -964,6 +994,7 @@ class Connection(api.Connection):
return _paginate_query(models.VolumeConnector, limit, marker, return _paginate_query(models.VolumeConnector, limit, marker,
sort_key, sort_dir, query) sort_key, sort_dir, query)
@oslo_db_api.retry_on_deadlock
def create_volume_connector(self, connector_info): def create_volume_connector(self, connector_info):
if 'uuid' not in connector_info: if 'uuid' not in connector_info:
connector_info['uuid'] = uuidutils.generate_uuid() connector_info['uuid'] = uuidutils.generate_uuid()
@ -983,6 +1014,7 @@ class Connection(api.Connection):
uuid=connector_info['uuid']) uuid=connector_info['uuid'])
return connector return connector
@oslo_db_api.retry_on_deadlock
def update_volume_connector(self, ident, connector_info): def update_volume_connector(self, ident, connector_info):
if 'uuid' in connector_info: if 'uuid' in connector_info:
msg = _("Cannot overwrite UUID for an existing Volume Connector.") msg = _("Cannot overwrite UUID for an existing Volume Connector.")
@ -1006,6 +1038,7 @@ class Connection(api.Connection):
raise exception.VolumeConnectorNotFound(connector=ident) raise exception.VolumeConnectorNotFound(connector=ident)
return ref return ref
@oslo_db_api.retry_on_deadlock
def destroy_volume_connector(self, ident): def destroy_volume_connector(self, ident):
with _session_for_write(): with _session_for_write():
query = model_query(models.VolumeConnector) query = model_query(models.VolumeConnector)
@ -1039,6 +1072,7 @@ class Connection(api.Connection):
return _paginate_query(models.VolumeTarget, limit, marker, sort_key, return _paginate_query(models.VolumeTarget, limit, marker, sort_key,
sort_dir, query) sort_dir, query)
@oslo_db_api.retry_on_deadlock
def create_volume_target(self, target_info): def create_volume_target(self, target_info):
if 'uuid' not in target_info: if 'uuid' not in target_info:
target_info['uuid'] = uuidutils.generate_uuid() target_info['uuid'] = uuidutils.generate_uuid()
@ -1057,6 +1091,7 @@ class Connection(api.Connection):
uuid=target_info['uuid']) uuid=target_info['uuid'])
return target return target
@oslo_db_api.retry_on_deadlock
def update_volume_target(self, ident, target_info): def update_volume_target(self, ident, target_info):
if 'uuid' in target_info: if 'uuid' in target_info:
msg = _("Cannot overwrite UUID for an existing Volume Target.") msg = _("Cannot overwrite UUID for an existing Volume Target.")
@ -1077,6 +1112,7 @@ class Connection(api.Connection):
raise exception.VolumeTargetNotFound(target=ident) raise exception.VolumeTargetNotFound(target=ident)
return ref return ref
@oslo_db_api.retry_on_deadlock
def destroy_volume_target(self, ident): def destroy_volume_target(self, ident):
with _session_for_write(): with _session_for_write():
query = model_query(models.VolumeTarget) query = model_query(models.VolumeTarget)

View File

@ -0,0 +1,32 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import inspect
from ironic.db.sqlalchemy import api as sqlalchemy_api
from ironic.tests import base as test_base
class TestDBWriteMethodsRetryOnDeadlock(test_base.TestCase):
def test_retry_on_deadlock(self):
# This test ensures that every dbapi method doing database write is
# wrapped with retry_on_deadlock decorator
for name, method in inspect.getmembers(sqlalchemy_api.Connection,
predicate=inspect.ismethod):
src = inspect.getsource(method)
if 'with _session_for_write()' in src:
self.assertIn(
'@oslo_db_api.retry_on_deadlock', src,
'oslo_db\'s retry_on_deadlock decorator not '
'applied to method ironic.db.sqlalchemy.api.Connection.%s '
'doing database write' % name)

View File

@ -18,6 +18,8 @@
import datetime import datetime
import mock import mock
from oslo_db import exception as db_exc
from oslo_db import sqlalchemy
from oslo_utils import timeutils from oslo_utils import timeutils
from ironic.common import exception from ironic.common import exception
@ -129,6 +131,13 @@ class DbConductorTestCase(base.DbTestCase):
c = self.dbapi.get_conductor(c.hostname) c = self.dbapi.get_conductor(c.hostname)
self.assertEqual(test_time, timeutils.normalize_time(c.updated_at)) self.assertEqual(test_time, timeutils.normalize_time(c.updated_at))
@mock.patch.object(sqlalchemy.orm.Query, 'update', autospec=True)
def test_touch_conductor_deadlock(self, mock_update):
mock_update.side_effect = [db_exc.DBDeadlock(), None]
c = self._create_test_cdr()
self.dbapi.touch_conductor(c.hostname)
self.assertEqual(2, mock_update.call_count)
def test_touch_conductor_not_found(self): def test_touch_conductor_not_found(self):
# A conductor's heartbeat will not create a new record, # A conductor's heartbeat will not create a new record,
# it will only update existing ones # it will only update existing ones

View File

@ -0,0 +1,12 @@
---
fixes:
- Fixes an issue which caused conductor's periodic tasks to stop executing.
See https://bugs.launchpad.net/ironic/+bug/1637210
features:
- Adds DBDeadlock handling which may improve stability when using Galera.
See https://bugs.launchpad.net/ironic/+bug/1639338
upgrade:
- All DB API methods doing database writes now retry on deadlock. The
``[database]db_max_retries`` configuration option specifies the maximum
number of times to retry, and can be customised if necessary. It is 5 by
default.