From 3428cb74f0a6fe05c23cf00dc66da4c1d9766409 Mon Sep 17 00:00:00 2001 From: Joanna Taryma Date: Mon, 27 Feb 2017 13:39:37 -0800 Subject: [PATCH] Add support for DBDeadlock handling Wrap all DB write operations with oslo_db decorator to retry on deadlock. DBDeadlock exception can be raised quite often on Galera clusters under load. This also fixes an issue which causes conductor's periodic tasks to stop executing. Closes-Bug: #1637210 Closes-Bug: #1639338 Co-Authored-By: Vladyslav Drok Co-Authored-By: Joanna Taryma Change-Id: I61db83637adfd98a5394d1f570f3de4302c93497 --- etc/ironic/ironic.conf.sample | 2 +- ironic/conf/database.py | 3 ++ ironic/db/sqlalchemy/api.py | 36 +++++++++++++++++++ ironic/tests/unit/db/sqlalchemy/test_api.py | 32 +++++++++++++++++ ironic/tests/unit/db/test_conductor.py | 9 +++++ ...db-deadlock-handling-6bc10076537f3727.yaml | 12 +++++++ 6 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 ironic/tests/unit/db/sqlalchemy/test_api.py create mode 100644 releasenotes/notes/add-db-deadlock-handling-6bc10076537f3727.yaml diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index 405295b5fa..48d78c9520 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -1208,7 +1208,7 @@ # Maximum retries in case of connection error or deadlock # error before error is raised. Set to -1 to specify an # infinite retry count. (integer value) -#db_max_retries = 20 +#db_max_retries = 5 [deploy] diff --git a/ironic/conf/database.py b/ironic/conf/database.py index fccf2f437d..b4aa6224b2 100644 --- a/ironic/conf/database.py +++ b/ironic/conf/database.py @@ -26,3 +26,6 @@ opts = [ def register_opts(conf): conf.register_opts(opts, group='database') + # Change the oslo_db side default to 5 + conf.import_opt('db_max_retries', 'ironic.db.api', group='database') + conf.set_default('db_max_retries', 5, group='database') diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 3d134f5bff..62741d0247 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -20,6 +20,7 @@ import collections import datetime import threading +from oslo_db import api as oslo_db_api from oslo_db import exception as db_exc from oslo_db.sqlalchemy import enginefacade from oslo_db.sqlalchemy import utils as db_utils @@ -55,6 +56,9 @@ def _session_for_read(): return enginefacade.reader.using(_CONTEXT) +# Please add @oslo_db_api.retry_on_deadlock decorator to all methods using +# _session_for_write (as deadlocks happen on write), so that oslo_db is able +# to retry in case of deadlocks. def _session_for_write(): return enginefacade.writer.using(_CONTEXT) @@ -258,6 +262,7 @@ class Connection(api.Connection): return _paginate_query(models.Node, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def reserve_node(self, tag, node_id): with _session_for_write(): query = _get_node_query_with_tags() @@ -276,6 +281,7 @@ class Connection(api.Connection): except NoResultFound: raise exception.NodeNotFound(node_id) + @oslo_db_api.retry_on_deadlock def release_node(self, tag, node_id): with _session_for_write(): query = model_query(models.Node) @@ -294,6 +300,7 @@ class Connection(api.Connection): except NoResultFound: raise exception.NodeNotFound(node_id) + @oslo_db_api.retry_on_deadlock def create_node(self, values): # ensure defaults are present for new nodes if 'uuid' not in values: @@ -364,6 +371,7 @@ class Connection(api.Connection): return result + @oslo_db_api.retry_on_deadlock def destroy_node(self, node_id): with _session_for_write(): query = model_query(models.Node) @@ -422,6 +430,7 @@ class Connection(api.Connection): else: raise + @oslo_db_api.retry_on_deadlock def _do_update_node(self, node_id, values): with _session_for_write(): query = model_query(models.Node) @@ -492,6 +501,7 @@ class Connection(api.Connection): return _paginate_query(models.Port, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def create_port(self, values): if not values.get('uuid'): values['uuid'] = uuidutils.generate_uuid() @@ -508,6 +518,7 @@ class Connection(api.Connection): raise exception.PortAlreadyExists(uuid=values['uuid']) return port + @oslo_db_api.retry_on_deadlock def update_port(self, port_id, values): # NOTE(dtantsur): this can lead to very strange errors if 'uuid' in values: @@ -527,6 +538,7 @@ class Connection(api.Connection): raise exception.MACAlreadyExists(mac=values['address']) return ref + @oslo_db_api.retry_on_deadlock def destroy_port(self, port_id): with _session_for_write(): query = model_query(models.Port) @@ -575,6 +587,7 @@ class Connection(api.Connection): return _paginate_query(models.Portgroup, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def create_portgroup(self, values): if not values.get('uuid'): values['uuid'] = uuidutils.generate_uuid() @@ -596,6 +609,7 @@ class Connection(api.Connection): raise exception.PortgroupAlreadyExists(uuid=values['uuid']) return portgroup + @oslo_db_api.retry_on_deadlock def update_portgroup(self, portgroup_id, values): if 'uuid' in values: msg = _("Cannot overwrite UUID for an existing portgroup.") @@ -620,6 +634,7 @@ class Connection(api.Connection): raise return ref + @oslo_db_api.retry_on_deadlock def destroy_portgroup(self, portgroup_id): def portgroup_not_empty(session): """Checks whether the portgroup does not have ports.""" @@ -659,6 +674,7 @@ class Connection(api.Connection): return _paginate_query(models.Chassis, limit, marker, sort_key, sort_dir) + @oslo_db_api.retry_on_deadlock def create_chassis(self, values): if not values.get('uuid'): values['uuid'] = uuidutils.generate_uuid() @@ -673,6 +689,7 @@ class Connection(api.Connection): raise exception.ChassisAlreadyExists(uuid=values['uuid']) return chassis + @oslo_db_api.retry_on_deadlock def update_chassis(self, chassis_id, values): # NOTE(dtantsur): this can lead to very strange errors if 'uuid' in values: @@ -689,6 +706,7 @@ class Connection(api.Connection): ref = query.one() return ref + @oslo_db_api.retry_on_deadlock def destroy_chassis(self, chassis_id): def chassis_not_empty(): """Checks whether the chassis does not have nodes.""" @@ -709,6 +727,7 @@ class Connection(api.Connection): if count != 1: raise exception.ChassisNotFound(chassis=chassis_id) + @oslo_db_api.retry_on_deadlock def register_conductor(self, values, update_existing=False): with _session_for_write() as session: query = (model_query(models.Conductor) @@ -736,6 +755,7 @@ class Connection(api.Connection): except NoResultFound: raise exception.ConductorNotFound(conductor=hostname) + @oslo_db_api.retry_on_deadlock def unregister_conductor(self, hostname): with _session_for_write(): query = (model_query(models.Conductor) @@ -744,6 +764,7 @@ class Connection(api.Connection): if count == 0: raise exception.ConductorNotFound(conductor=hostname) + @oslo_db_api.retry_on_deadlock def touch_conductor(self, hostname): with _session_for_write(): query = (model_query(models.Conductor) @@ -755,6 +776,7 @@ class Connection(api.Connection): if count == 0: raise exception.ConductorNotFound(conductor=hostname) + @oslo_db_api.retry_on_deadlock def clear_node_reservations_for_conductor(self, hostname): nodes = [] with _session_for_write(): @@ -769,6 +791,7 @@ class Connection(api.Connection): _LW('Cleared reservations held by %(hostname)s: ' '%(nodes)s'), {'hostname': hostname, 'nodes': nodes}) + @oslo_db_api.retry_on_deadlock def clear_node_target_power_state(self, hostname): nodes = [] with _session_for_write(): @@ -831,6 +854,7 @@ class Connection(api.Connection): query = _filter_active_conductors(query) return query.all() + @oslo_db_api.retry_on_deadlock def register_conductor_hardware_interfaces(self, conductor_id, hardware_type, interface_type, interfaces, default_interface): @@ -852,12 +876,14 @@ class Connection(api.Connection): interface_type=interface_type, interfaces=interfaces) + @oslo_db_api.retry_on_deadlock def unregister_conductor_hardware_interfaces(self, conductor_id): with _session_for_write(): query = (model_query(models.ConductorHardwareInterfaces) .filter_by(conductor_id=conductor_id)) query.delete() + @oslo_db_api.retry_on_deadlock def touch_node_provisioning(self, node_id): with _session_for_write(): query = model_query(models.Node) @@ -870,6 +896,7 @@ class Connection(api.Connection): if not model_query(models.Node).filter_by(id=node_id).scalar(): raise exception.NodeNotFound(node=node_id) + @oslo_db_api.retry_on_deadlock def set_node_tags(self, node_id, tags): # remove duplicate tags tags = set(tags) @@ -883,6 +910,7 @@ class Connection(api.Connection): return node_tags + @oslo_db_api.retry_on_deadlock def unset_node_tags(self, node_id): self._check_node_exists(node_id) with _session_for_write(): @@ -895,6 +923,7 @@ class Connection(api.Connection): .all()) return result + @oslo_db_api.retry_on_deadlock def add_node_tag(self, node_id, tag): node_tag = models.NodeTag(tag=tag, node_id=node_id) @@ -909,6 +938,7 @@ class Connection(api.Connection): return node_tag + @oslo_db_api.retry_on_deadlock def delete_node_tag(self, node_id, tag): self._check_node_exists(node_id) with _session_for_write(): @@ -964,6 +994,7 @@ class Connection(api.Connection): return _paginate_query(models.VolumeConnector, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def create_volume_connector(self, connector_info): if 'uuid' not in connector_info: connector_info['uuid'] = uuidutils.generate_uuid() @@ -983,6 +1014,7 @@ class Connection(api.Connection): uuid=connector_info['uuid']) return connector + @oslo_db_api.retry_on_deadlock def update_volume_connector(self, ident, connector_info): if 'uuid' in connector_info: msg = _("Cannot overwrite UUID for an existing Volume Connector.") @@ -1006,6 +1038,7 @@ class Connection(api.Connection): raise exception.VolumeConnectorNotFound(connector=ident) return ref + @oslo_db_api.retry_on_deadlock def destroy_volume_connector(self, ident): with _session_for_write(): query = model_query(models.VolumeConnector) @@ -1039,6 +1072,7 @@ class Connection(api.Connection): return _paginate_query(models.VolumeTarget, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def create_volume_target(self, target_info): if 'uuid' not in target_info: target_info['uuid'] = uuidutils.generate_uuid() @@ -1057,6 +1091,7 @@ class Connection(api.Connection): uuid=target_info['uuid']) return target + @oslo_db_api.retry_on_deadlock def update_volume_target(self, ident, target_info): if 'uuid' in target_info: msg = _("Cannot overwrite UUID for an existing Volume Target.") @@ -1077,6 +1112,7 @@ class Connection(api.Connection): raise exception.VolumeTargetNotFound(target=ident) return ref + @oslo_db_api.retry_on_deadlock def destroy_volume_target(self, ident): with _session_for_write(): query = model_query(models.VolumeTarget) diff --git a/ironic/tests/unit/db/sqlalchemy/test_api.py b/ironic/tests/unit/db/sqlalchemy/test_api.py new file mode 100644 index 0000000000..bd620e86c7 --- /dev/null +++ b/ironic/tests/unit/db/sqlalchemy/test_api.py @@ -0,0 +1,32 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import inspect + +from ironic.db.sqlalchemy import api as sqlalchemy_api +from ironic.tests import base as test_base + + +class TestDBWriteMethodsRetryOnDeadlock(test_base.TestCase): + + def test_retry_on_deadlock(self): + # This test ensures that every dbapi method doing database write is + # wrapped with retry_on_deadlock decorator + for name, method in inspect.getmembers(sqlalchemy_api.Connection, + predicate=inspect.ismethod): + src = inspect.getsource(method) + if 'with _session_for_write()' in src: + self.assertIn( + '@oslo_db_api.retry_on_deadlock', src, + 'oslo_db\'s retry_on_deadlock decorator not ' + 'applied to method ironic.db.sqlalchemy.api.Connection.%s ' + 'doing database write' % name) diff --git a/ironic/tests/unit/db/test_conductor.py b/ironic/tests/unit/db/test_conductor.py index 62293f5c0f..abe57390c9 100644 --- a/ironic/tests/unit/db/test_conductor.py +++ b/ironic/tests/unit/db/test_conductor.py @@ -18,6 +18,8 @@ import datetime import mock +from oslo_db import exception as db_exc +from oslo_db import sqlalchemy from oslo_utils import timeutils from ironic.common import exception @@ -129,6 +131,13 @@ class DbConductorTestCase(base.DbTestCase): c = self.dbapi.get_conductor(c.hostname) self.assertEqual(test_time, timeutils.normalize_time(c.updated_at)) + @mock.patch.object(sqlalchemy.orm.Query, 'update', autospec=True) + def test_touch_conductor_deadlock(self, mock_update): + mock_update.side_effect = [db_exc.DBDeadlock(), None] + c = self._create_test_cdr() + self.dbapi.touch_conductor(c.hostname) + self.assertEqual(2, mock_update.call_count) + def test_touch_conductor_not_found(self): # A conductor's heartbeat will not create a new record, # it will only update existing ones diff --git a/releasenotes/notes/add-db-deadlock-handling-6bc10076537f3727.yaml b/releasenotes/notes/add-db-deadlock-handling-6bc10076537f3727.yaml new file mode 100644 index 0000000000..d1536407a9 --- /dev/null +++ b/releasenotes/notes/add-db-deadlock-handling-6bc10076537f3727.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - Fixes an issue which caused conductor's periodic tasks to stop executing. + See https://bugs.launchpad.net/ironic/+bug/1637210 +features: + - Adds DBDeadlock handling which may improve stability when using Galera. + See https://bugs.launchpad.net/ironic/+bug/1639338 +upgrade: + - All DB API methods doing database writes now retry on deadlock. The + ``[database]db_max_retries`` configuration option specifies the maximum + number of times to retry, and can be customised if necessary. It is 5 by + default.