diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index 405295b5fa..48d78c9520 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -1208,7 +1208,7 @@ # Maximum retries in case of connection error or deadlock # error before error is raised. Set to -1 to specify an # infinite retry count. (integer value) -#db_max_retries = 20 +#db_max_retries = 5 [deploy] diff --git a/ironic/conf/database.py b/ironic/conf/database.py index fccf2f437d..b4aa6224b2 100644 --- a/ironic/conf/database.py +++ b/ironic/conf/database.py @@ -26,3 +26,6 @@ opts = [ def register_opts(conf): conf.register_opts(opts, group='database') + # Change the oslo_db side default to 5 + conf.import_opt('db_max_retries', 'ironic.db.api', group='database') + conf.set_default('db_max_retries', 5, group='database') diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 3d134f5bff..62741d0247 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -20,6 +20,7 @@ import collections import datetime import threading +from oslo_db import api as oslo_db_api from oslo_db import exception as db_exc from oslo_db.sqlalchemy import enginefacade from oslo_db.sqlalchemy import utils as db_utils @@ -55,6 +56,9 @@ def _session_for_read(): return enginefacade.reader.using(_CONTEXT) +# Please add @oslo_db_api.retry_on_deadlock decorator to all methods using +# _session_for_write (as deadlocks happen on write), so that oslo_db is able +# to retry in case of deadlocks. def _session_for_write(): return enginefacade.writer.using(_CONTEXT) @@ -258,6 +262,7 @@ class Connection(api.Connection): return _paginate_query(models.Node, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def reserve_node(self, tag, node_id): with _session_for_write(): query = _get_node_query_with_tags() @@ -276,6 +281,7 @@ class Connection(api.Connection): except NoResultFound: raise exception.NodeNotFound(node_id) + @oslo_db_api.retry_on_deadlock def release_node(self, tag, node_id): with _session_for_write(): query = model_query(models.Node) @@ -294,6 +300,7 @@ class Connection(api.Connection): except NoResultFound: raise exception.NodeNotFound(node_id) + @oslo_db_api.retry_on_deadlock def create_node(self, values): # ensure defaults are present for new nodes if 'uuid' not in values: @@ -364,6 +371,7 @@ class Connection(api.Connection): return result + @oslo_db_api.retry_on_deadlock def destroy_node(self, node_id): with _session_for_write(): query = model_query(models.Node) @@ -422,6 +430,7 @@ class Connection(api.Connection): else: raise + @oslo_db_api.retry_on_deadlock def _do_update_node(self, node_id, values): with _session_for_write(): query = model_query(models.Node) @@ -492,6 +501,7 @@ class Connection(api.Connection): return _paginate_query(models.Port, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def create_port(self, values): if not values.get('uuid'): values['uuid'] = uuidutils.generate_uuid() @@ -508,6 +518,7 @@ class Connection(api.Connection): raise exception.PortAlreadyExists(uuid=values['uuid']) return port + @oslo_db_api.retry_on_deadlock def update_port(self, port_id, values): # NOTE(dtantsur): this can lead to very strange errors if 'uuid' in values: @@ -527,6 +538,7 @@ class Connection(api.Connection): raise exception.MACAlreadyExists(mac=values['address']) return ref + @oslo_db_api.retry_on_deadlock def destroy_port(self, port_id): with _session_for_write(): query = model_query(models.Port) @@ -575,6 +587,7 @@ class Connection(api.Connection): return _paginate_query(models.Portgroup, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def create_portgroup(self, values): if not values.get('uuid'): values['uuid'] = uuidutils.generate_uuid() @@ -596,6 +609,7 @@ class Connection(api.Connection): raise exception.PortgroupAlreadyExists(uuid=values['uuid']) return portgroup + @oslo_db_api.retry_on_deadlock def update_portgroup(self, portgroup_id, values): if 'uuid' in values: msg = _("Cannot overwrite UUID for an existing portgroup.") @@ -620,6 +634,7 @@ class Connection(api.Connection): raise return ref + @oslo_db_api.retry_on_deadlock def destroy_portgroup(self, portgroup_id): def portgroup_not_empty(session): """Checks whether the portgroup does not have ports.""" @@ -659,6 +674,7 @@ class Connection(api.Connection): return _paginate_query(models.Chassis, limit, marker, sort_key, sort_dir) + @oslo_db_api.retry_on_deadlock def create_chassis(self, values): if not values.get('uuid'): values['uuid'] = uuidutils.generate_uuid() @@ -673,6 +689,7 @@ class Connection(api.Connection): raise exception.ChassisAlreadyExists(uuid=values['uuid']) return chassis + @oslo_db_api.retry_on_deadlock def update_chassis(self, chassis_id, values): # NOTE(dtantsur): this can lead to very strange errors if 'uuid' in values: @@ -689,6 +706,7 @@ class Connection(api.Connection): ref = query.one() return ref + @oslo_db_api.retry_on_deadlock def destroy_chassis(self, chassis_id): def chassis_not_empty(): """Checks whether the chassis does not have nodes.""" @@ -709,6 +727,7 @@ class Connection(api.Connection): if count != 1: raise exception.ChassisNotFound(chassis=chassis_id) + @oslo_db_api.retry_on_deadlock def register_conductor(self, values, update_existing=False): with _session_for_write() as session: query = (model_query(models.Conductor) @@ -736,6 +755,7 @@ class Connection(api.Connection): except NoResultFound: raise exception.ConductorNotFound(conductor=hostname) + @oslo_db_api.retry_on_deadlock def unregister_conductor(self, hostname): with _session_for_write(): query = (model_query(models.Conductor) @@ -744,6 +764,7 @@ class Connection(api.Connection): if count == 0: raise exception.ConductorNotFound(conductor=hostname) + @oslo_db_api.retry_on_deadlock def touch_conductor(self, hostname): with _session_for_write(): query = (model_query(models.Conductor) @@ -755,6 +776,7 @@ class Connection(api.Connection): if count == 0: raise exception.ConductorNotFound(conductor=hostname) + @oslo_db_api.retry_on_deadlock def clear_node_reservations_for_conductor(self, hostname): nodes = [] with _session_for_write(): @@ -769,6 +791,7 @@ class Connection(api.Connection): _LW('Cleared reservations held by %(hostname)s: ' '%(nodes)s'), {'hostname': hostname, 'nodes': nodes}) + @oslo_db_api.retry_on_deadlock def clear_node_target_power_state(self, hostname): nodes = [] with _session_for_write(): @@ -831,6 +854,7 @@ class Connection(api.Connection): query = _filter_active_conductors(query) return query.all() + @oslo_db_api.retry_on_deadlock def register_conductor_hardware_interfaces(self, conductor_id, hardware_type, interface_type, interfaces, default_interface): @@ -852,12 +876,14 @@ class Connection(api.Connection): interface_type=interface_type, interfaces=interfaces) + @oslo_db_api.retry_on_deadlock def unregister_conductor_hardware_interfaces(self, conductor_id): with _session_for_write(): query = (model_query(models.ConductorHardwareInterfaces) .filter_by(conductor_id=conductor_id)) query.delete() + @oslo_db_api.retry_on_deadlock def touch_node_provisioning(self, node_id): with _session_for_write(): query = model_query(models.Node) @@ -870,6 +896,7 @@ class Connection(api.Connection): if not model_query(models.Node).filter_by(id=node_id).scalar(): raise exception.NodeNotFound(node=node_id) + @oslo_db_api.retry_on_deadlock def set_node_tags(self, node_id, tags): # remove duplicate tags tags = set(tags) @@ -883,6 +910,7 @@ class Connection(api.Connection): return node_tags + @oslo_db_api.retry_on_deadlock def unset_node_tags(self, node_id): self._check_node_exists(node_id) with _session_for_write(): @@ -895,6 +923,7 @@ class Connection(api.Connection): .all()) return result + @oslo_db_api.retry_on_deadlock def add_node_tag(self, node_id, tag): node_tag = models.NodeTag(tag=tag, node_id=node_id) @@ -909,6 +938,7 @@ class Connection(api.Connection): return node_tag + @oslo_db_api.retry_on_deadlock def delete_node_tag(self, node_id, tag): self._check_node_exists(node_id) with _session_for_write(): @@ -964,6 +994,7 @@ class Connection(api.Connection): return _paginate_query(models.VolumeConnector, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def create_volume_connector(self, connector_info): if 'uuid' not in connector_info: connector_info['uuid'] = uuidutils.generate_uuid() @@ -983,6 +1014,7 @@ class Connection(api.Connection): uuid=connector_info['uuid']) return connector + @oslo_db_api.retry_on_deadlock def update_volume_connector(self, ident, connector_info): if 'uuid' in connector_info: msg = _("Cannot overwrite UUID for an existing Volume Connector.") @@ -1006,6 +1038,7 @@ class Connection(api.Connection): raise exception.VolumeConnectorNotFound(connector=ident) return ref + @oslo_db_api.retry_on_deadlock def destroy_volume_connector(self, ident): with _session_for_write(): query = model_query(models.VolumeConnector) @@ -1039,6 +1072,7 @@ class Connection(api.Connection): return _paginate_query(models.VolumeTarget, limit, marker, sort_key, sort_dir, query) + @oslo_db_api.retry_on_deadlock def create_volume_target(self, target_info): if 'uuid' not in target_info: target_info['uuid'] = uuidutils.generate_uuid() @@ -1057,6 +1091,7 @@ class Connection(api.Connection): uuid=target_info['uuid']) return target + @oslo_db_api.retry_on_deadlock def update_volume_target(self, ident, target_info): if 'uuid' in target_info: msg = _("Cannot overwrite UUID for an existing Volume Target.") @@ -1077,6 +1112,7 @@ class Connection(api.Connection): raise exception.VolumeTargetNotFound(target=ident) return ref + @oslo_db_api.retry_on_deadlock def destroy_volume_target(self, ident): with _session_for_write(): query = model_query(models.VolumeTarget) diff --git a/ironic/tests/unit/db/sqlalchemy/test_api.py b/ironic/tests/unit/db/sqlalchemy/test_api.py new file mode 100644 index 0000000000..bd620e86c7 --- /dev/null +++ b/ironic/tests/unit/db/sqlalchemy/test_api.py @@ -0,0 +1,32 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import inspect + +from ironic.db.sqlalchemy import api as sqlalchemy_api +from ironic.tests import base as test_base + + +class TestDBWriteMethodsRetryOnDeadlock(test_base.TestCase): + + def test_retry_on_deadlock(self): + # This test ensures that every dbapi method doing database write is + # wrapped with retry_on_deadlock decorator + for name, method in inspect.getmembers(sqlalchemy_api.Connection, + predicate=inspect.ismethod): + src = inspect.getsource(method) + if 'with _session_for_write()' in src: + self.assertIn( + '@oslo_db_api.retry_on_deadlock', src, + 'oslo_db\'s retry_on_deadlock decorator not ' + 'applied to method ironic.db.sqlalchemy.api.Connection.%s ' + 'doing database write' % name) diff --git a/ironic/tests/unit/db/test_conductor.py b/ironic/tests/unit/db/test_conductor.py index 62293f5c0f..abe57390c9 100644 --- a/ironic/tests/unit/db/test_conductor.py +++ b/ironic/tests/unit/db/test_conductor.py @@ -18,6 +18,8 @@ import datetime import mock +from oslo_db import exception as db_exc +from oslo_db import sqlalchemy from oslo_utils import timeutils from ironic.common import exception @@ -129,6 +131,13 @@ class DbConductorTestCase(base.DbTestCase): c = self.dbapi.get_conductor(c.hostname) self.assertEqual(test_time, timeutils.normalize_time(c.updated_at)) + @mock.patch.object(sqlalchemy.orm.Query, 'update', autospec=True) + def test_touch_conductor_deadlock(self, mock_update): + mock_update.side_effect = [db_exc.DBDeadlock(), None] + c = self._create_test_cdr() + self.dbapi.touch_conductor(c.hostname) + self.assertEqual(2, mock_update.call_count) + def test_touch_conductor_not_found(self): # A conductor's heartbeat will not create a new record, # it will only update existing ones diff --git a/releasenotes/notes/add-db-deadlock-handling-6bc10076537f3727.yaml b/releasenotes/notes/add-db-deadlock-handling-6bc10076537f3727.yaml new file mode 100644 index 0000000000..d1536407a9 --- /dev/null +++ b/releasenotes/notes/add-db-deadlock-handling-6bc10076537f3727.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - Fixes an issue which caused conductor's periodic tasks to stop executing. + See https://bugs.launchpad.net/ironic/+bug/1637210 +features: + - Adds DBDeadlock handling which may improve stability when using Galera. + See https://bugs.launchpad.net/ironic/+bug/1639338 +upgrade: + - All DB API methods doing database writes now retry on deadlock. The + ``[database]db_max_retries`` configuration option specifies the maximum + number of times to retry, and can be customised if necessary. It is 5 by + default.