diff --git a/ironic/conf/database.py b/ironic/conf/database.py index fccf2f437d..37986e1cd3 100644 --- a/ironic/conf/database.py +++ b/ironic/conf/database.py @@ -20,7 +20,15 @@ from ironic.common.i18n import _ opts = [ cfg.StrOpt('mysql_engine', default='InnoDB', - help=_('MySQL engine to use.')) + help=_('MySQL engine to use.')), + cfg.BoolOpt('sqlite_retries', + default=True, + help=_('If SQLite database operation retry logic is enabled ' + 'or not. Enabled by default.')), + cfg.IntOpt('sqlite_max_wait_for_retry', + default=30, + help=_('Maximum number of seconds to retry SQLite database ' + 'locks.')), ] diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index cbacf32024..5c976e8152 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -16,7 +16,9 @@ import collections import datetime +import functools import json +import logging import threading from oslo_concurrency import lockutils @@ -37,6 +39,7 @@ from sqlalchemy.exc import NoResultFound, MultipleResultsFound from sqlalchemy.orm import Load from sqlalchemy.orm import selectinload from sqlalchemy import sql +import tenacity from ironic.common import exception from ironic.common.i18n import _ @@ -63,6 +66,33 @@ synchronized = lockutils.synchronized_with_prefix('ironic-') MAX_TRAITS_PER_NODE = 50 +def wrap_sqlite_retry(f): + + @functools.wraps(f) + def wrapper(*args, **kwargs): + if (CONF.database.sqlite_retries + and not utils.is_ironic_using_sqlite()): + return f(*args, **kwargs) + else: + for attempt in tenacity.Retrying( + retry=( + tenacity.retry_if_exception_type( + sa.exc.OperationalError) + & tenacity.retry_if_exception( + lambda e: 'database is locked' in str(e)) + ), + wait=tenacity.wait_full_jitter( + multiplier=0.25, + max=CONF.database.sqlite_max_wait_for_retry), + before_sleep=( + tenacity.before_sleep_log(LOG, logging.debug) + ), + reraise=True): + with attempt: + return f(*args, **kwargs) + return wrapper + + def get_backend(): """The backend is this module itself.""" return Connection() @@ -653,6 +683,7 @@ class Connection(api.Connection): return mapping @synchronized(RESERVATION_SEMAPHORE, fair=True) + @wrap_sqlite_retry def _reserve_node_place_lock(self, tag, node_id, node): try: # NOTE(TheJulia): We explicitly do *not* synch the session @@ -705,6 +736,7 @@ class Connection(api.Connection): # Return a node object as that is the contract for this method. return self.get_node_by_id(node.id) + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def release_node(self, tag, node_id): with _session_for_read() as session: @@ -734,6 +766,7 @@ class Connection(api.Connection): except NoResultFound: raise exception.NodeNotFound(node=node_id) + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def create_node(self, values): # ensure defaults are present for new nodes @@ -839,6 +872,7 @@ class Connection(api.Connection): raise exception.InstanceNotFound(instance_uuid=instance) return res + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def destroy_node(self, node_id): with _session_for_write() as session: @@ -914,6 +948,7 @@ class Connection(api.Connection): query.delete() + @wrap_sqlite_retry def update_node(self, node_id, values): # NOTE(dtantsur): this can lead to very strange errors if 'uuid' in values: @@ -1059,6 +1094,7 @@ class Connection(api.Connection): return _paginate_query(models.Port, limit, marker, sort_key, sort_dir, query) + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def create_port(self, values): if not values.get('uuid'): @@ -1076,6 +1112,7 @@ class Connection(api.Connection): raise exception.PortAlreadyExists(uuid=values['uuid']) return port + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def update_port(self, port_id, values): # NOTE(dtantsur): this can lead to very strange errors @@ -1098,6 +1135,7 @@ class Connection(api.Connection): raise exception.MACAlreadyExists(mac=values['address']) return ref + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def destroy_port(self, port_id): with _session_for_write() as session: @@ -1498,6 +1536,7 @@ class Connection(api.Connection): .filter_by(conductor_id=conductor_id)) query.delete() + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def touch_node_provisioning(self, node_id): with _session_for_write() as session: @@ -2049,6 +2088,7 @@ class Connection(api.Connection): models.NodeTrait).filter_by(node_id=node_id, trait=trait) return session.query(q.exists()).scalar() + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def create_bios_setting_list(self, node_id, settings, version): bios_settings = [] @@ -2078,6 +2118,7 @@ class Connection(api.Connection): node=node_id, name=setting['name']) return bios_settings + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def update_bios_setting_list(self, node_id, settings, version): bios_settings = [] @@ -2109,6 +2150,7 @@ class Connection(api.Connection): node=node_id, name=setting['name']) return bios_settings + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def delete_bios_setting_list(self, node_id, names): missing_bios_settings = [] @@ -2635,6 +2677,7 @@ class Connection(api.Connection): # ordered ascending originally. return final_set + @wrap_sqlite_retry def bulk_delete_node_history_records(self, entries): with _session_for_write() as session: # Uses input entry list, selects entries matching those ids @@ -2672,6 +2715,7 @@ class Connection(api.Connection): ) return res + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def create_node_inventory(self, values): inventory = models.NodeInventory() @@ -2686,6 +2730,7 @@ class Connection(api.Connection): session.flush() return inventory + @wrap_sqlite_retry @oslo_db_api.retry_on_deadlock def destroy_node_inventory_by_node_id(self, node_id): with _session_for_write() as session: @@ -2748,6 +2793,7 @@ class Connection(api.Connection): return shard_list + @wrap_sqlite_retry def create_firmware_component(self, values): """Create a FirmwareComponent record for a given node. @@ -2779,6 +2825,7 @@ class Connection(api.Connection): name=values['component'], node=values['node_id']) return fw_component + @wrap_sqlite_retry def update_firmware_component(self, node_id, component, values): """Update a FirmwareComponent record. diff --git a/ironic/tests/unit/db/base.py b/ironic/tests/unit/db/base.py index 8541e53642..e251b067e7 100644 --- a/ironic/tests/unit/db/base.py +++ b/ironic/tests/unit/db/base.py @@ -43,6 +43,8 @@ class Database(fixtures.Fixture): self.post_migrations() self._DB = "".join(line for line in conn.connection.iterdump()) self.engine.dispose() + # Disable retry logic for unit testing, to minimize overhead. + CONF.set_override('sqlite_retries', False, group='database') def setup_sqlite(self, db_migrate): if db_migrate.version(): diff --git a/ironic/tests/unit/db/test_nodes.py b/ironic/tests/unit/db/test_nodes.py index e8c0fa6e48..ba93ec7122 100644 --- a/ironic/tests/unit/db/test_nodes.py +++ b/ironic/tests/unit/db/test_nodes.py @@ -18,17 +18,27 @@ import datetime from unittest import mock +from oslo_config import cfg from oslo_utils import timeutils from oslo_utils import uuidutils -from sqlalchemy.orm import exc as sa_exc +from sqlalchemy import exc as sa_exc +# NOTE(TheJulia): At some point, we should drop the line below, +# as I understand the ORM exceptions are going to move completely +# after SQLAlchemy 2.0 to just the base exception class. +from sqlalchemy.orm import exc as sa_orm_exc from ironic.common import exception from ironic.common import states +from ironic.db.sqlalchemy import api as dbapi +from ironic.db.sqlalchemy.api import Connection as db_conn from ironic.db.sqlalchemy.models import NodeInventory from ironic.tests.unit.db import base from ironic.tests.unit.db import utils +CONF = cfg.CONF + + class DbNodeTestCase(base.DbTestCase): def test_create_node(self): @@ -535,7 +545,7 @@ class DbNodeTestCase(base.DbTestCase): 'deploy_interface', 'boot_interface', 'driver', 'extra']: try: - self.assertRaises(sa_exc.DetachedInstanceError, + self.assertRaises(sa_orm_exc.DetachedInstanceError, _attempt_field_access, r, field) except AttributeError: pass @@ -592,7 +602,7 @@ class DbNodeTestCase(base.DbTestCase): 'driver', 'extra', 'power_state', 'traits']: try: - self.assertRaises(sa_exc.DetachedInstanceError, + self.assertRaises(sa_orm_exc.DetachedInstanceError, _attempt_field_access, r, field) except AttributeError: # We expect an AttributeError, in addition to @@ -847,6 +857,28 @@ class DbNodeTestCase(base.DbTestCase): self.assertRaises(exception.NodeNotFound, self.dbapi.update_node, node_uuid, {'extra': new_extra}) + @mock.patch.object(dbapi, 'LOG', autospec=True) + def test_update_node_retries(self, log_mock): + """Test retry logic to ensure it works.""" + node = utils.create_test_node() + CONF.set_override('sqlite_retries', True, group='database') + # NOTE(TheJulia): Update is an ideal place to test retries + # as the underlying work is done by _do_update_node. + with mock.patch.object(db_conn, '_do_update_node', + autospec=True) as mock_update: + sa_err = sa_exc.OperationalError( + statement=None, + params=None, + orig=Exception('database is locked')) + mock_update.side_effect = [ + sa_err, + sa_err, + node + ] + self.dbapi.update_node(node.id, {'extra': {'foo': 'bar'}}) + self.assertEqual(3, mock_update.call_count) + self.assertEqual(2, log_mock.log.call_count) + def test_update_node_uuid(self): node = utils.create_test_node() self.assertRaises(exception.InvalidParameterValue, diff --git a/ironic/tests/unit/objects/test_bios.py b/ironic/tests/unit/objects/test_bios.py index 5f4d8b9993..9f7f4e7465 100644 --- a/ironic/tests/unit/objects/test_bios.py +++ b/ironic/tests/unit/objects/test_bios.py @@ -15,6 +15,7 @@ from unittest import mock from ironic.common import context from ironic.db import api as dbapi +from ironic.db.sqlalchemy.api import Connection as db_conn from ironic import objects from ironic.tests.unit.db import base as db_base from ironic.tests.unit.db import utils as db_utils @@ -71,7 +72,7 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): self.assertEqual(bios_setting2['name'], bios_obj_list[1].name) self.assertEqual(bios_setting2['value'], bios_obj_list[1].value) - @mock.patch.object(dbapi.IMPL, 'create_bios_setting_list', autospec=True) + @mock.patch.object(db_conn, 'create_bios_setting_list', autospec=True) def test_create(self, mock_create_list): fake_call_args = {'node_id': self.bios_setting['node_id'], 'name': self.bios_setting['name'], @@ -97,7 +98,8 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): mock_create_list.return_value = [self.bios_setting] mock_create_list.call_args bios_obj.create() - mock_create_list.assert_called_once_with(self.bios_setting['node_id'], + mock_create_list.assert_called_once_with(mock.ANY, + self.bios_setting['node_id'], setting, self.bios_setting['version']) self.assertEqual(self.bios_setting['node_id'], bios_obj.node_id) @@ -113,7 +115,7 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): bios_obj.reset_required) self.assertEqual(self.bios_setting['unique'], bios_obj.unique) - @mock.patch.object(dbapi.IMPL, 'update_bios_setting_list', autospec=True) + @mock.patch.object(db_conn, 'update_bios_setting_list', autospec=True) def test_save(self, mock_update_list): fake_call_args = {'node_id': self.bios_setting['node_id'], 'name': self.bios_setting['name'], @@ -131,14 +133,15 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): mock_update_list.return_value = [self.bios_setting] mock_update_list.call_args bios_obj.save() - mock_update_list.assert_called_once_with(self.bios_setting['node_id'], + mock_update_list.assert_called_once_with(mock.ANY, + self.bios_setting['node_id'], setting, self.bios_setting['version']) self.assertEqual(self.bios_setting['node_id'], bios_obj.node_id) self.assertEqual(self.bios_setting['name'], bios_obj.name) self.assertEqual(self.bios_setting['value'], bios_obj.value) - @mock.patch.object(dbapi.IMPL, 'create_bios_setting_list', autospec=True) + @mock.patch.object(db_conn, 'create_bios_setting_list', autospec=True) def test_list_create(self, mock_create_list): bios_setting2 = db_utils.get_test_bios_setting(name='hyperthread', value='enabled') @@ -147,7 +150,8 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): bios_obj_list = objects.BIOSSettingList.create( self.context, self.node_id, settings) - mock_create_list.assert_called_once_with(self.node_id, settings, '1.1') + mock_create_list.assert_called_once_with(mock.ANY, self.node_id, + settings, '1.1') self.assertEqual(self.context, bios_obj_list._context) self.assertEqual(2, len(bios_obj_list)) self.assertEqual(self.bios_setting['node_id'], @@ -157,7 +161,7 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): self.assertEqual(bios_setting2['node_id'], bios_obj_list[1].node_id) self.assertEqual(bios_setting2['name'], bios_obj_list[1].name) - @mock.patch.object(dbapi.IMPL, 'update_bios_setting_list', autospec=True) + @mock.patch.object(db_conn, 'update_bios_setting_list', autospec=True) def test_list_save(self, mock_update_list): bios_setting2 = db_utils.get_test_bios_setting(name='hyperthread', value='enabled') @@ -166,7 +170,8 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): bios_obj_list = objects.BIOSSettingList.save( self.context, self.node_id, settings) - mock_update_list.assert_called_once_with(self.node_id, settings, '1.1') + mock_update_list.assert_called_once_with(mock.ANY, self.node_id, + settings, '1.1') self.assertEqual(self.context, bios_obj_list._context) self.assertEqual(2, len(bios_obj_list)) self.assertEqual(self.bios_setting['node_id'], @@ -177,19 +182,20 @@ class TestBIOSSettingObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): self.assertEqual(bios_setting2['name'], bios_obj_list[1].name) self.assertEqual(bios_setting2['value'], bios_obj_list[1].value) - @mock.patch.object(dbapi.IMPL, 'delete_bios_setting_list', autospec=True) + @mock.patch.object(db_conn, 'delete_bios_setting_list', autospec=True) def test_delete(self, mock_delete): objects.BIOSSetting.delete(self.context, self.node_id, self.bios_setting['name']) - mock_delete.assert_called_once_with(self.node_id, + mock_delete.assert_called_once_with(mock.ANY, + self.node_id, [self.bios_setting['name']]) - @mock.patch.object(dbapi.IMPL, 'delete_bios_setting_list', autospec=True) + @mock.patch.object(db_conn, 'delete_bios_setting_list', autospec=True) def test_list_delete(self, mock_delete): bios_setting2 = db_utils.get_test_bios_setting(name='hyperthread') name_list = [self.bios_setting['name'], bios_setting2['name']] objects.BIOSSettingList.delete(self.context, self.node_id, name_list) - mock_delete.assert_called_once_with(self.node_id, name_list) + mock_delete.assert_called_once_with(mock.ANY, self.node_id, name_list) @mock.patch('ironic.objects.bios.BIOSSettingList.get_by_node_id', spec_set=types.FunctionType) diff --git a/ironic/tests/unit/objects/test_node.py b/ironic/tests/unit/objects/test_node.py index 05793e3a61..b2322a7a4f 100644 --- a/ironic/tests/unit/objects/test_node.py +++ b/ironic/tests/unit/objects/test_node.py @@ -22,6 +22,7 @@ from testtools import matchers from ironic.common import context from ironic.common import exception +from ironic.db.sqlalchemy.api import Connection as db_conn from ironic import objects from ironic.objects import node as node_objects from ironic.tests.unit.db import base as db_base @@ -479,15 +480,15 @@ class TestNodeObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): node_id) def test_release(self): - with mock.patch.object(self.dbapi, 'release_node', + with mock.patch.object(db_conn, 'release_node', autospec=True) as mock_release: node_id = self.fake_node['id'] fake_tag = 'fake-tag' objects.Node.release(self.context, fake_tag, node_id) - mock_release.assert_called_once_with(fake_tag, node_id) + mock_release.assert_called_once_with(mock.ANY, fake_tag, node_id) def test_release_node_not_found(self): - with mock.patch.object(self.dbapi, 'release_node', + with mock.patch.object(db_conn, 'release_node', autospec=True) as mock_release: node_id = 'non-existent' mock_release.side_effect = exception.NodeNotFound(node=node_id) @@ -496,25 +497,25 @@ class TestNodeObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): 'fake-tag', node_id) def test_touch_provisioning(self): - with mock.patch.object(self.dbapi, 'get_node_by_uuid', + with mock.patch.object(db_conn, 'get_node_by_uuid', autospec=True) as mock_get_node: mock_get_node.return_value = self.fake_node - with mock.patch.object(self.dbapi, 'touch_node_provisioning', + with mock.patch.object(db_conn, 'touch_node_provisioning', autospec=True) as mock_touch: node = objects.Node.get(self.context, self.fake_node['uuid']) node.touch_provisioning() - mock_touch.assert_called_once_with(node.id) + mock_touch.assert_called_once_with(mock.ANY, node.id) def test_create(self): node = obj_utils.get_test_node(self.ctxt, **self.fake_node) - with mock.patch.object(self.dbapi, 'create_node', + with mock.patch.object(db_conn, 'create_node', autospec=True) as mock_create_node: mock_create_node.return_value = db_utils.get_test_node() node.create() args, _kwargs = mock_create_node.call_args - self.assertEqual(objects.Node.VERSION, args[0]['version']) + self.assertEqual(objects.Node.VERSION, args[1]['version']) self.assertEqual(1, mock_create_node.call_count) def test_create_with_invalid_properties(self): diff --git a/ironic/tests/unit/objects/test_node_inventory.py b/ironic/tests/unit/objects/test_node_inventory.py index 9eee77a160..be16aa159a 100644 --- a/ironic/tests/unit/objects/test_node_inventory.py +++ b/ironic/tests/unit/objects/test_node_inventory.py @@ -12,6 +12,7 @@ from unittest import mock +from ironic.db.sqlalchemy.api import Connection as db_conn from ironic import objects from ironic.tests.unit.db import base as db_base from ironic.tests.unit.db import utils as db_utils @@ -25,25 +26,26 @@ class TestNodeInventoryObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): self.fake_inventory = db_utils.get_test_inventory() def test_create(self): - with mock.patch.object(self.dbapi, 'create_node_inventory', + with mock.patch.object(db_conn, 'create_node_inventory', autospec=True) as mock_db_create: mock_db_create.return_value = self.fake_inventory new_inventory = objects.NodeInventory( self.context, **self.fake_inventory) new_inventory.create() - mock_db_create.assert_called_once_with(self.fake_inventory) + mock_db_create.assert_called_once_with(mock.ANY, + self.fake_inventory) def test_destroy(self): node_id = self.fake_inventory['node_id'] with mock.patch.object(self.dbapi, 'get_node_inventory_by_node_id', autospec=True) as mock_get: mock_get.return_value = self.fake_inventory - with mock.patch.object(self.dbapi, + with mock.patch.object(db_conn, 'destroy_node_inventory_by_node_id', autospec=True) as mock_db_destroy: inventory = objects.NodeInventory.get_by_node_id(self.context, node_id) inventory.destroy() - mock_db_destroy.assert_called_once_with(node_id) + mock_db_destroy.assert_called_once_with(mock.ANY, node_id) diff --git a/ironic/tests/unit/objects/test_port.py b/ironic/tests/unit/objects/test_port.py index 4c7280216f..2f43785764 100644 --- a/ironic/tests/unit/objects/test_port.py +++ b/ironic/tests/unit/objects/test_port.py @@ -21,6 +21,7 @@ from oslo_config import cfg from testtools import matchers from ironic.common import exception +from ironic.db.sqlalchemy.api import Connection as db_conn from ironic import objects from ironic.objects import base as obj_base from ironic.tests.unit.db import base as db_base @@ -86,9 +87,9 @@ class TestPortObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): def test_create(self): port = objects.Port(self.context, **self.fake_port) - with mock.patch.object(self.dbapi, 'create_port', + with mock.patch.object(db_conn, 'create_port', autospec=True) as mock_create_port: - with mock.patch.object(self.dbapi, 'get_port_by_id', + with mock.patch.object(db_conn, 'get_port_by_id', autospec=True) as mock_get_port: test_port = db_utils.get_test_port() mock_create_port.return_value = test_port @@ -97,7 +98,7 @@ class TestPortObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): port.create() args, _kwargs = mock_create_port.call_args - self.assertEqual(objects.Port.VERSION, args[0]['version']) + self.assertEqual(objects.Port.VERSION, args[1]['version']) def test_save(self): uuid = self.fake_port['uuid'] @@ -106,7 +107,7 @@ class TestPortObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): with mock.patch.object(self.dbapi, 'get_port_by_uuid', autospec=True) as mock_get_port: mock_get_port.return_value = self.fake_port - with mock.patch.object(self.dbapi, 'update_port', + with mock.patch.object(db_conn, 'update_port', autospec=True) as mock_update_port: mock_update_port.return_value = ( db_utils.get_test_port(address=address, @@ -117,8 +118,10 @@ class TestPortObject(db_base.DbTestCase, obj_utils.SchemasTestMixIn): mock_get_port.assert_called_once_with(uuid) mock_update_port.assert_called_once_with( - uuid, {'version': objects.Port.VERSION, - 'address': "b2:54:00:cf:2d:40"}) + mock.ANY, + uuid, + {'version': objects.Port.VERSION, + 'address': "b2:54:00:cf:2d:40"}) self.assertEqual(self.context, p._context) res_updated_at = (p.updated_at).replace(tzinfo=None) self.assertEqual(test_time, res_updated_at) diff --git a/releasenotes/notes/add-sqlite-db-retries-f493d5d7aa6db78b.yaml b/releasenotes/notes/add-sqlite-db-retries-f493d5d7aa6db78b.yaml new file mode 100644 index 0000000000..4c5f116cc9 --- /dev/null +++ b/releasenotes/notes/add-sqlite-db-retries-f493d5d7aa6db78b.yaml @@ -0,0 +1,25 @@ +--- +fixes: + - | + Adds a database write retry decorate for SQLite failures reporting + "database is locked". By default, through the new configuration + parameter ``[database]sqlite_max_wait_for_retry``, retries will + be performed on failing write operations for up to *30* seconds. + + This value can be tuned, but be warned it is an expotential + backoff retry model, and HTTP requests can give up if no + response is received in a reasonable time, thus *30* seconds + was deemed a reasonable default. + + The retry logic can be disabled using the + ``[database]sqlite_retries`` option, which defaults to + *True*. Users of other, mutli-threaded/concurrent-write database + platforms are not impacted by this change, as the retry logic + recognizes if another database is in use and bypasses the retry + logic in that case. A similar retry logic concept already exists + with other databases in the form of a "Database Deadlock" retry + where two writers conflict on the same row or table. The database + abstraction layer already handles such deadlock conditions. + The SQLite file based locking issue is unfortunately more common + with file based write locking as the entire file, in other words + the entire database, to perform the write operation.