From 0099d1812d0c018be65c47c5f1eb6e98b56cbf99 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Thu, 13 Jul 2023 14:35:49 -0700 Subject: [PATCH] Don't actually heartbeat with sqlite Disables internal heartbeat mechanism when ironic has been configured to utilize a SQLite database backend. This is done to lessen the possibility of a "database is locked" error, which can occur when two distinct threads attempt to write to the database at the same time with open writers. The process keepalive heartbeat process was identified as a major source of these write operations as it was writing every ten seconds by default, which would also collide with periodic tasks. Change-Id: I7b6d7a78ba2910f22673ad8e72e255f321d3fdff --- ironic/cmd/api.py | 12 ++++ ironic/cmd/conductor.py | 17 +++++ ironic/common/hash_ring.py | 7 +- ironic/common/utils.py | 14 ++++ ironic/conductor/base_manager.py | 5 ++ ironic/db/sqlalchemy/api.py | 26 ++++--- ironic/tests/unit/common/test_hash_ring.py | 53 +++++++++++++- .../tests/unit/conductor/test_base_manager.py | 24 ++++++- ironic/tests/unit/db/test_conductor.py | 70 +++++++++++++++++-- ...onductor-with-sqlite-7cfc71e717d69bc5.yaml | 13 ++++ 10 files changed, 223 insertions(+), 18 deletions(-) create mode 100644 releasenotes/notes/not-heartbeat-the-conductor-with-sqlite-7cfc71e717d69bc5.yaml diff --git a/ironic/cmd/api.py b/ironic/cmd/api.py index f75aa9ee7d..0ec925168e 100644 --- a/ironic/cmd/api.py +++ b/ironic/cmd/api.py @@ -30,6 +30,17 @@ CONF = cfg.CONF LOG = log.getLogger(__name__) +def _error_about_sqlite_usage(): + if 'sqlite' in CONF.database.connection.lower(): + # TODO(TheJulia): Make this a hard error in C* + LOG.error('We have detected the API is being launched with a SQLite ' + 'database backend. This is unsupported, and will be a hard ' + 'error in the future. This is becaues multi-process use of ' + 'a single SQLite database is problematic in terms of ' + 'locking. A single process ironic model exists for use with ' + 'SQLite.') + + def main(): # Parse config file and command line options, then start logging ironic_service.prepare_service('ironic_api', sys.argv) @@ -43,4 +54,5 @@ def main(): if __name__ == '__main__': + _error_about_sqlite_usage() sys.exit(main()) diff --git a/ironic/cmd/conductor.py b/ironic/cmd/conductor.py index a932cb22fb..36330c9949 100644 --- a/ironic/cmd/conductor.py +++ b/ironic/cmd/conductor.py @@ -27,6 +27,7 @@ from oslo_service import service from ironic.common import rpc_service from ironic.common import service as ironic_service +from ironic.common import utils CONF = cfg.CONF @@ -43,8 +44,24 @@ def warn_about_unsafe_shred_parameters(conf): 'Secure Erase. This is a possible SECURITY ISSUE!') +def warn_about_sqlite(): + # We are intentionally calling the helper here to ensure it caches + # for all future calls. + if utils.is_ironic_using_sqlite(): + LOG.warning('Ironic has been configured to utilize SQLite. ' + 'This has some restrictions and impacts. You must run ' + 'as as a single combined ironic process, and some ' + 'internal mechanisms do not execute such as the hash ' + 'ring will remain static and the conductor\'s ' + '``last_updated`` field will also not update. This is ' + 'in order to minimize database locking issues present ' + 'as a result of SQLAlchemy 2.0 and the removal of ' + 'autocommit support.') + + def issue_startup_warnings(conf): warn_about_unsafe_shred_parameters(conf) + warn_about_sqlite() def main(): diff --git a/ironic/common/hash_ring.py b/ironic/common/hash_ring.py index 8986d5bd25..fab102b0e7 100644 --- a/ironic/common/hash_ring.py +++ b/ironic/common/hash_ring.py @@ -21,6 +21,7 @@ from tooz import hashring from ironic.common import exception from ironic.common.i18n import _ +from ironic.common import utils from ironic.conf import CONF from ironic.db import api as dbapi @@ -48,7 +49,11 @@ class HashRingManager(object): # Hot path, no lock. Using a local variable to avoid races with code # changing the class variable. hash_rings, updated_at = self.__class__._hash_rings - if hash_rings is not None and updated_at >= limit: + if (hash_rings is not None + and (updated_at >= limit + or utils.is_ironic_using_sqlite())): + # Returning the hash ring for us, if it is still valid, + # or if we're using sqlite. return hash_rings with self._lock: diff --git a/ironic/common/utils.py b/ironic/common/utils.py index 793b4b501d..6bae4e7e88 100644 --- a/ironic/common/utils.py +++ b/ironic/common/utils.py @@ -57,6 +57,8 @@ TZ_RE = r'((?P[+-])(?P\d{2}):(?P\d{2}))' + \ DATETIME_RE = re.compile( '%sT%s(%s)?' % (DATE_RE, TIME_RE, TZ_RE)) +USING_SQLITE = None + def _get_root_helper(): # NOTE(jlvillal): This function has been moved to ironic-lib. And is @@ -724,3 +726,15 @@ def parse_kernel_params(params): else: result[key] = value return result + + +def is_ironic_using_sqlite(): + """Return True if Ironic is configured to use SQLite""" + global USING_SQLITE + if USING_SQLITE is not None: + return USING_SQLITE + + # We're being called for the first time, lets cache and + # return the value. + USING_SQLITE = 'sqlite' in CONF.database.connection.lower() + return USING_SQLITE diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 544411e1d6..c5c677ab38 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -34,6 +34,7 @@ from ironic.common.i18n import _ from ironic.common import release_mappings as versions from ironic.common import rpc from ironic.common import states +from ironic.common import utils as common_utils from ironic.conductor import allocations from ironic.conductor import notification_utils as notify_utils from ironic.conductor import task_manager @@ -440,6 +441,10 @@ class BaseConductorManager(object): raise exception.NoFreeConductorWorker() def _conductor_service_record_keepalive(self): + if common_utils.is_ironic_using_sqlite(): + # Exit this keepalive heartbeats are disabled and not + # considered. + return while not self._keepalive_evt.is_set(): try: self.conductor.touch() diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 6e3daf1422..cbacf32024 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -355,10 +355,14 @@ def _paginate_query(model, limit=None, marker=None, sort_key=None, def _filter_active_conductors(query, interval=None): if interval is None: interval = CONF.conductor.heartbeat_timeout - limit = timeutils.utcnow() - datetime.timedelta(seconds=interval) - - query = (query.filter(models.Conductor.online.is_(True)) - .filter(models.Conductor.updated_at >= limit)) + if not utils.is_ironic_using_sqlite() and interval > 0: + # Check for greater than zero becaues if the value is zero, + # then the logic makes no sense. + limit = timeutils.utcnow() - datetime.timedelta(seconds=interval) + query = (query.filter(models.Conductor.online.is_(True)) + .filter(models.Conductor.updated_at >= limit)) + else: + query = query.filter(models.Conductor.online.is_(True)) return query @@ -1433,10 +1437,16 @@ class Connection(api.Connection): def get_offline_conductors(self, field='hostname'): with _session_for_read() as session: field = getattr(models.Conductor, field) - interval = CONF.conductor.heartbeat_timeout - limit = timeutils.utcnow() - datetime.timedelta(seconds=interval) - result = (session.query(field) - .filter(models.Conductor.updated_at < limit)) + if not utils.is_ironic_using_sqlite(): + interval = CONF.conductor.heartbeat_timeout + limit = (timeutils.utcnow() + - datetime.timedelta(seconds=interval)) + result = (session.query(field) + .filter(models.Conductor.updated_at < limit)) + else: + result = session.query( + field + ).filter(models.Conductor.online.is_(False)) return [row[0] for row in result] def get_online_conductors(self): diff --git a/ironic/tests/unit/common/test_hash_ring.py b/ironic/tests/unit/common/test_hash_ring.py index a84b9ffcfa..085420404d 100644 --- a/ironic/tests/unit/common/test_hash_ring.py +++ b/ironic/tests/unit/common/test_hash_ring.py @@ -14,11 +14,13 @@ # under the License. import time +from unittest import mock from oslo_config import cfg from ironic.common import exception from ironic.common import hash_ring +from ironic.common import utils from ironic.tests.unit.db import base as db_base CONF = cfg.CONF @@ -92,7 +94,9 @@ class HashRingManagerTestCase(db_base.DbTestCase): self.register_conductors() self.ring_manager.get_ring('hardware-type', '') - def test_hash_ring_manager_reset_interval(self): + @mock.patch.object(utils, 'is_ironic_using_sqlite', autospec=True) + def test_hash_ring_manager_reset_interval(self, is_sqlite_mock): + is_sqlite_mock.return_value = False CONF.set_override('hash_ring_reset_interval', 30) # Just to simplify calculations CONF.set_override('hash_partition_exponent', 0) @@ -133,6 +137,53 @@ class HashRingManagerTestCase(db_base.DbTestCase): ) ring = self.ring_manager.get_ring('hardware-type', '') self.assertEqual(2, len(ring)) + self.assertEqual(3, is_sqlite_mock.call_count) + + @mock.patch.object(utils, 'is_ironic_using_sqlite', autospec=True) + def test_hash_ring_manager_reset_interval_not_happen_sqlite( + self, is_sqlite_mock): + is_sqlite_mock.return_value = True + CONF.set_override('hash_ring_reset_interval', 30) + # Just to simplify calculations + CONF.set_override('hash_partition_exponent', 0) + c1 = self.dbapi.register_conductor({ + 'hostname': 'host1', + 'drivers': ['driver1', 'driver2'], + }) + c2 = self.dbapi.register_conductor({ + 'hostname': 'host2', + 'drivers': ['driver1'], + }) + self.dbapi.register_conductor_hardware_interfaces( + c1.id, + [{'hardware_type': 'hardware-type', 'interface_type': 'deploy', + 'interface_name': 'ansible', 'default': True}, + {'hardware_type': 'hardware-type', 'interface_type': 'deploy', + 'interface_name': 'direct', 'default': False}] + ) + + ring = self.ring_manager.get_ring('hardware-type', '') + self.assertEqual(1, len(ring)) + + self.dbapi.register_conductor_hardware_interfaces( + c2.id, + [{'hardware_type': 'hardware-type', 'interface_type': 'deploy', + 'interface_name': 'ansible', 'default': True}, + {'hardware_type': 'hardware-type', 'interface_type': 'deploy', + 'interface_name': 'direct', 'default': False}] + ) + ring = self.ring_manager.get_ring('hardware-type', '') + # The new conductor is not known yet. Automatic retry does not kick in, + # since there is an active conductor for the requested hardware type. + self.assertEqual(1, len(ring)) + + self.ring_manager.__class__._hash_rings = ( + self.ring_manager.__class__._hash_rings[0], + time.monotonic() - 31 + ) + ring = self.ring_manager.get_ring('hardware-type', '') + self.assertEqual(1, len(ring)) + self.assertEqual(2, is_sqlite_mock.call_count) def test_hash_ring_manager_uncached(self): ring_mgr = hash_ring.HashRingManager(cache=False, diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py index e690031232..7de7be3724 100644 --- a/ironic/tests/unit/conductor/test_base_manager.py +++ b/ironic/tests/unit/conductor/test_base_manager.py @@ -27,6 +27,7 @@ from oslo_utils import uuidutils from ironic.common import driver_factory from ironic.common import exception from ironic.common import states +from ironic.common import utils as common_utils from ironic.conductor import base_manager from ironic.conductor import manager from ironic.conductor import notification_utils @@ -322,12 +323,24 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): 'is_set', autospec=True) as mock_is_set: mock_is_set.side_effect = [False, True] self.service._conductor_service_record_keepalive() + mock_touch.assert_not_called() + with mock.patch.object(self.service._keepalive_evt, + 'is_set', autospec=True) as mock_is_set: + mock_is_set.side_effect = [False, True] + with mock.patch.object(common_utils, 'is_ironic_using_sqlite', + autospec=True) as mock_is_sqlite: + mock_is_sqlite.return_value = False + self.service._conductor_service_record_keepalive() + self.assertEqual(1, mock_is_sqlite.call_count) mock_touch.assert_called_once_with(self.hostname) - def test__conductor_service_record_keepalive_failed_db_conn(self): + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) + def test__conductor_service_record_keepalive_failed_db_conn( + self, is_sqlite_mock): self._start_service() # avoid wasting time at the event.wait() CONF.set_override('heartbeat_interval', 0, 'conductor') + is_sqlite_mock.return_value = False with mock.patch.object(self.dbapi, 'touch_conductor', autospec=True) as mock_touch: mock_touch.side_effect = [None, db_exception.DBConnectionError(), @@ -337,11 +350,15 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_is_set.side_effect = [False, False, False, True] self.service._conductor_service_record_keepalive() self.assertEqual(3, mock_touch.call_count) + self.assertEqual(1, is_sqlite_mock.call_count) - def test__conductor_service_record_keepalive_failed_error(self): + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) + def test__conductor_service_record_keepalive_failed_error(self, + is_sqlite_mock): self._start_service() - # avoid wasting time at the event.wait() + # minimal time at the event.wait() CONF.set_override('heartbeat_interval', 0, 'conductor') + is_sqlite_mock.return_value = False with mock.patch.object(self.dbapi, 'touch_conductor', autospec=True) as mock_touch: mock_touch.side_effect = [None, Exception(), @@ -351,6 +368,7 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_is_set.side_effect = [False, False, False, True] self.service._conductor_service_record_keepalive() self.assertEqual(3, mock_touch.call_count) + self.assertEqual(1, is_sqlite_mock.call_count) class ManagerSpawnWorkerTestCase(tests_base.TestCase): diff --git a/ironic/tests/unit/db/test_conductor.py b/ironic/tests/unit/db/test_conductor.py index efc3a38a39..a16ef08b5c 100644 --- a/ironic/tests/unit/db/test_conductor.py +++ b/ironic/tests/unit/db/test_conductor.py @@ -21,6 +21,7 @@ from unittest import mock from oslo_utils import timeutils from ironic.common import exception +from ironic.common import utils as common_utils from ironic.tests.unit.db import base from ironic.tests.unit.db import utils @@ -295,9 +296,11 @@ class DbConductorTestCase(base.DbTestCase): result = self.dbapi.get_active_hardware_type_dict() self.assertEqual(expected, result) + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True) - def test_get_active_hardware_type_dict_with_old_conductor(self, - mock_utcnow): + def test_get_active_hardware_type_dict_with_old_conductor( + self, mock_utcnow, mock_is_sqlite): + mock_is_sqlite.return_value = False past = datetime.datetime(2000, 1, 1, 0, 0) present = past + datetime.timedelta(minutes=2) @@ -326,9 +329,12 @@ class DbConductorTestCase(base.DbTestCase): expected = {ht: {h1, h2}, ht1: {h1}, ht2: {h2}} result = self.dbapi.get_active_hardware_type_dict() self.assertEqual(expected, result) + self.assertEqual(2, mock_is_sqlite.call_count) + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True) - def test_get_offline_conductors(self, mock_utcnow): + def test_get_offline_conductors(self, mock_utcnow, mock_is_sqlite): + mock_is_sqlite.return_value = False self.config(heartbeat_timeout=60, group='conductor') time_ = datetime.datetime(2000, 1, 1, 0, 0) @@ -344,9 +350,34 @@ class DbConductorTestCase(base.DbTestCase): mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) self.assertEqual([c.hostname], self.dbapi.get_offline_conductors()) self.assertEqual([c.id], self.dbapi.get_offline_conductors(field='id')) + self.assertEqual(3, mock_is_sqlite.call_count) + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True) - def test_get_online_conductors(self, mock_utcnow): + def test_get_offline_conductors_with_sqlite(self, mock_utcnow, + mock_is_sqlite): + mock_is_sqlite.return_value = True + self.config(heartbeat_timeout=60, group='conductor') + time_ = datetime.datetime(2000, 1, 1, 0, 0) + + mock_utcnow.return_value = time_ + self._create_test_cdr() + + # Only 30 seconds passed since last heartbeat, it's still + # considered alive + mock_utcnow.return_value = time_ + datetime.timedelta(seconds=30) + self.assertEqual([], self.dbapi.get_offline_conductors()) + + # 61 seconds passed since last heartbeat, it's dead + mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) + self.assertEqual([], self.dbapi.get_offline_conductors()) + self.assertEqual([], self.dbapi.get_offline_conductors(field='id')) + self.assertEqual(3, mock_is_sqlite.call_count) + + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) + @mock.patch.object(timeutils, 'utcnow', autospec=True) + def test_get_online_conductors(self, mock_utcnow, mock_is_sqlite): + mock_is_sqlite.return_value = False self.config(heartbeat_timeout=60, group='conductor') time_ = datetime.datetime(2000, 1, 1, 0, 0) @@ -361,6 +392,26 @@ class DbConductorTestCase(base.DbTestCase): # 61 seconds passed since last heartbeat, it's dead mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) self.assertEqual([], self.dbapi.get_online_conductors()) + self.assertEqual(2, mock_is_sqlite.call_count) + + @mock.patch.object(timeutils, 'utcnow', autospec=True) + def test_get_online_conductors_with_sqlite(self, mock_utcnow): + # NOTE(TheJulia): Explicitly skipping the mock so the underlying + # code on the test is 'tested'. + self.config(heartbeat_timeout=60, group='conductor') + time_ = datetime.datetime(2000, 1, 1, 0, 0) + + mock_utcnow.return_value = time_ + c = self._create_test_cdr() + + # Only 30 seconds passed since last heartbeat, it's still + # considered alive + mock_utcnow.return_value = time_ + datetime.timedelta(seconds=30) + self.assertEqual([c.hostname], self.dbapi.get_online_conductors()) + + # 61 seconds passed since last heartbeat, it's dead + mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) + self.assertEqual([c.hostname], self.dbapi.get_online_conductors()) @mock.patch.object(timeutils, 'utcnow', autospec=True) def test_list_hardware_type_interfaces(self, mock_utcnow): @@ -415,5 +466,14 @@ class DbConductorTestCase(base.DbTestCase): # 61 seconds passed since last heartbeat, it's dead mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) - result = self.dbapi.list_hardware_type_interfaces([ht1, ht2]) + with mock.patch.object(common_utils, 'is_ironic_using_sqlite', + autospec=True) as mock_is_sqlite: + mock_is_sqlite.return_value = False + result = self.dbapi.list_hardware_type_interfaces([ht1, ht2]) + self.assertEqual(1, mock_is_sqlite.call_count) self.assertEqual([], result) + + # Validate we still have four entries with SQLite, which is the + # default for all unit tests running. + result = self.dbapi.list_hardware_type_interfaces([ht1, ht2]) + self.assertEqual(4, len(result)) diff --git a/releasenotes/notes/not-heartbeat-the-conductor-with-sqlite-7cfc71e717d69bc5.yaml b/releasenotes/notes/not-heartbeat-the-conductor-with-sqlite-7cfc71e717d69bc5.yaml new file mode 100644 index 0000000000..fb0eeaae4f --- /dev/null +++ b/releasenotes/notes/not-heartbeat-the-conductor-with-sqlite-7cfc71e717d69bc5.yaml @@ -0,0 +1,13 @@ +--- +deprecations: + - | + The use of a SQLite database with mutli-process (i.e. ``ironic-api`` and + ``ironic-conductor`` services) is not supported, and the ability to launch + a dedicated ``ironic-api`` process with a SQLite database backend will + be an error in the future. In this case, the single process combined + API and Conductor service should be utilized. +fixes: + - | + Database locks with a ``sqlite`` database backend should now be lessened + as the conductor will no longer perform a keepalive heartbeat operation + when the use of SQLite has been detected.