diff --git a/ironic/cmd/api.py b/ironic/cmd/api.py index f75aa9ee7d..0ec925168e 100644 --- a/ironic/cmd/api.py +++ b/ironic/cmd/api.py @@ -30,6 +30,17 @@ CONF = cfg.CONF LOG = log.getLogger(__name__) +def _error_about_sqlite_usage(): + if 'sqlite' in CONF.database.connection.lower(): + # TODO(TheJulia): Make this a hard error in C* + LOG.error('We have detected the API is being launched with a SQLite ' + 'database backend. This is unsupported, and will be a hard ' + 'error in the future. This is becaues multi-process use of ' + 'a single SQLite database is problematic in terms of ' + 'locking. A single process ironic model exists for use with ' + 'SQLite.') + + def main(): # Parse config file and command line options, then start logging ironic_service.prepare_service('ironic_api', sys.argv) @@ -43,4 +54,5 @@ def main(): if __name__ == '__main__': + _error_about_sqlite_usage() sys.exit(main()) diff --git a/ironic/cmd/conductor.py b/ironic/cmd/conductor.py index a932cb22fb..36330c9949 100644 --- a/ironic/cmd/conductor.py +++ b/ironic/cmd/conductor.py @@ -27,6 +27,7 @@ from oslo_service import service from ironic.common import rpc_service from ironic.common import service as ironic_service +from ironic.common import utils CONF = cfg.CONF @@ -43,8 +44,24 @@ def warn_about_unsafe_shred_parameters(conf): 'Secure Erase. This is a possible SECURITY ISSUE!') +def warn_about_sqlite(): + # We are intentionally calling the helper here to ensure it caches + # for all future calls. + if utils.is_ironic_using_sqlite(): + LOG.warning('Ironic has been configured to utilize SQLite. ' + 'This has some restrictions and impacts. You must run ' + 'as as a single combined ironic process, and some ' + 'internal mechanisms do not execute such as the hash ' + 'ring will remain static and the conductor\'s ' + '``last_updated`` field will also not update. This is ' + 'in order to minimize database locking issues present ' + 'as a result of SQLAlchemy 2.0 and the removal of ' + 'autocommit support.') + + def issue_startup_warnings(conf): warn_about_unsafe_shred_parameters(conf) + warn_about_sqlite() def main(): diff --git a/ironic/common/hash_ring.py b/ironic/common/hash_ring.py index 8986d5bd25..fab102b0e7 100644 --- a/ironic/common/hash_ring.py +++ b/ironic/common/hash_ring.py @@ -21,6 +21,7 @@ from tooz import hashring from ironic.common import exception from ironic.common.i18n import _ +from ironic.common import utils from ironic.conf import CONF from ironic.db import api as dbapi @@ -48,7 +49,11 @@ class HashRingManager(object): # Hot path, no lock. Using a local variable to avoid races with code # changing the class variable. hash_rings, updated_at = self.__class__._hash_rings - if hash_rings is not None and updated_at >= limit: + if (hash_rings is not None + and (updated_at >= limit + or utils.is_ironic_using_sqlite())): + # Returning the hash ring for us, if it is still valid, + # or if we're using sqlite. return hash_rings with self._lock: diff --git a/ironic/common/utils.py b/ironic/common/utils.py index 793b4b501d..6bae4e7e88 100644 --- a/ironic/common/utils.py +++ b/ironic/common/utils.py @@ -57,6 +57,8 @@ TZ_RE = r'((?P[+-])(?P\d{2}):(?P\d{2}))' + \ DATETIME_RE = re.compile( '%sT%s(%s)?' % (DATE_RE, TIME_RE, TZ_RE)) +USING_SQLITE = None + def _get_root_helper(): # NOTE(jlvillal): This function has been moved to ironic-lib. And is @@ -724,3 +726,15 @@ def parse_kernel_params(params): else: result[key] = value return result + + +def is_ironic_using_sqlite(): + """Return True if Ironic is configured to use SQLite""" + global USING_SQLITE + if USING_SQLITE is not None: + return USING_SQLITE + + # We're being called for the first time, lets cache and + # return the value. + USING_SQLITE = 'sqlite' in CONF.database.connection.lower() + return USING_SQLITE diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 544411e1d6..c5c677ab38 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -34,6 +34,7 @@ from ironic.common.i18n import _ from ironic.common import release_mappings as versions from ironic.common import rpc from ironic.common import states +from ironic.common import utils as common_utils from ironic.conductor import allocations from ironic.conductor import notification_utils as notify_utils from ironic.conductor import task_manager @@ -440,6 +441,10 @@ class BaseConductorManager(object): raise exception.NoFreeConductorWorker() def _conductor_service_record_keepalive(self): + if common_utils.is_ironic_using_sqlite(): + # Exit this keepalive heartbeats are disabled and not + # considered. + return while not self._keepalive_evt.is_set(): try: self.conductor.touch() diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 6e3daf1422..cbacf32024 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -355,10 +355,14 @@ def _paginate_query(model, limit=None, marker=None, sort_key=None, def _filter_active_conductors(query, interval=None): if interval is None: interval = CONF.conductor.heartbeat_timeout - limit = timeutils.utcnow() - datetime.timedelta(seconds=interval) - - query = (query.filter(models.Conductor.online.is_(True)) - .filter(models.Conductor.updated_at >= limit)) + if not utils.is_ironic_using_sqlite() and interval > 0: + # Check for greater than zero becaues if the value is zero, + # then the logic makes no sense. + limit = timeutils.utcnow() - datetime.timedelta(seconds=interval) + query = (query.filter(models.Conductor.online.is_(True)) + .filter(models.Conductor.updated_at >= limit)) + else: + query = query.filter(models.Conductor.online.is_(True)) return query @@ -1433,10 +1437,16 @@ class Connection(api.Connection): def get_offline_conductors(self, field='hostname'): with _session_for_read() as session: field = getattr(models.Conductor, field) - interval = CONF.conductor.heartbeat_timeout - limit = timeutils.utcnow() - datetime.timedelta(seconds=interval) - result = (session.query(field) - .filter(models.Conductor.updated_at < limit)) + if not utils.is_ironic_using_sqlite(): + interval = CONF.conductor.heartbeat_timeout + limit = (timeutils.utcnow() + - datetime.timedelta(seconds=interval)) + result = (session.query(field) + .filter(models.Conductor.updated_at < limit)) + else: + result = session.query( + field + ).filter(models.Conductor.online.is_(False)) return [row[0] for row in result] def get_online_conductors(self): diff --git a/ironic/tests/unit/common/test_hash_ring.py b/ironic/tests/unit/common/test_hash_ring.py index a84b9ffcfa..085420404d 100644 --- a/ironic/tests/unit/common/test_hash_ring.py +++ b/ironic/tests/unit/common/test_hash_ring.py @@ -14,11 +14,13 @@ # under the License. import time +from unittest import mock from oslo_config import cfg from ironic.common import exception from ironic.common import hash_ring +from ironic.common import utils from ironic.tests.unit.db import base as db_base CONF = cfg.CONF @@ -92,7 +94,9 @@ class HashRingManagerTestCase(db_base.DbTestCase): self.register_conductors() self.ring_manager.get_ring('hardware-type', '') - def test_hash_ring_manager_reset_interval(self): + @mock.patch.object(utils, 'is_ironic_using_sqlite', autospec=True) + def test_hash_ring_manager_reset_interval(self, is_sqlite_mock): + is_sqlite_mock.return_value = False CONF.set_override('hash_ring_reset_interval', 30) # Just to simplify calculations CONF.set_override('hash_partition_exponent', 0) @@ -133,6 +137,53 @@ class HashRingManagerTestCase(db_base.DbTestCase): ) ring = self.ring_manager.get_ring('hardware-type', '') self.assertEqual(2, len(ring)) + self.assertEqual(3, is_sqlite_mock.call_count) + + @mock.patch.object(utils, 'is_ironic_using_sqlite', autospec=True) + def test_hash_ring_manager_reset_interval_not_happen_sqlite( + self, is_sqlite_mock): + is_sqlite_mock.return_value = True + CONF.set_override('hash_ring_reset_interval', 30) + # Just to simplify calculations + CONF.set_override('hash_partition_exponent', 0) + c1 = self.dbapi.register_conductor({ + 'hostname': 'host1', + 'drivers': ['driver1', 'driver2'], + }) + c2 = self.dbapi.register_conductor({ + 'hostname': 'host2', + 'drivers': ['driver1'], + }) + self.dbapi.register_conductor_hardware_interfaces( + c1.id, + [{'hardware_type': 'hardware-type', 'interface_type': 'deploy', + 'interface_name': 'ansible', 'default': True}, + {'hardware_type': 'hardware-type', 'interface_type': 'deploy', + 'interface_name': 'direct', 'default': False}] + ) + + ring = self.ring_manager.get_ring('hardware-type', '') + self.assertEqual(1, len(ring)) + + self.dbapi.register_conductor_hardware_interfaces( + c2.id, + [{'hardware_type': 'hardware-type', 'interface_type': 'deploy', + 'interface_name': 'ansible', 'default': True}, + {'hardware_type': 'hardware-type', 'interface_type': 'deploy', + 'interface_name': 'direct', 'default': False}] + ) + ring = self.ring_manager.get_ring('hardware-type', '') + # The new conductor is not known yet. Automatic retry does not kick in, + # since there is an active conductor for the requested hardware type. + self.assertEqual(1, len(ring)) + + self.ring_manager.__class__._hash_rings = ( + self.ring_manager.__class__._hash_rings[0], + time.monotonic() - 31 + ) + ring = self.ring_manager.get_ring('hardware-type', '') + self.assertEqual(1, len(ring)) + self.assertEqual(2, is_sqlite_mock.call_count) def test_hash_ring_manager_uncached(self): ring_mgr = hash_ring.HashRingManager(cache=False, diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py index e690031232..7de7be3724 100644 --- a/ironic/tests/unit/conductor/test_base_manager.py +++ b/ironic/tests/unit/conductor/test_base_manager.py @@ -27,6 +27,7 @@ from oslo_utils import uuidutils from ironic.common import driver_factory from ironic.common import exception from ironic.common import states +from ironic.common import utils as common_utils from ironic.conductor import base_manager from ironic.conductor import manager from ironic.conductor import notification_utils @@ -322,12 +323,24 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): 'is_set', autospec=True) as mock_is_set: mock_is_set.side_effect = [False, True] self.service._conductor_service_record_keepalive() + mock_touch.assert_not_called() + with mock.patch.object(self.service._keepalive_evt, + 'is_set', autospec=True) as mock_is_set: + mock_is_set.side_effect = [False, True] + with mock.patch.object(common_utils, 'is_ironic_using_sqlite', + autospec=True) as mock_is_sqlite: + mock_is_sqlite.return_value = False + self.service._conductor_service_record_keepalive() + self.assertEqual(1, mock_is_sqlite.call_count) mock_touch.assert_called_once_with(self.hostname) - def test__conductor_service_record_keepalive_failed_db_conn(self): + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) + def test__conductor_service_record_keepalive_failed_db_conn( + self, is_sqlite_mock): self._start_service() # avoid wasting time at the event.wait() CONF.set_override('heartbeat_interval', 0, 'conductor') + is_sqlite_mock.return_value = False with mock.patch.object(self.dbapi, 'touch_conductor', autospec=True) as mock_touch: mock_touch.side_effect = [None, db_exception.DBConnectionError(), @@ -337,11 +350,15 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_is_set.side_effect = [False, False, False, True] self.service._conductor_service_record_keepalive() self.assertEqual(3, mock_touch.call_count) + self.assertEqual(1, is_sqlite_mock.call_count) - def test__conductor_service_record_keepalive_failed_error(self): + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) + def test__conductor_service_record_keepalive_failed_error(self, + is_sqlite_mock): self._start_service() - # avoid wasting time at the event.wait() + # minimal time at the event.wait() CONF.set_override('heartbeat_interval', 0, 'conductor') + is_sqlite_mock.return_value = False with mock.patch.object(self.dbapi, 'touch_conductor', autospec=True) as mock_touch: mock_touch.side_effect = [None, Exception(), @@ -351,6 +368,7 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_is_set.side_effect = [False, False, False, True] self.service._conductor_service_record_keepalive() self.assertEqual(3, mock_touch.call_count) + self.assertEqual(1, is_sqlite_mock.call_count) class ManagerSpawnWorkerTestCase(tests_base.TestCase): diff --git a/ironic/tests/unit/db/test_conductor.py b/ironic/tests/unit/db/test_conductor.py index efc3a38a39..a16ef08b5c 100644 --- a/ironic/tests/unit/db/test_conductor.py +++ b/ironic/tests/unit/db/test_conductor.py @@ -21,6 +21,7 @@ from unittest import mock from oslo_utils import timeutils from ironic.common import exception +from ironic.common import utils as common_utils from ironic.tests.unit.db import base from ironic.tests.unit.db import utils @@ -295,9 +296,11 @@ class DbConductorTestCase(base.DbTestCase): result = self.dbapi.get_active_hardware_type_dict() self.assertEqual(expected, result) + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True) - def test_get_active_hardware_type_dict_with_old_conductor(self, - mock_utcnow): + def test_get_active_hardware_type_dict_with_old_conductor( + self, mock_utcnow, mock_is_sqlite): + mock_is_sqlite.return_value = False past = datetime.datetime(2000, 1, 1, 0, 0) present = past + datetime.timedelta(minutes=2) @@ -326,9 +329,12 @@ class DbConductorTestCase(base.DbTestCase): expected = {ht: {h1, h2}, ht1: {h1}, ht2: {h2}} result = self.dbapi.get_active_hardware_type_dict() self.assertEqual(expected, result) + self.assertEqual(2, mock_is_sqlite.call_count) + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True) - def test_get_offline_conductors(self, mock_utcnow): + def test_get_offline_conductors(self, mock_utcnow, mock_is_sqlite): + mock_is_sqlite.return_value = False self.config(heartbeat_timeout=60, group='conductor') time_ = datetime.datetime(2000, 1, 1, 0, 0) @@ -344,9 +350,34 @@ class DbConductorTestCase(base.DbTestCase): mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) self.assertEqual([c.hostname], self.dbapi.get_offline_conductors()) self.assertEqual([c.id], self.dbapi.get_offline_conductors(field='id')) + self.assertEqual(3, mock_is_sqlite.call_count) + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True) - def test_get_online_conductors(self, mock_utcnow): + def test_get_offline_conductors_with_sqlite(self, mock_utcnow, + mock_is_sqlite): + mock_is_sqlite.return_value = True + self.config(heartbeat_timeout=60, group='conductor') + time_ = datetime.datetime(2000, 1, 1, 0, 0) + + mock_utcnow.return_value = time_ + self._create_test_cdr() + + # Only 30 seconds passed since last heartbeat, it's still + # considered alive + mock_utcnow.return_value = time_ + datetime.timedelta(seconds=30) + self.assertEqual([], self.dbapi.get_offline_conductors()) + + # 61 seconds passed since last heartbeat, it's dead + mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) + self.assertEqual([], self.dbapi.get_offline_conductors()) + self.assertEqual([], self.dbapi.get_offline_conductors(field='id')) + self.assertEqual(3, mock_is_sqlite.call_count) + + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) + @mock.patch.object(timeutils, 'utcnow', autospec=True) + def test_get_online_conductors(self, mock_utcnow, mock_is_sqlite): + mock_is_sqlite.return_value = False self.config(heartbeat_timeout=60, group='conductor') time_ = datetime.datetime(2000, 1, 1, 0, 0) @@ -361,6 +392,26 @@ class DbConductorTestCase(base.DbTestCase): # 61 seconds passed since last heartbeat, it's dead mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) self.assertEqual([], self.dbapi.get_online_conductors()) + self.assertEqual(2, mock_is_sqlite.call_count) + + @mock.patch.object(timeutils, 'utcnow', autospec=True) + def test_get_online_conductors_with_sqlite(self, mock_utcnow): + # NOTE(TheJulia): Explicitly skipping the mock so the underlying + # code on the test is 'tested'. + self.config(heartbeat_timeout=60, group='conductor') + time_ = datetime.datetime(2000, 1, 1, 0, 0) + + mock_utcnow.return_value = time_ + c = self._create_test_cdr() + + # Only 30 seconds passed since last heartbeat, it's still + # considered alive + mock_utcnow.return_value = time_ + datetime.timedelta(seconds=30) + self.assertEqual([c.hostname], self.dbapi.get_online_conductors()) + + # 61 seconds passed since last heartbeat, it's dead + mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) + self.assertEqual([c.hostname], self.dbapi.get_online_conductors()) @mock.patch.object(timeutils, 'utcnow', autospec=True) def test_list_hardware_type_interfaces(self, mock_utcnow): @@ -415,5 +466,14 @@ class DbConductorTestCase(base.DbTestCase): # 61 seconds passed since last heartbeat, it's dead mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) - result = self.dbapi.list_hardware_type_interfaces([ht1, ht2]) + with mock.patch.object(common_utils, 'is_ironic_using_sqlite', + autospec=True) as mock_is_sqlite: + mock_is_sqlite.return_value = False + result = self.dbapi.list_hardware_type_interfaces([ht1, ht2]) + self.assertEqual(1, mock_is_sqlite.call_count) self.assertEqual([], result) + + # Validate we still have four entries with SQLite, which is the + # default for all unit tests running. + result = self.dbapi.list_hardware_type_interfaces([ht1, ht2]) + self.assertEqual(4, len(result)) diff --git a/releasenotes/notes/not-heartbeat-the-conductor-with-sqlite-7cfc71e717d69bc5.yaml b/releasenotes/notes/not-heartbeat-the-conductor-with-sqlite-7cfc71e717d69bc5.yaml new file mode 100644 index 0000000000..fb0eeaae4f --- /dev/null +++ b/releasenotes/notes/not-heartbeat-the-conductor-with-sqlite-7cfc71e717d69bc5.yaml @@ -0,0 +1,13 @@ +--- +deprecations: + - | + The use of a SQLite database with mutli-process (i.e. ``ironic-api`` and + ``ironic-conductor`` services) is not supported, and the ability to launch + a dedicated ``ironic-api`` process with a SQLite database backend will + be an error in the future. In this case, the single process combined + API and Conductor service should be utilized. +fixes: + - | + Database locks with a ``sqlite`` database backend should now be lessened + as the conductor will no longer perform a keepalive heartbeat operation + when the use of SQLite has been detected.