Don't actually heartbeat with sqlite

Disables internal heartbeat mechanism when ironic has been
configured to utilize a SQLite database backend.

This is done to lessen the possibility of a
"database is locked" error, which can occur when two
distinct threads attempt to write to the database
at the same time with open writers.

The process keepalive heartbeat process was identified as
a major source of these write operations as it was writing
every ten seconds by default, which would also collide with
periodic tasks.

Change-Id: I7b6d7a78ba2910f22673ad8e72e255f321d3fdff
This commit is contained in:
Julia Kreger 2023-07-13 14:35:49 -07:00
parent 54da324900
commit 0099d1812d
10 changed files with 223 additions and 18 deletions

View File

@ -30,6 +30,17 @@ CONF = cfg.CONF
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
def _error_about_sqlite_usage():
if 'sqlite' in CONF.database.connection.lower():
# TODO(TheJulia): Make this a hard error in C*
LOG.error('We have detected the API is being launched with a SQLite '
'database backend. This is unsupported, and will be a hard '
'error in the future. This is becaues multi-process use of '
'a single SQLite database is problematic in terms of '
'locking. A single process ironic model exists for use with '
'SQLite.')
def main(): def main():
# Parse config file and command line options, then start logging # Parse config file and command line options, then start logging
ironic_service.prepare_service('ironic_api', sys.argv) ironic_service.prepare_service('ironic_api', sys.argv)
@ -43,4 +54,5 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
_error_about_sqlite_usage()
sys.exit(main()) sys.exit(main())

View File

@ -27,6 +27,7 @@ from oslo_service import service
from ironic.common import rpc_service from ironic.common import rpc_service
from ironic.common import service as ironic_service from ironic.common import service as ironic_service
from ironic.common import utils
CONF = cfg.CONF CONF = cfg.CONF
@ -43,8 +44,24 @@ def warn_about_unsafe_shred_parameters(conf):
'Secure Erase. This is a possible SECURITY ISSUE!') 'Secure Erase. This is a possible SECURITY ISSUE!')
def warn_about_sqlite():
# We are intentionally calling the helper here to ensure it caches
# for all future calls.
if utils.is_ironic_using_sqlite():
LOG.warning('Ironic has been configured to utilize SQLite. '
'This has some restrictions and impacts. You must run '
'as as a single combined ironic process, and some '
'internal mechanisms do not execute such as the hash '
'ring will remain static and the conductor\'s '
'``last_updated`` field will also not update. This is '
'in order to minimize database locking issues present '
'as a result of SQLAlchemy 2.0 and the removal of '
'autocommit support.')
def issue_startup_warnings(conf): def issue_startup_warnings(conf):
warn_about_unsafe_shred_parameters(conf) warn_about_unsafe_shred_parameters(conf)
warn_about_sqlite()
def main(): def main():

View File

@ -21,6 +21,7 @@ from tooz import hashring
from ironic.common import exception from ironic.common import exception
from ironic.common.i18n import _ from ironic.common.i18n import _
from ironic.common import utils
from ironic.conf import CONF from ironic.conf import CONF
from ironic.db import api as dbapi from ironic.db import api as dbapi
@ -48,7 +49,11 @@ class HashRingManager(object):
# Hot path, no lock. Using a local variable to avoid races with code # Hot path, no lock. Using a local variable to avoid races with code
# changing the class variable. # changing the class variable.
hash_rings, updated_at = self.__class__._hash_rings hash_rings, updated_at = self.__class__._hash_rings
if hash_rings is not None and updated_at >= limit: if (hash_rings is not None
and (updated_at >= limit
or utils.is_ironic_using_sqlite())):
# Returning the hash ring for us, if it is still valid,
# or if we're using sqlite.
return hash_rings return hash_rings
with self._lock: with self._lock:

View File

@ -57,6 +57,8 @@ TZ_RE = r'((?P<tz_sign>[+-])(?P<tz_hour>\d{2}):(?P<tz_min>\d{2}))' + \
DATETIME_RE = re.compile( DATETIME_RE = re.compile(
'%sT%s(%s)?' % (DATE_RE, TIME_RE, TZ_RE)) '%sT%s(%s)?' % (DATE_RE, TIME_RE, TZ_RE))
USING_SQLITE = None
def _get_root_helper(): def _get_root_helper():
# NOTE(jlvillal): This function has been moved to ironic-lib. And is # NOTE(jlvillal): This function has been moved to ironic-lib. And is
@ -724,3 +726,15 @@ def parse_kernel_params(params):
else: else:
result[key] = value result[key] = value
return result return result
def is_ironic_using_sqlite():
"""Return True if Ironic is configured to use SQLite"""
global USING_SQLITE
if USING_SQLITE is not None:
return USING_SQLITE
# We're being called for the first time, lets cache and
# return the value.
USING_SQLITE = 'sqlite' in CONF.database.connection.lower()
return USING_SQLITE

View File

@ -34,6 +34,7 @@ from ironic.common.i18n import _
from ironic.common import release_mappings as versions from ironic.common import release_mappings as versions
from ironic.common import rpc from ironic.common import rpc
from ironic.common import states from ironic.common import states
from ironic.common import utils as common_utils
from ironic.conductor import allocations from ironic.conductor import allocations
from ironic.conductor import notification_utils as notify_utils from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import task_manager from ironic.conductor import task_manager
@ -440,6 +441,10 @@ class BaseConductorManager(object):
raise exception.NoFreeConductorWorker() raise exception.NoFreeConductorWorker()
def _conductor_service_record_keepalive(self): def _conductor_service_record_keepalive(self):
if common_utils.is_ironic_using_sqlite():
# Exit this keepalive heartbeats are disabled and not
# considered.
return
while not self._keepalive_evt.is_set(): while not self._keepalive_evt.is_set():
try: try:
self.conductor.touch() self.conductor.touch()

View File

@ -355,10 +355,14 @@ def _paginate_query(model, limit=None, marker=None, sort_key=None,
def _filter_active_conductors(query, interval=None): def _filter_active_conductors(query, interval=None):
if interval is None: if interval is None:
interval = CONF.conductor.heartbeat_timeout interval = CONF.conductor.heartbeat_timeout
limit = timeutils.utcnow() - datetime.timedelta(seconds=interval) if not utils.is_ironic_using_sqlite() and interval > 0:
# Check for greater than zero becaues if the value is zero,
query = (query.filter(models.Conductor.online.is_(True)) # then the logic makes no sense.
.filter(models.Conductor.updated_at >= limit)) limit = timeutils.utcnow() - datetime.timedelta(seconds=interval)
query = (query.filter(models.Conductor.online.is_(True))
.filter(models.Conductor.updated_at >= limit))
else:
query = query.filter(models.Conductor.online.is_(True))
return query return query
@ -1433,10 +1437,16 @@ class Connection(api.Connection):
def get_offline_conductors(self, field='hostname'): def get_offline_conductors(self, field='hostname'):
with _session_for_read() as session: with _session_for_read() as session:
field = getattr(models.Conductor, field) field = getattr(models.Conductor, field)
interval = CONF.conductor.heartbeat_timeout if not utils.is_ironic_using_sqlite():
limit = timeutils.utcnow() - datetime.timedelta(seconds=interval) interval = CONF.conductor.heartbeat_timeout
result = (session.query(field) limit = (timeutils.utcnow()
.filter(models.Conductor.updated_at < limit)) - datetime.timedelta(seconds=interval))
result = (session.query(field)
.filter(models.Conductor.updated_at < limit))
else:
result = session.query(
field
).filter(models.Conductor.online.is_(False))
return [row[0] for row in result] return [row[0] for row in result]
def get_online_conductors(self): def get_online_conductors(self):

View File

@ -14,11 +14,13 @@
# under the License. # under the License.
import time import time
from unittest import mock
from oslo_config import cfg from oslo_config import cfg
from ironic.common import exception from ironic.common import exception
from ironic.common import hash_ring from ironic.common import hash_ring
from ironic.common import utils
from ironic.tests.unit.db import base as db_base from ironic.tests.unit.db import base as db_base
CONF = cfg.CONF CONF = cfg.CONF
@ -92,7 +94,9 @@ class HashRingManagerTestCase(db_base.DbTestCase):
self.register_conductors() self.register_conductors()
self.ring_manager.get_ring('hardware-type', '') self.ring_manager.get_ring('hardware-type', '')
def test_hash_ring_manager_reset_interval(self): @mock.patch.object(utils, 'is_ironic_using_sqlite', autospec=True)
def test_hash_ring_manager_reset_interval(self, is_sqlite_mock):
is_sqlite_mock.return_value = False
CONF.set_override('hash_ring_reset_interval', 30) CONF.set_override('hash_ring_reset_interval', 30)
# Just to simplify calculations # Just to simplify calculations
CONF.set_override('hash_partition_exponent', 0) CONF.set_override('hash_partition_exponent', 0)
@ -133,6 +137,53 @@ class HashRingManagerTestCase(db_base.DbTestCase):
) )
ring = self.ring_manager.get_ring('hardware-type', '') ring = self.ring_manager.get_ring('hardware-type', '')
self.assertEqual(2, len(ring)) self.assertEqual(2, len(ring))
self.assertEqual(3, is_sqlite_mock.call_count)
@mock.patch.object(utils, 'is_ironic_using_sqlite', autospec=True)
def test_hash_ring_manager_reset_interval_not_happen_sqlite(
self, is_sqlite_mock):
is_sqlite_mock.return_value = True
CONF.set_override('hash_ring_reset_interval', 30)
# Just to simplify calculations
CONF.set_override('hash_partition_exponent', 0)
c1 = self.dbapi.register_conductor({
'hostname': 'host1',
'drivers': ['driver1', 'driver2'],
})
c2 = self.dbapi.register_conductor({
'hostname': 'host2',
'drivers': ['driver1'],
})
self.dbapi.register_conductor_hardware_interfaces(
c1.id,
[{'hardware_type': 'hardware-type', 'interface_type': 'deploy',
'interface_name': 'ansible', 'default': True},
{'hardware_type': 'hardware-type', 'interface_type': 'deploy',
'interface_name': 'direct', 'default': False}]
)
ring = self.ring_manager.get_ring('hardware-type', '')
self.assertEqual(1, len(ring))
self.dbapi.register_conductor_hardware_interfaces(
c2.id,
[{'hardware_type': 'hardware-type', 'interface_type': 'deploy',
'interface_name': 'ansible', 'default': True},
{'hardware_type': 'hardware-type', 'interface_type': 'deploy',
'interface_name': 'direct', 'default': False}]
)
ring = self.ring_manager.get_ring('hardware-type', '')
# The new conductor is not known yet. Automatic retry does not kick in,
# since there is an active conductor for the requested hardware type.
self.assertEqual(1, len(ring))
self.ring_manager.__class__._hash_rings = (
self.ring_manager.__class__._hash_rings[0],
time.monotonic() - 31
)
ring = self.ring_manager.get_ring('hardware-type', '')
self.assertEqual(1, len(ring))
self.assertEqual(2, is_sqlite_mock.call_count)
def test_hash_ring_manager_uncached(self): def test_hash_ring_manager_uncached(self):
ring_mgr = hash_ring.HashRingManager(cache=False, ring_mgr = hash_ring.HashRingManager(cache=False,

View File

@ -27,6 +27,7 @@ from oslo_utils import uuidutils
from ironic.common import driver_factory from ironic.common import driver_factory
from ironic.common import exception from ironic.common import exception
from ironic.common import states from ironic.common import states
from ironic.common import utils as common_utils
from ironic.conductor import base_manager from ironic.conductor import base_manager
from ironic.conductor import manager from ironic.conductor import manager
from ironic.conductor import notification_utils from ironic.conductor import notification_utils
@ -322,12 +323,24 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
'is_set', autospec=True) as mock_is_set: 'is_set', autospec=True) as mock_is_set:
mock_is_set.side_effect = [False, True] mock_is_set.side_effect = [False, True]
self.service._conductor_service_record_keepalive() self.service._conductor_service_record_keepalive()
mock_touch.assert_not_called()
with mock.patch.object(self.service._keepalive_evt,
'is_set', autospec=True) as mock_is_set:
mock_is_set.side_effect = [False, True]
with mock.patch.object(common_utils, 'is_ironic_using_sqlite',
autospec=True) as mock_is_sqlite:
mock_is_sqlite.return_value = False
self.service._conductor_service_record_keepalive()
self.assertEqual(1, mock_is_sqlite.call_count)
mock_touch.assert_called_once_with(self.hostname) mock_touch.assert_called_once_with(self.hostname)
def test__conductor_service_record_keepalive_failed_db_conn(self): @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
def test__conductor_service_record_keepalive_failed_db_conn(
self, is_sqlite_mock):
self._start_service() self._start_service()
# avoid wasting time at the event.wait() # avoid wasting time at the event.wait()
CONF.set_override('heartbeat_interval', 0, 'conductor') CONF.set_override('heartbeat_interval', 0, 'conductor')
is_sqlite_mock.return_value = False
with mock.patch.object(self.dbapi, 'touch_conductor', with mock.patch.object(self.dbapi, 'touch_conductor',
autospec=True) as mock_touch: autospec=True) as mock_touch:
mock_touch.side_effect = [None, db_exception.DBConnectionError(), mock_touch.side_effect = [None, db_exception.DBConnectionError(),
@ -337,11 +350,15 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_is_set.side_effect = [False, False, False, True] mock_is_set.side_effect = [False, False, False, True]
self.service._conductor_service_record_keepalive() self.service._conductor_service_record_keepalive()
self.assertEqual(3, mock_touch.call_count) self.assertEqual(3, mock_touch.call_count)
self.assertEqual(1, is_sqlite_mock.call_count)
def test__conductor_service_record_keepalive_failed_error(self): @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
def test__conductor_service_record_keepalive_failed_error(self,
is_sqlite_mock):
self._start_service() self._start_service()
# avoid wasting time at the event.wait() # minimal time at the event.wait()
CONF.set_override('heartbeat_interval', 0, 'conductor') CONF.set_override('heartbeat_interval', 0, 'conductor')
is_sqlite_mock.return_value = False
with mock.patch.object(self.dbapi, 'touch_conductor', with mock.patch.object(self.dbapi, 'touch_conductor',
autospec=True) as mock_touch: autospec=True) as mock_touch:
mock_touch.side_effect = [None, Exception(), mock_touch.side_effect = [None, Exception(),
@ -351,6 +368,7 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_is_set.side_effect = [False, False, False, True] mock_is_set.side_effect = [False, False, False, True]
self.service._conductor_service_record_keepalive() self.service._conductor_service_record_keepalive()
self.assertEqual(3, mock_touch.call_count) self.assertEqual(3, mock_touch.call_count)
self.assertEqual(1, is_sqlite_mock.call_count)
class ManagerSpawnWorkerTestCase(tests_base.TestCase): class ManagerSpawnWorkerTestCase(tests_base.TestCase):

View File

@ -21,6 +21,7 @@ from unittest import mock
from oslo_utils import timeutils from oslo_utils import timeutils
from ironic.common import exception from ironic.common import exception
from ironic.common import utils as common_utils
from ironic.tests.unit.db import base from ironic.tests.unit.db import base
from ironic.tests.unit.db import utils from ironic.tests.unit.db import utils
@ -295,9 +296,11 @@ class DbConductorTestCase(base.DbTestCase):
result = self.dbapi.get_active_hardware_type_dict() result = self.dbapi.get_active_hardware_type_dict()
self.assertEqual(expected, result) self.assertEqual(expected, result)
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
@mock.patch.object(timeutils, 'utcnow', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True)
def test_get_active_hardware_type_dict_with_old_conductor(self, def test_get_active_hardware_type_dict_with_old_conductor(
mock_utcnow): self, mock_utcnow, mock_is_sqlite):
mock_is_sqlite.return_value = False
past = datetime.datetime(2000, 1, 1, 0, 0) past = datetime.datetime(2000, 1, 1, 0, 0)
present = past + datetime.timedelta(minutes=2) present = past + datetime.timedelta(minutes=2)
@ -326,9 +329,12 @@ class DbConductorTestCase(base.DbTestCase):
expected = {ht: {h1, h2}, ht1: {h1}, ht2: {h2}} expected = {ht: {h1, h2}, ht1: {h1}, ht2: {h2}}
result = self.dbapi.get_active_hardware_type_dict() result = self.dbapi.get_active_hardware_type_dict()
self.assertEqual(expected, result) self.assertEqual(expected, result)
self.assertEqual(2, mock_is_sqlite.call_count)
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
@mock.patch.object(timeutils, 'utcnow', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True)
def test_get_offline_conductors(self, mock_utcnow): def test_get_offline_conductors(self, mock_utcnow, mock_is_sqlite):
mock_is_sqlite.return_value = False
self.config(heartbeat_timeout=60, group='conductor') self.config(heartbeat_timeout=60, group='conductor')
time_ = datetime.datetime(2000, 1, 1, 0, 0) time_ = datetime.datetime(2000, 1, 1, 0, 0)
@ -344,9 +350,34 @@ class DbConductorTestCase(base.DbTestCase):
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
self.assertEqual([c.hostname], self.dbapi.get_offline_conductors()) self.assertEqual([c.hostname], self.dbapi.get_offline_conductors())
self.assertEqual([c.id], self.dbapi.get_offline_conductors(field='id')) self.assertEqual([c.id], self.dbapi.get_offline_conductors(field='id'))
self.assertEqual(3, mock_is_sqlite.call_count)
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
@mock.patch.object(timeutils, 'utcnow', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True)
def test_get_online_conductors(self, mock_utcnow): def test_get_offline_conductors_with_sqlite(self, mock_utcnow,
mock_is_sqlite):
mock_is_sqlite.return_value = True
self.config(heartbeat_timeout=60, group='conductor')
time_ = datetime.datetime(2000, 1, 1, 0, 0)
mock_utcnow.return_value = time_
self._create_test_cdr()
# Only 30 seconds passed since last heartbeat, it's still
# considered alive
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=30)
self.assertEqual([], self.dbapi.get_offline_conductors())
# 61 seconds passed since last heartbeat, it's dead
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
self.assertEqual([], self.dbapi.get_offline_conductors())
self.assertEqual([], self.dbapi.get_offline_conductors(field='id'))
self.assertEqual(3, mock_is_sqlite.call_count)
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
@mock.patch.object(timeutils, 'utcnow', autospec=True)
def test_get_online_conductors(self, mock_utcnow, mock_is_sqlite):
mock_is_sqlite.return_value = False
self.config(heartbeat_timeout=60, group='conductor') self.config(heartbeat_timeout=60, group='conductor')
time_ = datetime.datetime(2000, 1, 1, 0, 0) time_ = datetime.datetime(2000, 1, 1, 0, 0)
@ -361,6 +392,26 @@ class DbConductorTestCase(base.DbTestCase):
# 61 seconds passed since last heartbeat, it's dead # 61 seconds passed since last heartbeat, it's dead
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
self.assertEqual([], self.dbapi.get_online_conductors()) self.assertEqual([], self.dbapi.get_online_conductors())
self.assertEqual(2, mock_is_sqlite.call_count)
@mock.patch.object(timeutils, 'utcnow', autospec=True)
def test_get_online_conductors_with_sqlite(self, mock_utcnow):
# NOTE(TheJulia): Explicitly skipping the mock so the underlying
# code on the test is 'tested'.
self.config(heartbeat_timeout=60, group='conductor')
time_ = datetime.datetime(2000, 1, 1, 0, 0)
mock_utcnow.return_value = time_
c = self._create_test_cdr()
# Only 30 seconds passed since last heartbeat, it's still
# considered alive
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=30)
self.assertEqual([c.hostname], self.dbapi.get_online_conductors())
# 61 seconds passed since last heartbeat, it's dead
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
self.assertEqual([c.hostname], self.dbapi.get_online_conductors())
@mock.patch.object(timeutils, 'utcnow', autospec=True) @mock.patch.object(timeutils, 'utcnow', autospec=True)
def test_list_hardware_type_interfaces(self, mock_utcnow): def test_list_hardware_type_interfaces(self, mock_utcnow):
@ -415,5 +466,14 @@ class DbConductorTestCase(base.DbTestCase):
# 61 seconds passed since last heartbeat, it's dead # 61 seconds passed since last heartbeat, it's dead
mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61) mock_utcnow.return_value = time_ + datetime.timedelta(seconds=61)
result = self.dbapi.list_hardware_type_interfaces([ht1, ht2]) with mock.patch.object(common_utils, 'is_ironic_using_sqlite',
autospec=True) as mock_is_sqlite:
mock_is_sqlite.return_value = False
result = self.dbapi.list_hardware_type_interfaces([ht1, ht2])
self.assertEqual(1, mock_is_sqlite.call_count)
self.assertEqual([], result) self.assertEqual([], result)
# Validate we still have four entries with SQLite, which is the
# default for all unit tests running.
result = self.dbapi.list_hardware_type_interfaces([ht1, ht2])
self.assertEqual(4, len(result))

View File

@ -0,0 +1,13 @@
---
deprecations:
- |
The use of a SQLite database with mutli-process (i.e. ``ironic-api`` and
``ironic-conductor`` services) is not supported, and the ability to launch
a dedicated ``ironic-api`` process with a SQLite database backend will
be an error in the future. In this case, the single process combined
API and Conductor service should be utilized.
fixes:
- |
Database locks with a ``sqlite`` database backend should now be lessened
as the conductor will no longer perform a keepalive heartbeat operation
when the use of SQLite has been detected.