diff --git a/nova/conf/quota.py b/nova/conf/quota.py index 212d44299669..b856096b1a62 100644 --- a/nova/conf/quota.py +++ b/nova/conf/quota.py @@ -293,6 +293,45 @@ impossible for a user to exceed their quota with the caveat that it will, however, be possible for a REST API user to be rejected with a 403 response in the event of a collision close to reaching their quota limit, even if the user has enough quota available when they made the request. +"""), + cfg.BoolOpt( + 'count_usage_from_placement', + default=False, + help=""" +Enable the counting of quota usage from the placement service. + +Starting in Train, it is possible to count quota usage for cores and ram from +the placement service and instances from the API database instead of counting +from cell databases. + +This works well if there is only one Nova deployment running per placement +deployment. However, if an operator is running more than one Nova deployment +sharing a placement deployment, they should not set this option to True because +currently the placement service has no way to partition resource providers per +Nova deployment. When this option is left as the default or set to False, Nova +will use the legacy counting method to count quota usage for instances, cores, +and ram from its cell databases. + +Note that quota usage behavior related to resizes will be affected if this +option is set to True. Placement resource allocations are claimed on the +destination while holding allocations on the source during a resize, until the +resize is confirmed or reverted. During this time, when the server is in +VERIFY_RESIZE state, quota usage will reflect resource consumption on both the +source and the destination. This can be beneficial as it reserves space for a +revert of a downsize, but it also means quota usage will be inflated until a +resize is confirmed or reverted. + +Behavior will also be different for unscheduled servers in ERROR state. A +server in ERROR state that has never been scheduled to a compute host will +not have placement allocations, so it will not consume quota usage for cores +and ram. + +Behavior will be different for servers in SHELVED_OFFLOADED state. A server in +SHELVED_OFFLOADED state will not have placement allocations, so it will not +consume quota usage for cores and ram. Note that because of this, it will be +possible for a request to unshelve a server to be rejected if the user does not +have enough quota available to support the cores and ram needed by the server +to be unshelved. """), ] diff --git a/nova/quota.py b/nova/quota.py index bf5b94ac5925..3becabc4678e 100644 --- a/nova/quota.py +++ b/nova/quota.py @@ -20,18 +20,29 @@ import copy from oslo_log import log as logging from oslo_utils import importutils +from sqlalchemy.sql import and_ +from sqlalchemy.sql import false +from sqlalchemy.sql import null +from sqlalchemy.sql import or_ import nova.conf from nova import context as nova_context from nova.db import api as db +from nova.db.sqlalchemy import api as db_api +from nova.db.sqlalchemy import api_models from nova import exception from nova import objects +from nova.scheduler.client import report from nova import utils LOG = logging.getLogger(__name__) - - CONF = nova.conf.CONF +# Lazy-loaded on first access. +# Avoid constructing the KSA adapter and provider tree on every access. +PLACEMENT_CLIENT = None +# If user_id and queued_for_delete are populated for a project, cache the +# result to avoid doing unnecessary EXISTS database queries. +UID_QFD_POPULATED_CACHE_BY_PROJECT = set() class DbQuotaDriver(object): @@ -1050,6 +1061,49 @@ class QuotaEngine(object): return 0 +@db_api.api_context_manager.reader +def _user_id_queued_for_delete_populated(context, project_id=None): + """Determine whether user_id and queued_for_delete are set. + + This will be used to determine whether we need to fall back on + the legacy quota counting method (if we cannot rely on counting + instance mappings for the instance count). If any records with user_id=None + and queued_for_delete=False are found, we need to fall back to the legacy + counting method. If any records with queued_for_delete=None are found, we + need to fall back to the legacy counting method. + + Note that this check specifies queued_for_deleted=False, which excludes + deleted and SOFT_DELETED instances. The 'populate_user_id' data migration + migrates SOFT_DELETED instances because they could be restored at any time + in the future. However, for this quota-check-time method, it is acceptable + to ignore SOFT_DELETED instances, since we just want to know if it is safe + to use instance mappings to count instances at this point in time (and + SOFT_DELETED instances do not count against quota limits). + + We also want to fall back to the legacy counting method if we detect any + records that have not yet populated the queued_for_delete field. We do this + instead of counting queued_for_delete=None records since that might not + accurately reflect the project or project user's quota usage. + + :param project_id: The project to check + :returns: True if user_id is set for all non-deleted instances and + queued_for_delete is set for all instances, else False + """ + user_id_not_populated = and_( + api_models.InstanceMapping.user_id == null(), + api_models.InstanceMapping.queued_for_delete == false()) + # If either queued_for_delete or user_id are unmigrated, we will return + # False. + unmigrated_filter = or_( + api_models.InstanceMapping.queued_for_delete == null(), + user_id_not_populated) + query = context.session.query(api_models.InstanceMapping).filter( + unmigrated_filter) + if project_id: + query = query.filter_by(project_id=project_id) + return not context.session.query(query.exists()).scalar() + + def _keypair_get_count_by_user(context, user_id): count = objects.KeyPairList.get_count_by_user(context, user_id) return {'user': {'key_pairs': count}} @@ -1121,8 +1175,8 @@ def _floating_ip_count(context, project_id): return {'project': {'floating_ips': count}} -def _instances_cores_ram_count(context, project_id, user_id=None): - """Get the counts of instances, cores, and ram in the database. +def _instances_cores_ram_count_legacy(context, project_id, user_id=None): + """Get the counts of instances, cores, and ram in cell databases. :param context: The request context for database access :param project_id: The project_id to count across @@ -1137,10 +1191,8 @@ def _instances_cores_ram_count(context, project_id, user_id=None): 'cores': , 'ram': }} """ - # TODO(melwitt): Counting across cells for instances means we will miss - # counting resources if a cell is down. In the future, we should query - # placement for cores/ram and InstanceMappings for instances (once we are - # deleting InstanceMappings when we delete instances). + # NOTE(melwitt): Counting across cells for instances, cores, and ram means + # we will miss counting resources if a cell is down. # NOTE(tssurya): We only go into those cells in which the tenant has # instances. We could optimize this to avoid the CellMappingList query # for single-cell deployments by checking the cell cache and only doing @@ -1165,6 +1217,70 @@ def _instances_cores_ram_count(context, project_id, user_id=None): return total_counts +def _cores_ram_count_placement(context, project_id, user_id=None): + global PLACEMENT_CLIENT + if not PLACEMENT_CLIENT: + PLACEMENT_CLIENT = report.SchedulerReportClient() + return PLACEMENT_CLIENT.get_usages_counts_for_quota(context, project_id, + user_id=user_id) + + +def _instances_cores_ram_count_api_db_placement(context, project_id, + user_id=None): + # Will return a dict with format: {'project': {'instances': M}, + # 'user': {'instances': N}} + # where the 'user' key is optional. + total_counts = objects.InstanceMappingList.get_counts(context, + project_id, + user_id=user_id) + cores_ram_counts = _cores_ram_count_placement(context, project_id, + user_id=user_id) + total_counts['project'].update(cores_ram_counts['project']) + if 'user' in total_counts: + total_counts['user'].update(cores_ram_counts['user']) + return total_counts + + +def _instances_cores_ram_count(context, project_id, user_id=None): + """Get the counts of instances, cores, and ram. + + :param context: The request context for database access + :param project_id: The project_id to count across + :param user_id: The user_id to count across + :returns: A dict containing the project-scoped counts and user-scoped + counts if user_id is specified. For example: + + {'project': {'instances': , + 'cores': , + 'ram': }, + 'user': {'instances': , + 'cores': , + 'ram': }} + """ + global UID_QFD_POPULATED_CACHE_BY_PROJECT + if CONF.quota.count_usage_from_placement: + # If a project has all user_id and queued_for_delete data populated, + # cache the result to avoid needless database checking in the future. + if project_id not in UID_QFD_POPULATED_CACHE_BY_PROJECT: + LOG.debug('Checking whether user_id and queued_for_delete are ' + 'populated for project_id %s', project_id) + uid_qfd_populated = _user_id_queued_for_delete_populated( + context, project_id) + if uid_qfd_populated: + UID_QFD_POPULATED_CACHE_BY_PROJECT.add(project_id) + else: + uid_qfd_populated = True + if not uid_qfd_populated: + LOG.warning('Falling back to legacy quota counting method for ' + 'instances, cores, and ram') + + if CONF.quota.count_usage_from_placement and uid_qfd_populated: + return _instances_cores_ram_count_api_db_placement(context, project_id, + user_id=user_id) + return _instances_cores_ram_count_legacy(context, project_id, + user_id=user_id) + + def _server_group_count(context, project_id, user_id=None): """Get the counts of server groups in the database. diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index 09e0279935e9..95caa89497a5 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -2358,6 +2358,8 @@ class SchedulerReportClient(object): """ total_counts = {'project': {}} # First query counts across all users of a project + LOG.debug('Getting usages for project_id %s from placement', + project_id) resp = self._get_usages(context, project_id) if resp: data = resp.json() @@ -2370,6 +2372,8 @@ class SchedulerReportClient(object): self._handle_usages_error_from_placement(resp, project_id) # If specified, second query counts across one user in the project if user_id: + LOG.debug('Getting usages for project_id %s and user_id %s from ' + 'placement', project_id, user_id) resp = self._get_usages(context, project_id, user_id=user_id) if resp: data = resp.json() diff --git a/nova/test.py b/nova/test.py index c30b71fbcf6b..dae66cbfa472 100644 --- a/nova/test.py +++ b/nova/test.py @@ -57,6 +57,7 @@ from nova.network import manager as network_manager from nova.network.security_group import openstack_driver from nova import objects from nova.objects import base as objects_base +from nova import quota from nova.tests import fixtures as nova_fixtures from nova.tests.unit import conf_fixture from nova.tests.unit import policy_fixture @@ -305,6 +306,9 @@ class TestCase(testtools.TestCase): self.flags(build_failure_weight_multiplier=0.0, group='filter_scheduler') + # NOTE(melwitt): Reset the cached set of projects + quota.UID_QFD_POPULATED_CACHE_BY_PROJECT = set() + def _setup_cells(self): """Setup a normal cellsv2 environment. diff --git a/nova/tests/functional/db/test_quota.py b/nova/tests/functional/db/test_quota.py index 1fb2452daedb..65c5aaedf952 100644 --- a/nova/tests/functional/db/test_quota.py +++ b/nova/tests/functional/db/test_quota.py @@ -17,6 +17,7 @@ from nova import objects from nova import quota from nova import test from nova.tests import fixtures as nova_fixtures +from nova.tests.functional.db import test_instance_mapping class QuotaTestCase(test.NoDBTestCase): @@ -146,3 +147,51 @@ class QuotaTestCase(test.NoDBTestCase): self.assertEqual(2, count['user']['instances']) self.assertEqual(6, count['user']['cores']) self.assertEqual(1536, count['user']['ram']) + + def test_user_id_queued_for_delete_populated(self): + ctxt = context.RequestContext('fake-user', 'fake-project') + + # One deleted or SOFT_DELETED instance with user_id=None, should not be + # considered by the check. + test_instance_mapping.create_mapping(user_id=None, + queued_for_delete=True) + + # Should be True because deleted instances are not considered. + self.assertTrue(quota._user_id_queued_for_delete_populated(ctxt)) + + # A non-deleted instance with user_id=None, should be considered in the + # check. + test_instance_mapping.create_mapping(user_id=None, + queued_for_delete=False) + + # Should be False because it's not deleted and user_id is unmigrated. + self.assertFalse(quota._user_id_queued_for_delete_populated(ctxt)) + + # A non-deleted instance in a different project, should be considered + # in the check (if project_id is not passed). + test_instance_mapping.create_mapping(queued_for_delete=False, + project_id='other-project') + + # Should be False since only instance 3 has user_id set and we're not + # filtering on project. + self.assertFalse(quota._user_id_queued_for_delete_populated(ctxt)) + + # Should be True because only instance 3 will be considered when we + # filter on project. + self.assertTrue( + quota._user_id_queued_for_delete_populated( + ctxt, project_id='other-project')) + + # Add a mapping for an instance that has not yet migrated + # queued_for_delete. + test_instance_mapping.create_mapping(queued_for_delete=None) + + # Should be False because an unmigrated queued_for_delete was found. + self.assertFalse( + quota._user_id_queued_for_delete_populated(ctxt)) + + # Check again filtering on project. Should be True because the + # unmigrated queued_for_delete record is part of a different project. + self.assertTrue( + quota._user_id_queued_for_delete_populated( + ctxt, project_id='other-project')) diff --git a/nova/tests/unit/test_quota.py b/nova/tests/unit/test_quota.py index bcb4838c7663..30824e1ba858 100644 --- a/nova/tests/unit/test_quota.py +++ b/nova/tests/unit/test_quota.py @@ -14,6 +14,7 @@ # License for the specific language governing permissions and limitations # under the License. +import ddt import mock from oslo_db.sqlalchemy import enginefacade from six.moves import range @@ -1961,3 +1962,153 @@ class NoopQuotaDriverTestCase(test.TestCase): quota.QUOTAS._resources, 'test_project') self.assertEqual(self.expected_settable_quotas, result) + + +@ddt.ddt +class QuotaCountTestCase(test.NoDBTestCase): + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_usages_counts_for_quota') + def test_cores_ram_count_placement(self, mock_get_usages): + usages = quota._cores_ram_count_placement( + mock.sentinel.context, mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + mock_get_usages.assert_called_once_with( + mock.sentinel.context, mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + self.assertEqual(mock_get_usages.return_value, usages) + + @mock.patch('nova.objects.InstanceMappingList.get_counts') + @mock.patch('nova.quota._cores_ram_count_placement') + def test_instances_cores_ram_count_api_db_placement( + self, mock_placement_count, mock_get_im_count): + # Fake response from placement with project and user usages of cores + # and ram. + mock_placement_count.return_value = {'project': {'cores': 2, 'ram': 4}, + 'user': {'cores': 1, 'ram': 2}} + # Fake count of instances based on instance mappings in the API DB. + mock_get_im_count.return_value = {'project': {'instances': 2}, + 'user': {'instances': 1}} + + counts = quota._instances_cores_ram_count_api_db_placement( + mock.sentinel.context, mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + + mock_get_im_count.assert_called_once_with( + mock.sentinel.context, mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + mock_placement_count.assert_called_once_with( + mock.sentinel.context, mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + expected = {'project': {'instances': 2, 'cores': 2, 'ram': 4}, + 'user': {'instances': 1, 'cores': 1, 'ram': 2}} + self.assertDictEqual(expected, counts) + + @ddt.data((True, True), + (True, False), + (False, True), + (False, False)) + @ddt.unpack + @mock.patch('nova.quota.LOG.warning') + @mock.patch('nova.quota._user_id_queued_for_delete_populated') + @mock.patch('nova.quota._instances_cores_ram_count_legacy') + @mock.patch('nova.quota._instances_cores_ram_count_api_db_placement') + def test_instances_cores_ram_count(self, quota_from_placement, + uid_qfd_populated, + mock_api_db_placement_count, + mock_legacy_count, + mock_uid_qfd_populated, mock_warn_log): + # Check that all the combinations of + # [quota]count_usage_from_placement (True/False) and + # user_id_queued_for_delete_populated (True/False) do the right things. + + # Fake count of instances, cores, and ram. + expected = {'project': {'instances': 2, 'cores': 2, 'ram': 4}, + 'user': {'instances': 1, 'cores': 1, 'ram': 2}} + mock_api_db_placement_count.return_value = expected + mock_legacy_count.return_value = expected + # user_id and queued_for_delete populated/migrated (True/False) + mock_uid_qfd_populated.return_value = uid_qfd_populated + # Counting quota usage from placement enabled (True/False) + self.flags(count_usage_from_placement=quota_from_placement, + group='quota') + + counts = quota._instances_cores_ram_count( + mock.sentinel.context, mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + + if quota_from_placement and uid_qfd_populated: + # If we are counting quota usage from placement and user_id and + # queued_for_delete data has all been migrated, we should count + # instances from the API DB using instance mappings and count + # cores and ram from placement. + mock_api_db_placement_count.assert_called_once_with( + mock.sentinel.context, mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + # We should not have called the legacy counting method. + mock_legacy_count.assert_not_called() + # We should not have logged a warn message saying we were falling + # back to the legacy counting method. + mock_warn_log.assert_not_called() + else: + # If counting quota usage from placement is not enabled or if + # user_id or queued_for_delete data has not all been migrated yet, + # we should use the legacy counting method. + mock_legacy_count.assert_called_once_with( + mock.sentinel.context, mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + # We should have logged a warn message saying we were falling back + # to the legacy counting method. + if quota_from_placement: + # We only log the message if someone has opted in to counting + # from placement. + mock_warn_log.assert_called_once() + # We should not have called the API DB and placement counting + # method. + mock_api_db_placement_count.assert_not_called() + + self.assertDictEqual(expected, counts) + + @mock.patch('nova.quota._user_id_queued_for_delete_populated') + @mock.patch('nova.quota._instances_cores_ram_count_legacy') + @mock.patch('nova.quota._instances_cores_ram_count_api_db_placement') + def test_user_id_queued_for_delete_populated_cache_by_project( + self, mock_api_db_placement_count, mock_legacy_count, + mock_uid_qfd_populated): + # We need quota usage from placement enabled to test this. For legacy + # counting, the cache is not used. + self.flags(count_usage_from_placement=True, group='quota') + # Fake count of instances, cores, and ram. + fake_counts = {'project': {'instances': 2, 'cores': 2, 'ram': 4}, + 'user': {'instances': 1, 'cores': 1, 'ram': 2}} + mock_api_db_placement_count.return_value = fake_counts + mock_legacy_count.return_value = fake_counts + + # First, check the case where user_id and queued_for_delete are found + # not to be migrated. + mock_uid_qfd_populated.return_value = False + quota._instances_cores_ram_count(mock.sentinel.context, + mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + mock_uid_qfd_populated.assert_called_once() + # The second call should check for unmigrated records again, since the + # project was found not to be completely migrated last time. + quota._instances_cores_ram_count(mock.sentinel.context, + mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + self.assertEqual(2, mock_uid_qfd_populated.call_count) + + # Now check the case where the data migration was found to be complete. + mock_uid_qfd_populated.reset_mock() + mock_uid_qfd_populated.return_value = True + # The first call will check whether there are any unmigrated records. + quota._instances_cores_ram_count(mock.sentinel.context, + mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + mock_uid_qfd_populated.assert_called_once() + # Second call should skip the check for user_id and queued_for_delete + # migrated because the result was cached. + mock_uid_qfd_populated.reset_mock() + quota._instances_cores_ram_count(mock.sentinel.context, + mock.sentinel.project_id, + user_id=mock.sentinel.user_id) + mock_uid_qfd_populated.assert_not_called() diff --git a/releasenotes/notes/quota-usage-placement-5b3f62e83056f59d.yaml b/releasenotes/notes/quota-usage-placement-5b3f62e83056f59d.yaml new file mode 100644 index 000000000000..4e48c099e57c --- /dev/null +++ b/releasenotes/notes/quota-usage-placement-5b3f62e83056f59d.yaml @@ -0,0 +1,49 @@ +upgrade: + - | + It is now possible to count quota usage for cores and ram from the + placement service and instances from instance mappings in the API database + instead of counting resources from cell databases. This makes quota usage + counting resilient in the presence of down or poor-performing cells. + + Quota usage counting from placement is opt-in via the + ``[quota]count_usage_from_placement`` configuration option. + + There are some things to note when opting in to counting quota usage from + placement: + + * Counted usage will not be accurate in an environment where multiple Nova + deployments are sharing a placement deployment because currently + placement has no way of partitioning resource providers between different + Nova deployments. Operators who are running multiple Nova deployments + that share a placement deployment should not set the + ``[quota]count_usage_from_placement`` configuration option to ``True``. + + * Behavior will be different for resizes. During a resize, resource + allocations are held on both the source and destination (even on the same + host, see https://bugs.launchpad.net/nova/+bug/1790204) until the resize + is confirmed or reverted. Quota usage will be inflated for servers in + the ``VERIFY_RESIZE`` state and operators should weigh the advantages and + disadvantages before enabling ``[quota]count_usage_from_placement``. + + * The ``populate_queued_for_delete`` and ``populate_user_id`` online data + migrations must be completed before usage can be counted from placement. + Until the data migration is complete, the system will fall back to legacy + quota usage counting from cell databases depending on the result of an + ``EXISTS`` database query during each quota check, if + ``[quota]count_usage_from_placement`` is set to ``True``. Operators who + want to avoid the performance hit from the ``EXISTS`` queries should wait + to set the ``[quota]count_usage_from_placement`` configuration option to + ``True`` until after they have completed their online data migrations via + ``nova-manage db online_data_migrations``. + + * Behavior will be different for unscheduled servers in ``ERROR`` state. + A server in ``ERROR`` state that has never been scheduled to a compute + host will not have placement allocations, so it will not consume quota + usage for cores and ram. + + * Behavior will be different for servers in ``SHELVED_OFFLOADED`` state. + A server in ``SHELVED_OFFLOADED`` state will not have placement + allocations, so it will not consume quota usage for cores and ram. Note + that because of this, it will be possible for a request to unshelve a + server to be rejected if the user does not have enough quota available to + support the cores and ram needed by the server to be unshelved.