diff --git a/contrib/profile_caching_scheduler.sh b/contrib/profile_caching_scheduler.sh new file mode 100755 index 000000000000..df38ab12bfeb --- /dev/null +++ b/contrib/profile_caching_scheduler.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright (c) 2014 Rackspace Hosting +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# This runs a unit test that uses pycallgraph +# to profile the select_destinations call +# in the CachingScheduler +# +# For this script to work please run: +# python setup.py develop +# pip install -r requirements.txt +# pip install -r test-requirements.txt +# pip install pycallgraph +# export EVENTLET_NO_GREENDNS='yes' +# +BASEDIR=$(dirname $0) +TEST=$BASEDIR/../nova/tests/scheduler/test_caching_scheduler.py +echo +echo "Running this unit test file as a python script:" +echo $TEST + +python $TEST + +RESULTDIR=$(pwd) +echo +echo "For profiler result see: " +echo $RESULTDIR/scheduler.png +echo diff --git a/etc/nova/nova.conf.sample b/etc/nova/nova.conf.sample index d842c12b9673..650de441756f 100644 --- a/etc/nova/nova.conf.sample +++ b/etc/nova/nova.conf.sample @@ -1836,6 +1836,13 @@ # Default driver to use for the scheduler (string value) #scheduler_driver=nova.scheduler.filter_scheduler.FilterScheduler +# How often (in seconds) to run periodic tasks in the +# scheduler driver of your choice. Please note this is likely +# to interact with the value of service_down_time, but exactly +# how they interact will depend on your choice of scheduler +# driver. (integer value) +#scheduler_driver_task_period=60 + # # Options defined in nova.scheduler.rpcapi diff --git a/nova/scheduler/caching_scheduler.py b/nova/scheduler/caching_scheduler.py new file mode 100644 index 000000000000..98033e23ac7c --- /dev/null +++ b/nova/scheduler/caching_scheduler.py @@ -0,0 +1,77 @@ +# Copyright (c) 2014 Rackspace Hosting +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova.scheduler import filter_scheduler + + +class CachingScheduler(filter_scheduler.FilterScheduler): + """Scheduler to test aggressive caching of the host list. + + Please note, this is a very opinionated scheduler. Be sure to + review the caveats listed here before selecting this scheduler. + + The aim of this scheduler is to reduce server build times when + you have large bursts of server builds, by reducing the time it + takes, from the users point of view, to service each schedule + request. + + There are two main parts to scheduling a users request: + * getting the current state of the system + * using filters and weights to pick the best host + + This scheduler tries its best to cache in memory the current + state of the system, so we don't need to make the expensive + call to get the current state of the system while processing + a user's request, we can do that query in a periodic task + before the user even issues their request. + + To reduce races, cached info of the chosen host is updated using + the existing host state call: consume_from_instance + + Please note, the way this works, each scheduler worker has its own + copy of the cache. So if you run multiple schedulers, you will get + more retries, because the data stored on any additional scheduler will + be more out of date, than if it was fetched from the database. + + In a similar way, if you have a high number of server deletes, the + extra capacity from those deletes will not show up until the cache is + refreshed. + """ + + def __init__(self, *args, **kwargs): + super(CachingScheduler, self).__init__(*args, **kwargs) + self.all_host_states = None + + def run_periodic_tasks(self, context): + """Called from a periodic tasks in the manager.""" + elevated = context.elevated() + # NOTE(johngarbutt) Fetching the list of hosts before we get + # a user request, so no user requests have to wait while we + # fetch the list of hosts. + self.all_host_states = self._get_up_hosts(elevated) + + def _get_all_host_states(self, context): + """Called from the filter scheduler, in a template pattern.""" + if self.all_host_states is None: + # NOTE(johngarbutt) We only get here when we a scheduler request + # comes in before the first run of the periodic task. + # Rather than raise an error, we fetch the list of hosts. + self.all_host_states = self._get_up_hosts(context) + + return self.all_host_states + + def _get_up_hosts(self, context): + all_hosts_iterator = self.host_manager.get_all_host_states(context) + return list(all_hosts_iterator) diff --git a/nova/scheduler/driver.py b/nova/scheduler/driver.py index fb39638f1c43..ade4ef5cf844 100644 --- a/nova/scheduler/driver.py +++ b/nova/scheduler/driver.py @@ -104,6 +104,10 @@ class Scheduler(object): CONF.scheduler_host_manager) self.servicegroup_api = servicegroup.API() + def run_periodic_tasks(self, context): + """Manager calls this so drivers can perform periodic tasks.""" + pass + def hosts_up(self, context, topic): """Return the list of hosts that have a running service for topic.""" diff --git a/nova/scheduler/filter_scheduler.py b/nova/scheduler/filter_scheduler.py index 023b30f29d82..444170109963 100644 --- a/nova/scheduler/filter_scheduler.py +++ b/nova/scheduler/filter_scheduler.py @@ -312,7 +312,7 @@ class FilterScheduler(driver.Scheduler): # Note: remember, we are using an iterator here. So only # traverse this list once. This can bite you if the hosts # are being scanned in a filter or weighing function. - hosts = self.host_manager.get_all_host_states(elevated) + hosts = self._get_all_host_states(elevated) selected_hosts = [] if instance_uuids: @@ -350,3 +350,7 @@ class FilterScheduler(driver.Scheduler): if update_group_hosts is True: filter_properties['group_hosts'].append(chosen_host.obj.host) return selected_hosts + + def _get_all_host_states(self, context): + """Template method, so a subclass can implement caching.""" + return self.host_manager.get_all_host_states(context) diff --git a/nova/scheduler/host_manager.py b/nova/scheduler/host_manager.py index 3ef1547f48b0..7a57879c6b85 100644 --- a/nova/scheduler/host_manager.py +++ b/nova/scheduler/host_manager.py @@ -114,6 +114,7 @@ class HostState(object): # Mutable available resources. # These will change as resources are virtually "consumed". + self.total_usable_ram_mb = 0 self.total_usable_disk_gb = 0 self.disk_mb_used = 0 self.free_ram_mb = 0 diff --git a/nova/scheduler/manager.py b/nova/scheduler/manager.py index 889e13c493d9..0ce57040c18b 100644 --- a/nova/scheduler/manager.py +++ b/nova/scheduler/manager.py @@ -42,12 +42,20 @@ from nova.scheduler import utils as scheduler_utils LOG = logging.getLogger(__name__) -scheduler_driver_opt = cfg.StrOpt('scheduler_driver', - default='nova.scheduler.filter_scheduler.FilterScheduler', - help='Default driver to use for the scheduler') - +scheduler_driver_opts = [ + cfg.StrOpt('scheduler_driver', + default='nova.scheduler.filter_scheduler.FilterScheduler', + help='Default driver to use for the scheduler'), + cfg.IntOpt('scheduler_driver_task_period', + default=60, + help='How often (in seconds) to run periodic tasks in ' + 'the scheduler driver of your choice. ' + 'Please note this is likely to interact with the value ' + 'of service_down_time, but exactly how they interact ' + 'will depend on your choice of scheduler driver.'), +] CONF = cfg.CONF -CONF.register_opt(scheduler_driver_opt) +CONF.register_opts(scheduler_driver_opts) QUOTAS = quota.QUOTAS @@ -256,6 +264,11 @@ class SchedulerManager(manager.Manager): def _expire_reservations(self, context): QUOTAS.expire(context) + @periodic_task.periodic_task(spacing=CONF.scheduler_driver_task_period, + run_immediately=True) + def _run_periodic_tasks(self, context): + self.driver.run_periodic_tasks(context) + # NOTE(russellb) This method can be removed in 3.0 of this API. It is # deprecated in favor of the method in the base API. def get_backdoor_port(self, context): diff --git a/nova/tests/scheduler/test_caching_scheduler.py b/nova/tests/scheduler/test_caching_scheduler.py new file mode 100644 index 000000000000..89cd842f944b --- /dev/null +++ b/nova/tests/scheduler/test_caching_scheduler.py @@ -0,0 +1,192 @@ +# Copyright (c) 2014 Rackspace Hosting +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock + +from nova import exception +from nova.openstack.common import timeutils +from nova.scheduler import caching_scheduler +from nova.scheduler import host_manager +from nova.tests.scheduler import test_scheduler + +ENABLE_PROFILER = False + + +class CachingSchedulerTestCase(test_scheduler.SchedulerTestCase): + """Test case for Caching Scheduler.""" + + driver_cls = caching_scheduler.CachingScheduler + + @mock.patch.object(caching_scheduler.CachingScheduler, + "_get_up_hosts") + def test_run_periodic_tasks_loads_hosts(self, mock_up_hosts): + mock_up_hosts.return_value = [] + context = mock.Mock() + + self.driver.run_periodic_tasks(context) + + self.assertTrue(mock_up_hosts.called) + self.assertEqual([], self.driver.all_host_states) + context.elevated.assert_called_with() + + @mock.patch.object(caching_scheduler.CachingScheduler, + "_get_up_hosts") + def test_get_all_host_states_returns_cached_value(self, mock_up_hosts): + self.driver.all_host_states = [] + + result = self.driver._get_all_host_states(self.context) + + self.assertFalse(mock_up_hosts.called) + self.assertEqual([], self.driver.all_host_states) + + @mock.patch.object(caching_scheduler.CachingScheduler, + "_get_up_hosts") + def test_get_all_host_states_loads_hosts(self, mock_up_hosts): + mock_up_hosts.return_value = ["asdf"] + + result = self.driver._get_all_host_states(self.context) + + self.assertTrue(mock_up_hosts.called) + self.assertEqual(["asdf"], self.driver.all_host_states) + self.assertEqual(["asdf"], result) + + def test_get_up_hosts(self): + with mock.patch.object(self.driver.host_manager, + "get_all_host_states") as mock_get_hosts: + mock_get_hosts.return_value = ["asdf"] + + result = self.driver._get_up_hosts(self.context) + + self.assertTrue(mock_get_hosts.called) + self.assertEqual(mock_get_hosts.return_value, result) + + def test_select_destination_raises_with_no_hosts(self): + fake_request_spec = self._get_fake_request_spec() + self.driver.all_host_states = [] + + self.assertRaises(exception.NoValidHost, + self.driver.select_destinations, + self.context, fake_request_spec, {}) + + def test_select_destination_works(self): + fake_request_spec = self._get_fake_request_spec() + fake_host = self._get_fake_host_state() + self.driver.all_host_states = [fake_host] + + result = self._test_select_destinations(fake_request_spec) + + self.assertEqual(1, len(result)) + self.assertEqual(result[0]["host"], fake_host.host) + + def _test_select_destinations(self, request_spec): + return self.driver.select_destinations( + self.context, request_spec, {}) + + def _get_fake_request_spec(self): + flavor = { + "flavorid": "small", + "memory_mb": 512, + "root_gb": 1, + "ephemeral_gb": 1, + "vcpus": 1, + } + instance_properties = { + "os_type": "linux", + "project_id": "1234", + "memory_mb": 512, + "root_gb": 1, + "ephemeral_gb": 1, + "vcpus": 1, + } + request_spec = { + "instance_type": flavor, + "instance_properties": instance_properties, + "num_instances": 1, + } + return request_spec + + def _get_fake_host_state(self, index=0): + host_state = host_manager.HostState( + 'host_%s' % index, + 'node_%s' % index) + host_state.free_ram_mb = 50000 + host_state.service = { + "disabled": False, + "updated_at": timeutils.utcnow(), + "created_at": timeutils.utcnow(), + } + return host_state + + def test_performance_check_select_destination(self): + hosts = 2 + requests = 1 + + self.flags(service_down_time=240) + + request_spec = self._get_fake_request_spec() + host_states = [] + for x in xrange(hosts): + host_state = self._get_fake_host_state(x) + host_states.append(host_state) + self.driver.all_host_states = host_states + + def run_test(): + a = timeutils.utcnow() + + for x in xrange(requests): + self.driver.select_destinations( + self.context, request_spec, {}) + + b = timeutils.utcnow() + c = b - a + + seconds = (c.days * 24 * 60 * 60 + c.seconds) + microseconds = seconds * 1000 + c.microseconds / 1000.0 + per_request_ms = microseconds / requests + return per_request_ms + + per_request_ms = None + if ENABLE_PROFILER: + import pycallgraph + from pycallgraph import output + config = pycallgraph.Config(max_depth=10) + config.trace_filter = pycallgraph.GlobbingFilter(exclude=[ + 'pycallgraph.*', + 'unittest.*', + 'nova.tests.*', + ]) + graphviz = output.GraphvizOutput(output_file='scheduler.png') + + with pycallgraph.PyCallGraph(output=graphviz): + per_request_ms = run_test() + + else: + per_request_ms = run_test() + + # This has proved to be around 1 ms on a random dev box + # But this is here so you can do simply performance testing easily. + self.assertTrue(per_request_ms < 1000) + + +if __name__ == '__main__': + # A handy tool to help profile the schedulers performance + ENABLE_PROFILER = True + import unittest + suite = unittest.TestSuite() + test = "test_performance_check_select_destination" + test_case = CachingSchedulerTestCase(test) + suite.addTest(test_case) + runner = unittest.TextTestRunner() + runner.run(suite)