Add a caching scheduler driver

This adds a new scheduler driver, CachingScheduler

It currently subclasses the filter scheduler and only adds caching of
calls to host_manager.get_all_host_states.

It relies on there being a small number of schedulers, ideally one,
calls to consume_from_instance within the filter scheduler to update the
current cached list of hosts, and a low rate of deletes.

It introduces the new config value:
scheduler_driver_task_period

The periodic task it controls is used by the caching scheduler to fetch
an updated copy of host_manager.get_all_host_states.

If your value for service_down_time is much smaller than
scheduler_driver_task_period, there will be issues with hosts appearing
to be dead, just because the list of hosts is being cached. Correct
configuration of those two values can avoid this issue.

DocImpact
Part of blueprint caching-scheduler
Change-Id: I2eb873ce8024a576e597205e2d52b5d5e8aee97a
This commit is contained in:
John Garbutt 2014-01-13 09:56:14 +00:00 committed by Gerrit Code Review
parent b40f2d79aa
commit 1b24bfbd73
8 changed files with 344 additions and 6 deletions

View File

@ -0,0 +1,40 @@
#!/bin/bash
# Copyright (c) 2014 Rackspace Hosting
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# This runs a unit test that uses pycallgraph
# to profile the select_destinations call
# in the CachingScheduler
#
# For this script to work please run:
# python setup.py develop
# pip install -r requirements.txt
# pip install -r test-requirements.txt
# pip install pycallgraph
# export EVENTLET_NO_GREENDNS='yes'
#
BASEDIR=$(dirname $0)
TEST=$BASEDIR/../nova/tests/scheduler/test_caching_scheduler.py
echo
echo "Running this unit test file as a python script:"
echo $TEST
python $TEST
RESULTDIR=$(pwd)
echo
echo "For profiler result see: "
echo $RESULTDIR/scheduler.png
echo

View File

@ -1836,6 +1836,13 @@
# Default driver to use for the scheduler (string value)
#scheduler_driver=nova.scheduler.filter_scheduler.FilterScheduler
# How often (in seconds) to run periodic tasks in the
# scheduler driver of your choice. Please note this is likely
# to interact with the value of service_down_time, but exactly
# how they interact will depend on your choice of scheduler
# driver. (integer value)
#scheduler_driver_task_period=60
#
# Options defined in nova.scheduler.rpcapi

View File

@ -0,0 +1,77 @@
# Copyright (c) 2014 Rackspace Hosting
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from nova.scheduler import filter_scheduler
class CachingScheduler(filter_scheduler.FilterScheduler):
"""Scheduler to test aggressive caching of the host list.
Please note, this is a very opinionated scheduler. Be sure to
review the caveats listed here before selecting this scheduler.
The aim of this scheduler is to reduce server build times when
you have large bursts of server builds, by reducing the time it
takes, from the users point of view, to service each schedule
request.
There are two main parts to scheduling a users request:
* getting the current state of the system
* using filters and weights to pick the best host
This scheduler tries its best to cache in memory the current
state of the system, so we don't need to make the expensive
call to get the current state of the system while processing
a user's request, we can do that query in a periodic task
before the user even issues their request.
To reduce races, cached info of the chosen host is updated using
the existing host state call: consume_from_instance
Please note, the way this works, each scheduler worker has its own
copy of the cache. So if you run multiple schedulers, you will get
more retries, because the data stored on any additional scheduler will
be more out of date, than if it was fetched from the database.
In a similar way, if you have a high number of server deletes, the
extra capacity from those deletes will not show up until the cache is
refreshed.
"""
def __init__(self, *args, **kwargs):
super(CachingScheduler, self).__init__(*args, **kwargs)
self.all_host_states = None
def run_periodic_tasks(self, context):
"""Called from a periodic tasks in the manager."""
elevated = context.elevated()
# NOTE(johngarbutt) Fetching the list of hosts before we get
# a user request, so no user requests have to wait while we
# fetch the list of hosts.
self.all_host_states = self._get_up_hosts(elevated)
def _get_all_host_states(self, context):
"""Called from the filter scheduler, in a template pattern."""
if self.all_host_states is None:
# NOTE(johngarbutt) We only get here when we a scheduler request
# comes in before the first run of the periodic task.
# Rather than raise an error, we fetch the list of hosts.
self.all_host_states = self._get_up_hosts(context)
return self.all_host_states
def _get_up_hosts(self, context):
all_hosts_iterator = self.host_manager.get_all_host_states(context)
return list(all_hosts_iterator)

View File

@ -104,6 +104,10 @@ class Scheduler(object):
CONF.scheduler_host_manager)
self.servicegroup_api = servicegroup.API()
def run_periodic_tasks(self, context):
"""Manager calls this so drivers can perform periodic tasks."""
pass
def hosts_up(self, context, topic):
"""Return the list of hosts that have a running service for topic."""

View File

@ -312,7 +312,7 @@ class FilterScheduler(driver.Scheduler):
# Note: remember, we are using an iterator here. So only
# traverse this list once. This can bite you if the hosts
# are being scanned in a filter or weighing function.
hosts = self.host_manager.get_all_host_states(elevated)
hosts = self._get_all_host_states(elevated)
selected_hosts = []
if instance_uuids:
@ -350,3 +350,7 @@ class FilterScheduler(driver.Scheduler):
if update_group_hosts is True:
filter_properties['group_hosts'].append(chosen_host.obj.host)
return selected_hosts
def _get_all_host_states(self, context):
"""Template method, so a subclass can implement caching."""
return self.host_manager.get_all_host_states(context)

View File

@ -114,6 +114,7 @@ class HostState(object):
# Mutable available resources.
# These will change as resources are virtually "consumed".
self.total_usable_ram_mb = 0
self.total_usable_disk_gb = 0
self.disk_mb_used = 0
self.free_ram_mb = 0

View File

@ -42,12 +42,20 @@ from nova.scheduler import utils as scheduler_utils
LOG = logging.getLogger(__name__)
scheduler_driver_opt = cfg.StrOpt('scheduler_driver',
default='nova.scheduler.filter_scheduler.FilterScheduler',
help='Default driver to use for the scheduler')
scheduler_driver_opts = [
cfg.StrOpt('scheduler_driver',
default='nova.scheduler.filter_scheduler.FilterScheduler',
help='Default driver to use for the scheduler'),
cfg.IntOpt('scheduler_driver_task_period',
default=60,
help='How often (in seconds) to run periodic tasks in '
'the scheduler driver of your choice. '
'Please note this is likely to interact with the value '
'of service_down_time, but exactly how they interact '
'will depend on your choice of scheduler driver.'),
]
CONF = cfg.CONF
CONF.register_opt(scheduler_driver_opt)
CONF.register_opts(scheduler_driver_opts)
QUOTAS = quota.QUOTAS
@ -256,6 +264,11 @@ class SchedulerManager(manager.Manager):
def _expire_reservations(self, context):
QUOTAS.expire(context)
@periodic_task.periodic_task(spacing=CONF.scheduler_driver_task_period,
run_immediately=True)
def _run_periodic_tasks(self, context):
self.driver.run_periodic_tasks(context)
# NOTE(russellb) This method can be removed in 3.0 of this API. It is
# deprecated in favor of the method in the base API.
def get_backdoor_port(self, context):

View File

@ -0,0 +1,192 @@
# Copyright (c) 2014 Rackspace Hosting
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import mock
from nova import exception
from nova.openstack.common import timeutils
from nova.scheduler import caching_scheduler
from nova.scheduler import host_manager
from nova.tests.scheduler import test_scheduler
ENABLE_PROFILER = False
class CachingSchedulerTestCase(test_scheduler.SchedulerTestCase):
"""Test case for Caching Scheduler."""
driver_cls = caching_scheduler.CachingScheduler
@mock.patch.object(caching_scheduler.CachingScheduler,
"_get_up_hosts")
def test_run_periodic_tasks_loads_hosts(self, mock_up_hosts):
mock_up_hosts.return_value = []
context = mock.Mock()
self.driver.run_periodic_tasks(context)
self.assertTrue(mock_up_hosts.called)
self.assertEqual([], self.driver.all_host_states)
context.elevated.assert_called_with()
@mock.patch.object(caching_scheduler.CachingScheduler,
"_get_up_hosts")
def test_get_all_host_states_returns_cached_value(self, mock_up_hosts):
self.driver.all_host_states = []
result = self.driver._get_all_host_states(self.context)
self.assertFalse(mock_up_hosts.called)
self.assertEqual([], self.driver.all_host_states)
@mock.patch.object(caching_scheduler.CachingScheduler,
"_get_up_hosts")
def test_get_all_host_states_loads_hosts(self, mock_up_hosts):
mock_up_hosts.return_value = ["asdf"]
result = self.driver._get_all_host_states(self.context)
self.assertTrue(mock_up_hosts.called)
self.assertEqual(["asdf"], self.driver.all_host_states)
self.assertEqual(["asdf"], result)
def test_get_up_hosts(self):
with mock.patch.object(self.driver.host_manager,
"get_all_host_states") as mock_get_hosts:
mock_get_hosts.return_value = ["asdf"]
result = self.driver._get_up_hosts(self.context)
self.assertTrue(mock_get_hosts.called)
self.assertEqual(mock_get_hosts.return_value, result)
def test_select_destination_raises_with_no_hosts(self):
fake_request_spec = self._get_fake_request_spec()
self.driver.all_host_states = []
self.assertRaises(exception.NoValidHost,
self.driver.select_destinations,
self.context, fake_request_spec, {})
def test_select_destination_works(self):
fake_request_spec = self._get_fake_request_spec()
fake_host = self._get_fake_host_state()
self.driver.all_host_states = [fake_host]
result = self._test_select_destinations(fake_request_spec)
self.assertEqual(1, len(result))
self.assertEqual(result[0]["host"], fake_host.host)
def _test_select_destinations(self, request_spec):
return self.driver.select_destinations(
self.context, request_spec, {})
def _get_fake_request_spec(self):
flavor = {
"flavorid": "small",
"memory_mb": 512,
"root_gb": 1,
"ephemeral_gb": 1,
"vcpus": 1,
}
instance_properties = {
"os_type": "linux",
"project_id": "1234",
"memory_mb": 512,
"root_gb": 1,
"ephemeral_gb": 1,
"vcpus": 1,
}
request_spec = {
"instance_type": flavor,
"instance_properties": instance_properties,
"num_instances": 1,
}
return request_spec
def _get_fake_host_state(self, index=0):
host_state = host_manager.HostState(
'host_%s' % index,
'node_%s' % index)
host_state.free_ram_mb = 50000
host_state.service = {
"disabled": False,
"updated_at": timeutils.utcnow(),
"created_at": timeutils.utcnow(),
}
return host_state
def test_performance_check_select_destination(self):
hosts = 2
requests = 1
self.flags(service_down_time=240)
request_spec = self._get_fake_request_spec()
host_states = []
for x in xrange(hosts):
host_state = self._get_fake_host_state(x)
host_states.append(host_state)
self.driver.all_host_states = host_states
def run_test():
a = timeutils.utcnow()
for x in xrange(requests):
self.driver.select_destinations(
self.context, request_spec, {})
b = timeutils.utcnow()
c = b - a
seconds = (c.days * 24 * 60 * 60 + c.seconds)
microseconds = seconds * 1000 + c.microseconds / 1000.0
per_request_ms = microseconds / requests
return per_request_ms
per_request_ms = None
if ENABLE_PROFILER:
import pycallgraph
from pycallgraph import output
config = pycallgraph.Config(max_depth=10)
config.trace_filter = pycallgraph.GlobbingFilter(exclude=[
'pycallgraph.*',
'unittest.*',
'nova.tests.*',
])
graphviz = output.GraphvizOutput(output_file='scheduler.png')
with pycallgraph.PyCallGraph(output=graphviz):
per_request_ms = run_test()
else:
per_request_ms = run_test()
# This has proved to be around 1 ms on a random dev box
# But this is here so you can do simply performance testing easily.
self.assertTrue(per_request_ms < 1000)
if __name__ == '__main__':
# A handy tool to help profile the schedulers performance
ENABLE_PROFILER = True
import unittest
suite = unittest.TestSuite()
test = "test_performance_check_select_destination"
test_case = CachingSchedulerTestCase(test)
suite.addTest(test_case)
runner = unittest.TextTestRunner()
runner.run(suite)