6047d790a3
This lifts some hash ring code from ironic (to be put into oslo soon), to be used to do consistent hashing of ironic nodes among multiple nova-compute services. The hash ring is used within the driver itself, and is refreshed at each resource tracker. get_available_nodes() will now return a subset of nodes, determined by the following rules: * any node with an instance managed by the compute service * any node that is mapped to the compute service on the hash ring * no nodes with instances managed by another compute service The virt driver finds all compute services that are running the ironic driver by joining the services table and the compute_nodes table. Since there won't be any records in the compute_nodes table for a service that is starting for the first time, the virt driver also adds its own compute service into this list. The list of all hostnames in this list is what is used to instantiate the hash ring. As nova-compute services are brought up or down, the ring will re-balance. It's important to note that this re-balance does not occur at the same time on all compute services, so for some amount of time, an ironic node may be managed by more than one compute service. In other words, there may be two compute_nodes records for a single ironic node, with a different host value. For scheduling purposes, this is okay, because either compute service is capable of actually spawning an instance on the node (because the ironic service doesn't know about this hashing). This will cause capacity reporting (e.g. nova hypervisor-stats) to over-report capacity for this time. Once all compute services in the cluster have done a resource tracker run and re-balanced the hash ring, this will be back to normal. It's also important to note that, due to the way nodes with instances are handled, if an instance is deleted while the compute service is down, that node will be removed from the compute_nodes table when the service comes back up (as each service will see an instance on the node object, and assume another compute service manages that instance). The ironic node will remain active and orphaned. Once the periodic task to reap deleted instances runs, the ironic node will be torn down and the node will again be reported in the compute_nodes table. It's all very eventually consistent, with a potentially long time to eventual. There's no configuration to enable this mode; it's always running. The code is exercised (but simple) when running with one compute service; spinning up more invokes the hard bits. As such, the release note for this change clarifies that this feature is new and untested for running with multiple compute services. Implements: blueprint ironic-multiple-compute-hosts Change-Id: I852f62b29f1faedf7ff19b42bbfb966f61d95c6e
135 lines
5.0 KiB
Python
135 lines
5.0 KiB
Python
# Copyright 2013 Hewlett-Packard Development Company, L.P.
|
|
# All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import bisect
|
|
import hashlib
|
|
|
|
import six
|
|
|
|
from nova import exception
|
|
from nova.i18n import _
|
|
|
|
|
|
# NOTE(jroll) these constants will be config options in Ocata, when the hash
|
|
# ring code is in oslo.
|
|
# Number of partitions per service is 2^PARTITION_EXPONENT.
|
|
# 5 should be fine for most deployments, as an experimental feature.
|
|
PARTITION_EXPONENT = 5
|
|
# This should always be 1 in nova, as two compute daemons handling the same
|
|
# node should not be possible.
|
|
DISTRIBUTION_REPLICAS = 1
|
|
|
|
|
|
class HashRing(object):
|
|
"""A stable hash ring.
|
|
|
|
We map item N to a host Y based on the closest lower hash:
|
|
|
|
- hash(item) -> partition
|
|
- hash(host) -> divider
|
|
- closest lower divider is the host to use
|
|
- we hash each host many times to spread load more finely
|
|
as otherwise adding a host gets (on average) 50% of the load of
|
|
just one other host assigned to it.
|
|
"""
|
|
|
|
def __init__(self, hosts):
|
|
"""Create a new hash ring across the specified hosts.
|
|
|
|
:param hosts: an iterable of hosts which will be mapped.
|
|
"""
|
|
replicas = DISTRIBUTION_REPLICAS
|
|
|
|
try:
|
|
self.hosts = set(hosts)
|
|
self.replicas = replicas if replicas <= len(hosts) else len(hosts)
|
|
except TypeError:
|
|
raise exception.Invalid(
|
|
_("Invalid hosts supplied when building HashRing."))
|
|
|
|
self._host_hashes = {}
|
|
for host in hosts:
|
|
key = str(host).encode('utf8')
|
|
key_hash = hashlib.md5(key)
|
|
for p in range(2 ** PARTITION_EXPONENT):
|
|
key_hash.update(key)
|
|
hashed_key = self._hash2int(key_hash)
|
|
self._host_hashes[hashed_key] = host
|
|
# Gather the (possibly colliding) resulting hashes into a bisectable
|
|
# list.
|
|
self._partitions = sorted(self._host_hashes.keys())
|
|
|
|
def _hash2int(self, key_hash):
|
|
"""Convert the given hash's digest to a numerical value for the ring.
|
|
|
|
:returns: An integer equivalent value of the digest.
|
|
"""
|
|
return int(key_hash.hexdigest(), 16)
|
|
|
|
def _get_partition(self, data):
|
|
try:
|
|
if six.PY3 and data is not None:
|
|
data = data.encode('utf-8')
|
|
key_hash = hashlib.md5(data)
|
|
hashed_key = self._hash2int(key_hash)
|
|
position = bisect.bisect(self._partitions, hashed_key)
|
|
return position if position < len(self._partitions) else 0
|
|
except TypeError:
|
|
raise exception.Invalid(
|
|
_("Invalid data supplied to HashRing.get_hosts."))
|
|
|
|
def get_hosts(self, data, ignore_hosts=None):
|
|
"""Get the list of hosts which the supplied data maps onto.
|
|
|
|
:param data: A string identifier to be mapped across the ring.
|
|
:param ignore_hosts: A list of hosts to skip when performing the hash.
|
|
Useful to temporarily skip down hosts without
|
|
performing a full rebalance.
|
|
Default: None.
|
|
:returns: a list of hosts.
|
|
The length of this list depends on the number of replicas
|
|
this `HashRing` was created with. It may be less than this
|
|
if ignore_hosts is not None.
|
|
"""
|
|
hosts = []
|
|
if ignore_hosts is None:
|
|
ignore_hosts = set()
|
|
else:
|
|
ignore_hosts = set(ignore_hosts)
|
|
ignore_hosts.intersection_update(self.hosts)
|
|
partition = self._get_partition(data)
|
|
for replica in range(0, self.replicas):
|
|
if len(hosts) + len(ignore_hosts) == len(self.hosts):
|
|
# prevent infinite loop - cannot allocate more fallbacks.
|
|
break
|
|
# Linear probing: partition N, then N+1 etc.
|
|
host = self._get_host(partition)
|
|
while host in hosts or host in ignore_hosts:
|
|
partition += 1
|
|
if partition >= len(self._partitions):
|
|
partition = 0
|
|
host = self._get_host(partition)
|
|
hosts.append(host)
|
|
return hosts
|
|
|
|
def _get_host(self, partition):
|
|
"""Find what host is serving a partition.
|
|
|
|
:param partition: The index of the partition in the partition map.
|
|
e.g. 0 is the first partition, 1 is the second.
|
|
:return: The host object the ring was constructed with.
|
|
"""
|
|
return self._host_hashes[self._partitions[partition]]
|