Lowercase ironic driver hash ring and ignore case in cache
Recently we had a customer case where attempts to add new ironic nodes to an existing undercloud resulted in half of the nodes failing to be detected and added to nova. Ironic API returned all of the newly added nodes when called by the driver, but half of the nodes were not returned to the compute manager by the driver. There was only one nova-compute service managing all of the ironic nodes of the all-in-one typical undercloud deployment. After days of investigation and examination of a database dump from the customer, we noticed that at some point the customer had changed the hostname of the machine from something containing uppercase letters to the same name but all lowercase. The nova-compute service record had the mixed case name and the CONF.host (socket.gethostname()) had the lowercase name. The hash ring logic adds all of the nova-compute service hostnames plus CONF.host to hash ring, then the ironic driver reports only the nodes it owns by retrieving a service hostname from the ring based on a hash of each ironic node UUID. Because of the machine hostname change, the hash ring contained, for example: {'MachineHostName', 'machinehostname'} when it should have contained only one hostname. And because the hash ring contained two hostnames, the driver was able to retrieve only half of the nodes as nodes that it owned. So half of the new nodes were excluded and not added as new compute nodes. This adds lowercasing of hosts that are added to the hash ring and ignores case when comparing the CONF.host to the hash ring members to avoid unnecessary pain and confusion for users that make hostname changes that are otherwise functionally harmless. This also adds logging of the set of hash ring members at level DEBUG to help enable easier debugging of hash ring related situations. Closes-Bug: #1866380 Change-Id: I617fd59de327de05a198f12b75a381f21945afb0 (cherry picked from commit7145100ee4
) (cherry picked from commit588b0484bf
) (cherry picked from commit8f8667a8dd
) (cherry picked from commit019e3da75b
)
This commit is contained in:
parent
f5b6dc603c
commit
620e5da840
|
@ -2951,17 +2951,28 @@ class HashRingTestCase(test.NoDBTestCase):
|
||||||
self.assertEqual(SENTINEL, self.driver.hash_ring)
|
self.assertEqual(SENTINEL, self.driver.hash_ring)
|
||||||
self.mock_is_up.assert_has_calls(is_up_calls)
|
self.mock_is_up.assert_has_calls(is_up_calls)
|
||||||
|
|
||||||
|
def test__refresh_hash_ring_same_host_different_case(self):
|
||||||
|
# Test that we treat Host1 and host1 as the same host
|
||||||
|
# CONF.host is set to 'host1' in __test_refresh_hash_ring
|
||||||
|
services = ['Host1']
|
||||||
|
expected_hosts = {'host1'}
|
||||||
|
self.mock_is_up.return_value = True
|
||||||
|
self._test__refresh_hash_ring(services, expected_hosts)
|
||||||
|
|
||||||
def test__refresh_hash_ring_one_compute(self):
|
def test__refresh_hash_ring_one_compute(self):
|
||||||
services = ['host1']
|
services = ['host1']
|
||||||
expected_hosts = {'host1'}
|
expected_hosts = {'host1'}
|
||||||
self.mock_is_up.return_value = True
|
self.mock_is_up.return_value = True
|
||||||
self._test__refresh_hash_ring(services, expected_hosts)
|
self._test__refresh_hash_ring(services, expected_hosts)
|
||||||
|
|
||||||
def test__refresh_hash_ring_many_computes(self):
|
@mock.patch('nova.virt.ironic.driver.LOG.debug')
|
||||||
|
def test__refresh_hash_ring_many_computes(self, mock_log_debug):
|
||||||
services = ['host1', 'host2', 'host3']
|
services = ['host1', 'host2', 'host3']
|
||||||
expected_hosts = {'host1', 'host2', 'host3'}
|
expected_hosts = {'host1', 'host2', 'host3'}
|
||||||
self.mock_is_up.return_value = True
|
self.mock_is_up.return_value = True
|
||||||
self._test__refresh_hash_ring(services, expected_hosts)
|
self._test__refresh_hash_ring(services, expected_hosts)
|
||||||
|
expected_msg = 'Hash ring members are %s'
|
||||||
|
mock_log_debug.assert_called_once_with(expected_msg, set(services))
|
||||||
|
|
||||||
def test__refresh_hash_ring_one_compute_new_compute(self):
|
def test__refresh_hash_ring_one_compute_new_compute(self):
|
||||||
services = []
|
services = []
|
||||||
|
@ -3016,6 +3027,26 @@ class NodeCacheTestCase(test.NoDBTestCase):
|
||||||
limit=0)
|
limit=0)
|
||||||
self.assertIsNotNone(self.driver.node_cache_time)
|
self.assertIsNotNone(self.driver.node_cache_time)
|
||||||
|
|
||||||
|
def test__refresh_cache_same_host_different_case(self):
|
||||||
|
# Test that we treat Host1 and host1 as the same host
|
||||||
|
self.host = 'Host1'
|
||||||
|
self.flags(host=self.host)
|
||||||
|
instances = []
|
||||||
|
nodes = [
|
||||||
|
_get_cached_node(
|
||||||
|
uuid=uuidutils.generate_uuid(), instance_uuid=None),
|
||||||
|
_get_cached_node(
|
||||||
|
uuid=uuidutils.generate_uuid(), instance_uuid=None),
|
||||||
|
_get_cached_node(
|
||||||
|
uuid=uuidutils.generate_uuid(), instance_uuid=None),
|
||||||
|
]
|
||||||
|
hosts = ['host1', 'host1', 'host1']
|
||||||
|
|
||||||
|
self._test__refresh_cache(instances, nodes, hosts)
|
||||||
|
|
||||||
|
expected_cache = {n.uuid: n for n in nodes}
|
||||||
|
self.assertEqual(expected_cache, self.driver.node_cache)
|
||||||
|
|
||||||
def test__refresh_cache(self):
|
def test__refresh_cache(self):
|
||||||
# normal operation, one compute service
|
# normal operation, one compute service
|
||||||
instances = []
|
instances = []
|
||||||
|
|
|
@ -686,14 +686,15 @@ class IronicDriver(virt_driver.ComputeDriver):
|
||||||
for svc in service_list:
|
for svc in service_list:
|
||||||
is_up = self.servicegroup_api.service_is_up(svc)
|
is_up = self.servicegroup_api.service_is_up(svc)
|
||||||
if is_up:
|
if is_up:
|
||||||
services.add(svc.host)
|
services.add(svc.host.lower())
|
||||||
# NOTE(jroll): always make sure this service is in the list, because
|
# NOTE(jroll): always make sure this service is in the list, because
|
||||||
# only services that have something registered in the compute_nodes
|
# only services that have something registered in the compute_nodes
|
||||||
# table will be here so far, and we might be brand new.
|
# table will be here so far, and we might be brand new.
|
||||||
services.add(CONF.host)
|
services.add(CONF.host.lower())
|
||||||
|
|
||||||
self.hash_ring = hash_ring.HashRing(services,
|
self.hash_ring = hash_ring.HashRing(services,
|
||||||
partitions=_HASH_RING_PARTITIONS)
|
partitions=_HASH_RING_PARTITIONS)
|
||||||
|
LOG.debug('Hash ring members are %s', services)
|
||||||
|
|
||||||
def _refresh_cache(self):
|
def _refresh_cache(self):
|
||||||
# NOTE(lucasagomes): limit == 0 is an indicator to continue
|
# NOTE(lucasagomes): limit == 0 is an indicator to continue
|
||||||
|
@ -715,7 +716,7 @@ class IronicDriver(virt_driver.ComputeDriver):
|
||||||
# nova while the service was down, and not yet reaped, will not be
|
# nova while the service was down, and not yet reaped, will not be
|
||||||
# reported until the periodic task cleans it up.
|
# reported until the periodic task cleans it up.
|
||||||
elif (node.instance_uuid is None and
|
elif (node.instance_uuid is None and
|
||||||
CONF.host in
|
CONF.host.lower() in
|
||||||
self.hash_ring.get_nodes(node.uuid.encode('utf-8'))):
|
self.hash_ring.get_nodes(node.uuid.encode('utf-8'))):
|
||||||
node_cache[node.uuid] = node
|
node_cache[node.uuid] = node
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue