Fix TypeError in _update_from_compute_node race

A compute node record is created in the database from the
_init_compute_node method in the resource tracker in the
nova-compute service. It is created without the updated_at
or free_disk_gb fields being set.

Shortly after it's created, update_resource_stats() is called
on the scheduler report client which calls compute_node.save()
which calls the update_compute_node() method in the DB API which
*always* updates the updated_at field even if nothing else
was changed.

So at this point, we have a compute node with updated_at set
but not free_disk_gb. The free_disk_gb field gets set by the
_update_usage_from_instances() method in the resource tracker
and then that value is later saved off when the compute node
is updated in the database on a second call to the
update_resource_stats() method in the scheduler report client.
At that point the free_disk_gb field is set on the compute
node record.

There is a race in between the compute node create and initial
update but before free_disk_gb is set where the scheduler host
manager can be attempting to update the HostState object from
the same compute node when selecting host destinations during
a server build. If that picks up the compute node before its
free_disk_gb field is set it will result in a TypeError when
trying to multiply None * 1024.

Change 36a0ba9c81 was an earlier
attempt at fixing this bug and shortened the race window but
didn't handle the problem where updated_at is set but free_disk_gb
is not yet.

This change builds on that by simply checking for the thing
the scheduler host manager actually cares about, which is the
free_disk_gb field.

Closes-Bug: #1654102
Closes-Bug: #1610679

Change-Id: I37b75dabb3ea7ec2d5678550d9aff30b1a0c29e6
This commit is contained in:
Matt Riedemann 2017-01-04 22:06:23 -05:00
parent 1bcf3b553a
commit 2f1245a56c
4 changed files with 27 additions and 3 deletions

View File

@ -183,7 +183,9 @@ class HostState(object):
"""Update information about a host from a ComputeNode object."""
# NOTE(jichenjc): if the compute record is just created but not updated
# some field such as free_disk_gb can be None
if compute.updated_at is None:
if 'free_disk_gb' not in compute or compute.free_disk_gb is None:
LOG.debug('Ignoring compute node %s as its usage has not been '
'updated yet.', compute.uuid)
return
if (self.updated and compute.updated_at

View File

@ -21,6 +21,8 @@ This host manager will consume all cpu's, disk space, and
ram from a host / node as it is supporting Baremetal hosts, which can not be
subdivided into multiple instances.
"""
from oslo_log import log as logging
import nova.conf
from nova import context as context_module
from nova import objects
@ -29,6 +31,8 @@ from nova.scheduler import host_manager
CONF = nova.conf.CONF
LOG = logging.getLogger(__name__)
class IronicNodeState(host_manager.HostState):
"""Mutable and immutable information tracked for a host.
@ -38,6 +42,14 @@ class IronicNodeState(host_manager.HostState):
def _update_from_compute_node(self, compute):
"""Update information about a host from a ComputeNode object."""
# NOTE(jichenjc): if the compute record is just created but not updated
# some field such as free_disk_gb can be None
if 'free_disk_gb' not in compute or compute.free_disk_gb is None:
LOG.debug('Ignoring compute node %s as its usage has not been '
'updated yet.', compute.uuid)
return
self.vcpus_total = compute.vcpus
self.vcpus_used = compute.vcpus_used

View File

@ -1218,7 +1218,7 @@ class HostStateTestCase(test.NoDBTestCase):
def test_stat_consumption_from_compute_node_not_ready(self):
compute = objects.ComputeNode(free_ram_mb=100,
updated_at=None)
uuid=uuids.compute_node_uuid)
host = host_manager.HostState("fakehost", "fakenode")
host._update_from_compute_node(compute)

View File

@ -182,7 +182,8 @@ class IronicHostManagerChangedNodesTestCase(test.NoDBTestCase):
hypervisor_version=1,
hypervisor_hostname='fake_host',
cpu_allocation_ratio=16.0, ram_allocation_ratio=1.5,
disk_allocation_ratio=1.0)
disk_allocation_ratio=1.0,
uuid=uuids.compute_node_uuid)
@mock.patch.object(ironic_host_manager.IronicNodeState, '__init__')
def test_create_ironic_node_state(self, init_mock):
@ -265,6 +266,15 @@ class IronicHostManagerChangedNodesTestCase(test.NoDBTestCase):
self.assertEqual(1, host.hypervisor_version)
self.assertEqual('fake_host', host.hypervisor_hostname)
def test_update_from_compute_node_not_ready(self):
"""Tests that we ignore a compute node that does not have its
free_disk_gb field set yet from the compute resource tracker.
"""
host = ironic_host_manager.IronicNodeState("fakehost", "fakenode")
self.compute_node.free_disk_gb = None
host.update(compute=self.compute_node)
self.assertEqual(0, host.free_disk_mb)
def test_consume_identical_instance_from_compute(self):
host = ironic_host_manager.IronicNodeState("fakehost", "fakenode")
host.update(compute=self.compute_node)