Log some stats for image pre-cache

This attempts to log some statistics about a precache operation so that,
barring something more complicated on the operator side, there is some
useful information in the logs about what is going on.

Related to blueprint image-precache-support

Change-Id: I550afc344eca30c366ba0e5342966bcbaac96bfe
This commit is contained in:
Dan Smith 2019-10-11 09:53:01 -07:00
parent 9742a64403
commit 7ecd502f6d
3 changed files with 102 additions and 8 deletions

View File

@ -14,6 +14,7 @@
"""Handles database requests from other nova services."""
import collections
import contextlib
import copy
import eventlet
@ -1667,6 +1668,26 @@ class ComputeTaskManager(base.Base):
'cells': len(hosts_by_cell)})
clock.start()
stats = collections.defaultdict(lambda: (0, 0, 0, 0))
failed_images = collections.defaultdict(int)
down_hosts = set()
def wrap_cache_images(ctxt, host, image_ids):
result = self.compute_rpcapi.cache_images(
ctxt, host=host, image_ids=image_ids)
for image_id, status in result.items():
cached, existing, error, unsupported = stats[image_id]
if status == 'error':
failed_images[image_id] += 1
error += 1
elif status == 'cached':
cached += 1
elif status == 'existing':
existing += 1
elif status == 'unsupported':
unsupported += 1
stats[image_id] = (cached, existing, error, unsupported)
for cell_uuid, hosts in hosts_by_cell.items():
cell = cells_by_uuid[cell_uuid]
with nova_context.target_cell(context, cell) as target_ctxt:
@ -1674,27 +1695,47 @@ class ComputeTaskManager(base.Base):
service = objects.Service.get_by_compute_host(target_ctxt,
host)
if not self.servicegroup_api.service_is_up(service):
down_hosts.add(host)
LOG.info(
'Skipping image pre-cache request to compute '
'%(host)r because it is not up',
{'host': host})
continue
fetch_pool.spawn_n(self.compute_rpcapi.cache_images,
target_ctxt,
host=host,
image_ids=image_ids)
fetch_pool.spawn_n(wrap_cache_images, target_ctxt, host,
image_ids)
# Wait until all those things finish
fetch_pool.waitall()
overall_stats = {'cached': 0, 'existing': 0, 'error': 0,
'unsupported': 0}
for cached, existing, error, unsupported in stats.values():
overall_stats['cached'] += cached
overall_stats['existing'] += existing
overall_stats['error'] += error
overall_stats['unsupported'] += unsupported
clock.stop()
# FIXME(danms): Calculate some interesting statistics about the image
# download process to log for the admin.
LOG.info('Image pre-cache operation for image(s) %(image_ids)s '
'completed in %(time).2f seconds',
'completed in %(time).2f seconds; '
'%(cached)i cached, %(existing)i existing, %(error)i errors, '
'%(unsupported)i unsupported, %(skipped)i skipped (down) '
'hosts',
{'image_ids': ','.join(image_ids),
'time': clock.elapsed()})
'time': clock.elapsed(),
'cached': overall_stats['cached'],
'existing': overall_stats['existing'],
'error': overall_stats['error'],
'unsupported': overall_stats['unsupported'],
'skipped': len(down_hosts),
})
# Log error'd images specifically at warning level
for image_id, fails in failed_images.items():
LOG.warning('Image pre-cache operation for image %(image)s '
'failed %(fails)i times',
{'image': image_id,
'fails': fails})
compute_utils.notify_about_aggregate_action(
context, aggregate,

View File

@ -74,3 +74,12 @@ class ImageCacheTest(test.TestCase):
'aggregate.cache_images.start')
fake_notifier.wait_for_versioned_notifications(
'aggregate.cache_images.end')
logtext = self.stdlog.logger.output
self.assertIn(
'3 cached, 0 existing, 0 errors, 0 unsupported, 1 skipped',
logtext)
self.assertNotIn(
'Image pre-cache operation for image an-image failed',
logtext)

View File

@ -3661,3 +3661,47 @@ class ConductorTaskAPITestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
mock_cache.assert_not_called()
_test()
@mock.patch('nova.objects.HostMapping.get_by_host')
@mock.patch('nova.context.target_cell')
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_cache_images_failed_compute(self, mock_service, mock_target,
mock_gbh):
"""Test the edge cases for cache_images(), specifically the
error, skip, and down situations.
"""
fake_service = objects.Service(disabled=False, forced_down=False,
last_seen_up=timeutils.utcnow())
fake_down_service = objects.Service(disabled=False, forced_down=True,
last_seen_up=None)
mock_service.side_effect = [fake_service, fake_service,
fake_down_service]
mock_target.__return_value.__enter__.return_value = self.context
fake_cell = objects.CellMapping(uuid=uuids.cell,
database_connection='',
transport_url='')
fake_mapping = objects.HostMapping(cell_mapping=fake_cell)
mock_gbh.return_value = fake_mapping
fake_agg = objects.Aggregate(name='agg', uuid=uuids.agg,
hosts=['host1', 'host2', 'host3'])
@mock.patch.object(self.conductor_manager.compute_rpcapi,
'cache_images')
def _test(mock_cache):
mock_cache.side_effect = [
{'image1': 'unsupported'},
{'image1': 'error'},
]
self.conductor_manager.cache_images(self.context,
fake_agg,
['image1'])
_test()
logtext = self.stdlog.logger.output
self.assertIn(
'0 cached, 0 existing, 1 errors, 1 unsupported, 1 skipped',
logtext)
self.assertIn('host3\' because it is not up', logtext)
self.assertIn('image1 failed 1 times', logtext)