Validate network downtime during live migration

This patch adds the ability to measure network downtime during live
migration process.
A fixture has been added to start and stop a background pinger
process and also read status from it.
The downtime measurement has 0.2 seconds granularity.
In order to reduce overall traffic the ping payload size set to
minimal value.

Change-Id: I83c6a5d49f5d4da05deb677907e5048ecdd2242b
This commit is contained in:
Roman Safronov 2022-02-10 12:26:43 +02:00 committed by Eduardo Olivares
parent 569c7a89f5
commit 72575889c8
4 changed files with 97 additions and 0 deletions

View File

@ -0,0 +1,9 @@
---
features:
- |
Added new module net_downtime including the fixture NetDowntimeMeter that
can be used to measure how long the connectivity with an IP is lost
during certain operations like a server live migration.
The configuration option allowed_network_downtime has been added with a
default value of 5.0 seconds, which would be the maximum time that
the connectivity downtime is expected to last.

View File

@ -0,0 +1,63 @@
# Copyright 2022 OpenStack Foundation
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import signal
import subprocess
import fixtures
from oslo_log import log
LOG = log.getLogger(__name__)
class NetDowntimeMeter(fixtures.Fixture):
def __init__(self, dest_ip, interval='0.2'):
self.dest_ip = dest_ip
# Note: for intervals lower than 0.2 ping requires root privileges
self.interval = interval
self.ping_process = None
def _setUp(self):
self.start_background_pinger()
def start_background_pinger(self):
cmd = ['ping', '-q', '-s1']
cmd.append('-i{}'.format(self.interval))
cmd.append(self.dest_ip)
LOG.debug("Starting background pinger to '{}' with interval {}".format(
self.dest_ip, self.interval))
self.ping_process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
self.addCleanup(self.cleanup)
def cleanup(self):
if self.ping_process and self.ping_process.poll() is None:
LOG.debug('Terminating background pinger with pid {}'.format(
self.ping_process.pid))
self.ping_process.terminate()
self.ping_process = None
def get_downtime(self):
self.ping_process.send_signal(signal.SIGQUIT)
# Example of the expected output:
# 264/274 packets, 3% loss
output = self.ping_process.stderr.readline().strip().decode('utf-8')
if output and len(output.split()[0].split('/')) == 2:
succ, total = output.split()[0].split('/')
return (int(total) - int(succ)) * float(self.interval)
else:
LOG.warning('Unexpected output obtained from the pinger: %s',
output)

View File

@ -965,6 +965,12 @@ ValidationGroup = [
default='ecdsa', default='ecdsa',
help='Type of key to use for ssh connections. ' help='Type of key to use for ssh connections. '
'Valid types are rsa, ecdsa'), 'Valid types are rsa, ecdsa'),
cfg.IntOpt('allowed_network_downtime',
default=5.0,
help="Allowed VM network connection downtime during live "
"migration, in seconds. "
"When the measured downtime exceeds this value, an "
"exception is raised."),
] ]
volume_group = cfg.OptGroup(name='volume', volume_group = cfg.OptGroup(name='volume',

View File

@ -15,7 +15,9 @@
import testtools import testtools
from oslo_log import log
from tempest.common import utils from tempest.common import utils
from tempest.common.utils import net_downtime
from tempest.common import waiters from tempest.common import waiters
from tempest import config from tempest import config
from tempest.lib import decorators from tempest.lib import decorators
@ -23,6 +25,8 @@ from tempest.scenario import manager
CONF = config.CONF CONF = config.CONF
LOG = log.getLogger(__name__)
class TestNetworkAdvancedServerOps(manager.NetworkScenarioTest): class TestNetworkAdvancedServerOps(manager.NetworkScenarioTest):
"""Check VM connectivity after some advanced instance operations executed: """Check VM connectivity after some advanced instance operations executed:
@ -252,6 +256,11 @@ class TestNetworkAdvancedServerOps(manager.NetworkScenarioTest):
block_migration = (CONF.compute_feature_enabled. block_migration = (CONF.compute_feature_enabled.
block_migration_for_live_migration) block_migration_for_live_migration)
old_host = self.get_host_for_server(server['id']) old_host = self.get_host_for_server(server['id'])
downtime_meter = net_downtime.NetDowntimeMeter(
floating_ip['floating_ip_address'])
self.useFixture(downtime_meter)
self.admin_servers_client.live_migrate_server( self.admin_servers_client.live_migrate_server(
server['id'], host=None, block_migration=block_migration, server['id'], host=None, block_migration=block_migration,
disk_over_commit=False) disk_over_commit=False)
@ -261,6 +270,16 @@ class TestNetworkAdvancedServerOps(manager.NetworkScenarioTest):
new_host = self.get_host_for_server(server['id']) new_host = self.get_host_for_server(server['id'])
self.assertNotEqual(old_host, new_host, 'Server did not migrate') self.assertNotEqual(old_host, new_host, 'Server did not migrate')
downtime = downtime_meter.get_downtime()
self.assertIsNotNone(downtime)
LOG.debug("Downtime seconds measured with downtime_meter = %r",
downtime)
allowed_downtime = CONF.validation.allowed_network_downtime
self.assertLess(
downtime, allowed_downtime,
"Downtime of {} seconds is higher than expected '{}'".format(
downtime, allowed_downtime))
self._wait_server_status_and_check_network_connectivity( self._wait_server_status_and_check_network_connectivity(
server, keypair, floating_ip) server, keypair, floating_ip)