Merge "Read keepalived initial state in parallel to interface monitoring"

This commit is contained in:
Zuul 2021-04-02 15:25:03 +00:00 committed by Gerrit Code Review
commit 1ad9ca56b0
2 changed files with 60 additions and 10 deletions

View File

@ -27,11 +27,13 @@ from neutron.agent.linux import daemon
from neutron.agent.linux import ip_lib
from neutron.agent.linux import utils as agent_utils
from neutron.common import config
from neutron.common import utils as common_utils
from neutron.conf.agent.l3 import keepalived
from neutron import privileged
LOG = logging.getLogger(__name__)
INITIAL_STATE_READ_TIMEOUT = 10
class KeepalivedUnixDomainConnection(agent_utils.UnixDomainHTTPConnection):
@ -56,10 +58,24 @@ class MonitorDaemon(daemon.Daemon):
self.event_stop = threading.Event()
self.event_started = threading.Event()
self.queue = queue.Queue()
self._initial_state = None
super(MonitorDaemon, self).__init__(pidfile, uuid=router_id,
user=user, group=group)
@property
def initial_state(self):
return self._initial_state
@initial_state.setter
def initial_state(self, state):
if not self._initial_state:
LOG.debug('Initial status of router %s is %s', self.router_id,
state)
self._initial_state = state
def run(self):
self._thread_initial_state = threading.Thread(
target=self.handle_initial_state)
self._thread_ip_monitor = threading.Thread(
target=ip_lib.ip_monitor,
args=(self.namespace, self.queue, self.event_stop,
@ -67,9 +83,19 @@ class MonitorDaemon(daemon.Daemon):
self._thread_read_queue = threading.Thread(
target=self.read_queue,
args=(self.queue, self.event_stop, self.event_started))
self._thread_initial_state.start()
self._thread_ip_monitor.start()
self._thread_read_queue.start()
self.handle_initial_state()
# NOTE(ralonsoh): if the initial status is not read in a defined
# timeout, "backup" state is set.
self._thread_initial_state.join(timeout=INITIAL_STATE_READ_TIMEOUT)
if not self.initial_state:
LOG.warning('Timeout reading the initial status of router %s, '
'state is set to "backup".', self.router_id)
self.write_state_change('backup')
self.notify_agent('backup')
self._thread_read_queue.join()
def read_queue(self, _queue, event_stop, event_started):
@ -93,21 +119,26 @@ class MonitorDaemon(daemon.Daemon):
def handle_initial_state(self):
try:
state = 'backup'
ip = ip_lib.IPDevice(self.interface, self.namespace)
for address in ip.addr.list():
if address.get('cidr') == self.cidr:
cidr = common_utils.ip_to_cidr(self.cidr)
# NOTE(ralonsoh): "get_devices_with_ip" without passing an IP
# address performs one single pyroute2 command. Because the number
# of interfaces in the namespace is reduced, this is faster.
for address in ip_lib.get_devices_with_ip(self.namespace):
if (address['name'] == self.interface and
address['cidr'] == cidr):
state = 'primary'
break
LOG.debug('Initial status of router %s is %s',
self.router_id, state)
if not self.initial_state:
self.write_state_change(state)
self.notify_agent(state)
except Exception:
if not self.initial_state:
LOG.exception('Failed to get initial status of router %s',
self.router_id)
def write_state_change(self, state):
self.initial_state = state
with open(os.path.join(
self.conf_dir, 'state'), 'w') as state_file:
state_file.write(state)

View File

@ -127,3 +127,22 @@ class TestMonitorDaemon(base.BaseLoggingTestCase):
self._run_monitor()
msg = 'Initial status of router %s is %s' % (self.router_id, 'primary')
self._search_in_file(self.log_file, msg)
def test_handle_initial_state_backup_error_reading_initial_status(self):
# By passing this wrong IP address, the thread "_thread_initial_state"
# will fail generating an exception (caught inside the called method).
# The main thread will timeout waiting for an initial state and
# "backup" will be set.
self.router.port.addr.add(self.cidr)
self._generate_cmd_opts(cidr='failed_IP_address')
self.ext_process = external_process.ProcessManager(
conf=None, uuid=self.router_id, namespace=self.router.namespace,
service='test_ip_mon', pids_path=self.conf_dir,
default_cmd_callback=self._callback, run_as_root=True,
pid_file=self.pid_file)
self._run_monitor()
msg = ('Timeout reading the initial status of router %s' %
self.router_id)
self._search_in_file(self.log_file, msg)
msg = 'Initial status of router %s is %s' % (self.router_id, 'backup')
self._search_in_file(self.log_file, msg)