Router should flip to standby if all L3 nodes down

A HA router should always be active unless all of the agents hosting
that router go down.  In that event, the router should switch to
standby.  This behavior changed with review:
  https://review.openstack.org/#/c/411784

That review seemed to be accounting for a flakey message bus.  This
change should account for that, but also revert to the original behavior
of the router state only changing when its backing agent hosts are down.

Change-Id: I89c3b2546382624f175f8de4de621c3e53adf527
Closes-Bug: 1682145
This commit is contained in:
Drew Thorstensen 2017-04-21 08:02:17 -04:00 committed by Brian Haley
parent 09e053cd91
commit b62d1bfdf7
7 changed files with 109 additions and 22 deletions
neutron
common
db
l3_hamode_db.py
migration/alembic_migrations/versions
models
tests/unit/db
releasenotes/notes

View File

@ -37,7 +37,10 @@ HA_SUBNET_NAME = 'HA subnet tenant %s'
HA_PORT_NAME = 'HA port tenant %s'
HA_ROUTER_STATE_ACTIVE = 'active'
HA_ROUTER_STATE_STANDBY = 'standby'
VALID_HA_STATES = (HA_ROUTER_STATE_ACTIVE, HA_ROUTER_STATE_STANDBY)
HA_ROUTER_STATE_UNKNOWN = 'unknown'
VALID_HA_STATES = (HA_ROUTER_STATE_ACTIVE, HA_ROUTER_STATE_STANDBY,
HA_ROUTER_STATE_UNKNOWN)
PAGINATION_INFINITE = 'infinite'
SORT_DIRECTION_ASC = 'asc'

View File

@ -547,20 +547,36 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin,
"""
with context.session.begin(subtransactions=True):
bindings = self.get_ha_router_port_bindings(context, [router_id])
dead_agents = []
active = [binding for binding in bindings
if binding.state == n_const.HA_ROUTER_STATE_ACTIVE]
# Check dead agents only if we have more then one active agent
if len(active) > 1:
dead_agents = [binding.agent for binding in active
if not (binding.agent.is_active and
binding.agent.admin_state_up)]
for dead_agent in dead_agents:
self.update_routers_states(
context,
{router_id: n_const.HA_ROUTER_STATE_STANDBY},
dead_agent.host)
if dead_agents:
router_active_agents_dead = []
router_standby_agents_dead = []
# List agents where router is active and agent is dead
# and agents where router is standby and agent is dead
for binding in bindings:
if not (binding.agent.is_active
and binding.agent.admin_state_up):
if binding.state == n_const.HA_ROUTER_STATE_ACTIVE:
router_active_agents_dead.append(binding.agent)
elif binding.state == n_const.HA_ROUTER_STATE_STANDBY:
router_standby_agents_dead.append(binding.agent)
if router_active_agents_dead:
# Just check if all l3_agents are down
# then assuming some communication issue
if (len(router_active_agents_dead) +
len(router_standby_agents_dead) == len(bindings)):
# Make router status as unknown because
# agent communication may be issue but router
# may still be active. We do not know the
# exact status of router.
state = n_const.HA_ROUTER_STATE_UNKNOWN
else:
# Make router status as standby on all dead agents
# as some other agents are alive , router can become
# active on them after some time
state = n_const.HA_ROUTER_STATE_STANDBY
for dead_agent in router_active_agents_dead:
self.update_routers_states(context, {router_id: state},
dead_agent.host)
if router_active_agents_dead:
return self.get_ha_router_port_bindings(context, [router_id])
return bindings

View File

@ -1 +1 @@
594422d373ee
61663558142c

View File

@ -0,0 +1,43 @@
# Copyright 2017 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
"""Add unknown state to HA router
Revision ID: 61663558142c
Revises: 594422d373ee
Create Date: 2017-05-18 14:31:45.725516
"""
revision = '61663558142c'
down_revision = '594422d373ee'
import sqlalchemy as sa
from neutron.common import constants
from neutron.db import migration
ha_port_bindings_table_name = "ha_router_agent_port_bindings"
new_enum = sa.Enum(
constants.HA_ROUTER_STATE_ACTIVE,
constants.HA_ROUTER_STATE_STANDBY,
constants.HA_ROUTER_STATE_UNKNOWN,
name='l3_ha_states'
)
def upgrade():
migration.alter_enum_add_value(ha_port_bindings_table_name, 'state',
new_enum, True, server_default='standby')

View File

@ -53,6 +53,7 @@ class L3HARouterAgentPortBinding(model_base.BASEV2):
state = sa.Column(sa.Enum(n_const.HA_ROUTER_STATE_ACTIVE,
n_const.HA_ROUTER_STATE_STANDBY,
n_const.HA_ROUTER_STATE_UNKNOWN,
name='l3_ha_states'),
default=n_const.HA_ROUTER_STATE_STANDBY,
server_default=n_const.HA_ROUTER_STATE_STANDBY)

View File

@ -179,8 +179,7 @@ class L3HATestCase(L3HATestFramework):
self.admin_ctx, router['id'])
self.assertEqual([], bindings)
def _assert_ha_state_for_agent(self, router, agent,
state=n_const.HA_ROUTER_STATE_STANDBY):
def _assert_ha_state_for_agent(self, router, agent, state):
bindings = (
self.plugin.get_l3_bindings_hosting_router_with_ha_states(
self.admin_ctx, router['id']))
@ -198,7 +197,8 @@ class L3HATestCase(L3HATestFramework):
self.agent2['host'])
with mock.patch.object(agent_utils, 'is_agent_down',
return_value=True):
self._assert_ha_state_for_agent(router, self.agent1)
self._assert_ha_state_for_agent(router, self.agent1,
n_const.HA_ROUTER_STATE_UNKNOWN)
def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_false(
self):
@ -209,8 +209,22 @@ class L3HATestCase(L3HATestFramework):
self.plugin.update_routers_states(
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
self.agent2['host'])
helpers.set_agent_admin_state(self.agent1['id'])
self._assert_ha_state_for_agent(router, self.agent1)
helpers.set_agent_admin_state(self.agent1['id'], admin_state_up=False)
self._assert_ha_state_for_agent(router, self.agent1,
n_const.HA_ROUTER_STATE_STANDBY)
def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_true(
self):
router = self._create_router()
self.plugin.update_routers_states(
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
self.agent1['host'])
self.plugin.update_routers_states(
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
self.agent2['host'])
helpers.set_agent_admin_state(self.agent1['id'], admin_state_up=True)
self._assert_ha_state_for_agent(router, self.agent1,
n_const.HA_ROUTER_STATE_ACTIVE)
def test_get_l3_bindings_hosting_router_with_ha_states_one_dead(self):
router = self._create_router()
@ -222,8 +236,10 @@ class L3HATestCase(L3HATestFramework):
self.agent2['host'])
with mock.patch.object(agent_utils, 'is_agent_down',
return_value=True):
# With above mock all agents are in dead state
# hence router state is Unknown overall.
self._assert_ha_state_for_agent(
router, self.agent1, state=n_const.HA_ROUTER_STATE_ACTIVE)
router, self.agent1, n_const.HA_ROUTER_STATE_UNKNOWN)
def test_ha_router_create(self):
router = self._create_router()

View File

@ -0,0 +1,8 @@
---
features:
- Added new ``unknown`` state for HA routers. Sometimes l3 agents may not be
able to update health status to Neutron server due to communication issues.
During that time the server may not know whether HA routers hosted by that
agent are active or standby.
fixes:
- Fixes bug `1682145 <https://launchpad.net/bugs/1682145>`_.