Router should flip to standby if all L3 nodes down
A HA router should always be active unless all of the agents hosting that router go down. In that event, the router should switch to standby. This behavior changed with review: https://review.openstack.org/#/c/411784 That review seemed to be accounting for a flakey message bus. This change should account for that, but also revert to the original behavior of the router state only changing when its backing agent hosts are down. Change-Id: I89c3b2546382624f175f8de4de621c3e53adf527 Closes-Bug: 1682145
This commit is contained in:
parent
09e053cd91
commit
b62d1bfdf7
neutron
releasenotes/notes
@ -37,7 +37,10 @@ HA_SUBNET_NAME = 'HA subnet tenant %s'
|
||||
HA_PORT_NAME = 'HA port tenant %s'
|
||||
HA_ROUTER_STATE_ACTIVE = 'active'
|
||||
HA_ROUTER_STATE_STANDBY = 'standby'
|
||||
VALID_HA_STATES = (HA_ROUTER_STATE_ACTIVE, HA_ROUTER_STATE_STANDBY)
|
||||
HA_ROUTER_STATE_UNKNOWN = 'unknown'
|
||||
VALID_HA_STATES = (HA_ROUTER_STATE_ACTIVE, HA_ROUTER_STATE_STANDBY,
|
||||
HA_ROUTER_STATE_UNKNOWN)
|
||||
|
||||
PAGINATION_INFINITE = 'infinite'
|
||||
|
||||
SORT_DIRECTION_ASC = 'asc'
|
||||
|
@ -547,20 +547,36 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin,
|
||||
"""
|
||||
with context.session.begin(subtransactions=True):
|
||||
bindings = self.get_ha_router_port_bindings(context, [router_id])
|
||||
dead_agents = []
|
||||
active = [binding for binding in bindings
|
||||
if binding.state == n_const.HA_ROUTER_STATE_ACTIVE]
|
||||
# Check dead agents only if we have more then one active agent
|
||||
if len(active) > 1:
|
||||
dead_agents = [binding.agent for binding in active
|
||||
if not (binding.agent.is_active and
|
||||
binding.agent.admin_state_up)]
|
||||
for dead_agent in dead_agents:
|
||||
self.update_routers_states(
|
||||
context,
|
||||
{router_id: n_const.HA_ROUTER_STATE_STANDBY},
|
||||
dead_agent.host)
|
||||
if dead_agents:
|
||||
router_active_agents_dead = []
|
||||
router_standby_agents_dead = []
|
||||
# List agents where router is active and agent is dead
|
||||
# and agents where router is standby and agent is dead
|
||||
for binding in bindings:
|
||||
if not (binding.agent.is_active
|
||||
and binding.agent.admin_state_up):
|
||||
if binding.state == n_const.HA_ROUTER_STATE_ACTIVE:
|
||||
router_active_agents_dead.append(binding.agent)
|
||||
elif binding.state == n_const.HA_ROUTER_STATE_STANDBY:
|
||||
router_standby_agents_dead.append(binding.agent)
|
||||
if router_active_agents_dead:
|
||||
# Just check if all l3_agents are down
|
||||
# then assuming some communication issue
|
||||
if (len(router_active_agents_dead) +
|
||||
len(router_standby_agents_dead) == len(bindings)):
|
||||
# Make router status as unknown because
|
||||
# agent communication may be issue but router
|
||||
# may still be active. We do not know the
|
||||
# exact status of router.
|
||||
state = n_const.HA_ROUTER_STATE_UNKNOWN
|
||||
else:
|
||||
# Make router status as standby on all dead agents
|
||||
# as some other agents are alive , router can become
|
||||
# active on them after some time
|
||||
state = n_const.HA_ROUTER_STATE_STANDBY
|
||||
for dead_agent in router_active_agents_dead:
|
||||
self.update_routers_states(context, {router_id: state},
|
||||
dead_agent.host)
|
||||
if router_active_agents_dead:
|
||||
return self.get_ha_router_port_bindings(context, [router_id])
|
||||
return bindings
|
||||
|
||||
|
@ -1 +1 @@
|
||||
594422d373ee
|
||||
61663558142c
|
||||
|
@ -0,0 +1,43 @@
|
||||
# Copyright 2017 OpenStack Foundation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
"""Add unknown state to HA router
|
||||
|
||||
Revision ID: 61663558142c
|
||||
Revises: 594422d373ee
|
||||
Create Date: 2017-05-18 14:31:45.725516
|
||||
|
||||
"""
|
||||
|
||||
revision = '61663558142c'
|
||||
down_revision = '594422d373ee'
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from neutron.common import constants
|
||||
from neutron.db import migration
|
||||
|
||||
|
||||
ha_port_bindings_table_name = "ha_router_agent_port_bindings"
|
||||
new_enum = sa.Enum(
|
||||
constants.HA_ROUTER_STATE_ACTIVE,
|
||||
constants.HA_ROUTER_STATE_STANDBY,
|
||||
constants.HA_ROUTER_STATE_UNKNOWN,
|
||||
name='l3_ha_states'
|
||||
)
|
||||
|
||||
|
||||
def upgrade():
|
||||
migration.alter_enum_add_value(ha_port_bindings_table_name, 'state',
|
||||
new_enum, True, server_default='standby')
|
@ -53,6 +53,7 @@ class L3HARouterAgentPortBinding(model_base.BASEV2):
|
||||
|
||||
state = sa.Column(sa.Enum(n_const.HA_ROUTER_STATE_ACTIVE,
|
||||
n_const.HA_ROUTER_STATE_STANDBY,
|
||||
n_const.HA_ROUTER_STATE_UNKNOWN,
|
||||
name='l3_ha_states'),
|
||||
default=n_const.HA_ROUTER_STATE_STANDBY,
|
||||
server_default=n_const.HA_ROUTER_STATE_STANDBY)
|
||||
|
@ -179,8 +179,7 @@ class L3HATestCase(L3HATestFramework):
|
||||
self.admin_ctx, router['id'])
|
||||
self.assertEqual([], bindings)
|
||||
|
||||
def _assert_ha_state_for_agent(self, router, agent,
|
||||
state=n_const.HA_ROUTER_STATE_STANDBY):
|
||||
def _assert_ha_state_for_agent(self, router, agent, state):
|
||||
bindings = (
|
||||
self.plugin.get_l3_bindings_hosting_router_with_ha_states(
|
||||
self.admin_ctx, router['id']))
|
||||
@ -198,7 +197,8 @@ class L3HATestCase(L3HATestFramework):
|
||||
self.agent2['host'])
|
||||
with mock.patch.object(agent_utils, 'is_agent_down',
|
||||
return_value=True):
|
||||
self._assert_ha_state_for_agent(router, self.agent1)
|
||||
self._assert_ha_state_for_agent(router, self.agent1,
|
||||
n_const.HA_ROUTER_STATE_UNKNOWN)
|
||||
|
||||
def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_false(
|
||||
self):
|
||||
@ -209,8 +209,22 @@ class L3HATestCase(L3HATestFramework):
|
||||
self.plugin.update_routers_states(
|
||||
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
|
||||
self.agent2['host'])
|
||||
helpers.set_agent_admin_state(self.agent1['id'])
|
||||
self._assert_ha_state_for_agent(router, self.agent1)
|
||||
helpers.set_agent_admin_state(self.agent1['id'], admin_state_up=False)
|
||||
self._assert_ha_state_for_agent(router, self.agent1,
|
||||
n_const.HA_ROUTER_STATE_STANDBY)
|
||||
|
||||
def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_true(
|
||||
self):
|
||||
router = self._create_router()
|
||||
self.plugin.update_routers_states(
|
||||
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
|
||||
self.agent1['host'])
|
||||
self.plugin.update_routers_states(
|
||||
self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
|
||||
self.agent2['host'])
|
||||
helpers.set_agent_admin_state(self.agent1['id'], admin_state_up=True)
|
||||
self._assert_ha_state_for_agent(router, self.agent1,
|
||||
n_const.HA_ROUTER_STATE_ACTIVE)
|
||||
|
||||
def test_get_l3_bindings_hosting_router_with_ha_states_one_dead(self):
|
||||
router = self._create_router()
|
||||
@ -222,8 +236,10 @@ class L3HATestCase(L3HATestFramework):
|
||||
self.agent2['host'])
|
||||
with mock.patch.object(agent_utils, 'is_agent_down',
|
||||
return_value=True):
|
||||
# With above mock all agents are in dead state
|
||||
# hence router state is Unknown overall.
|
||||
self._assert_ha_state_for_agent(
|
||||
router, self.agent1, state=n_const.HA_ROUTER_STATE_ACTIVE)
|
||||
router, self.agent1, n_const.HA_ROUTER_STATE_UNKNOWN)
|
||||
|
||||
def test_ha_router_create(self):
|
||||
router = self._create_router()
|
||||
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
features:
|
||||
- Added new ``unknown`` state for HA routers. Sometimes l3 agents may not be
|
||||
able to update health status to Neutron server due to communication issues.
|
||||
During that time the server may not know whether HA routers hosted by that
|
||||
agent are active or standby.
|
||||
fixes:
|
||||
- Fixes bug `1682145 <https://launchpad.net/bugs/1682145>`_.
|
Loading…
Reference in New Issue
Block a user