From b62d1bfdf71c2f8810d9b143d50127b8f3a4942d Mon Sep 17 00:00:00 2001 From: Drew Thorstensen Date: Fri, 21 Apr 2017 08:02:17 -0400 Subject: [PATCH] Router should flip to standby if all L3 nodes down A HA router should always be active unless all of the agents hosting that router go down. In that event, the router should switch to standby. This behavior changed with review: https://review.openstack.org/#/c/411784 That review seemed to be accounting for a flakey message bus. This change should account for that, but also revert to the original behavior of the router state only changing when its backing agent hosts are down. Change-Id: I89c3b2546382624f175f8de4de621c3e53adf527 Closes-Bug: 1682145 --- neutron/common/constants.py | 5 ++- neutron/db/l3_hamode_db.py | 44 +++++++++++++------ .../alembic_migrations/versions/EXPAND_HEAD | 2 +- .../61663558142c_add_ha_router_state.py | 43 ++++++++++++++++++ neutron/db/models/l3ha.py | 1 + neutron/tests/unit/db/test_l3_hamode_db.py | 28 +++++++++--- ...d-new-harouter-state-5612fc5b5c2043a5.yaml | 8 ++++ 7 files changed, 109 insertions(+), 22 deletions(-) create mode 100644 neutron/db/migration/alembic_migrations/versions/rocky/expand/61663558142c_add_ha_router_state.py create mode 100644 releasenotes/notes/add-new-harouter-state-5612fc5b5c2043a5.yaml diff --git a/neutron/common/constants.py b/neutron/common/constants.py index bb5741599aa..cbce6e86ee8 100644 --- a/neutron/common/constants.py +++ b/neutron/common/constants.py @@ -37,7 +37,10 @@ HA_SUBNET_NAME = 'HA subnet tenant %s' HA_PORT_NAME = 'HA port tenant %s' HA_ROUTER_STATE_ACTIVE = 'active' HA_ROUTER_STATE_STANDBY = 'standby' -VALID_HA_STATES = (HA_ROUTER_STATE_ACTIVE, HA_ROUTER_STATE_STANDBY) +HA_ROUTER_STATE_UNKNOWN = 'unknown' +VALID_HA_STATES = (HA_ROUTER_STATE_ACTIVE, HA_ROUTER_STATE_STANDBY, + HA_ROUTER_STATE_UNKNOWN) + PAGINATION_INFINITE = 'infinite' SORT_DIRECTION_ASC = 'asc' diff --git a/neutron/db/l3_hamode_db.py b/neutron/db/l3_hamode_db.py index d0ee7eb3c60..e9eacc074f9 100644 --- a/neutron/db/l3_hamode_db.py +++ b/neutron/db/l3_hamode_db.py @@ -547,20 +547,36 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, """ with context.session.begin(subtransactions=True): bindings = self.get_ha_router_port_bindings(context, [router_id]) - dead_agents = [] - active = [binding for binding in bindings - if binding.state == n_const.HA_ROUTER_STATE_ACTIVE] - # Check dead agents only if we have more then one active agent - if len(active) > 1: - dead_agents = [binding.agent for binding in active - if not (binding.agent.is_active and - binding.agent.admin_state_up)] - for dead_agent in dead_agents: - self.update_routers_states( - context, - {router_id: n_const.HA_ROUTER_STATE_STANDBY}, - dead_agent.host) - if dead_agents: + router_active_agents_dead = [] + router_standby_agents_dead = [] + # List agents where router is active and agent is dead + # and agents where router is standby and agent is dead + for binding in bindings: + if not (binding.agent.is_active + and binding.agent.admin_state_up): + if binding.state == n_const.HA_ROUTER_STATE_ACTIVE: + router_active_agents_dead.append(binding.agent) + elif binding.state == n_const.HA_ROUTER_STATE_STANDBY: + router_standby_agents_dead.append(binding.agent) + if router_active_agents_dead: + # Just check if all l3_agents are down + # then assuming some communication issue + if (len(router_active_agents_dead) + + len(router_standby_agents_dead) == len(bindings)): + # Make router status as unknown because + # agent communication may be issue but router + # may still be active. We do not know the + # exact status of router. + state = n_const.HA_ROUTER_STATE_UNKNOWN + else: + # Make router status as standby on all dead agents + # as some other agents are alive , router can become + # active on them after some time + state = n_const.HA_ROUTER_STATE_STANDBY + for dead_agent in router_active_agents_dead: + self.update_routers_states(context, {router_id: state}, + dead_agent.host) + if router_active_agents_dead: return self.get_ha_router_port_bindings(context, [router_id]) return bindings diff --git a/neutron/db/migration/alembic_migrations/versions/EXPAND_HEAD b/neutron/db/migration/alembic_migrations/versions/EXPAND_HEAD index ed9a43eb00c..31d6c4434d7 100644 --- a/neutron/db/migration/alembic_migrations/versions/EXPAND_HEAD +++ b/neutron/db/migration/alembic_migrations/versions/EXPAND_HEAD @@ -1 +1 @@ -594422d373ee +61663558142c diff --git a/neutron/db/migration/alembic_migrations/versions/rocky/expand/61663558142c_add_ha_router_state.py b/neutron/db/migration/alembic_migrations/versions/rocky/expand/61663558142c_add_ha_router_state.py new file mode 100644 index 00000000000..ce87463f25c --- /dev/null +++ b/neutron/db/migration/alembic_migrations/versions/rocky/expand/61663558142c_add_ha_router_state.py @@ -0,0 +1,43 @@ +# Copyright 2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +"""Add unknown state to HA router + +Revision ID: 61663558142c +Revises: 594422d373ee +Create Date: 2017-05-18 14:31:45.725516 + +""" + +revision = '61663558142c' +down_revision = '594422d373ee' + +import sqlalchemy as sa + +from neutron.common import constants +from neutron.db import migration + + +ha_port_bindings_table_name = "ha_router_agent_port_bindings" +new_enum = sa.Enum( + constants.HA_ROUTER_STATE_ACTIVE, + constants.HA_ROUTER_STATE_STANDBY, + constants.HA_ROUTER_STATE_UNKNOWN, + name='l3_ha_states' +) + + +def upgrade(): + migration.alter_enum_add_value(ha_port_bindings_table_name, 'state', + new_enum, True, server_default='standby') diff --git a/neutron/db/models/l3ha.py b/neutron/db/models/l3ha.py index 866b10a749a..ca27f810797 100644 --- a/neutron/db/models/l3ha.py +++ b/neutron/db/models/l3ha.py @@ -53,6 +53,7 @@ class L3HARouterAgentPortBinding(model_base.BASEV2): state = sa.Column(sa.Enum(n_const.HA_ROUTER_STATE_ACTIVE, n_const.HA_ROUTER_STATE_STANDBY, + n_const.HA_ROUTER_STATE_UNKNOWN, name='l3_ha_states'), default=n_const.HA_ROUTER_STATE_STANDBY, server_default=n_const.HA_ROUTER_STATE_STANDBY) diff --git a/neutron/tests/unit/db/test_l3_hamode_db.py b/neutron/tests/unit/db/test_l3_hamode_db.py index a595b4b0126..6254c22997c 100644 --- a/neutron/tests/unit/db/test_l3_hamode_db.py +++ b/neutron/tests/unit/db/test_l3_hamode_db.py @@ -179,8 +179,7 @@ class L3HATestCase(L3HATestFramework): self.admin_ctx, router['id']) self.assertEqual([], bindings) - def _assert_ha_state_for_agent(self, router, agent, - state=n_const.HA_ROUTER_STATE_STANDBY): + def _assert_ha_state_for_agent(self, router, agent, state): bindings = ( self.plugin.get_l3_bindings_hosting_router_with_ha_states( self.admin_ctx, router['id'])) @@ -198,7 +197,8 @@ class L3HATestCase(L3HATestFramework): self.agent2['host']) with mock.patch.object(agent_utils, 'is_agent_down', return_value=True): - self._assert_ha_state_for_agent(router, self.agent1) + self._assert_ha_state_for_agent(router, self.agent1, + n_const.HA_ROUTER_STATE_UNKNOWN) def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_false( self): @@ -209,8 +209,22 @@ class L3HATestCase(L3HATestFramework): self.plugin.update_routers_states( self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE}, self.agent2['host']) - helpers.set_agent_admin_state(self.agent1['id']) - self._assert_ha_state_for_agent(router, self.agent1) + helpers.set_agent_admin_state(self.agent1['id'], admin_state_up=False) + self._assert_ha_state_for_agent(router, self.agent1, + n_const.HA_ROUTER_STATE_STANDBY) + + def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_true( + self): + router = self._create_router() + self.plugin.update_routers_states( + self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE}, + self.agent1['host']) + self.plugin.update_routers_states( + self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE}, + self.agent2['host']) + helpers.set_agent_admin_state(self.agent1['id'], admin_state_up=True) + self._assert_ha_state_for_agent(router, self.agent1, + n_const.HA_ROUTER_STATE_ACTIVE) def test_get_l3_bindings_hosting_router_with_ha_states_one_dead(self): router = self._create_router() @@ -222,8 +236,10 @@ class L3HATestCase(L3HATestFramework): self.agent2['host']) with mock.patch.object(agent_utils, 'is_agent_down', return_value=True): + # With above mock all agents are in dead state + # hence router state is Unknown overall. self._assert_ha_state_for_agent( - router, self.agent1, state=n_const.HA_ROUTER_STATE_ACTIVE) + router, self.agent1, n_const.HA_ROUTER_STATE_UNKNOWN) def test_ha_router_create(self): router = self._create_router() diff --git a/releasenotes/notes/add-new-harouter-state-5612fc5b5c2043a5.yaml b/releasenotes/notes/add-new-harouter-state-5612fc5b5c2043a5.yaml new file mode 100644 index 00000000000..9452f22fd40 --- /dev/null +++ b/releasenotes/notes/add-new-harouter-state-5612fc5b5c2043a5.yaml @@ -0,0 +1,8 @@ +--- +features: + - Added new ``unknown`` state for HA routers. Sometimes l3 agents may not be + able to update health status to Neutron server due to communication issues. + During that time the server may not know whether HA routers hosted by that + agent are active or standby. +fixes: + - Fixes bug `1682145 `_.