Merge "Use neutron-netns-cleanup utility"

This commit is contained in:
Jenkins 2015-08-14 11:39:12 +00:00 committed by Gerrit Code Review
commit 6a36740388
6 changed files with 47 additions and 849 deletions

View File

@ -1,5 +1,4 @@
files/fuel-ha-utils/ocf/* /usr/lib/ocf/resource.d/fuel
files/fuel-ha-utils/tools/q-agent-cleanup.py /usr/bin
files/fuel-ha-utils/tools/wsrepclustercheckrc /etc
files/fuel-ha-utils/tools/galeracheck /usr/bin
files/fuel-ha-utils/tools/swiftcheck /usr/bin

View File

@ -6,14 +6,6 @@ class cluster::neutron () {
File<| title == 'ocf-mirantis-path' |> ->
Package['neutron'] ->
# file {'q-agent-cleanup.py':
# path => '/usr/bin/q-agent-cleanup.py',
# mode => '0755',
# owner => root,
# group => root,
# source => "puppet:///modules/cluster/q-agent-cleanup.py",
#} ->
file {'/var/cache/neutron':
ensure => directory,
path => '/var/cache/neutron',

View File

@ -351,60 +351,6 @@ check_local_reports() {
return $OCF_SUCCESS
}
get_ns_list() {
local rv=`ip netns list | grep -Ee "^qdhcp-.*"`
echo $rv
}
get_pid_list_for_ns_list() {
# Parameters contain namespace names for searching pids
local ns_list="$@"
local pids=`for netns in $ns_list ; do ip netns pids $netns ; done`
echo $pids
}
clean_up() {
# kill processes inside network namespaces
ns_list=`get_ns_list`
# kill all proceses from all dhcp-agent's net.namespaces, that using ip
count=3 # we will try kill process 3 times
while [ $count -gt 0 ]; do
# we can't use ps, because ps can't select processes for given network namespace
inside_ns_pids=`get_pid_list_for_ns_list "$ns_list"`
if [ -z "$inside_ns_pids" ] ; then
break
fi
ocf_run kill $inside_ns_pids
sleep 1
((count--))
done
# kill all remaining proceses, that not died by simple kill
inside_ns_pids=`get_pid_list_for_ns_list "$ns_list"`
if [ ! -z "$inside_ns_pids" ] ; then
for ns_pid in $inside_ns_pids ; do
ocf_run kill -9 $ns_pid
done
fi
# cleanup network interfaces
q-agent-cleanup.py --agent=dhcp --cleanup-ports
}
clean_up_namespaces() {
# kill unnided network namespaces.
#
# Be carefully. In each network namespace shouldn't be any processes
# using network!!! use clean_up before it
ns_list=`get_ns_list`
if [ ! -z "$ns_list" ] ; then
for ns_name in $ns_list ; do
ocf_run ip --force netns del $ns_name
done
fi
}
neutron_dhcp_agent_monitor() {
local rc
local pid
@ -445,9 +391,7 @@ neutron_dhcp_agent_start() {
fi
if ocf_is_true "$remove_artifacts_on_stop_start"; then
clean_up
sleep 1
clean_up_namespaces
neutron-netns-cleanup --agent-type=dhcp --force --config-file ${OCF_RESKEY_config}
fi
rm -f $OCF_RESKEY_state_reports_file
@ -487,79 +431,53 @@ neutron_dhcp_agent_stop() {
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
if ocf_is_true "$remove_artifacts_on_stop_start"; then
clean_up
sleep 1
clean_up_namespaces
neutron-netns-cleanup --agent-type=dhcp --force --config-file ${OCF_RESKEY_config}
fi
ocf_log info "OpenStack DHCP Agent (${OCF_RESKEY_binary}) already stopped"
return $OCF_SUCCESS
fi
#Try SIGTERM
# Terminate agent daemon
pid=`get_worker_pid`
# stop waiting
shutdown_timeout=15
iteration_time=3
iteration_time=1
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-6))
fi
if ocf_is_true "$remove_artifacts_on_stop_start"; then
all_inside_ns_pids=`get_pid_list_for_ns_list $(get_ns_list)`
all_pids="$pid $all_inside_ns_pids"
else
all_pids="$pid"
fi
count=0
alive=1
while [ $alive -gt 0 ] && [ $count -lt $shutdown_timeout ]; do
alive=0
ocf_run kill -s TERM $all_pids
clock=0
# Try to terminate gracefully
while [ -d /proc/${pid}/ ] && [ $clock -lt $shutdown_timeout ]; do
ocf_log debug "Stopping DHCP Agent (${OCF_RESKEY_binary}) gracefully with SIGTERM"
ocf_run kill -s TERM ${pid}
sleep $iteration_time
#Check if processes are alive after command kill
#if yes, send to them the term signal again
np=""
for pid in $all_pids ; do
ocf_run kill -s 0 $pid
if [ $? -eq 0 ]; then
np="$np $pid"
((alive++))
fi
done
if [ $alive -gt 0 ] ; then
all_pids=$np
fi
((count+=$iteration_time))
ocf_log debug "OpenStack DHCP Agent (${OCF_RESKEY_binary}) still hasn't stopped yet. Waiting ..."
((clock+=$iteration_time))
done
#Send the kill signal to processes which are still alive
if [ $alive -gt 0 ] ; then
alive=0
ocf_run kill -s KILL $all_pids
# Send kill signal if process is still up
if [ -d /proc/${pid}/ ] ; then
ocf_log debug "Killing DHCP Agent (${OCF_RESKEY_binary}) with SIGKILL"
ocf_run kill -s KILL ${pid}
sleep 1
for pid in $all_pids ; do
ocf_run kill -s 0 $pid
if [ $? -eq 0 ]; then
((alive++))
fi
done
if [ $alive -gt 0 ] ; then
if [ -d /proc/${pid}/ ] ; then
ocf_log err "OpenStack DHCP Agent (${OCF_RESKEY_binary}) stop failed"
return $OCF_ERR_GENERIC
fi
fi
ocf_log info "OpenStack DHCP Agent (${OCF_RESKEY_binary}) stopped"
ocf_log debug "Delete pid file: ${OCF_RESKEY_pid} with content $(cat ${OCF_RESKEY_pid})"
rm -f $OCF_RESKEY_pid
if ocf_is_true "$remove_artifacts_on_stop_start"; then
# cleanup network interfaces
q-agent-cleanup.py --agent=dhcp --cleanup-ports
clean_up_namespaces
neutron-netns-cleanup --agent-type=dhcp --force --config-file ${OCF_RESKEY_config}
fi
sleep 3
return $OCF_SUCCESS
}

View File

@ -362,62 +362,6 @@ check_local_reports() {
return $OCF_SUCCESS
}
get_ns_list() {
local rv=`ip netns list | grep -Ee "^qrouter-.*"`
echo $rv
}
get_pid_list_for_ns_list() {
# Parameters contain namespace names for searching pids
local ns_list="$@"
local pids=`for netns in $ns_list ; do ip netns pids $netns ; done`
echo $pids
}
clean_up() {
# kill processes inside network namespaces
ns_list=`get_ns_list`
# kill all proceses from all dhcp-agent's net.namespaces, that using ip
count=3 # we will try kill process 3 times
while [ $count -gt 0 ]; do
# we can't use ps, because ps can't select processes for given network namespace
inside_ns_pids=`get_pid_list_for_ns_list "$ns_list"`
if [ -z "$inside_ns_pids" ] ; then
break
fi
for ns_pid in $inside_ns_pids ; do
ocf_run kill $ns_pid
done
sleep 1
count=$(($count - 1))
done
# kill all remaining proceses, that not died by simple kill
inside_ns_pids=`get_pid_list_for_ns_list "$ns_list"`
if [ ! -z "$inside_ns_pids" ] ; then
for ns_pid in $inside_ns_pids ; do
ocf_run kill -9 $ns_pid
done
fi
# cleanup network interfaces
q-agent-cleanup.py --agent=l3 --cleanup-ports
}
clean_up_namespaces() {
# kill unnided network namespaces.
#
# Be carefully. In each network namespace shouldn't be any processes
# using network!!! use clean_up before it
ns_list=`get_ns_list`
if [ ! -z "$ns_list" ] ; then
for ns_name in $ns_list ; do
ocf_run ip --force netns del $ns_name
done
fi
}
neutron_l3_agent_monitor() {
neutron_l3_agent_status
rc=$?
@ -442,9 +386,7 @@ neutron_l3_agent_start() {
return $OCF_SUCCESS
fi
clean_up
sleep 1
clean_up_namespaces
neutron-netns-cleanup --agent-type=l3 --force --config-file $OCF_RESKEY_config
rm -f $OCF_RESKEY_state_reports_file
# run and detach to background agent as daemon.
@ -478,69 +420,49 @@ neutron_l3_agent_stop() {
neutron_l3_agent_status
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
clean_up
sleep 1
clean_up_namespaces
neutron-netns-cleanup --agent-type=l3 --force --config-file $OCF_RESKEY_config
ocf_log info "OpenStack L3 agent ($OCF_RESKEY_binary) already stopped"
return $OCF_SUCCESS
fi
# Try SIGTERM
# Terminate agent daemon
pid=`get_worker_pid`
# stop waiting
shutdown_timeout=15
iteration_time=3
iteration_time=1
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-6))
fi
all_inside_ns_pids=`get_pid_list_for_ns_list $(get_ns_list)`
all_pids="$pid $all_inside_ns_pids"
count=0
alive=1
while [ $alive -gt 0 ] && [ $count -lt $shutdown_timeout ]; do
alive=0
ocf_run kill -s TERM $all_pids
clock=0
# Try to terminate gracefully
while [ -d /proc/${pid}/ ] && [ $clock -lt $shutdown_timeout ]; do
ocf_log debug "Stopping L3 agent (${OCF_RESKEY_binary}) gracefully with SIGTERM"
ocf_run kill -s TERM ${pid}
sleep $iteration_time
#Check if processes are alive after command kill
#if yes, send to them the term signal again
np=""
for pid in $all_pids ; do
ocf_run kill -s 0 $pid
if [ $? -eq 0 ]; then
np="$np $pid"
((alive++))
fi
done
if [ $alive -gt 0 ] ; then
all_pids=$np
fi
((count+=$iteration_time))
ocf_log debug "OpenStack L3 agent ($OCF_RESKEY_binary) still hasn't stopped yet. Waiting ..."
((clock+=$iteration_time))
done
#Send the kill signal to processes which are still alive
if [ $alive -gt 0 ] ; then
alive=0
ocf_run kill -s KILL $all_pids
# Send kill signal if process is still up
if [ -d /proc/${pid}/ ] ; then
ocf_log debug "Killing L3 agent (${OCF_RESKEY_binary}) with SIGKILL"
ocf_run kill -s KILL ${pid}
sleep 1
for pid in $all_pids ; do
ocf_run kill -s 0 $pid
if [ $? -eq 0 ]; then
((alive++))
fi
done
if [ $alive -gt 0 ] ; then
if [ -d /proc/${pid}/ ] ; then
ocf_log err "OpenStack L3 agent (${OCF_RESKEY_binary}) stop failed"
return $OCF_ERR_GENERIC
fi
fi
ocf_log info "OpenStack L3 agent ($OCF_RESKEY_binary) stopped"
ocf_log debug "Delete pid file: ${OCF_RESKEY_pid} with content $(cat ${OCF_RESKEY_pid})"
rm -f $OCF_RESKEY_pid
clean_up
sleep 1
clean_up_namespaces
sleep 3
neutron-netns-cleanup --agent-type=l3 --force --config-file $OCF_RESKEY_config
return $OCF_SUCCESS
}

View File

@ -1,631 +0,0 @@
#!/usr/bin/env python
# Copyright 2013 - 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import argparse
from ConfigParser import SafeConfigParser
import functools
import json
import logging
import logging.config
import logging.handlers
import re
import socket
import StringIO
import subprocess
import sys
from time import sleep
from neutronclient.neutron import client as n_client
LOG_NAME = 'q-agent-cleanup'
API_VER = '2.0'
PORT_ID_PART_LEN = 11
def make_logger(handler=logging.StreamHandler(sys.stdout), level=logging.INFO):
format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
handler.setFormatter(format)
logger = logging.getLogger(LOG_NAME)
logger.addHandler(handler)
logger.setLevel(level)
return logger
LOG = make_logger()
AUTH_KEYS = {
'tenant_name': 'admin_tenant_name',
'username': 'admin_user',
'password': 'admin_password',
'auth_url': 'auth_uri',
}
def get_auth_data(cfg_file, section='keystone_authtoken', keys=AUTH_KEYS):
cfg = SafeConfigParser()
with open(cfg_file) as f:
cfg.readfp(f)
auth_data = {}
for key, value in keys.iteritems():
auth_data[key] = cfg.get(section, value)
return auth_data
# Note(xarses): be careful not to inject \n's into the regex pattern
# or it will case the maching to fail
RECOVERABLE = re.compile((
'(HTTP\s+400\))|'
'(400-\{\'message\'\:\s+\'\'\})|'
'(\[Errno 111\]\s+Connection\s+refused)|'
'(503\s+Service\s+Unavailable)|'
'(504\s+Gateway\s+Time-out)|'
'(\:\s+Maximum\s+attempts\s+reached)|'
'(Unauthorized\:\s+bad\s+credentials)|'
'(Max\s+retries\s+exceeded)|'
"""('*NoneType'*\s+object\s+ha'\s+no\s+attribute\s+'*__getitem__'*$)|"""
'(No\s+route\s+to\s+host$)|'
'(Lost\s+connection\s+to\s+MySQL\s+server)'), flags=re.M)
RETRY_COUNT = 50
RETRY_DELAY = 2
def retry(func, pattern=RECOVERABLE):
@functools.wraps(func)
def wrapper(*args, **kwargs):
i = 0
while True:
try:
return func(*args, **kwargs)
except Exception as e:
if pattern and not pattern.match(e.message):
raise e
i += 1
if i >= RETRY_COUNT:
raise e
LOG.debug("retry request {0}: {1}".format(i, e))
sleep(RETRY_DELAY)
return wrapper
class NeutronCleaner(object):
PORT_NAME_PREFIXES_BY_DEV_OWNER = {
'network:dhcp': 'tap',
'network:router_gateway': 'qg-',
'network:router_interface': 'qr-',
}
PORT_NAME_PREFIXES = {
# contains tuples of prefixes
'dhcp': (PORT_NAME_PREFIXES_BY_DEV_OWNER['network:dhcp'],),
'l3': (
PORT_NAME_PREFIXES_BY_DEV_OWNER['network:router_gateway'],
PORT_NAME_PREFIXES_BY_DEV_OWNER['network:router_interface']
)
}
BRIDGES_FOR_PORTS_BY_AGENT = {
'dhcp': ('br-int',),
'l3': ('br-int', 'br-ex'),
}
PORT_OWNER_PREFIXES = {
'dhcp': ('network:dhcp',),
'l3': ('network:router_gateway', 'network:router_interface')
}
NS_NAME_PREFIXES = {
'dhcp': 'qdhcp',
'l3': 'qrouter',
}
AGENT_BINARY_NAME = {
'dhcp': 'neutron-dhcp-agent',
'l3': 'neutron-l3-agent',
'ovs': 'neutron-openvswitch-agent'
}
CMD__list_ovs_port = ['ovs-vsctl', 'list-ports']
CMD__remove_ovs_port = ['ovs-vsctl', '--', '--if-exists', 'del-port']
CMD__remove_ip_addr = ['ip', 'address', 'delete']
CMD__ip_netns_list = ['ip', 'netns', 'list']
CMD__ip_netns_exec = ['ip', 'netns', 'exec']
# 14: tap-xxxyyyzzz:
RE__port_in_portlist = re.compile(r"^\s*\d+\:\s+([\w-]+)\:")
def __init__(self, options, log=None):
self.log = log
self.auth_data = get_auth_data(cfg_file=options.get('authconf'))
self.options = options
self.agents = {}
self.debug = options.get('debug')
self.RESCHEDULING_CALLS = {
'dhcp': self._reschedule_agent_dhcp,
'l3': self._reschedule_agent_l3,
}
self._client = None
@property
@retry
def client(self):
if self._client is None:
self._client = n_client.Client(API_VER, **self.auth_data)
return self._client
@retry
def _get_agents(self, use_cache=True):
return self.client.list_agents()['agents']
@retry
def _get_routers(self, use_cache=True):
return self.client.list_routers()['routers']
@retry
def _get_networks(self, use_cache=True):
return self.client.list_networks()['networks']
@retry
def _list_networks_on_dhcp_agent(self, agent_id):
return self.client.list_networks_on_dhcp_agent(
agent_id)['networks']
@retry
def _list_routers_on_l3_agent(self, agent_id):
return self.client.list_routers_on_l3_agent(
agent_id)['routers']
@retry
def _list_l3_agents_on_router(self, router_id):
return self.client.list_l3_agent_hosting_routers(
router_id)['agents']
@retry
def _list_dhcp_agents_on_network(self, network_id):
return self.client.list_dhcp_agent_hosting_networks(
network_id)['agents']
def _list_orphaned_networks(self):
networks = self._get_networks()
self.log.debug(
"_list_orphaned_networks:, got list of networks {0}".format(
json.dumps(networks, indent=4)))
orphaned_networks = []
for network in networks:
if len(self._list_dhcp_agents_on_network(network['id'])) == 0:
orphaned_networks.append(network['id'])
self.log.debug(
"_list_orphaned_networks:, got list of orphaned networks {0}".
format(orphaned_networks))
return orphaned_networks
def _list_orphaned_routers(self):
routers = self._get_routers()
self.log.debug(
"_list_orphaned_routers:, got list of routers {0}".format(
json.dumps(routers, indent=4)))
orphaned_routers = []
for router in routers:
if len(self._list_l3_agents_on_router(router['id'])) == 0:
orphaned_routers.append(router['id'])
self.log.debug(
"_list_orphaned_routers:, got list of orphaned routers {0}".format(
orphaned_routers))
return orphaned_routers
@retry
def _add_network_to_dhcp_agent(self, agent_id, net_id):
return self.client.add_network_to_dhcp_agent(
agent_id, {"network_id": net_id})
@retry
def _add_router_to_l3_agent(self, agent_id, router_id):
return self.client.add_router_to_l3_agent(
agent_id, {"router_id": router_id})
@retry
def _remove_router_from_l3_agent(self, agent_id, router_id):
return self.client.remove_router_from_l3_agent(
agent_id, router_id)
@retry
def _delete_agent(self, agent_id):
return self.client.delete_agent(agent_id)
def _get_agents_by_type(self, agent, use_cache=True):
self.log.debug("_get_agents_by_type: start.")
rv = self.agents.get(agent, []) if use_cache else []
if not rv:
agents = self._get_agents(use_cache=use_cache)
for i in agents:
if i['binary'] == self.AGENT_BINARY_NAME.get(agent):
rv.append(i)
from_cache = ''
else:
from_cache = ' from local cache'
self.log.debug(
"_get_agents_by_type: end, {0} rv: {1}".format(
from_cache, json.dumps(rv, indent=4)))
return rv
def _execute(self, cmd):
process = subprocess.Popen(
cmd,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
(stdout, stderr) = process.communicate()
ret_code = process.returncode
if ret_code != 0:
self.log.error(
"ERROR (rc={0}) while execution {1}, stderr: {2}".format(
ret_code, ' '.join(cmd), stderr))
return None
return ret_code, stdout
def __collect_namespaces_for_agent(self, agent):
cmd = self.CMD__ip_netns_list[:]
self.log.debug("Execute command '{0}'".format(' '.join(cmd)))
ret_code, stdout = self._execute(cmd)
if ret_code != 0:
return []
# filter namespaces by given agent type
netns = []
for ns in StringIO.StringIO(stdout):
ns = ns.strip()
self.log.debug("Found network namespace '{0}'".format(ns))
if ns.startswith(self.NS_NAME_PREFIXES[agent]):
netns.append(ns)
return netns
def __collect_ports_for_namespace(self, ns):
cmd = self.CMD__ip_netns_exec[:]
cmd.extend([ns, 'ip', 'l', 'show'])
self.log.debug("Execute command '{0}'".format(' '.join(cmd)))
ret_code, stdout = self._execute(cmd)
if ret_code != 0:
return []
ports = []
for line in StringIO.StringIO(stdout):
pp = self.RE__port_in_portlist.match(line)
if not pp:
continue
port = pp.group(1)
if port != 'lo':
self.log.debug("Found port '{0}'".format(port))
ports.append(port)
return ports
def _cleanup_ports(self, agent):
self.log.debug("_cleanup_ports: start.")
# get namespaces list
netns = self.__collect_namespaces_for_agent(agent)
# collect ports from namespace
ports = []
for ns in netns:
ports.extend(self.__collect_ports_for_namespace(ns))
# iterate by port_list and remove port from OVS
for port in ports:
cmd = self.CMD__remove_ovs_port[:]
cmd.append(port)
if self.options.get('noop'):
self.log.info("NOOP-execution: '{0}'".format(' '.join(cmd)))
else:
self.log.debug("Execute command '{0}'".format(' '.join(cmd)))
self._execute(cmd)
self.log.debug("_cleanup_ports: end.")
return True
def _reschedule_agent_dhcp(self, agent_type):
self.log.debug("_reschedule_agent_dhcp: start.")
agents = {
'alive': [],
'dead': []
}
# collect networklist from dead DHCP-agents
dead_networks = []
for agent in self._get_agents_by_type(agent_type):
if agent['alive']:
self.log.info(
"found alive DHCP agent: {0}".format(agent['id']))
agents['alive'].append(agent)
else:
# dead agent
self.log.info(
"found dead DHCP agent: {0}".format(agent['id']))
agents['dead'].append(agent)
for net in self._list_networks_on_dhcp_agent(agent['id']):
dead_networks.append(net)
if dead_networks and agents['alive']:
# get network-ID list of already attached to alive agent networks
lucky_ids = set()
map(
lambda net: lucky_ids.add(net['id']),
self._list_networks_on_dhcp_agent(agents['alive'][0]['id'])
)
# add dead networks to alive agent
for net in dead_networks:
if net['id'] not in lucky_ids:
# attach network to agent
self.log.info(
"attach network {net} to DHCP agent {agent}".format(
net=net['id'],
agent=agents['alive'][0]['id']))
if not self.options.get('noop'):
self._add_network_to_dhcp_agent(
agents['alive'][0]['id'], net['id'])
# remove dead agents if need (and if found alive agent)
if self.options.get('remove-dead'):
for agent in agents['dead']:
self.log.info(
"remove dead DHCP agent: {0}".format(agent['id']))
if not self.options.get('noop'):
self._delete_agent(agent['id'])
orphaned_networks = self._list_orphaned_networks()
self.log.info("_reschedule_agent_dhcp: rescheduling orphaned networks")
if orphaned_networks and agents['alive']:
for network in orphaned_networks:
self.log.info(
"_reschedule_agent_dhcp: rescheduling {0} to {1}".format(
network, agents['alive'][0]['id']))
if not self.options.get('noop'):
self._add_network_to_dhcp_agent(
agents['alive'][0]['id'], network)
self.log.info(
"_reschedule_agent_dhcp: ended rescheduling of orphaned networks")
self.log.debug("_reschedule_agent_dhcp: end.")
def _reschedule_agent_l3(self, agent_type):
self.log.debug("_reschedule_agent_l3: start.")
agents = {
'alive': [],
'dead': []
}
# collect router-list from dead DHCP-agents
dead_routers = [] # array of tuples (router, agentID)
for agent in self._get_agents_by_type(agent_type):
if agent['alive']:
self.log.info("found alive L3 agent: {0}".format(agent['id']))
agents['alive'].append(agent)
else:
# dead agent
self.log.info("found dead L3 agent: {0}".format(agent['id']))
agents['dead'].append(agent)
map(
lambda rou: dead_routers.append((rou, agent['id'])),
self._list_routers_on_l3_agent(agent['id'])
)
self.log.debug(
"L3 agents in cluster: {0}".format(
json.dumps(agents, indent=4)))
self.log.debug("Routers, attached to dead L3 agents: {0}".format(
json.dumps(dead_routers, indent=4)))
if dead_routers and agents['alive']:
# get router-ID list of already attached to alive agent routerss
lucky_ids = set()
map(
lambda rou: lucky_ids.add(rou['id']),
self._list_routers_on_l3_agent(agents['alive'][0]['id'])
)
# remove dead agents after rescheduling
for agent in agents['dead']:
self.log.info("remove dead L3 agent: {0}".format(agent['id']))
if not self.options.get('noop'):
self._delete_agent(agent['id'])
# move routers from dead to alive agent
for rou in filter(
lambda rr: not(rr[0]['id'] in lucky_ids), dead_routers):
self.log.info(
"schedule router {0} to L3 agent {1}".format(
rou[0]['id'],
agents['alive'][0]['id']))
if not self.options.get('noop'):
self._add_router_to_l3_agent(
agents['alive'][0]['id'], rou[0]['id'])
orphaned_routers = self._list_orphaned_routers()
self.log.info("_reschedule_agent_l3: rescheduling orphaned routers")
if orphaned_routers and agents['alive']:
for router in orphaned_routers:
self.log.info(
"_reschedule_agent_l3: rescheduling {0} to {1}".format(
router, agents['alive'][0]['id']))
if not self.options.get('noop'):
self._add_router_to_l3_agent(
agents['alive'][0]['id'], router)
self.log.info(
"_reschedule_agent_l3: ended rescheduling of orphaned routers")
self.log.debug("_reschedule_agent_l3: end.")
def _remove_self(self, agent_type):
self.log.debug("_remove_self: start.")
for agent in self._get_agents_by_type(agent_type):
if agent['host'] == socket.gethostname():
self.log.info(
"_remove_self: deleting our own agent {0} of type {1}".
format(agent['id'], agent_type))
if not self.options.get('noop'):
self._delete_agent(agent['id'])
self.log.debug("_remove_self: end.")
def _reschedule_agent(self, agent):
self.log.debug("_reschedule_agents: start.")
task = self.RESCHEDULING_CALLS.get(agent, None)
if task:
task(agent)
self.log.debug("_reschedule_agents: end.")
def do(self, agent):
if self.options.get('cleanup-ports'):
self._cleanup_ports(agent)
if self.options.get('reschedule'):
self._reschedule_agent(agent)
if self.options.get('remove-self'):
self._remove_self(agent)
def _test_healthy(self, agent_list, hostname):
rv = False
for agent in agent_list:
if agent['host'] == hostname and agent['alive']:
return True
return rv
def test_healthy(self, agent_type):
# OCF_FAILED_MASTER,
# http://www.linux-ha.org/doc/dev-guides/_literal_ocf_failed_master_literal_9.html
rc = 9
agentlist = self._get_agents_by_type(agent_type)
for hostname in self.options.get('test-hostnames'):
if self._test_healthy(agentlist, hostname):
return 0
return rc
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Neutron network node cleaning tool.')
parser.add_argument(
"-c",
"--auth-config",
dest="authconf",
default="/etc/neutron/neutron.conf",
help="Read authconfig from service file",
metavar="FILE")
parser.add_argument(
"-t",
"--auth-token",
dest="auth-token",
default=None,
help="Authenticating token (instead username/passwd)",
metavar="TOKEN")
parser.add_argument(
"-u",
"--admin-auth-url",
dest="admin-auth-url",
default=None,
help="Authenticating URL (admin)",
metavar="URL")
parser.add_argument(
"--retries",
dest="retries",
type=int,
default=50,
help="try NN retries for API call",
metavar="NN")
parser.add_argument(
"--sleep",
dest="sleep",
type=int,
default=2,
help="sleep seconds between retries",
metavar="SEC")
parser.add_argument(
"-a",
"--agent",
dest="agent",
action="append",
help="specyfy agents for cleaning",
required=True)
parser.add_argument(
"--cleanup-ports",
dest="cleanup-ports",
action="store_true",
default=False,
help="cleanup ports for given agents on this node")
parser.add_argument(
"--remove-self",
dest="remove-self",
action="store_true",
default=False,
help="remove ourselves from agent list")
parser.add_argument(
"--activeonly",
dest="activeonly",
action="store_true",
default=False,
help="cleanup only active ports")
parser.add_argument(
"--reschedule",
dest="reschedule",
action="store_true",
default=False,
help="reschedule given agents")
parser.add_argument(
"--remove-dead",
dest="remove-dead",
action="store_true",
default=False,
help="remove dead agents while rescheduling")
parser.add_argument(
"--test-alive-for-hostname",
dest="test-hostnames",
action="append",
help="testing agent's healthy for given hostname")
parser.add_argument(
"--external-bridge",
dest="external-bridge",
default="br-ex",
help="external bridge name",
metavar="IFACE")
parser.add_argument(
"--integration-bridge",
dest="integration-bridge",
default="br-int",
help="integration bridge name",
metavar="IFACE")
parser.add_argument(
"-l",
"--log",
dest="log",
action="store",
help="log to file instead of STDOUT")
parser.add_argument(
"--noop",
dest="noop",
action="store_true",
default=False,
help="do not execute, print to log instead")
parser.add_argument(
"--debug",
dest="debug",
action="store_true",
default=False,
help="debug")
args = parser.parse_args()
RETRY_COUNT = args.retries
RETRY_DELAY = args.sleep
# setup logging
if args.log:
LOG = make_logger(
handler=logging.handlers.WatchedFileHandler(args.log))
if args.debug:
LOG.setLevel(logging.DEBUG)
LOG.info("Started: {0}".format(' '.join(sys.argv)))
cleaner = NeutronCleaner(options=vars(args), log=LOG)
rc = 0
if vars(args).get('test-hostnames'):
rc = cleaner.test_healthy(args.agent[0])
else:
for i in args.agent:
cleaner.do(i)
LOG.debug("End.")
sys.exit(rc)

View File

@ -85,7 +85,6 @@ install -m 0755 %{files_source}/fuel-ha-utils/ocf/ceilometer-alarm-evaluator %{b
install -m 0755 %{files_source}/fuel-ha-utils/ocf/nova-compute %{buildroot}/usr/lib/ocf/resource.d/fuel/nova-compute
install -m 0755 %{files_source}/fuel-ha-utils/ocf/nova-network %{buildroot}/usr/lib/ocf/resource.d/fuel/nova-network
install -m 0755 %{files_source}/fuel-ha-utils/ocf/ceilometer-agent-compute %{buildroot}/usr/lib/ocf/resource.d/fuel/ceilometer-agent-compute
install -m 0755 %{files_source}/fuel-ha-utils/tools/q-agent-cleanup.py %{buildroot}/usr/bin/q-agent-cleanup.py
install -m 0755 %{files_source}/fuel-ha-utils/tools/galeracheck %{buildroot}/usr/bin/galeracheck
install -m 0755 %{files_source}/fuel-ha-utils/tools/swiftcheck %{buildroot}/usr/bin/swiftcheck
install -m 0644 %{files_source}/fuel-ha-utils/tools/wsrepclustercheckrc %{buildroot}/etc/wsrepclustercheckrc
@ -190,7 +189,6 @@ For further information go to http://wiki.openstack.org/Fuel
%files -n fuel-ha-utils
%defattr(-,root,root)
/usr/lib/ocf/resource.d/fuel
/usr/bin/q-agent-cleanup.py
/usr/bin/galeracheck
/usr/bin/swiftcheck
%config(noreplace) /etc/wsrepclustercheckrc