Merge "Use neutron-netns-cleanup utility"
This commit is contained in:
commit
6a36740388
1
debian/fuel-ha-utils.install
vendored
1
debian/fuel-ha-utils.install
vendored
@ -1,5 +1,4 @@
|
||||
files/fuel-ha-utils/ocf/* /usr/lib/ocf/resource.d/fuel
|
||||
files/fuel-ha-utils/tools/q-agent-cleanup.py /usr/bin
|
||||
files/fuel-ha-utils/tools/wsrepclustercheckrc /etc
|
||||
files/fuel-ha-utils/tools/galeracheck /usr/bin
|
||||
files/fuel-ha-utils/tools/swiftcheck /usr/bin
|
||||
|
@ -6,14 +6,6 @@ class cluster::neutron () {
|
||||
File<| title == 'ocf-mirantis-path' |> ->
|
||||
Package['neutron'] ->
|
||||
|
||||
# file {'q-agent-cleanup.py':
|
||||
# path => '/usr/bin/q-agent-cleanup.py',
|
||||
# mode => '0755',
|
||||
# owner => root,
|
||||
# group => root,
|
||||
# source => "puppet:///modules/cluster/q-agent-cleanup.py",
|
||||
#} ->
|
||||
|
||||
file {'/var/cache/neutron':
|
||||
ensure => directory,
|
||||
path => '/var/cache/neutron',
|
||||
|
@ -351,60 +351,6 @@ check_local_reports() {
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
get_ns_list() {
|
||||
local rv=`ip netns list | grep -Ee "^qdhcp-.*"`
|
||||
echo $rv
|
||||
}
|
||||
|
||||
get_pid_list_for_ns_list() {
|
||||
# Parameters contain namespace names for searching pids
|
||||
local ns_list="$@"
|
||||
local pids=`for netns in $ns_list ; do ip netns pids $netns ; done`
|
||||
echo $pids
|
||||
}
|
||||
|
||||
clean_up() {
|
||||
# kill processes inside network namespaces
|
||||
ns_list=`get_ns_list`
|
||||
|
||||
# kill all proceses from all dhcp-agent's net.namespaces, that using ip
|
||||
count=3 # we will try kill process 3 times
|
||||
while [ $count -gt 0 ]; do
|
||||
# we can't use ps, because ps can't select processes for given network namespace
|
||||
inside_ns_pids=`get_pid_list_for_ns_list "$ns_list"`
|
||||
if [ -z "$inside_ns_pids" ] ; then
|
||||
break
|
||||
fi
|
||||
ocf_run kill $inside_ns_pids
|
||||
sleep 1
|
||||
((count--))
|
||||
done
|
||||
|
||||
# kill all remaining proceses, that not died by simple kill
|
||||
inside_ns_pids=`get_pid_list_for_ns_list "$ns_list"`
|
||||
if [ ! -z "$inside_ns_pids" ] ; then
|
||||
for ns_pid in $inside_ns_pids ; do
|
||||
ocf_run kill -9 $ns_pid
|
||||
done
|
||||
fi
|
||||
|
||||
# cleanup network interfaces
|
||||
q-agent-cleanup.py --agent=dhcp --cleanup-ports
|
||||
}
|
||||
|
||||
clean_up_namespaces() {
|
||||
# kill unnided network namespaces.
|
||||
#
|
||||
# Be carefully. In each network namespace shouldn't be any processes
|
||||
# using network!!! use clean_up before it
|
||||
ns_list=`get_ns_list`
|
||||
if [ ! -z "$ns_list" ] ; then
|
||||
for ns_name in $ns_list ; do
|
||||
ocf_run ip --force netns del $ns_name
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
neutron_dhcp_agent_monitor() {
|
||||
local rc
|
||||
local pid
|
||||
@ -445,9 +391,7 @@ neutron_dhcp_agent_start() {
|
||||
fi
|
||||
|
||||
if ocf_is_true "$remove_artifacts_on_stop_start"; then
|
||||
clean_up
|
||||
sleep 1
|
||||
clean_up_namespaces
|
||||
neutron-netns-cleanup --agent-type=dhcp --force --config-file ${OCF_RESKEY_config}
|
||||
fi
|
||||
rm -f $OCF_RESKEY_state_reports_file
|
||||
|
||||
@ -487,79 +431,53 @@ neutron_dhcp_agent_stop() {
|
||||
rc=$?
|
||||
if [ $rc -eq $OCF_NOT_RUNNING ]; then
|
||||
if ocf_is_true "$remove_artifacts_on_stop_start"; then
|
||||
clean_up
|
||||
sleep 1
|
||||
clean_up_namespaces
|
||||
neutron-netns-cleanup --agent-type=dhcp --force --config-file ${OCF_RESKEY_config}
|
||||
fi
|
||||
ocf_log info "OpenStack DHCP Agent (${OCF_RESKEY_binary}) already stopped"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
#Try SIGTERM
|
||||
# Terminate agent daemon
|
||||
pid=`get_worker_pid`
|
||||
# stop waiting
|
||||
shutdown_timeout=15
|
||||
iteration_time=3
|
||||
iteration_time=1
|
||||
|
||||
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|
||||
shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-6))
|
||||
fi
|
||||
if ocf_is_true "$remove_artifacts_on_stop_start"; then
|
||||
all_inside_ns_pids=`get_pid_list_for_ns_list $(get_ns_list)`
|
||||
all_pids="$pid $all_inside_ns_pids"
|
||||
else
|
||||
all_pids="$pid"
|
||||
fi
|
||||
count=0
|
||||
alive=1
|
||||
while [ $alive -gt 0 ] && [ $count -lt $shutdown_timeout ]; do
|
||||
alive=0
|
||||
ocf_run kill -s TERM $all_pids
|
||||
|
||||
clock=0
|
||||
|
||||
# Try to terminate gracefully
|
||||
while [ -d /proc/${pid}/ ] && [ $clock -lt $shutdown_timeout ]; do
|
||||
ocf_log debug "Stopping DHCP Agent (${OCF_RESKEY_binary}) gracefully with SIGTERM"
|
||||
ocf_run kill -s TERM ${pid}
|
||||
|
||||
sleep $iteration_time
|
||||
#Check if processes are alive after command kill
|
||||
#if yes, send to them the term signal again
|
||||
np=""
|
||||
for pid in $all_pids ; do
|
||||
ocf_run kill -s 0 $pid
|
||||
if [ $? -eq 0 ]; then
|
||||
np="$np $pid"
|
||||
((alive++))
|
||||
fi
|
||||
done
|
||||
if [ $alive -gt 0 ] ; then
|
||||
all_pids=$np
|
||||
fi
|
||||
((count+=$iteration_time))
|
||||
ocf_log debug "OpenStack DHCP Agent (${OCF_RESKEY_binary}) still hasn't stopped yet. Waiting ..."
|
||||
((clock+=$iteration_time))
|
||||
done
|
||||
#Send the kill signal to processes which are still alive
|
||||
if [ $alive -gt 0 ] ; then
|
||||
alive=0
|
||||
ocf_run kill -s KILL $all_pids
|
||||
|
||||
# Send kill signal if process is still up
|
||||
if [ -d /proc/${pid}/ ] ; then
|
||||
ocf_log debug "Killing DHCP Agent (${OCF_RESKEY_binary}) with SIGKILL"
|
||||
ocf_run kill -s KILL ${pid}
|
||||
|
||||
sleep 1
|
||||
for pid in $all_pids ; do
|
||||
ocf_run kill -s 0 $pid
|
||||
if [ $? -eq 0 ]; then
|
||||
((alive++))
|
||||
fi
|
||||
done
|
||||
if [ $alive -gt 0 ] ; then
|
||||
if [ -d /proc/${pid}/ ] ; then
|
||||
ocf_log err "OpenStack DHCP Agent (${OCF_RESKEY_binary}) stop failed"
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
fi
|
||||
|
||||
ocf_log info "OpenStack DHCP Agent (${OCF_RESKEY_binary}) stopped"
|
||||
|
||||
ocf_log debug "Delete pid file: ${OCF_RESKEY_pid} with content $(cat ${OCF_RESKEY_pid})"
|
||||
rm -f $OCF_RESKEY_pid
|
||||
|
||||
if ocf_is_true "$remove_artifacts_on_stop_start"; then
|
||||
# cleanup network interfaces
|
||||
q-agent-cleanup.py --agent=dhcp --cleanup-ports
|
||||
clean_up_namespaces
|
||||
neutron-netns-cleanup --agent-type=dhcp --force --config-file ${OCF_RESKEY_config}
|
||||
fi
|
||||
|
||||
sleep 3
|
||||
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
|
@ -362,62 +362,6 @@ check_local_reports() {
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
get_ns_list() {
|
||||
local rv=`ip netns list | grep -Ee "^qrouter-.*"`
|
||||
echo $rv
|
||||
}
|
||||
|
||||
get_pid_list_for_ns_list() {
|
||||
# Parameters contain namespace names for searching pids
|
||||
local ns_list="$@"
|
||||
local pids=`for netns in $ns_list ; do ip netns pids $netns ; done`
|
||||
echo $pids
|
||||
}
|
||||
|
||||
clean_up() {
|
||||
# kill processes inside network namespaces
|
||||
ns_list=`get_ns_list`
|
||||
|
||||
# kill all proceses from all dhcp-agent's net.namespaces, that using ip
|
||||
count=3 # we will try kill process 3 times
|
||||
while [ $count -gt 0 ]; do
|
||||
# we can't use ps, because ps can't select processes for given network namespace
|
||||
inside_ns_pids=`get_pid_list_for_ns_list "$ns_list"`
|
||||
if [ -z "$inside_ns_pids" ] ; then
|
||||
break
|
||||
fi
|
||||
for ns_pid in $inside_ns_pids ; do
|
||||
ocf_run kill $ns_pid
|
||||
done
|
||||
sleep 1
|
||||
count=$(($count - 1))
|
||||
done
|
||||
|
||||
# kill all remaining proceses, that not died by simple kill
|
||||
inside_ns_pids=`get_pid_list_for_ns_list "$ns_list"`
|
||||
if [ ! -z "$inside_ns_pids" ] ; then
|
||||
for ns_pid in $inside_ns_pids ; do
|
||||
ocf_run kill -9 $ns_pid
|
||||
done
|
||||
fi
|
||||
|
||||
# cleanup network interfaces
|
||||
q-agent-cleanup.py --agent=l3 --cleanup-ports
|
||||
}
|
||||
|
||||
clean_up_namespaces() {
|
||||
# kill unnided network namespaces.
|
||||
#
|
||||
# Be carefully. In each network namespace shouldn't be any processes
|
||||
# using network!!! use clean_up before it
|
||||
ns_list=`get_ns_list`
|
||||
if [ ! -z "$ns_list" ] ; then
|
||||
for ns_name in $ns_list ; do
|
||||
ocf_run ip --force netns del $ns_name
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
neutron_l3_agent_monitor() {
|
||||
neutron_l3_agent_status
|
||||
rc=$?
|
||||
@ -442,9 +386,7 @@ neutron_l3_agent_start() {
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
clean_up
|
||||
sleep 1
|
||||
clean_up_namespaces
|
||||
neutron-netns-cleanup --agent-type=l3 --force --config-file $OCF_RESKEY_config
|
||||
rm -f $OCF_RESKEY_state_reports_file
|
||||
|
||||
# run and detach to background agent as daemon.
|
||||
@ -478,69 +420,49 @@ neutron_l3_agent_stop() {
|
||||
neutron_l3_agent_status
|
||||
rc=$?
|
||||
if [ $rc -eq $OCF_NOT_RUNNING ]; then
|
||||
clean_up
|
||||
sleep 1
|
||||
clean_up_namespaces
|
||||
neutron-netns-cleanup --agent-type=l3 --force --config-file $OCF_RESKEY_config
|
||||
ocf_log info "OpenStack L3 agent ($OCF_RESKEY_binary) already stopped"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
# Try SIGTERM
|
||||
# Terminate agent daemon
|
||||
pid=`get_worker_pid`
|
||||
# stop waiting
|
||||
shutdown_timeout=15
|
||||
iteration_time=3
|
||||
iteration_time=1
|
||||
|
||||
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|
||||
shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-6))
|
||||
fi
|
||||
all_inside_ns_pids=`get_pid_list_for_ns_list $(get_ns_list)`
|
||||
all_pids="$pid $all_inside_ns_pids"
|
||||
count=0
|
||||
alive=1
|
||||
while [ $alive -gt 0 ] && [ $count -lt $shutdown_timeout ]; do
|
||||
alive=0
|
||||
ocf_run kill -s TERM $all_pids
|
||||
|
||||
clock=0
|
||||
|
||||
# Try to terminate gracefully
|
||||
while [ -d /proc/${pid}/ ] && [ $clock -lt $shutdown_timeout ]; do
|
||||
ocf_log debug "Stopping L3 agent (${OCF_RESKEY_binary}) gracefully with SIGTERM"
|
||||
ocf_run kill -s TERM ${pid}
|
||||
|
||||
sleep $iteration_time
|
||||
#Check if processes are alive after command kill
|
||||
#if yes, send to them the term signal again
|
||||
np=""
|
||||
for pid in $all_pids ; do
|
||||
ocf_run kill -s 0 $pid
|
||||
if [ $? -eq 0 ]; then
|
||||
np="$np $pid"
|
||||
((alive++))
|
||||
fi
|
||||
done
|
||||
if [ $alive -gt 0 ] ; then
|
||||
all_pids=$np
|
||||
fi
|
||||
((count+=$iteration_time))
|
||||
ocf_log debug "OpenStack L3 agent ($OCF_RESKEY_binary) still hasn't stopped yet. Waiting ..."
|
||||
((clock+=$iteration_time))
|
||||
done
|
||||
#Send the kill signal to processes which are still alive
|
||||
if [ $alive -gt 0 ] ; then
|
||||
alive=0
|
||||
ocf_run kill -s KILL $all_pids
|
||||
|
||||
# Send kill signal if process is still up
|
||||
if [ -d /proc/${pid}/ ] ; then
|
||||
ocf_log debug "Killing L3 agent (${OCF_RESKEY_binary}) with SIGKILL"
|
||||
ocf_run kill -s KILL ${pid}
|
||||
|
||||
sleep 1
|
||||
for pid in $all_pids ; do
|
||||
ocf_run kill -s 0 $pid
|
||||
if [ $? -eq 0 ]; then
|
||||
((alive++))
|
||||
fi
|
||||
done
|
||||
if [ $alive -gt 0 ] ; then
|
||||
if [ -d /proc/${pid}/ ] ; then
|
||||
ocf_log err "OpenStack L3 agent (${OCF_RESKEY_binary}) stop failed"
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
fi
|
||||
|
||||
ocf_log info "OpenStack L3 agent ($OCF_RESKEY_binary) stopped"
|
||||
|
||||
ocf_log debug "Delete pid file: ${OCF_RESKEY_pid} with content $(cat ${OCF_RESKEY_pid})"
|
||||
rm -f $OCF_RESKEY_pid
|
||||
clean_up
|
||||
sleep 1
|
||||
clean_up_namespaces
|
||||
sleep 3
|
||||
|
||||
neutron-netns-cleanup --agent-type=l3 --force --config-file $OCF_RESKEY_config
|
||||
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
@ -1,631 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2013 - 2015 Mirantis, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import argparse
|
||||
from ConfigParser import SafeConfigParser
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import logging.config
|
||||
import logging.handlers
|
||||
import re
|
||||
import socket
|
||||
import StringIO
|
||||
import subprocess
|
||||
import sys
|
||||
from time import sleep
|
||||
|
||||
from neutronclient.neutron import client as n_client
|
||||
|
||||
LOG_NAME = 'q-agent-cleanup'
|
||||
|
||||
API_VER = '2.0'
|
||||
PORT_ID_PART_LEN = 11
|
||||
|
||||
|
||||
def make_logger(handler=logging.StreamHandler(sys.stdout), level=logging.INFO):
|
||||
format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
||||
handler.setFormatter(format)
|
||||
logger = logging.getLogger(LOG_NAME)
|
||||
logger.addHandler(handler)
|
||||
logger.setLevel(level)
|
||||
return logger
|
||||
|
||||
LOG = make_logger()
|
||||
|
||||
AUTH_KEYS = {
|
||||
'tenant_name': 'admin_tenant_name',
|
||||
'username': 'admin_user',
|
||||
'password': 'admin_password',
|
||||
'auth_url': 'auth_uri',
|
||||
}
|
||||
|
||||
|
||||
def get_auth_data(cfg_file, section='keystone_authtoken', keys=AUTH_KEYS):
|
||||
cfg = SafeConfigParser()
|
||||
with open(cfg_file) as f:
|
||||
cfg.readfp(f)
|
||||
auth_data = {}
|
||||
for key, value in keys.iteritems():
|
||||
auth_data[key] = cfg.get(section, value)
|
||||
return auth_data
|
||||
|
||||
# Note(xarses): be careful not to inject \n's into the regex pattern
|
||||
# or it will case the maching to fail
|
||||
RECOVERABLE = re.compile((
|
||||
'(HTTP\s+400\))|'
|
||||
'(400-\{\'message\'\:\s+\'\'\})|'
|
||||
'(\[Errno 111\]\s+Connection\s+refused)|'
|
||||
'(503\s+Service\s+Unavailable)|'
|
||||
'(504\s+Gateway\s+Time-out)|'
|
||||
'(\:\s+Maximum\s+attempts\s+reached)|'
|
||||
'(Unauthorized\:\s+bad\s+credentials)|'
|
||||
'(Max\s+retries\s+exceeded)|'
|
||||
"""('*NoneType'*\s+object\s+ha'\s+no\s+attribute\s+'*__getitem__'*$)|"""
|
||||
'(No\s+route\s+to\s+host$)|'
|
||||
'(Lost\s+connection\s+to\s+MySQL\s+server)'), flags=re.M)
|
||||
|
||||
RETRY_COUNT = 50
|
||||
RETRY_DELAY = 2
|
||||
|
||||
|
||||
def retry(func, pattern=RECOVERABLE):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
i = 0
|
||||
while True:
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
if pattern and not pattern.match(e.message):
|
||||
raise e
|
||||
i += 1
|
||||
if i >= RETRY_COUNT:
|
||||
raise e
|
||||
LOG.debug("retry request {0}: {1}".format(i, e))
|
||||
sleep(RETRY_DELAY)
|
||||
return wrapper
|
||||
|
||||
|
||||
class NeutronCleaner(object):
|
||||
PORT_NAME_PREFIXES_BY_DEV_OWNER = {
|
||||
'network:dhcp': 'tap',
|
||||
'network:router_gateway': 'qg-',
|
||||
'network:router_interface': 'qr-',
|
||||
}
|
||||
PORT_NAME_PREFIXES = {
|
||||
# contains tuples of prefixes
|
||||
'dhcp': (PORT_NAME_PREFIXES_BY_DEV_OWNER['network:dhcp'],),
|
||||
'l3': (
|
||||
PORT_NAME_PREFIXES_BY_DEV_OWNER['network:router_gateway'],
|
||||
PORT_NAME_PREFIXES_BY_DEV_OWNER['network:router_interface']
|
||||
)
|
||||
}
|
||||
BRIDGES_FOR_PORTS_BY_AGENT = {
|
||||
'dhcp': ('br-int',),
|
||||
'l3': ('br-int', 'br-ex'),
|
||||
}
|
||||
PORT_OWNER_PREFIXES = {
|
||||
'dhcp': ('network:dhcp',),
|
||||
'l3': ('network:router_gateway', 'network:router_interface')
|
||||
}
|
||||
NS_NAME_PREFIXES = {
|
||||
'dhcp': 'qdhcp',
|
||||
'l3': 'qrouter',
|
||||
}
|
||||
AGENT_BINARY_NAME = {
|
||||
'dhcp': 'neutron-dhcp-agent',
|
||||
'l3': 'neutron-l3-agent',
|
||||
'ovs': 'neutron-openvswitch-agent'
|
||||
}
|
||||
|
||||
CMD__list_ovs_port = ['ovs-vsctl', 'list-ports']
|
||||
CMD__remove_ovs_port = ['ovs-vsctl', '--', '--if-exists', 'del-port']
|
||||
CMD__remove_ip_addr = ['ip', 'address', 'delete']
|
||||
CMD__ip_netns_list = ['ip', 'netns', 'list']
|
||||
CMD__ip_netns_exec = ['ip', 'netns', 'exec']
|
||||
|
||||
# 14: tap-xxxyyyzzz:
|
||||
RE__port_in_portlist = re.compile(r"^\s*\d+\:\s+([\w-]+)\:")
|
||||
|
||||
def __init__(self, options, log=None):
|
||||
self.log = log
|
||||
self.auth_data = get_auth_data(cfg_file=options.get('authconf'))
|
||||
self.options = options
|
||||
self.agents = {}
|
||||
self.debug = options.get('debug')
|
||||
self.RESCHEDULING_CALLS = {
|
||||
'dhcp': self._reschedule_agent_dhcp,
|
||||
'l3': self._reschedule_agent_l3,
|
||||
}
|
||||
|
||||
self._client = None
|
||||
|
||||
@property
|
||||
@retry
|
||||
def client(self):
|
||||
if self._client is None:
|
||||
self._client = n_client.Client(API_VER, **self.auth_data)
|
||||
return self._client
|
||||
|
||||
@retry
|
||||
def _get_agents(self, use_cache=True):
|
||||
return self.client.list_agents()['agents']
|
||||
|
||||
@retry
|
||||
def _get_routers(self, use_cache=True):
|
||||
return self.client.list_routers()['routers']
|
||||
|
||||
@retry
|
||||
def _get_networks(self, use_cache=True):
|
||||
return self.client.list_networks()['networks']
|
||||
|
||||
@retry
|
||||
def _list_networks_on_dhcp_agent(self, agent_id):
|
||||
return self.client.list_networks_on_dhcp_agent(
|
||||
agent_id)['networks']
|
||||
|
||||
@retry
|
||||
def _list_routers_on_l3_agent(self, agent_id):
|
||||
return self.client.list_routers_on_l3_agent(
|
||||
agent_id)['routers']
|
||||
|
||||
@retry
|
||||
def _list_l3_agents_on_router(self, router_id):
|
||||
return self.client.list_l3_agent_hosting_routers(
|
||||
router_id)['agents']
|
||||
|
||||
@retry
|
||||
def _list_dhcp_agents_on_network(self, network_id):
|
||||
return self.client.list_dhcp_agent_hosting_networks(
|
||||
network_id)['agents']
|
||||
|
||||
def _list_orphaned_networks(self):
|
||||
networks = self._get_networks()
|
||||
self.log.debug(
|
||||
"_list_orphaned_networks:, got list of networks {0}".format(
|
||||
json.dumps(networks, indent=4)))
|
||||
orphaned_networks = []
|
||||
for network in networks:
|
||||
if len(self._list_dhcp_agents_on_network(network['id'])) == 0:
|
||||
orphaned_networks.append(network['id'])
|
||||
self.log.debug(
|
||||
"_list_orphaned_networks:, got list of orphaned networks {0}".
|
||||
format(orphaned_networks))
|
||||
return orphaned_networks
|
||||
|
||||
def _list_orphaned_routers(self):
|
||||
routers = self._get_routers()
|
||||
self.log.debug(
|
||||
"_list_orphaned_routers:, got list of routers {0}".format(
|
||||
json.dumps(routers, indent=4)))
|
||||
orphaned_routers = []
|
||||
for router in routers:
|
||||
if len(self._list_l3_agents_on_router(router['id'])) == 0:
|
||||
orphaned_routers.append(router['id'])
|
||||
self.log.debug(
|
||||
"_list_orphaned_routers:, got list of orphaned routers {0}".format(
|
||||
orphaned_routers))
|
||||
return orphaned_routers
|
||||
|
||||
@retry
|
||||
def _add_network_to_dhcp_agent(self, agent_id, net_id):
|
||||
return self.client.add_network_to_dhcp_agent(
|
||||
agent_id, {"network_id": net_id})
|
||||
|
||||
@retry
|
||||
def _add_router_to_l3_agent(self, agent_id, router_id):
|
||||
return self.client.add_router_to_l3_agent(
|
||||
agent_id, {"router_id": router_id})
|
||||
|
||||
@retry
|
||||
def _remove_router_from_l3_agent(self, agent_id, router_id):
|
||||
return self.client.remove_router_from_l3_agent(
|
||||
agent_id, router_id)
|
||||
|
||||
@retry
|
||||
def _delete_agent(self, agent_id):
|
||||
return self.client.delete_agent(agent_id)
|
||||
|
||||
def _get_agents_by_type(self, agent, use_cache=True):
|
||||
self.log.debug("_get_agents_by_type: start.")
|
||||
rv = self.agents.get(agent, []) if use_cache else []
|
||||
if not rv:
|
||||
agents = self._get_agents(use_cache=use_cache)
|
||||
for i in agents:
|
||||
if i['binary'] == self.AGENT_BINARY_NAME.get(agent):
|
||||
rv.append(i)
|
||||
from_cache = ''
|
||||
else:
|
||||
from_cache = ' from local cache'
|
||||
self.log.debug(
|
||||
"_get_agents_by_type: end, {0} rv: {1}".format(
|
||||
from_cache, json.dumps(rv, indent=4)))
|
||||
return rv
|
||||
|
||||
def _execute(self, cmd):
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
shell=False,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
(stdout, stderr) = process.communicate()
|
||||
ret_code = process.returncode
|
||||
if ret_code != 0:
|
||||
self.log.error(
|
||||
"ERROR (rc={0}) while execution {1}, stderr: {2}".format(
|
||||
ret_code, ' '.join(cmd), stderr))
|
||||
return None
|
||||
return ret_code, stdout
|
||||
|
||||
def __collect_namespaces_for_agent(self, agent):
|
||||
cmd = self.CMD__ip_netns_list[:]
|
||||
self.log.debug("Execute command '{0}'".format(' '.join(cmd)))
|
||||
ret_code, stdout = self._execute(cmd)
|
||||
if ret_code != 0:
|
||||
return []
|
||||
# filter namespaces by given agent type
|
||||
netns = []
|
||||
for ns in StringIO.StringIO(stdout):
|
||||
ns = ns.strip()
|
||||
self.log.debug("Found network namespace '{0}'".format(ns))
|
||||
if ns.startswith(self.NS_NAME_PREFIXES[agent]):
|
||||
netns.append(ns)
|
||||
return netns
|
||||
|
||||
def __collect_ports_for_namespace(self, ns):
|
||||
cmd = self.CMD__ip_netns_exec[:]
|
||||
cmd.extend([ns, 'ip', 'l', 'show'])
|
||||
self.log.debug("Execute command '{0}'".format(' '.join(cmd)))
|
||||
ret_code, stdout = self._execute(cmd)
|
||||
if ret_code != 0:
|
||||
return []
|
||||
ports = []
|
||||
for line in StringIO.StringIO(stdout):
|
||||
pp = self.RE__port_in_portlist.match(line)
|
||||
if not pp:
|
||||
continue
|
||||
port = pp.group(1)
|
||||
if port != 'lo':
|
||||
self.log.debug("Found port '{0}'".format(port))
|
||||
ports.append(port)
|
||||
return ports
|
||||
|
||||
def _cleanup_ports(self, agent):
|
||||
self.log.debug("_cleanup_ports: start.")
|
||||
|
||||
# get namespaces list
|
||||
netns = self.__collect_namespaces_for_agent(agent)
|
||||
|
||||
# collect ports from namespace
|
||||
ports = []
|
||||
for ns in netns:
|
||||
ports.extend(self.__collect_ports_for_namespace(ns))
|
||||
|
||||
# iterate by port_list and remove port from OVS
|
||||
for port in ports:
|
||||
cmd = self.CMD__remove_ovs_port[:]
|
||||
cmd.append(port)
|
||||
if self.options.get('noop'):
|
||||
self.log.info("NOOP-execution: '{0}'".format(' '.join(cmd)))
|
||||
else:
|
||||
self.log.debug("Execute command '{0}'".format(' '.join(cmd)))
|
||||
self._execute(cmd)
|
||||
self.log.debug("_cleanup_ports: end.")
|
||||
|
||||
return True
|
||||
|
||||
def _reschedule_agent_dhcp(self, agent_type):
|
||||
self.log.debug("_reschedule_agent_dhcp: start.")
|
||||
agents = {
|
||||
'alive': [],
|
||||
'dead': []
|
||||
}
|
||||
# collect networklist from dead DHCP-agents
|
||||
dead_networks = []
|
||||
for agent in self._get_agents_by_type(agent_type):
|
||||
if agent['alive']:
|
||||
self.log.info(
|
||||
"found alive DHCP agent: {0}".format(agent['id']))
|
||||
agents['alive'].append(agent)
|
||||
else:
|
||||
# dead agent
|
||||
self.log.info(
|
||||
"found dead DHCP agent: {0}".format(agent['id']))
|
||||
agents['dead'].append(agent)
|
||||
for net in self._list_networks_on_dhcp_agent(agent['id']):
|
||||
dead_networks.append(net)
|
||||
|
||||
if dead_networks and agents['alive']:
|
||||
# get network-ID list of already attached to alive agent networks
|
||||
lucky_ids = set()
|
||||
map(
|
||||
lambda net: lucky_ids.add(net['id']),
|
||||
self._list_networks_on_dhcp_agent(agents['alive'][0]['id'])
|
||||
)
|
||||
# add dead networks to alive agent
|
||||
for net in dead_networks:
|
||||
if net['id'] not in lucky_ids:
|
||||
# attach network to agent
|
||||
self.log.info(
|
||||
"attach network {net} to DHCP agent {agent}".format(
|
||||
net=net['id'],
|
||||
agent=agents['alive'][0]['id']))
|
||||
if not self.options.get('noop'):
|
||||
self._add_network_to_dhcp_agent(
|
||||
agents['alive'][0]['id'], net['id'])
|
||||
|
||||
# remove dead agents if need (and if found alive agent)
|
||||
if self.options.get('remove-dead'):
|
||||
for agent in agents['dead']:
|
||||
self.log.info(
|
||||
"remove dead DHCP agent: {0}".format(agent['id']))
|
||||
if not self.options.get('noop'):
|
||||
self._delete_agent(agent['id'])
|
||||
orphaned_networks = self._list_orphaned_networks()
|
||||
self.log.info("_reschedule_agent_dhcp: rescheduling orphaned networks")
|
||||
if orphaned_networks and agents['alive']:
|
||||
for network in orphaned_networks:
|
||||
self.log.info(
|
||||
"_reschedule_agent_dhcp: rescheduling {0} to {1}".format(
|
||||
network, agents['alive'][0]['id']))
|
||||
if not self.options.get('noop'):
|
||||
self._add_network_to_dhcp_agent(
|
||||
agents['alive'][0]['id'], network)
|
||||
self.log.info(
|
||||
"_reschedule_agent_dhcp: ended rescheduling of orphaned networks")
|
||||
self.log.debug("_reschedule_agent_dhcp: end.")
|
||||
|
||||
def _reschedule_agent_l3(self, agent_type):
|
||||
self.log.debug("_reschedule_agent_l3: start.")
|
||||
agents = {
|
||||
'alive': [],
|
||||
'dead': []
|
||||
}
|
||||
# collect router-list from dead DHCP-agents
|
||||
dead_routers = [] # array of tuples (router, agentID)
|
||||
for agent in self._get_agents_by_type(agent_type):
|
||||
if agent['alive']:
|
||||
self.log.info("found alive L3 agent: {0}".format(agent['id']))
|
||||
agents['alive'].append(agent)
|
||||
else:
|
||||
# dead agent
|
||||
self.log.info("found dead L3 agent: {0}".format(agent['id']))
|
||||
agents['dead'].append(agent)
|
||||
map(
|
||||
lambda rou: dead_routers.append((rou, agent['id'])),
|
||||
self._list_routers_on_l3_agent(agent['id'])
|
||||
)
|
||||
self.log.debug(
|
||||
"L3 agents in cluster: {0}".format(
|
||||
json.dumps(agents, indent=4)))
|
||||
self.log.debug("Routers, attached to dead L3 agents: {0}".format(
|
||||
json.dumps(dead_routers, indent=4)))
|
||||
|
||||
if dead_routers and agents['alive']:
|
||||
# get router-ID list of already attached to alive agent routerss
|
||||
lucky_ids = set()
|
||||
map(
|
||||
lambda rou: lucky_ids.add(rou['id']),
|
||||
self._list_routers_on_l3_agent(agents['alive'][0]['id'])
|
||||
)
|
||||
# remove dead agents after rescheduling
|
||||
for agent in agents['dead']:
|
||||
self.log.info("remove dead L3 agent: {0}".format(agent['id']))
|
||||
if not self.options.get('noop'):
|
||||
self._delete_agent(agent['id'])
|
||||
# move routers from dead to alive agent
|
||||
for rou in filter(
|
||||
lambda rr: not(rr[0]['id'] in lucky_ids), dead_routers):
|
||||
self.log.info(
|
||||
"schedule router {0} to L3 agent {1}".format(
|
||||
rou[0]['id'],
|
||||
agents['alive'][0]['id']))
|
||||
if not self.options.get('noop'):
|
||||
self._add_router_to_l3_agent(
|
||||
agents['alive'][0]['id'], rou[0]['id'])
|
||||
|
||||
orphaned_routers = self._list_orphaned_routers()
|
||||
self.log.info("_reschedule_agent_l3: rescheduling orphaned routers")
|
||||
if orphaned_routers and agents['alive']:
|
||||
for router in orphaned_routers:
|
||||
self.log.info(
|
||||
"_reschedule_agent_l3: rescheduling {0} to {1}".format(
|
||||
router, agents['alive'][0]['id']))
|
||||
if not self.options.get('noop'):
|
||||
self._add_router_to_l3_agent(
|
||||
agents['alive'][0]['id'], router)
|
||||
self.log.info(
|
||||
"_reschedule_agent_l3: ended rescheduling of orphaned routers")
|
||||
self.log.debug("_reschedule_agent_l3: end.")
|
||||
|
||||
def _remove_self(self, agent_type):
|
||||
self.log.debug("_remove_self: start.")
|
||||
for agent in self._get_agents_by_type(agent_type):
|
||||
if agent['host'] == socket.gethostname():
|
||||
self.log.info(
|
||||
"_remove_self: deleting our own agent {0} of type {1}".
|
||||
format(agent['id'], agent_type))
|
||||
if not self.options.get('noop'):
|
||||
self._delete_agent(agent['id'])
|
||||
self.log.debug("_remove_self: end.")
|
||||
|
||||
def _reschedule_agent(self, agent):
|
||||
self.log.debug("_reschedule_agents: start.")
|
||||
task = self.RESCHEDULING_CALLS.get(agent, None)
|
||||
if task:
|
||||
task(agent)
|
||||
self.log.debug("_reschedule_agents: end.")
|
||||
|
||||
def do(self, agent):
|
||||
if self.options.get('cleanup-ports'):
|
||||
self._cleanup_ports(agent)
|
||||
if self.options.get('reschedule'):
|
||||
self._reschedule_agent(agent)
|
||||
if self.options.get('remove-self'):
|
||||
self._remove_self(agent)
|
||||
|
||||
def _test_healthy(self, agent_list, hostname):
|
||||
rv = False
|
||||
for agent in agent_list:
|
||||
if agent['host'] == hostname and agent['alive']:
|
||||
return True
|
||||
return rv
|
||||
|
||||
def test_healthy(self, agent_type):
|
||||
# OCF_FAILED_MASTER,
|
||||
# http://www.linux-ha.org/doc/dev-guides/_literal_ocf_failed_master_literal_9.html
|
||||
|
||||
rc = 9
|
||||
agentlist = self._get_agents_by_type(agent_type)
|
||||
for hostname in self.options.get('test-hostnames'):
|
||||
if self._test_healthy(agentlist, hostname):
|
||||
return 0
|
||||
return rc
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Neutron network node cleaning tool.')
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--auth-config",
|
||||
dest="authconf",
|
||||
default="/etc/neutron/neutron.conf",
|
||||
help="Read authconfig from service file",
|
||||
metavar="FILE")
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--auth-token",
|
||||
dest="auth-token",
|
||||
default=None,
|
||||
help="Authenticating token (instead username/passwd)",
|
||||
metavar="TOKEN")
|
||||
parser.add_argument(
|
||||
"-u",
|
||||
"--admin-auth-url",
|
||||
dest="admin-auth-url",
|
||||
default=None,
|
||||
help="Authenticating URL (admin)",
|
||||
metavar="URL")
|
||||
parser.add_argument(
|
||||
"--retries",
|
||||
dest="retries",
|
||||
type=int,
|
||||
default=50,
|
||||
help="try NN retries for API call",
|
||||
metavar="NN")
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
dest="sleep",
|
||||
type=int,
|
||||
default=2,
|
||||
help="sleep seconds between retries",
|
||||
metavar="SEC")
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--agent",
|
||||
dest="agent",
|
||||
action="append",
|
||||
help="specyfy agents for cleaning",
|
||||
required=True)
|
||||
parser.add_argument(
|
||||
"--cleanup-ports",
|
||||
dest="cleanup-ports",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="cleanup ports for given agents on this node")
|
||||
parser.add_argument(
|
||||
"--remove-self",
|
||||
dest="remove-self",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="remove ourselves from agent list")
|
||||
parser.add_argument(
|
||||
"--activeonly",
|
||||
dest="activeonly",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="cleanup only active ports")
|
||||
parser.add_argument(
|
||||
"--reschedule",
|
||||
dest="reschedule",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="reschedule given agents")
|
||||
parser.add_argument(
|
||||
"--remove-dead",
|
||||
dest="remove-dead",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="remove dead agents while rescheduling")
|
||||
parser.add_argument(
|
||||
"--test-alive-for-hostname",
|
||||
dest="test-hostnames",
|
||||
action="append",
|
||||
help="testing agent's healthy for given hostname")
|
||||
parser.add_argument(
|
||||
"--external-bridge",
|
||||
dest="external-bridge",
|
||||
default="br-ex",
|
||||
help="external bridge name",
|
||||
metavar="IFACE")
|
||||
parser.add_argument(
|
||||
"--integration-bridge",
|
||||
dest="integration-bridge",
|
||||
default="br-int",
|
||||
help="integration bridge name",
|
||||
metavar="IFACE")
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--log",
|
||||
dest="log",
|
||||
action="store",
|
||||
help="log to file instead of STDOUT")
|
||||
parser.add_argument(
|
||||
"--noop",
|
||||
dest="noop",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="do not execute, print to log instead")
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
dest="debug",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="debug")
|
||||
args = parser.parse_args()
|
||||
RETRY_COUNT = args.retries
|
||||
RETRY_DELAY = args.sleep
|
||||
|
||||
# setup logging
|
||||
if args.log:
|
||||
LOG = make_logger(
|
||||
handler=logging.handlers.WatchedFileHandler(args.log))
|
||||
|
||||
if args.debug:
|
||||
LOG.setLevel(logging.DEBUG)
|
||||
|
||||
LOG.info("Started: {0}".format(' '.join(sys.argv)))
|
||||
cleaner = NeutronCleaner(options=vars(args), log=LOG)
|
||||
rc = 0
|
||||
if vars(args).get('test-hostnames'):
|
||||
rc = cleaner.test_healthy(args.agent[0])
|
||||
else:
|
||||
for i in args.agent:
|
||||
cleaner.do(i)
|
||||
LOG.debug("End.")
|
||||
sys.exit(rc)
|
@ -85,7 +85,6 @@ install -m 0755 %{files_source}/fuel-ha-utils/ocf/ceilometer-alarm-evaluator %{b
|
||||
install -m 0755 %{files_source}/fuel-ha-utils/ocf/nova-compute %{buildroot}/usr/lib/ocf/resource.d/fuel/nova-compute
|
||||
install -m 0755 %{files_source}/fuel-ha-utils/ocf/nova-network %{buildroot}/usr/lib/ocf/resource.d/fuel/nova-network
|
||||
install -m 0755 %{files_source}/fuel-ha-utils/ocf/ceilometer-agent-compute %{buildroot}/usr/lib/ocf/resource.d/fuel/ceilometer-agent-compute
|
||||
install -m 0755 %{files_source}/fuel-ha-utils/tools/q-agent-cleanup.py %{buildroot}/usr/bin/q-agent-cleanup.py
|
||||
install -m 0755 %{files_source}/fuel-ha-utils/tools/galeracheck %{buildroot}/usr/bin/galeracheck
|
||||
install -m 0755 %{files_source}/fuel-ha-utils/tools/swiftcheck %{buildroot}/usr/bin/swiftcheck
|
||||
install -m 0644 %{files_source}/fuel-ha-utils/tools/wsrepclustercheckrc %{buildroot}/etc/wsrepclustercheckrc
|
||||
@ -190,7 +189,6 @@ For further information go to http://wiki.openstack.org/Fuel
|
||||
%files -n fuel-ha-utils
|
||||
%defattr(-,root,root)
|
||||
/usr/lib/ocf/resource.d/fuel
|
||||
/usr/bin/q-agent-cleanup.py
|
||||
/usr/bin/galeracheck
|
||||
/usr/bin/swiftcheck
|
||||
%config(noreplace) /etc/wsrepclustercheckrc
|
||||
|
Loading…
Reference in New Issue
Block a user