Add script to balance load of Neutron DHCP agents

We found that sometimes in large deployments some
DHCP agents can be overloaded when others are almost
empty (especially if random scheduler is used for
DHCP agents).
That can cause issue with agents config syncronization
e.g. after agents restart.

This script can move some networks from overloaded agents
to less loaded and also remove network from some DHCP agents
if it is hosted on too many agents in same time.

Change-Id: Ib9ed1d75100ee66e04143eff4d30ccab1eb72abf
This commit is contained in:
Sławek Kapłoński 2016-11-15 21:18:31 +00:00
parent 29c5746658
commit 38a70fda49
1 changed files with 520 additions and 0 deletions

View File

@ -0,0 +1,520 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2016 OVH SAS
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
The script checks how many neutron dhcp agents are handling one network and
spreads load of networks to be similar on all agents in infra.
"""
import argparse
import multiprocessing
import logging
import os
import random
import sys
from neutronclient.v2_0 import client as neutronclient
OS_PREFIX = "OS_"
OS_REQUIRED_KEYS = [
'username',
'password',
'tenant_name',
'auth_url',
'region_name']
MAX_ATTEMPTS = 3
DHCP_AGENT_TYPE = "DHCP agent"
HOST_ID = "binding:host_id"
RESERVED_DHCP_PORT = "reserved_dhcp_port"
_CLIENT = None
_CREDS = {}
def get_neutron_client():
global _CLIENT
if _CLIENT:
return _CLIENT
credentials = get_credentials()
_CLIENT = neutronclient.Client(**credentials)
return _CLIENT
def get_credentials():
global _CREDS
if _CREDS:
return _CREDS
for key in OS_REQUIRED_KEYS:
env_key = OS_PREFIX + key.upper()
value = os.environ.get(env_key)
if not value:
LOG.error("Missing %s in environment vars."
"Openstack environment vars should be loaded before "
"running this script", env_key)
sys.exit(1)
_CREDS[key] = value
return _CREDS
def get_logger(verbose=False, debug=False, logfile=None, name=None):
logger = logging.getLogger(name)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
loglevel = logging.ERROR
if verbose:
loglevel = logging.INFO
if debug:
loglevel = logging.DEBUG
logger.setLevel(loglevel)
if logfile:
fh = logging.FileHandler(logfile)
fh.setLevel(loglevel)
fh.setFormatter(formatter)
logger.addHandler(fh)
else:
ch = logging.StreamHandler()
ch.setLevel(loglevel)
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
def get_number_of_cores():
try:
return multiprocessing.cpu_count()
except Exception:
LOG.warning("Failed to determine number of cores in the system")
return 1
def parse_args():
def check_positive(value):
ivalue = int(value)
if ivalue <= 0:
raise argparse.ArgumentTypeError(
"%s is an invalid positive int value" % value)
return ivalue
program_description=("This script is working in two stages: \n"
"1. Checking number of DHCP agents for each \n"
" network and removing some agents if there is \n"
" too many assigned for network,\n"
"2. Calculating number of networks which every \n"
" DHCP agent should handle. Balancing networks \n"
" amont agents that each of them handles \n"
" similar number of networks.")
parser = argparse.ArgumentParser(
description=program_description,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--workers", default=None, type=check_positive,
help=("Number of workers to do some operations "
"simultaneously (like removing dead agents) "
"from network. By default number of CPU "
"cores will be taken"))
parser.add_argument("--max_agents_per_network", default=1, type=int,
help=("Maximum number of agents which should host "
"DHCP service for one network"))
parser.add_argument("--debug", action="store_true",
help="Enable debug mode")
parser.add_argument("--verbose", action="store_true",
help="Make script to be more verbose")
parser.add_argument("--log-file", dest='logfile', default=None,
help="Log file path.")
return parser.parse_args()
def remove_unneccessary_agents(number_of_workers):
"""Remove DHCP agents from handle DHCP service if there is more
agents than set in MAX_AGENTS_PER_NETWORK.
:param max_agents_per_network: maximum number of DHCP agents which can
handle DHCP service for network
"""
networks_agents = get_networks_agents(number_of_workers)
if not networks_agents:
return
LOG.info("Cleaning networks from unneccessary DHCP agents")
threads_pool = multiprocessing.Pool(processes=number_of_workers)
threads_pool.map(remove_unneccessary_agents_for_network,
zip(networks_agents.keys(), networks_agents.values()))
LOG.info("All networks cleaned")
def remove_unneccessary_agents_for_network(network_agents):
"""Remove DHCP agents from hosting DHCP service if there is more
agents than set in MAX_AGENTS_PER_NETWORK.
Example: network is assigned to 3 DHCP agents but should be only to
one, the network will be deleted from two agents.
Reserved_dhcp_ports will be deleted from this network as well.
:param network_agents: tuple with network id as first element and
list of agents as second
"""
network_id = network_agents[0]
agents = network_agents[1]
agents_to_stay = 0
for agent in agents:
agent_id = agent['id']
agent_alive = agent['alive']
if not agent_alive:
LOG.info("Removing dead agent %(agent_id)s from network "
"%(network_id)s",
{'agent_id': agent_id, 'network_id': network_id})
remove_network_from_agent(network_id, agent_id)
else:
if agents_to_stay < MAX_AGENTS_PER_NETWORK:
LOG.debug("Agent %(agent_id)s will still handle DHCP for "
"network %(network_id)s",
{'agent_id': agent_id, 'network_id': network_id})
agents_to_stay += 1
else:
LOG.info("Removing agent %(agent_id)s from network "
"%(network_id)s",
{'agent_id': agent_id, 'network_id': network_id})
remove_network_from_agent(network_id, agent_id)
remove_reserved_dhcp_ports(network_id)
def remove_reserved_dhcp_ports(network_id):
"""Remove reserved_dhcp_ports from network
:param network_id: id of network to clean
"""
client = get_neutron_client()
try:
ports = client.list_ports(network_id=network_id,
device_id=RESERVED_DHCP_PORT)
except Exception as e:
LOG.error("Failed to get list of reserved dhcp ports in "
"network %(network_id)s; Error: %(err)s",
{'network_id': network_id, 'err': e})
return
for port in ports['ports']:
LOG.info("Delete port %(port_id)s from network %(net_id)s",
{'port_id': port['id'], 'net_id': network_id})
try:
client.delete_port(port['id'])
except Exception as e:
LOG.error("Failed to remove reserved dhcp port %(port_id)s "
"from network %(network_id)s; Error: %(err)s",
{'port_id': port['id'],
'network_id': network_id,
'err': e})
def balance_load_of_agents():
"""Main function to make balance of networks across DHCP agents
It gets number of all agents from Neutron API and list of network_ids
hanlded by each agent. Then it calculates how many networks should be
handled by agent so all networks will be handled by alive agent(s).
Finally it moves some networks from overloaded_agents to free_agents.
"""
live_dhcp_agents, dead_dhcp_agents = get_dhcp_agents()
dhcp_agents = dict(
list(live_dhcp_agents.items()) + list(dead_dhcp_agents.items())
)
number_of_networks_with_dhcp = get_number_of_networks_with_dhcp(
dhcp_agents)
number_of_live_dhcp_agents = len(live_dhcp_agents)
if number_of_live_dhcp_agents == 0:
LOG.error("No live DHCP agents found")
return
# DHCP slot is network assigned to agent
necessary_dhcp_slots = (
number_of_networks_with_dhcp * MAX_AGENTS_PER_NETWORK)
max_networks_per_agent = int(round(
float(necessary_dhcp_slots) / float(number_of_live_dhcp_agents)
))
overloaded_agents, full_agents, free_agents = split_agents(
live_dhcp_agents, max_networks_per_agent)
LOG.info("Overloaded agents: %s", overloaded_agents.keys())
LOG.info("Full agents: %s", full_agents.keys())
if len(free_agents) == 0:
LOG.info("No any free agents found")
return
LOG.info("Free agents: %s", free_agents.keys())
for overloaded_agent_id, networks in overloaded_agents.iteritems():
networks_to_move = get_networks_to_move(overloaded_agent_id,
max_networks_per_agent)
LOG.info("Networks to move from agent %(agent_id)s: "
"%(networks)s",
{'agent_id': overloaded_agent_id,
'networks': networks_to_move})
for network_id in networks_to_move:
if len(free_agents) == 0:
LOG.info("No any free agents found to move network %s",
network_id)
return
free_agents = move_network_to_new_agent(network_id,
overloaded_agent_id,
free_agents)
def get_dhcp_agents():
"""Get list of alive/dead DHCP agents and networks hosted by each agent
:return agents: dict with ids of alive agents as keys and list of ids of
networks hostsed by agent
:return agents: dict with ids of dead agents as keys and list of ids of
networks hosted by agent
"""
client = get_neutron_client()
live_agents = {}
dead_agents = {}
try:
agents = client.list_agents(agent_type=DHCP_AGENT_TYPE)
except Exception as e:
LOG.error("Failed to get list of agents; Error: %s", e)
return
for agent in agents.get("agents", []):
agent_networks = get_networks_on_agent(agent['id'])
if agent.get('alive') == True:
live_agents[agent['id']] = agent_networks
else:
dead_agents[agent['id']] = agent_networks
return live_agents, dead_agents
def get_networks_agents(number_of_workers):
"""Get list of networks with ids of DHCP agents which hosts DHCP for net
:return networks_agents: dict with network_id as key and list of ids of
DHCP agents which hosts this network as values
"""
client = get_neutron_client()
networks_agents = []
try:
networks = client.list_networks()
networks_ids = [network['id'] for network in networks['networks']]
except Exception as e:
LOG.error("Failed to get list of networks; Error: %s", e)
return
threads_pool = multiprocessing.Pool(processes=number_of_workers)
networks_agents = threads_pool.map(get_agents_handled_network,
networks_ids)
return dict(networks_agents)
def get_agents_handled_network(network_id):
"""Get list agents which handle network with given id
:param network_id: id of network for which agents should be found
:return: tuple with network_id as first value and list of ids of
DHCP agents which hosts this network as second value
"""
client = get_neutron_client()
try:
network_agents = client.list_dhcp_agent_hosting_networks(
network_id)['agents']
return (network_id, network_agents)
except Exception as e:
LOG.error("Failed to get list of DHCP agents for "
"network %(network_id)s; Error: %(err)s",
{'network_id': network_id, 'err': e})
return (network_id, None)
def get_networks_on_agent(agent_id):
"""Get list of networks hosted on DHCP agent
:param agent_id: id of agent to check
"""
client = get_neutron_client()
try:
networks = client.list_networks_on_dhcp_agent(agent_id)['networks']
return [network['id'] for network in networks]
except Exception as e:
LOG.error("Failed to get list of networks hosted by "
"agent %(agent_id)s; Error: %(err)s",
{'agent_id': agent_id, 'err': e})
return []
def add_network_to_agent(network_id, agent_id):
"""Set network to be hosted by DHCP agent
:param network_id: id of network which will be added to agent
:param agent_id: id of agent which will host DHCP for network
:return: True if network will be added to agent or Neutron will return
error that agent is already hosting this network
False if adding network to agent fails
"""
client = get_neutron_client()
LOG.debug("Adding network %(network_id)s to agent "
"%(agent_id)s",
{'network_id': network_id,
'agent_id': agent_id})
try:
client.add_network_to_dhcp_agent(
agent_id, {'network_id': network_id}
)
except neutronclient.common.exceptions.Conflict:
LOG.warning("Network %(network_id)s is already hosted by "
"agent %(agent_id)s",
{'network_id': network_id,
'agent_id': agent_id})
except Exception as e:
LOG.error("Failed to add network %(network_id)s to "
"agent %(agent_id); Error: %(err)s",
{'network_id': network_id,
'agent_id': agent_id,
'err': e})
return False
return True
def remove_network_from_agent(network_id, agent_id):
"""Remove network from DHCP agent
:param network_id: id of network which will be removed from agent
:param agent_id: id of agent which to remove
"""
client = get_neutron_client()
try:
client.remove_network_from_dhcp_agent(agent_id, network_id)
except Exception as e:
LOG.error("Failed to remove network %(network_id)s from "
"agent %(agent_id)s; Error: %(err)s",
{'network_id': network_id,
'agent_id': agent_id,
'err': e})
def get_networks_to_move(agent_id, max_networks_on_agent):
"""Get list of networks which should be moved to other DHCP agents
:param agent_id: id of agent from which networks should be moved
:param max_networks_on_agent: max number of networks which agent should
handle
"""
networks_on_agent = get_networks_on_agent(agent_id)
number_of_networks_to_move = len(networks_on_agent) - max_networks_on_agent
return random.sample(networks_on_agent, number_of_networks_to_move)
def move_network_to_new_agent(network_id, old_agent_id, agents):
"""Move network from one DHCP agent to another one
If adding to new agent will success then network will be also removed from
old agent.
:param network_id: id of network to move
:param old_agent_id: id of existing agent which handles network
:param: agents: list of agents from which new agent will be choosen
:return agents: list of agents with updated list of networks for agents
"""
attempt = 1
while attempt <= MAX_ATTEMPTS:
agent_id = random.choice(agents.keys())
add_network_result = add_network_to_agent(
network_id, agent_id)
if add_network_result:
agents[agent_id] = get_networks_on_agent(agent_id)
remove_network_from_agent(network_id, old_agent_id)
return agents
else:
attempt += 1
return agents
def get_number_of_networks_with_dhcp(agents):
"""Get overall number of networks handled by at least one dhcp agent
:param: dict with agents and networks handled by thoses agents
:return: number of unique networks hosted on dhcp agents
"""
networks = []
for agent_networks in agents.values():
networks += agent_networks
return len(set(networks))
def split_agents(agents, max_networks_on_agent):
"""Divide list of agents into groups: overloaded, full and free
Overloaded agent means that it hosts more networks than
max_networks_on_agent,
Full agent means that it hosts exactly max_networks_on_agent of networks,
Free agent means that it hosts fewer networks than max_networks_on_agent
so there is place for other networks on such agent
:param agents: dict with agent_ids and list of networks hosted by each
agent
:param max_networks_on_agent: max number of networks which can be hosted
on agent
:returns overloaded, full, free: dicts with agent_ids and lists of
networks handled by each agent
"""
overloaded = {}
full = {}
free = {}
for agent, networks in agents.iteritems():
agent_networks = len(networks)
if agent_networks > max_networks_on_agent:
overloaded[agent] = networks
elif agent_networks == max_networks_on_agent:
full[agent] = networks
else:
free[agent] = networks
return overloaded, full, free
if __name__ == '__main__':
global MAX_AGENTS_PER_NETWORK
args = parse_args()
# If debug is set to True then logger name is not set so root logger will
# be used and also messages from neutronclient will be logged
logger_name = None if args.debug else "dhcp_agents_balancer"
LOG = get_logger(args.verbose, args.debug, args.logfile, logger_name)
number_of_workers = args.workers or get_number_of_cores()
MAX_AGENTS_PER_NETWORK = args.max_agents_per_network
remove_unneccessary_agents(number_of_workers)
balance_load_of_agents()