OpenStack host maintenance and upgrade in interaction with application on top of it
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

631 lines
26 KiB

# Copyright (c) 2020 Nokia Corporation.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from flask import Flask
from flask import request
import heatclient.client as heatclient
from heatclient.common.template_utils import get_template_contents
from heatclient import exc as heat_excecption
import json
from keystoneauth1 import loading
from keystoneclient import client as ks_client
import logging as lging
from neutronclient.v2_0 import client as neutronclient
import novaclient.client as novaclient
import os
from oslo_config import cfg
from oslo_log import log as logging
import requests
import sys
from threading import Thread
import time
import uuid
import yaml
try:
import fenix.utils.identity_auth as identity_auth
except ValueError:
sys.path.append('../utils')
import identity_auth
LOG = logging.getLogger(__name__)
streamlog = lging.StreamHandler(sys.stdout)
LOG.logger.addHandler(streamlog)
LOG.logger.setLevel(logging.INFO)
opts = [
cfg.StrOpt('ip',
default='127.0.0.1',
help='the ip of VNFM',
required=True),
cfg.IntOpt('port',
default='12348',
help='the port of VNFM',
required=True),
]
CONF = cfg.CONF
CONF.register_opts(opts)
CONF.register_opts(identity_auth.os_opts, group='service_user')
class Stack(object):
def __init__(self, conf, log, project='demo'):
self.conf = conf
self.log = log
self.project = project
self.auth = identity_auth.get_identity_auth(conf, project=self.project)
self.session = identity_auth.get_session(self.auth)
self.heat = heatclient.Client(version='1', session=self.session)
self.stack_name = None
self.stack_id = None
self.template = None
self.parameters = {}
self.files = {}
# standard yaml.load will not work for hot tpl becasue of date format in
# heat_template_version is not string
def get_hot_tpl(self, template_file):
if not os.path.isfile(template_file):
raise Exception('File(%s) does not exist' % template_file)
return get_template_contents(template_file=template_file)
def _wait_stack_action_complete(self, action):
action_in_progress = '%s_IN_PROGRESS' % action
action_complete = '%s_COMPLETE' % action
action_failed = '%s_FAILED' % action
status = action_in_progress
stack_retries = 160
while status == action_in_progress and stack_retries > 0:
time.sleep(2)
try:
stack = self.heat.stacks.get(self.stack_name)
except heat_excecption.HTTPNotFound:
if action == 'DELETE':
# Might happen you never get status as stack deleted
status = action_complete
break
else:
raise Exception('unable to get stack')
status = stack.stack_status
stack_retries = stack_retries - 1
if stack_retries == 0 and status != action_complete:
raise Exception("stack %s not completed within 5min, status:"
" %s" % (action, status))
elif status == action_complete:
self.log.info('stack %s %s' % (self.stack_name, status))
elif status == action_failed:
raise Exception("stack %s failed" % action)
else:
self.log.error('stack %s %s' % (self.stack_name, status))
raise Exception("stack %s unknown result" % action)
def wait_stack_delete(self):
self._wait_stack_action_complete('DELETE')
def wait_stack_create(self):
self._wait_stack_action_complete('CREATE')
def wait_stack_update(self):
self._wait_stack_action_complete('UPDATE')
def create(self, stack_name, template, parameters={}, files={}):
self.stack_name = stack_name
self.template = template
self.parameters = parameters
self.files = files
stack = self.heat.stacks.create(stack_name=self.stack_name,
files=files,
template=template,
parameters=parameters)
self.stack_id = stack['stack']['id']
try:
self.wait_stack_create()
except Exception:
# It might not always work at first
self.log.info('retry creating maintenance stack.......')
self.delete()
time.sleep(5)
stack = self.heat.stacks.create(stack_name=self.stack_name,
files=files,
template=template,
parameters=parameters)
self.stack_id = stack['stack']['id']
self.wait_stack_create()
def update(self, stack_name, stack_id, template, parameters={}, files={}):
self.heat.stacks.update(stack_name=stack_name,
stack_id=stack_id,
files=files,
template=template,
parameters=parameters)
self.wait_stack_update()
def delete(self):
if self.stack_id is not None:
self.heat.stacks.delete(self.stack_name)
self.wait_stack_delete()
else:
self.log.info('no stack to delete')
def get_identity_auth(conf, project=None, username=None, password=None):
loader = loading.get_plugin_loader('password')
return loader.load_from_options(
auth_url=conf.service_user.os_auth_url,
username=(username or conf.service_user.os_username),
password=(password or conf.service_user.os_password),
user_domain_name=conf.service_user.os_user_domain_name,
project_name=(project or conf.service_user.os_project_name),
tenant_name=(project or conf.service_user.os_project_name),
project_domain_name=conf.service_user.os_project_domain_name)
class VNFM(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
self.app = None
def start(self):
self.log.info('VNFM start...')
self.app = VNFManager(self.conf, self.log)
self.app.start()
def stop(self):
self.log.info('VNFM stop...')
if not self.app:
return
self.log.info('delete VNF constraints...')
self.app.delete_constraints()
self.log.info('VNF delete start...')
self.app.stack.delete()
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json',
}
url = 'http://%s:%d/shutdown'\
% (self.conf.ip,
self.conf.port)
requests.post(url, data='', headers=headers)
class VNFManager(Thread):
def __init__(self, conf, log, project='demo'):
Thread.__init__(self)
self.conf = conf
self.port = self.conf.port
self.log = log
self.intance_ids = None
self.project = project
self.auth = identity_auth.get_identity_auth(conf, project=self.project)
self.session = identity_auth.get_session(auth=self.auth)
self.keystone = ks_client.Client(version='v3', session=self.session)
auth = get_identity_auth(conf,
project='service',
username='fenix',
password='admin')
session = identity_auth.get_session(auth=auth)
keystone = ks_client.Client(version='v3', session=session)
self.nova = novaclient.Client(version='2.34', session=self.session)
self.neutron = neutronclient.Client(session=self.session)
self.headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'}
self.project_id = self.session.get_project_id()
self.stack = Stack(self.conf, self.log, self.project)
files, template = self.stack.get_hot_tpl('maintenance_hot_tpl.yaml')
ext_net = self.get_external_network()
parameters = {'ext_net': ext_net}
self.log.info('creating VNF...')
self.log.info('parameters: %s' % parameters)
self.stack.create('%s_stack' % self.project,
template,
parameters=parameters,
files=files)
self.headers['X-Auth-Token'] = self.session.get_token()
self.orig_number_of_instances = self.number_of_instances()
# List of instances
self.ha_instances = []
self.nonha_instances = []
# Different instance_id specific constraints {instanse_id: {},...}
self.instance_constraints = None
# Update existing instances to instance lists
self.update_instances()
nonha_instances = len(self.nonha_instances)
if nonha_instances < 7:
self.scale = 2
else:
self.scale = int((nonha_instances) / 2)
self.log.info('Init nonha_instances: %s scale: %s: max_impacted %s' %
(nonha_instances, self.scale, nonha_instances - 1))
# Different instance groups constraints dict
self.ha_group = None
self.nonha_group = None
self.nonha_group_id = str(uuid.uuid4())
self.ha_group_id = [sg.id for sg in self.nova.server_groups.list()
if sg.name == "%s_ha_app_group" % self.project][0]
# Floating IP used in HA instance
self.floating_ip = None
# HA instance_id that is active / has floating IP
self.active_instance_id = self.active_instance_id()
services = keystone.services.list()
for service in services:
if service.type == 'maintenance':
self.log.info('maintenance service: %s:%s type %s'
% (service.name, service.id, service.type))
maint_id = service.id
self.maint_endpoint = [ep.url for ep in keystone.endpoints.list()
if ep.service_id == maint_id and
ep.interface == 'public'][0]
self.log.info('maintenance endpoint: %s' % self.maint_endpoint)
self.update_constraints_lock = False
self.update_constraints()
def get_external_network(self):
ext_net = None
networks = self.neutron.list_networks()['networks']
for network in networks:
if network['router:external']:
ext_net = network['name']
break
if ext_net is None:
raise Exception("external network not defined")
return ext_net
def delete_remote_instance_constraints(self, instance_id):
url = "%s/instance/%s" % (self.maint_endpoint, instance_id)
self.log.info('DELETE: %s' % url)
ret = requests.delete(url, data=None, headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def update_remote_instance_constraints(self, instance):
url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"])
self.log.info('PUT: %s' % url)
ret = requests.put(url, data=json.dumps(instance),
headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def delete_remote_group_constraints(self, instance_group):
url = "%s/instance_group/%s" % (self.maint_endpoint,
instance_group["group_id"])
self.log.info('DELETE: %s' % url)
ret = requests.delete(url, data=None, headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def update_remote_group_constraints(self, instance_group):
url = "%s/instance_group/%s" % (self.maint_endpoint,
instance_group["group_id"])
self.log.info('PUT: %s' % url)
ret = requests.put(url, data=json.dumps(instance_group),
headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def delete_constraints(self):
self.headers['X-Auth-Token'] = self.session.get_token()
for instance_id in self.instance_constraints:
self.delete_remote_instance_constraints(instance_id)
self.delete_remote_group_constraints(self.nonha_group)
self.delete_remote_group_constraints(self.ha_group)
def update_constraints(self):
while self.update_constraints_lock:
self.log.info('Waiting update_constraints_lock...')
time.sleep(1)
self.update_constraints_lock = True
self.log.info('Update constraints')
# Nova does not support groupping instances that do not belong to
# anti-affinity server_groups. Anyhow all instances need groupping
max_impacted_members = len(self.nonha_instances) - 1
nonha_group = {
"group_id": self.nonha_group_id,
"project_id": self.project_id,
"group_name": "%s_nonha_app_group" % self.project,
"anti_affinity_group": False,
"max_instances_per_host": 0,
"max_impacted_members": max_impacted_members,
"recovery_time": 2,
"resource_mitigation": True}
self.log.info('create %s_nonha_app_group constraints: %s'
% (self.project, nonha_group))
ha_group = {
"group_id": self.ha_group_id,
"project_id": self.project_id,
"group_name": "%s_ha_app_group" % self.project,
"anti_affinity_group": True,
"max_instances_per_host": 1,
"max_impacted_members": 1,
"recovery_time": 4,
"resource_mitigation": True}
self.log.info('create %s_ha_app_group constraints: %s'
% (self.project, ha_group))
if not self.ha_group or self.ha_group != ha_group:
LOG.info('ha instance group need update')
self.update_remote_group_constraints(ha_group)
self.ha_group = ha_group.copy()
if not self.nonha_group or self.nonha_group != nonha_group:
LOG.info('nonha instance group need update')
self.update_remote_group_constraints(nonha_group)
self.nonha_group = nonha_group.copy()
instance_constraints = {}
for ha_instance in self.ha_instances:
instance = {
"instance_id": ha_instance.id,
"project_id": self.project_id,
"group_id": ha_group["group_id"],
"instance_name": ha_instance.name,
"max_interruption_time": 120,
"migration_type": "MIGRATE",
"resource_mitigation": True,
"lead_time": 40}
self.log.info('create ha instance constraints: %s'
% instance)
instance_constraints[ha_instance.id] = instance
for nonha_instance in self.nonha_instances:
instance = {
"instance_id": nonha_instance.id,
"project_id": self.project_id,
"group_id": nonha_group["group_id"],
"instance_name": nonha_instance.name,
"max_interruption_time": 120,
"migration_type": "MIGRATE",
"resource_mitigation": True,
"lead_time": 40}
self.log.info('create nonha instance constraints: %s'
% instance)
instance_constraints[nonha_instance.id] = instance
if not self.instance_constraints:
# Initial instance constraints
self.log.info('create initial instances constraints...')
for instance in [instance_constraints[i] for i
in instance_constraints]:
self.update_remote_instance_constraints(instance)
self.instance_constraints = instance_constraints.copy()
else:
self.log.info('check instances constraints changes...')
added = [i for i in instance_constraints.keys()
if i not in self.instance_constraints]
deleted = [i for i in self.instance_constraints.keys()
if i not in instance_constraints]
modified = [i for i in instance_constraints.keys()
if (i not in added and i not in deleted and
instance_constraints[i] !=
self.instance_constraints[i])]
for instance_id in deleted:
self.delete_remote_instance_constraints(instance_id)
updated = added + modified
for instance in [instance_constraints[i] for i in updated]:
self.update_remote_instance_constraints(instance)
if updated or deleted:
# Some instance constraints have changed
self.instance_constraints = instance_constraints.copy()
self.update_constraints_lock = False
def active_instance_id(self):
# Need rertry as it takes time after heat template done before
# Floating IP in place
retry = 5
while retry > 0:
for instance in self.ha_instances:
network_interfaces = next(iter(instance.addresses.values()))
for network_interface in network_interfaces:
_type = network_interface.get('OS-EXT-IPS:type')
if _type == "floating":
if not self.floating_ip:
self.floating_ip = network_interface.get('addr')
self.log.debug('active_instance: %s %s' %
(instance.name, instance.id))
return instance.id
time.sleep(2)
self.update_instances()
retry -= 1
raise Exception("No active instance found")
def switch_over_ha_instance(self):
for instance in self.ha_instances:
if instance.id != self.active_instance_id:
self.log.info('Switch over to: %s %s' % (instance.name,
instance.id))
# Deprecated, need to use neutron instead
# instance.add_floating_ip(self.floating_ip)
port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa
floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa
self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa
# Have to update ha_instances as floating_ip changed
self.update_instances()
self.active_instance_id = instance.id
break
def get_instance_ids(self):
ret = list()
for instance in self.nova.servers.list(detailed=False):
ret.append(instance.id)
return ret
def update_instances(self):
instances = self.nova.servers.list(detailed=True)
self.ha_instances = [i for i in instances
if "%s_ha_app_" % self.project in i.name]
self.nonha_instances = [i for i in instances
if "%s_nonha_app_" % self.project in i.name]
def _alarm_data_decoder(self, data):
if "[" in data or "{" in data:
# string to list or dict removing unicode
data = yaml.load(data.replace("u'", "'"))
return data
def _alarm_traits_decoder(self, data):
return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
for t in data['reason_data']['event']['traits']})
def get_session_instance_ids(self, url, session_id):
ret = requests.get(url, data=None, headers=self.headers)
if ret.status_code != 200:
raise Exception(ret.text)
self.log.info('get_instance_ids %s' % ret.json())
return ret.json()['instance_ids']
def scale_instances(self, number_of_instances):
# number_of_instances_before = self.number_of_instances()
number_of_instances_before = len(self.nonha_instances)
parameters = self.stack.parameters
parameters['nonha_intances'] = (number_of_instances_before +
number_of_instances)
self.stack.update(self.stack.stack_name,
self.stack.stack_id,
self.stack.template,
parameters=parameters,
files=self.stack.files)
# number_of_instances_after = self.number_of_instances()
self.update_instances()
self.update_constraints()
number_of_instances_after = len(self.nonha_instances)
if (number_of_instances_before + number_of_instances !=
number_of_instances_after):
self.log.error('scale_instances with: %d from: %d ends up to: %d'
% (number_of_instances, number_of_instances_before,
number_of_instances_after))
raise Exception('scale_instances failed')
self.log.info('scaled nonha_intances from %d to %d' %
(number_of_instances_before,
number_of_instances_after))
def number_of_instances(self):
return len(self.nova.servers.list(detailed=False))
def run(self):
app = Flask('VNFM')
@app.route('/maintenance', methods=['POST'])
def maintenance_alarm():
data = json.loads(request.data.decode('utf8'))
try:
payload = self._alarm_traits_decoder(data)
except Exception:
payload = ({t[0]: t[2] for t in
data['reason_data']['event']['traits']})
self.log.error('cannot parse alarm data: %s' % payload)
raise Exception('VNFM cannot parse alarm.'
'Possibly trait data over 256 char')
self.log.info('VNFM received data = %s' % payload)
state = payload['state']
reply_state = None
reply = dict()
self.log.info('VNFM state: %s' % state)
if state == 'MAINTENANCE':
instance_ids = (self.get_session_instance_ids(
payload['instance_ids'],
payload['session_id']))
my_instance_ids = self.get_instance_ids()
invalid_instances = (
[instance_id for instance_id in instance_ids
if instance_id not in my_instance_ids])
if invalid_instances:
self.log.error('Invalid instances: %s' % invalid_instances)
reply_state = 'NACK_MAINTENANCE'
else:
reply_state = 'ACK_MAINTENANCE'
elif state == 'SCALE_IN':
# scale down "self.scale" instances that is VCPUS equaling
# at least a single compute node
self.scale_instances(-self.scale)
reply_state = 'ACK_SCALE_IN'
elif state == 'MAINTENANCE_COMPLETE':
# possibly need to upscale
self.scale_instances(self.scale)
reply_state = 'ACK_MAINTENANCE_COMPLETE'
elif state == 'PREPARE_MAINTENANCE':
# TBD from contraints
if "MIGRATE" not in payload['allowed_actions']:
raise Exception('MIGRATE not supported')
instance_ids = payload['instance_ids'][0]
self.log.info('VNFM got instance: %s' % instance_ids)
if instance_ids == self.active_instance_id:
self.switch_over_ha_instance()
# optional also in contraints
reply['instance_action'] = "MIGRATE"
reply_state = 'ACK_PREPARE_MAINTENANCE'
elif state == 'PLANNED_MAINTENANCE':
# TBD from contraints
if "MIGRATE" not in payload['allowed_actions']:
raise Exception('MIGRATE not supported')
instance_ids = payload['instance_ids'][0]
self.log.info('VNFM got instance: %s' % instance_ids)
if instance_ids == self.active_instance_id:
self.switch_over_ha_instance()
# optional also in contraints
reply['instance_action'] = "MIGRATE"
reply_state = 'ACK_PLANNED_MAINTENANCE'
elif state == 'INSTANCE_ACTION_DONE':
# TBD was action done in allowed window
self.log.info('%s' % payload['instance_ids'])
else:
raise Exception('VNFM received event with'
' unknown state %s' % state)
if reply_state:
self.headers['X-Auth-Token'] = self.session.get_token()
reply['state'] = reply_state
url = payload['reply_url']
self.log.info('VNFM reply: %s' % reply)
requests.put(url, data=json.dumps(reply), headers=self.headers)
return 'OK'
@app.route('/shutdown', methods=['POST'])
def shutdown():
self.log.info('shutdown VNFM server at %s' % time.time())
func = request.environ.get('werkzeug.server.shutdown')
if func is None:
raise RuntimeError('Not running with the Werkzeug Server')
func()
return 'VNFM shutting down...'
app.run(host="0.0.0.0", port=self.port)
if __name__ == '__main__':
app_manager = VNFM(CONF, LOG)
app_manager.start()
try:
LOG.info('Press CTRL + C to quit')
while True:
time.sleep(2)
except KeyboardInterrupt:
app_manager.stop()
except Exception:
app_manager.app.stack.delete()