OpenStack host maintenance and upgrade in interaction with application on top of it
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

561 lines
24 KiB

# Copyright (c) 2020 Nokia Corporation.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import aodhclient.client as aodhclient
import datetime
from flask import Flask
from flask import request
import json
from keystoneauth1 import loading
from keystoneclient import client as ks_client
from kubernetes import client
from kubernetes import config
import logging as lging
from oslo_config import cfg
from oslo_log import log as logging
import requests
import sys
from threading import Thread
import time
import yaml
try:
import fenix.utils.identity_auth as identity_auth
except ValueError:
sys.path.append('../utils')
import identity_auth
LOG = logging.getLogger(__name__)
streamlog = lging.StreamHandler(sys.stdout)
LOG.logger.addHandler(streamlog)
LOG.logger.setLevel(logging.INFO)
opts = [
cfg.StrOpt('ip',
default='127.0.0.1',
help='the ip of VNFM',
required=True),
cfg.IntOpt('port',
default='12348',
help='the port of VNFM',
required=True),
]
CONF = cfg.CONF
CONF.register_opts(opts)
CONF.register_opts(identity_auth.os_opts, group='service_user')
def get_identity_auth(conf, project=None, username=None, password=None):
loader = loading.get_plugin_loader('password')
return loader.load_from_options(
auth_url=conf.service_user.os_auth_url,
username=(username or conf.service_user.os_username),
password=(password or conf.service_user.os_password),
user_domain_name=conf.service_user.os_user_domain_name,
project_name=(project or conf.service_user.os_project_name),
tenant_name=(project or conf.service_user.os_project_name),
project_domain_name=conf.service_user.os_project_domain_name)
class VNFM(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
self.app = None
def start(self):
LOG.info('VNFM start......')
self.app = VNFManager(self.conf, self.log)
self.app.start()
def stop(self):
LOG.info('VNFM stop......')
if not self.app:
return
self.app.headers['X-Auth-Token'] = self.app.session.get_token()
self.app.delete_constraints()
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json',
}
url = 'http://%s:%d/shutdown'\
% (self.conf.ip,
self.conf.port)
requests.post(url, data='', headers=headers)
class VNFManager(Thread):
def __init__(self, conf, log):
Thread.__init__(self)
self.conf = conf
self.log = log
self.port = self.conf.port
self.intance_ids = None
# VNFM is started with OS_* exported as admin user
# We need that to query Fenix endpoint url
# Still we work with our tenant/poroject/vnf as demo
self.project = "demo"
LOG.info('VNFM project: %s' % self.project)
self.auth = identity_auth.get_identity_auth(conf, project=self.project)
self.session = identity_auth.get_session(auth=self.auth)
self.ks = ks_client.Client(version='v3', session=self.session)
self.aodh = aodhclient.Client(2, self.session)
# Subscribe to mainenance event alarm from Fenix via AODH
self.create_alarm()
config.load_kube_config()
self.kaapi = client.AppsV1Api()
self.kapi = client.CoreV1Api()
self.headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'}
self.headers['X-Auth-Token'] = self.session.get_token()
self.orig_number_of_instances = self.number_of_instances()
# List of instances
self.ha_instances = []
self.nonha_instances = []
# Different instance_id specific constraints {instanse_id: {},...}
self.instance_constraints = None
# Update existing instances to instance lists
self.update_instances()
# How many instances needs to exists (with current VNF load)
# max_impacted_members need to be updated accordingly
# if number of instances is scaled. example for demo-ha:
# max_impacted_members = len(self.ha_instances) - ha_group_limit
self.ha_group_limit = 2
self.nonha_group_limit = 2
# Different instance groups constraints dict
self.ha_group = None
self.nonha_group = None
auth = get_identity_auth(conf,
project='service',
username='fenix',
password='admin')
session = identity_auth.get_session(auth=auth)
keystone = ks_client.Client(version='v3', session=session)
# VNF project_id (VNF ID)
self.project_id = self.session.get_project_id()
# HA instance_id that is active has active label
self.active_instance_id = self.active_instance_id()
services = keystone.services.list()
for service in services:
if service.type == 'maintenance':
LOG.info('maintenance service: %s:%s type %s'
% (service.name, service.id, service.type))
maint_id = service.id
self.maint_endpoint = [ep.url for ep in keystone.endpoints.list()
if ep.service_id == maint_id and
ep.interface == 'public'][0]
LOG.info('maintenance endpoint: %s' % self.maint_endpoint)
self.update_constraints_lock = False
self.update_constraints()
# Instances waiting action to be done
self.pending_actions = {}
def create_alarm(self):
alarms = {alarm['name']: alarm for alarm in self.aodh.alarm.list()}
alarm_name = "%s_MAINTENANCE_ALARM" % self.project
if alarm_name in alarms:
return
alarm_request = dict(
name=alarm_name,
description=alarm_name,
enabled=True,
alarm_actions=[u'http://%s:%d/maintenance'
% (self.conf.ip,
self.conf.port)],
repeat_actions=True,
severity=u'moderate',
type=u'event',
event_rule=dict(event_type=u'maintenance.scheduled'))
self.aodh.alarm.create(alarm_request)
def delete_remote_instance_constraints(self, instance_id):
url = "%s/instance/%s" % (self.maint_endpoint, instance_id)
LOG.info('DELETE: %s' % url)
ret = requests.delete(url, data=None, headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
if ret.status_code == 404:
LOG.info('Already deleted: %s' % instance_id)
else:
raise Exception(ret.text)
def update_remote_instance_constraints(self, instance):
url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"])
LOG.info('PUT: %s' % url)
ret = requests.put(url, data=json.dumps(instance),
headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def delete_remote_group_constraints(self, instance_group):
url = "%s/instance_group/%s" % (self.maint_endpoint,
instance_group["group_id"])
LOG.info('DELETE: %s' % url)
ret = requests.delete(url, data=None, headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def update_remote_group_constraints(self, instance_group):
url = "%s/instance_group/%s" % (self.maint_endpoint,
instance_group["group_id"])
LOG.info('PUT: %s' % url)
ret = requests.put(url, data=json.dumps(instance_group),
headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def delete_constraints(self):
for instance_id in self.instance_constraints:
self.delete_remote_instance_constraints(instance_id)
self.delete_remote_group_constraints(self.nonha_group)
self.delete_remote_group_constraints(self.ha_group)
def update_constraints(self):
while self.update_constraints_lock:
LOG.info('Waiting update_constraints_lock...')
time.sleep(1)
self.update_constraints_lock = True
LOG.info('Update constraints')
# Pods groupped by ReplicaSet, so we use that id
rs = {r.metadata.name: r.metadata.uid for r in
self.kaapi.list_namespaced_replica_set('demo').items}
max_impacted_members = len(self.nonha_instances) - 1
nonha_group = {
"group_id": rs['demo-nonha'],
"project_id": self.project_id,
"group_name": "demo-nonha",
"anti_affinity_group": False,
"max_instances_per_host": 0,
"max_impacted_members": max_impacted_members,
"recovery_time": 10,
"resource_mitigation": True}
LOG.info('create demo-nonha constraints: %s'
% nonha_group)
ha_group = {
"group_id": rs['demo-ha'],
"project_id": self.project_id,
"group_name": "demo-ha",
"anti_affinity_group": True,
"max_instances_per_host": 1,
"max_impacted_members": 1,
"recovery_time": 10,
"resource_mitigation": True}
LOG.info('create demo-ha constraints: %s'
% ha_group)
if not self.ha_group or self.ha_group != ha_group:
LOG.info('ha instance group need update')
self.update_remote_group_constraints(ha_group)
self.ha_group = ha_group.copy()
if not self.nonha_group or self.nonha_group != nonha_group:
LOG.info('nonha instance group need update')
self.update_remote_group_constraints(nonha_group)
self.nonha_group = nonha_group.copy()
instance_constraints = {}
for ha_instance in self.ha_instances:
instance = {
"instance_id": ha_instance.metadata.uid,
"project_id": self.project_id,
"group_id": ha_group["group_id"],
"instance_name": ha_instance.metadata.name,
"max_interruption_time": 120,
"migration_type": "EVICTION",
"resource_mitigation": True,
"lead_time": 40}
LOG.info('create ha instance constraints: %s' % instance)
instance_constraints[ha_instance.metadata.uid] = instance
for nonha_instance in self.nonha_instances:
instance = {
"instance_id": nonha_instance.metadata.uid,
"project_id": self.project_id,
"group_id": nonha_group["group_id"],
"instance_name": nonha_instance.metadata.name,
"max_interruption_time": 120,
"migration_type": "EVICTION",
"resource_mitigation": True,
"lead_time": 40}
LOG.info('create nonha instance constraints: %s' % instance)
instance_constraints[nonha_instance.metadata.uid] = instance
if not self.instance_constraints:
# Initial instance constraints
LOG.info('create initial instances constraints...')
for instance in [instance_constraints[i] for i
in instance_constraints]:
self.update_remote_instance_constraints(instance)
self.instance_constraints = instance_constraints.copy()
else:
LOG.info('check instances constraints changes...')
added = [i for i in instance_constraints.keys()
if i not in self.instance_constraints]
deleted = [i for i in self.instance_constraints.keys()
if i not in instance_constraints]
modified = [i for i in instance_constraints.keys()
if (i not in added and i not in deleted and
instance_constraints[i] !=
self.instance_constraints[i])]
for instance_id in deleted:
self.delete_remote_instance_constraints(instance_id)
updated = added + modified
for instance in [instance_constraints[i] for i in updated]:
self.update_remote_instance_constraints(instance)
if updated or deleted:
# Some instance constraints have changed
self.instance_constraints = instance_constraints.copy()
self.update_constraints_lock = False
def active_instance_id(self):
# We digtate the active in the beginning
instance = self.ha_instances[0]
LOG.info('Initially Active instance: %s %s' %
(instance.metadata.name, instance.metadata.uid))
name = instance.metadata.name
namespace = instance.metadata.namespace
body = {"metadata": {"labels": {"active": "True"}}}
self.kapi.patch_namespaced_pod(name, namespace, body)
self.active_instance_id = instance.metadata.uid
def switch_over_ha_instance(self, instance_id):
if instance_id == self.active_instance_id:
# Need to switchover as instance_id will be affected and is active
for instance in self.ha_instances:
if instance_id == instance.metadata.uid:
LOG.info('Active to Standby: %s %s' %
(instance.metadata.name, instance.metadata.uid))
name = instance.metadata.name
namespace = instance.metadata.namespace
body = client.UNKNOWN_BASE_TYPE()
body.metadata.labels = {"ative": None}
self.kapi.patch_namespaced_pod(name, namespace, body)
else:
LOG.info('Standby to Active: %s %s' %
(instance.metadata.name, instance.metadata.uid))
name = instance.metadata.name
namespace = instance.metadata.namespace
body = client.UNKNOWN_BASE_TYPE()
body.metadata.labels = {"ative": "True"}
self.kapi.patch_namespaced_pod(name, namespace, body)
self.active_instance_id = instance.metadata.uid
self.update_instances()
def get_instance_ids(self):
instances = self.kapi.list_pod_for_all_namespaces().items
return [i.metadata.uid for i in instances
if i.metadata.name.startswith("demo-") and
i.metadata.namespace == "demo"]
def update_instances(self):
instances = self.kapi.list_pod_for_all_namespaces().items
self.ha_instances = [i for i in instances
if i.metadata.name.startswith("demo-ha") and
i.metadata.namespace == "demo"]
self.nonha_instances = [i for i in instances
if i.metadata.name.startswith("demo-nonha") and
i.metadata.namespace == "demo"]
def _alarm_data_decoder(self, data):
if "[" in data or "{" in data:
# string to list or dict removing unicode
data = yaml.load(data.replace("u'", "'"))
return data
def _alarm_traits_decoder(self, data):
return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
for t in data['reason_data']['event']['traits']})
def get_session_instance_ids(self, url, session_id):
ret = requests.get(url, data=None, headers=self.headers)
if ret.status_code != 200:
raise Exception(ret.text)
LOG.info('get_instance_ids %s' % ret.json())
return ret.json()['instance_ids']
def scale_instances(self, scale_instances):
number_of_instances_before = len(self.nonha_instances)
replicas = number_of_instances_before + scale_instances
# We only scale nonha apps
namespace = "demo"
name = "demo-nonha"
body = {'spec': {"replicas": replicas}}
self.kaapi.patch_namespaced_replica_set_scale(name, namespace, body)
time.sleep(3)
# Let's check if scale has taken effect
self.update_instances()
number_of_instances_after = len(self.nonha_instances)
check = 20
while number_of_instances_after == number_of_instances_before:
if check == 0:
LOG.error('scale_instances with: %d failed, still %d instances'
% (scale_instances, number_of_instances_after))
raise Exception('scale_instances failed')
check -= 1
time.sleep(1)
self.update_instances()
number_of_instances_after = len(self.nonha_instances)
LOG.info('scaled instances from %d to %d' %
(number_of_instances_before, number_of_instances_after))
def number_of_instances(self):
instances = self.kapi.list_pod_for_all_namespaces().items
return len([i for i in instances
if i.metadata.name.startswith("demo-")])
def instance_action(self, instance_id, allowed_actions):
# We should keep instance constraint in our internal structur
# and match instance_id specific allowed action. Now we assume EVICTION
if 'EVICTION' not in allowed_actions:
LOG.error('Action for %s not foudn from %s' %
(instance_id, allowed_actions))
return None
return 'EVICTION'
def instance_action_started(self, instance_id, action):
time_now = datetime.datetime.utcnow()
max_interruption_time = (
self.instance_constraints[instance_id]['max_interruption_time'])
self.pending_actions[instance_id] = {
'started': time_now,
'max_interruption_time': max_interruption_time,
'action': action}
def was_instance_action_in_time(self, instance_id):
time_now = datetime.datetime.utcnow()
started = self.pending_actions[instance_id]['started']
limit = self.pending_actions[instance_id]['max_interruption_time']
action = self.pending_actions[instance_id]['action']
td = time_now - started
if td.total_seconds() > limit:
LOG.error('%s %s took too long: %ds' %
(instance_id, action, td.total_seconds()))
LOG.error('%s max_interruption_time %ds might be too short' %
(instance_id, limit))
raise Exception('%s %s took too long: %ds' %
(instance_id, action, td.total_seconds()))
else:
LOG.info('%s %s with recovery time took %ds' %
(instance_id, action, td.total_seconds()))
del self.pending_actions[instance_id]
def run(self):
app = Flask('VNFM')
@app.route('/maintenance', methods=['POST'])
def maintenance_alarm():
data = json.loads(request.data.decode('utf8'))
try:
payload = self._alarm_traits_decoder(data)
except Exception:
payload = ({t[0]: t[2] for t in
data['reason_data']['event']['traits']})
LOG.error('cannot parse alarm data: %s' % payload)
raise Exception('VNFM cannot parse alarm.'
'Possibly trait data over 256 char')
LOG.info('VNFM received data = %s' % payload)
state = payload['state']
reply_state = None
reply = dict()
LOG.info('VNFM state: %s' % state)
if state == 'MAINTENANCE':
self.headers['X-Auth-Token'] = self.session.get_token()
instance_ids = (self.get_session_instance_ids(
payload['instance_ids'],
payload['session_id']))
reply['instance_ids'] = instance_ids
reply_state = 'ACK_MAINTENANCE'
elif state == 'SCALE_IN':
# scale down only nonha instances
nonha_instances = len(self.nonha_instances)
scale_in = nonha_instances / 2
self.scale_instances(-scale_in)
self.update_constraints()
reply['instance_ids'] = self.get_instance_ids()
reply_state = 'ACK_SCALE_IN'
elif state == 'MAINTENANCE_COMPLETE':
# possibly need to upscale
number_of_instances = self.number_of_instances()
if self.orig_number_of_instances > number_of_instances:
scale_instances = (self.orig_number_of_instances -
number_of_instances)
self.scale_instances(scale_instances)
self.update_constraints()
reply_state = 'ACK_MAINTENANCE_COMPLETE'
elif (state == 'PREPARE_MAINTENANCE' or
state == 'PLANNED_MAINTENANCE'):
instance_id = payload['instance_ids'][0]
instance_action = (self.instance_action(instance_id,
payload['allowed_actions']))
if not instance_action:
raise Exception('Allowed_actions not supported for %s' %
instance_id)
LOG.info('VNFM got instance: %s' % instance_id)
self.switch_over_ha_instance(instance_id)
reply['instance_action'] = instance_action
reply_state = 'ACK_%s' % state
self.instance_action_started(instance_id, instance_action)
elif state == 'INSTANCE_ACTION_DONE':
# TBD was action done in max_interruption_time (live migration)
# NOTE, in EVICTION instance_id reported that was in evicted
# node. New instance_id might be different
LOG.info('%s' % payload['instance_ids'])
self.was_instance_action_in_time(payload['instance_ids'][0])
self.update_instances()
self.update_constraints()
else:
raise Exception('VNFM received event with'
' unknown state %s' % state)
if reply_state:
reply['session_id'] = payload['session_id']
reply['state'] = reply_state
url = payload['reply_url']
LOG.info('VNFM reply: %s' % reply)
requests.put(url, data=json.dumps(reply), headers=self.headers)
return 'OK'
@app.route('/shutdown', methods=['POST'])
def shutdown():
LOG.info('shutdown VNFM server at %s' % time.time())
func = request.environ.get('werkzeug.server.shutdown')
if func is None:
raise RuntimeError('Not running with the Werkzeug Server')
func()
return 'VNFM shutting down...'
app.run(host="0.0.0.0", port=self.port)
if __name__ == '__main__':
app_manager = VNFM(CONF, LOG)
app_manager.start()
try:
LOG.info('Press CTRL + C to quit')
while True:
time.sleep(2)
except KeyboardInterrupt:
app_manager.stop()