fenix/fenix/tools/vnfm_k8s.py

# Copyright (c) 2020 Nokia Corporation.
# All Rights Reserved.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.
import aodhclient.client as aodhclient
import datetime
from flask import Flask
from flask import request
import json
from keystoneauth1 import loading
from keystoneclient import client as ks_client
from kubernetes import client
from kubernetes import config
import logging as lging
from oslo_config import cfg
from oslo_log import log as logging
import requests
import sys
from threading import Thread
import time
import yaml

try:
    import fenix.utils.identity_auth as identity_auth
except ValueError:
    sys.path.append('../utils')
    import identity_auth

LOG = logging.getLogger(__name__)
streamlog = lging.StreamHandler(sys.stdout)
LOG.logger.addHandler(streamlog)
LOG.logger.setLevel(logging.INFO)

opts = [
    cfg.StrOpt('ip',
               default='127.0.0.1',
               help='the ip of VNFM',
               required=True),
    cfg.IntOpt('port',
               default='12348',
               help='the port of VNFM',
               required=True),
]

CONF = cfg.CONF
CONF.register_opts(opts)
CONF.register_opts(identity_auth.os_opts, group='service_user')


def get_identity_auth(conf, project=None, username=None, password=None):
    loader = loading.get_plugin_loader('password')
    return loader.load_from_options(
        auth_url=conf.service_user.os_auth_url,
        username=(username or conf.service_user.os_username),
        password=(password or conf.service_user.os_password),
        user_domain_name=conf.service_user.os_user_domain_name,
        project_name=(project or conf.service_user.os_project_name),
        tenant_name=(project or conf.service_user.os_project_name),
        project_domain_name=conf.service_user.os_project_domain_name)


class VNFM(object):

    def __init__(self, conf, log):
        self.conf = conf
        self.log = log
        self.app = None

    def start(self):
        LOG.info('VNFM start......')
        self.app = VNFManager(self.conf, self.log)
        self.app.start()

    def stop(self):
        LOG.info('VNFM stop......')
        if not self.app:
            return
        self.app.headers['X-Auth-Token'] = self.app.session.get_token()
        self.app.delete_constraints()
        headers = {
            'Content-Type': 'application/json',
            'Accept': 'application/json',
        }
        url = 'http://%s:%d/shutdown'\
              % (self.conf.ip,
                 self.conf.port)
        requests.post(url, data='', headers=headers)


class VNFManager(Thread):

    def __init__(self, conf, log):
        Thread.__init__(self)
        self.conf = conf
        self.log = log
        self.port = self.conf.port
        self.intance_ids = None
        # VNFM is started with OS_* exported as admin user
        # We need that to query Fenix endpoint url
        # Still we work with our tenant/poroject/vnf as demo
        self.project = "demo"
        LOG.info('VNFM project: %s' % self.project)
        self.auth = identity_auth.get_identity_auth(conf, project=self.project)
        self.session = identity_auth.get_session(auth=self.auth)
        self.ks = ks_client.Client(version='v3', session=self.session)
        self.aodh = aodhclient.Client(2, self.session)
        # Subscribe to mainenance event alarm from Fenix via AODH
        self.create_alarm()
        config.load_kube_config()
        self.kaapi = client.AppsV1Api()
        self.kapi = client.CoreV1Api()
        self.headers = {
            'Content-Type': 'application/json',
            'Accept': 'application/json'}
        self.headers['X-Auth-Token'] = self.session.get_token()
        self.orig_number_of_instances = self.number_of_instances()
        # List of instances
        self.ha_instances = []
        self.nonha_instances = []
        # Different instance_id specific constraints {instanse_id: {},...}
        self.instance_constraints = None
        # Update existing instances to instance lists
        self.update_instances()
        # How many instances needs to exists (with current VNF load)
        # max_impacted_members need to be updated accordingly
        # if number of instances is scaled. example for demo-ha:
        # max_impacted_members = len(self.ha_instances) - ha_group_limit
        self.ha_group_limit = 2
        self.nonha_group_limit = 2
        # Different instance groups constraints dict
        self.ha_group = None
        self.nonha_group = None
        auth = get_identity_auth(conf,
                                 project='service',
                                 username='fenix',
                                 password='admin')
        session = identity_auth.get_session(auth=auth)
        keystone = ks_client.Client(version='v3', session=session)
        # VNF project_id (VNF ID)
        self.project_id = self.session.get_project_id()
        # HA instance_id that is active has active label
        self.active_instance_id = self.active_instance_id()
        services = keystone.services.list()
        for service in services:
            if service.type == 'maintenance':
                LOG.info('maintenance service: %s:%s type %s'
                         % (service.name, service.id, service.type))
                maint_id = service.id
        self.maint_endpoint = [ep.url for ep in keystone.endpoints.list()
                               if ep.service_id == maint_id and
                               ep.interface == 'public'][0]
        LOG.info('maintenance endpoint: %s' % self.maint_endpoint)
        self.update_constraints_lock = False
        self.update_constraints()
        # Instances waiting action to be done
        self.pending_actions = {}

    def create_alarm(self):
        alarms = {alarm['name']: alarm for alarm in self.aodh.alarm.list()}
        alarm_name = "%s_MAINTENANCE_ALARM" % self.project
        if alarm_name in alarms:
            return
        alarm_request = dict(
            name=alarm_name,
            description=alarm_name,
            enabled=True,
            alarm_actions=[u'http://%s:%d/maintenance'
                           % (self.conf.ip,
                              self.conf.port)],
            repeat_actions=True,
            severity=u'moderate',
            type=u'event',
            event_rule=dict(event_type=u'maintenance.scheduled'))
        self.aodh.alarm.create(alarm_request)

    def delete_remote_instance_constraints(self, instance_id):
        url = "%s/instance/%s" % (self.maint_endpoint, instance_id)
        LOG.info('DELETE: %s' % url)
        ret = requests.delete(url, data=None, headers=self.headers)
        if ret.status_code != 200 and ret.status_code != 204:
            if ret.status_code == 404:
                LOG.info('Already deleted: %s' % instance_id)
            else:
                raise Exception(ret.text)

    def update_remote_instance_constraints(self, instance):
        url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"])
        LOG.info('PUT: %s' % url)
        ret = requests.put(url, data=json.dumps(instance),
                           headers=self.headers)
        if ret.status_code != 200 and ret.status_code != 204:
            raise Exception(ret.text)

    def delete_remote_group_constraints(self, instance_group):
        url = "%s/instance_group/%s" % (self.maint_endpoint,
                                        instance_group["group_id"])
        LOG.info('DELETE: %s' % url)
        ret = requests.delete(url, data=None, headers=self.headers)
        if ret.status_code != 200 and ret.status_code != 204:
            raise Exception(ret.text)

    def update_remote_group_constraints(self, instance_group):
        url = "%s/instance_group/%s" % (self.maint_endpoint,
                                        instance_group["group_id"])
        LOG.info('PUT: %s' % url)
        ret = requests.put(url, data=json.dumps(instance_group),
                           headers=self.headers)
        if ret.status_code != 200 and ret.status_code != 204:
            raise Exception(ret.text)

    def delete_constraints(self):
        for instance_id in self.instance_constraints:
            self.delete_remote_instance_constraints(instance_id)
        self.delete_remote_group_constraints(self.nonha_group)
        self.delete_remote_group_constraints(self.ha_group)

    def update_constraints(self):
        while self.update_constraints_lock:
            LOG.info('Waiting update_constraints_lock...')
            time.sleep(1)
        self.update_constraints_lock = True
        LOG.info('Update constraints')
        # Pods groupped by ReplicaSet, so we use that id
        rs = {r.metadata.name: r.metadata.uid for r in
              self.kaapi.list_namespaced_replica_set('demo').items}
        max_impacted_members = len(self.nonha_instances) - 1
        nonha_group = {
            "group_id": rs['demo-nonha'],
            "project_id": self.project_id,
            "group_name": "demo-nonha",
            "anti_affinity_group": False,
            "max_instances_per_host": 0,
            "max_impacted_members": max_impacted_members,
            "recovery_time": 10,
            "resource_mitigation": True}
        LOG.info('create demo-nonha constraints: %s'
                 % nonha_group)
        ha_group = {
            "group_id": rs['demo-ha'],
            "project_id": self.project_id,
            "group_name": "demo-ha",
            "anti_affinity_group": True,
            "max_instances_per_host": 1,
            "max_impacted_members": 1,
            "recovery_time": 10,
            "resource_mitigation": True}
        LOG.info('create demo-ha constraints: %s'
                 % ha_group)
        if not self.ha_group or self.ha_group != ha_group:
            LOG.info('ha instance group need update')
            self.update_remote_group_constraints(ha_group)
            self.ha_group = ha_group.copy()
        if not self.nonha_group or self.nonha_group != nonha_group:
            LOG.info('nonha instance group need update')
            self.update_remote_group_constraints(nonha_group)
            self.nonha_group = nonha_group.copy()

        instance_constraints = {}
        for ha_instance in self.ha_instances:
            instance = {
                "instance_id": ha_instance.metadata.uid,
                "project_id": self.project_id,
                "group_id": ha_group["group_id"],
                "instance_name": ha_instance.metadata.name,
                "max_interruption_time": 120,
                "migration_type": "EVICTION",
                "resource_mitigation": True,
                "lead_time": 40}
            LOG.info('create ha instance constraints: %s' % instance)
            instance_constraints[ha_instance.metadata.uid] = instance
        for nonha_instance in self.nonha_instances:
            instance = {
                "instance_id": nonha_instance.metadata.uid,
                "project_id": self.project_id,
                "group_id": nonha_group["group_id"],
                "instance_name": nonha_instance.metadata.name,
                "max_interruption_time": 120,
                "migration_type": "EVICTION",
                "resource_mitigation": True,
                "lead_time": 40}
            LOG.info('create nonha instance constraints: %s' % instance)
            instance_constraints[nonha_instance.metadata.uid] = instance
        if not self.instance_constraints:
            # Initial instance constraints
            LOG.info('create initial instances constraints...')
            for instance in [instance_constraints[i] for i
                             in instance_constraints]:
                self.update_remote_instance_constraints(instance)
            self.instance_constraints = instance_constraints.copy()
        else:
            LOG.info('check instances constraints changes...')
            added = [i for i in instance_constraints.keys()
                     if i not in self.instance_constraints]
            deleted = [i for i in self.instance_constraints.keys()
                       if i not in instance_constraints]
            modified = [i for i in instance_constraints.keys()
                        if (i not in added and i not in deleted and
                            instance_constraints[i] !=
                            self.instance_constraints[i])]
            for instance_id in deleted:
                self.delete_remote_instance_constraints(instance_id)
            updated = added + modified
            for instance in [instance_constraints[i] for i in updated]:
                self.update_remote_instance_constraints(instance)
            if updated or deleted:
                # Some instance constraints have changed
                self.instance_constraints = instance_constraints.copy()
        self.update_constraints_lock = False

    def active_instance_id(self):
        # We digtate the active in the beginning
        instance = self.ha_instances[0]
        LOG.info('Initially Active instance: %s %s' %
                 (instance.metadata.name, instance.metadata.uid))
        name = instance.metadata.name
        namespace = instance.metadata.namespace
        body = {"metadata": {"labels": {"active": "True"}}}
        self.kapi.patch_namespaced_pod(name, namespace, body)
        self.active_instance_id = instance.metadata.uid

    def switch_over_ha_instance(self, instance_id):
        if instance_id == self.active_instance_id:
            # Need to switchover as instance_id will be affected and is active
            for instance in self.ha_instances:
                if instance_id == instance.metadata.uid:
                    LOG.info('Active to Standby: %s %s' %
                             (instance.metadata.name, instance.metadata.uid))
                    name = instance.metadata.name
                    namespace = instance.metadata.namespace
                    body = client.UNKNOWN_BASE_TYPE()
                    body.metadata.labels = {"ative": None}
                    self.kapi.patch_namespaced_pod(name, namespace, body)
                else:
                    LOG.info('Standby to Active: %s %s' %
                             (instance.metadata.name, instance.metadata.uid))
                    name = instance.metadata.name
                    namespace = instance.metadata.namespace
                    body = client.UNKNOWN_BASE_TYPE()
                    body.metadata.labels = {"ative": "True"}
                    self.kapi.patch_namespaced_pod(name, namespace, body)
                    self.active_instance_id = instance.metadata.uid
            self.update_instances()

    def get_instance_ids(self):
        instances = self.kapi.list_pod_for_all_namespaces().items
        return [i.metadata.uid for i in instances
                if i.metadata.name.startswith("demo-") and
                i.metadata.namespace == "demo"]

    def update_instances(self):
        instances = self.kapi.list_pod_for_all_namespaces().items
        self.ha_instances = [i for i in instances
                             if i.metadata.name.startswith("demo-ha") and
                             i.metadata.namespace == "demo"]
        self.nonha_instances = [i for i in instances
                                if i.metadata.name.startswith("demo-nonha") and
                                i.metadata.namespace == "demo"]

    def _alarm_data_decoder(self, data):
        if "[" in data or "{" in data:
            # string to list or dict removing unicode
            data = yaml.load(data.replace("u'", "'"))
        return data

    def _alarm_traits_decoder(self, data):
        return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
                for t in data['reason_data']['event']['traits']})

    def get_session_instance_ids(self, url, session_id):
        ret = requests.get(url, data=None, headers=self.headers)
        if ret.status_code != 200:
            raise Exception(ret.text)
        LOG.info('get_instance_ids %s' % ret.json())
        return ret.json()['instance_ids']

    def scale_instances(self, scale_instances):
        number_of_instances_before = len(self.nonha_instances)
        replicas = number_of_instances_before + scale_instances

        # We only scale nonha apps
        namespace = "demo"
        name = "demo-nonha"
        body = {'spec': {"replicas": replicas}}
        self.kaapi.patch_namespaced_replica_set_scale(name, namespace, body)
        time.sleep(3)

        # Let's check if scale has taken effect
        self.update_instances()
        number_of_instances_after = len(self.nonha_instances)
        check = 20
        while number_of_instances_after == number_of_instances_before:
            if check == 0:
                LOG.error('scale_instances with: %d failed, still %d instances'
                          % (scale_instances, number_of_instances_after))
                raise Exception('scale_instances failed')
            check -= 1
            time.sleep(1)
            self.update_instances()
            number_of_instances_after = len(self.nonha_instances)

        LOG.info('scaled instances from %d to %d' %
                 (number_of_instances_before, number_of_instances_after))

    def number_of_instances(self):
        instances = self.kapi.list_pod_for_all_namespaces().items
        return len([i for i in instances
                    if i.metadata.name.startswith("demo-")])

    def instance_action(self, instance_id, allowed_actions):
        # We should keep instance constraint in our internal structur
        # and match instance_id specific allowed action. Now we assume EVICTION
        if 'EVICTION' not in allowed_actions:
            LOG.error('Action for %s not foudn from %s' %
                      (instance_id, allowed_actions))
            return None
        return 'EVICTION'

    def instance_action_started(self, instance_id, action):
        time_now = datetime.datetime.utcnow()
        max_interruption_time = (
            self.instance_constraints[instance_id]['max_interruption_time'])
        self.pending_actions[instance_id] = {
            'started': time_now,
            'max_interruption_time': max_interruption_time,
            'action': action}

    def was_instance_action_in_time(self, instance_id):
        time_now = datetime.datetime.utcnow()
        started = self.pending_actions[instance_id]['started']
        limit = self.pending_actions[instance_id]['max_interruption_time']
        action = self.pending_actions[instance_id]['action']
        td = time_now - started
        if td.total_seconds() > limit:
            LOG.error('%s %s took too long: %ds' %
                      (instance_id, action, td.total_seconds()))
            LOG.error('%s max_interruption_time %ds might be too short' %
                      (instance_id, limit))
            raise Exception('%s %s took too long: %ds' %
                            (instance_id, action, td.total_seconds()))
        else:
            LOG.info('%s %s with recovery time took %ds' %
                     (instance_id, action, td.total_seconds()))
        del self.pending_actions[instance_id]

    def run(self):
        app = Flask('VNFM')

        @app.route('/maintenance', methods=['POST'])
        def maintenance_alarm():
            data = json.loads(request.data.decode('utf8'))
            try:
                payload = self._alarm_traits_decoder(data)
            except Exception:
                payload = ({t[0]: t[2] for t in
                           data['reason_data']['event']['traits']})
                LOG.error('cannot parse alarm data: %s' % payload)
                raise Exception('VNFM cannot parse alarm.'
                                'Possibly trait data over 256 char')

            LOG.info('VNFM received data = %s' % payload)

            state = payload['state']
            reply_state = None
            reply = dict()

            LOG.info('VNFM state: %s' % state)

            if state == 'MAINTENANCE':
                self.headers['X-Auth-Token'] = self.session.get_token()
                instance_ids = (self.get_session_instance_ids(
                                payload['instance_ids'],
                                payload['session_id']))
                reply['instance_ids'] = instance_ids
                reply_state = 'ACK_MAINTENANCE'

            elif state == 'SCALE_IN':
                # scale down only nonha instances
                nonha_instances = len(self.nonha_instances)
                scale_in = nonha_instances / 2
                self.scale_instances(-scale_in)
                self.update_constraints()
                reply['instance_ids'] = self.get_instance_ids()
                reply_state = 'ACK_SCALE_IN'

            elif state == 'MAINTENANCE_COMPLETE':
                # possibly need to upscale
                number_of_instances = self.number_of_instances()
                if self.orig_number_of_instances > number_of_instances:
                    scale_instances = (self.orig_number_of_instances -
                                       number_of_instances)
                    self.scale_instances(scale_instances)
                    self.update_constraints()
                reply_state = 'ACK_MAINTENANCE_COMPLETE'

            elif (state == 'PREPARE_MAINTENANCE' or
                  state == 'PLANNED_MAINTENANCE'):
                instance_id = payload['instance_ids'][0]
                instance_action = (self.instance_action(instance_id,
                                   payload['allowed_actions']))
                if not instance_action:
                    raise Exception('Allowed_actions not supported for %s' %
                                    instance_id)

                LOG.info('VNFM got instance: %s' % instance_id)
                self.switch_over_ha_instance(instance_id)

                reply['instance_action'] = instance_action
                reply_state = 'ACK_%s' % state
                self.instance_action_started(instance_id, instance_action)

            elif state == 'INSTANCE_ACTION_DONE':
                # TBD was action done in max_interruption_time (live migration)
                # NOTE, in EVICTION instance_id reported that was in evicted
                # node. New instance_id might be different
                LOG.info('%s' % payload['instance_ids'])
                self.was_instance_action_in_time(payload['instance_ids'][0])
                self.update_instances()
                self.update_constraints()
            else:
                raise Exception('VNFM received event with'
                                ' unknown state %s' % state)

            if reply_state:
                reply['session_id'] = payload['session_id']
                reply['state'] = reply_state
                url = payload['reply_url']
                LOG.info('VNFM reply: %s' % reply)
                requests.put(url, data=json.dumps(reply), headers=self.headers)

            return 'OK'

        @app.route('/shutdown', methods=['POST'])
        def shutdown():
            LOG.info('shutdown VNFM server at %s' % time.time())
            func = request.environ.get('werkzeug.server.shutdown')
            if func is None:
                raise RuntimeError('Not running with the Werkzeug Server')
            func()
            return 'VNFM shutting down...'

        app.run(host="0.0.0.0", port=self.port)

if __name__ == '__main__':
    app_manager = VNFM(CONF, LOG)
    app_manager.start()
    try:
        LOG.info('Press CTRL + C to quit')
        while True:
            time.sleep(2)
    except KeyboardInterrupt:
        app_manager.stop()