octavia/octavia/controller/healthmanager/health_drivers/update_db.py

# Copyright 2015 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import datetime
import time
import timeit

from oslo_config import cfg
from oslo_log import log as logging
from oslo_utils import excutils
import sqlalchemy
from stevedore import driver as stevedore_driver

from octavia.common import constants
from octavia.common import stats
from octavia.controller.healthmanager.health_drivers import update_base
from octavia.db import api as db_api
from octavia.db import repositories as repo

CONF = cfg.CONF
LOG = logging.getLogger(__name__)


class UpdateHealthDb(update_base.HealthUpdateBase):
    def __init__(self):
        super(UpdateHealthDb, self).__init__()
        # first setup repo for amphora, listener,member(nodes),pool repo
        self.amphora_repo = repo.AmphoraRepository()
        self.amphora_health_repo = repo.AmphoraHealthRepository()
        self.listener_repo = repo.ListenerRepository()
        self.loadbalancer_repo = repo.LoadBalancerRepository()
        self.member_repo = repo.MemberRepository()
        self.pool_repo = repo.PoolRepository()

    def _update_status(self, session, repo, entity_type,
                       entity_id, new_op_status, old_op_status):
        message = {}
        if old_op_status.lower() != new_op_status.lower():
            LOG.debug("%s %s status has changed from %s to "
                      "%s, updating db.",
                      entity_type, entity_id, old_op_status,
                      new_op_status)
            repo.update(session, entity_id, operating_status=new_op_status)
            # Map the status for neutron-lbaas compatibility
            if new_op_status == constants.DRAINING:
                new_op_status = constants.ONLINE
            message.update({constants.OPERATING_STATUS: new_op_status})

    def update_health(self, health, srcaddr):
        # The executor will eat any exceptions from the update_health code
        # so we need to wrap it and log the unhandled exception
        start_time = timeit.default_timer()
        try:
            self._update_health(health, srcaddr)
        except Exception as e:
            LOG.exception('Health update for amphora %(amp)s encountered '
                          'error %(err)s. Skipping health update.',
                          {'amp': health['id'], 'err': e})
        # TODO(johnsom) We need to set a warning threshold here
        LOG.debug('Health Update finished in: %s seconds',
                  timeit.default_timer() - start_time)

    # Health heartbeat messsage pre-versioning with UDP listeners
    # need to adjust the expected listener count
    # This is for backward compatibility with Rocky pre-versioning
    # heartbeat amphora.
    def _update_listener_count_for_UDP(self, session, db_lb,
                                       expected_listener_count):
        # For udp listener, the udp health won't send out by amp agent.
        # Once the default_pool of udp listener have the first enabled
        # member, then the health will be sent out. So during this
        # period, need to figure out the udp listener and ignore them
        # by changing expected_listener_count.
        for list_id, list_db in db_lb.get('listeners', {}).items():
            need_remove = False
            if list_db['protocol'] == constants.PROTOCOL_UDP:
                listener = self.listener_repo.get(session, id=list_id)
                enabled_members = ([member
                                    for member in
                                    listener.default_pool.members
                                    if member.enabled]
                                   if listener.default_pool else [])
                if listener.default_pool:
                    if not listener.default_pool.members:
                        need_remove = True
                    elif not enabled_members:
                        need_remove = True
                else:
                    need_remove = True

                if need_remove:
                    expected_listener_count = expected_listener_count - 1
        return expected_listener_count

    def _update_health(self, health, srcaddr):
        """This function is to update db info based on amphora status

        :param health: map object that contains amphora, listener, member info
        :type map: string
        :returns: null

        The input v1 health data structure is shown as below::

            health = {
                "id": self.FAKE_UUID_1,
                "listeners": {
                    "listener-id-1": {"status": constants.OPEN, "pools": {
                        "pool-id-1": {"status": constants.UP,
                                      "members": {
                                          "member-id-1": constants.ONLINE}
                                      }
                    }
                    }
                }
            }

        Example V2 message::

            {"id": "<amphora_id>",
             "seq": 67,
             "listeners": {
               "<listener_id>": {
                 "status": "OPEN",
                 "stats": {
                   "tx": 0,
                   "rx": 0,
                   "conns": 0,
                   "totconns": 0,
                   "ereq": 0
                 }
               }
             },
             "pools": {
                 "<pool_id>:<listener_id>": {
                   "status": "UP",
                   "members": {
                     "<member_id>": "no check"
                   }
                 }
             },
             "ver": 2
            }

        """
        session = db_api.get_session()

        # We need to see if all of the listeners are reporting in
        db_lb = self.amphora_repo.get_lb_for_health_update(session,
                                                           health['id'])
        ignore_listener_count = False

        if db_lb:
            expected_listener_count = 0
            if ('PENDING' in db_lb['provisioning_status'] or
               not db_lb['enabled']):
                ignore_listener_count = True
            else:
                for key, listener in db_lb.get('listeners', {}).items():
                    # disabled listeners don't report from the amphora
                    if listener['enabled']:
                        expected_listener_count += 1

                # If this is a heartbeat older than versioning, handle
                # UDP special for backward compatibility.
                if 'ver' not in health:
                    udp_listeners = [
                        l for k, l in db_lb.get('listeners', {}).items()
                        if l['protocol'] == constants.PROTOCOL_UDP]
                    if udp_listeners:
                        expected_listener_count = (
                            self._update_listener_count_for_UDP(
                                session, db_lb, expected_listener_count))
        else:
            # If this is not a spare amp, log and skip it.
            amp = self.amphora_repo.get(session, id=health['id'])
            if not amp or amp.load_balancer_id:
                # This is debug and not warning because this can happen under
                # normal deleting operations.
                LOG.debug('Received a health heartbeat from amphora %s with '
                          'IP %s that should not exist. This amphora may be '
                          'in the process of being deleted, in which case you '
                          'will only see this message a few '
                          'times', health['id'], srcaddr)
                if not amp:
                    LOG.warning('The amphora %s with IP %s is missing from '
                                'the DB, so it cannot be automatically '
                                'deleted (the compute_id is unknown). An '
                                'operator must manually delete it from the '
                                'compute service.', health['id'], srcaddr)
                    return
                # delete the amp right there
                try:
                    compute = stevedore_driver.DriverManager(
                        namespace='octavia.compute.drivers',
                        name=CONF.controller_worker.compute_driver,
                        invoke_on_load=True
                    ).driver
                    compute.delete(amp.compute_id)
                    return
                except Exception as e:
                    LOG.info("Error deleting amp %s with IP %s Error: %s",
                             health['id'], srcaddr, e)
            expected_listener_count = 0

        listeners = health['listeners']

        # Do not update amphora health if the reporting listener count
        # does not match the expected listener count
        if len(listeners) == expected_listener_count or ignore_listener_count:

            lock_session = db_api.get_session(autocommit=False)

            # if we're running too far behind, warn and bail
            proc_delay = time.time() - health['recv_time']
            hb_interval = CONF.health_manager.heartbeat_interval
            # TODO(johnsom) We need to set a warning threshold here, and
            #               escalate to critical when it reaches the
            #               heartbeat_interval
            if proc_delay >= hb_interval:
                LOG.warning('Amphora %(id)s health message was processed too '
                            'slowly: %(delay)ss! The system may be overloaded '
                            'or otherwise malfunctioning. This heartbeat has '
                            'been ignored and no update was made to the '
                            'amphora health entry. THIS IS NOT GOOD.',
                            {'id': health['id'], 'delay': proc_delay})
                return

            # if the input amphora is healthy, we update its db info
            try:
                self.amphora_health_repo.replace(
                    lock_session, health['id'],
                    last_update=(datetime.datetime.utcnow()))
                lock_session.commit()
            except Exception:
                with excutils.save_and_reraise_exception():
                    lock_session.rollback()
        else:
            LOG.warning('Amphora %(id)s health message reports %(found)i '
                        'listeners when %(expected)i expected',
                        {'id': health['id'], 'found': len(listeners),
                         'expected': expected_listener_count})

        # Don't try to update status for spares pool amphora
        if not db_lb:
            return

        processed_pools = []
        potential_offline_pools = {}

        # We got a heartbeat so lb is healthy until proven otherwise
        if db_lb[constants.ENABLED] is False:
            lb_status = constants.OFFLINE
        else:
            lb_status = constants.ONLINE

        health_msg_version = health.get('ver', 0)

        for listener_id in db_lb.get(constants.LISTENERS, {}):
            db_listener = db_lb[constants.LISTENERS][listener_id]
            db_op_status = db_listener[constants.OPERATING_STATUS]
            listener_status = None
            listener = None

            if listener_id not in listeners:
                if (db_listener[constants.ENABLED] and
                    db_lb[constants.PROVISIONING_STATUS] ==
                        constants.ACTIVE):
                    listener_status = constants.ERROR
                else:
                    listener_status = constants.OFFLINE
            else:
                listener = listeners[listener_id]

                # OPEN = HAProxy listener status nbconn < maxconn
                if listener.get('status') == constants.OPEN:
                    listener_status = constants.ONLINE
                # FULL = HAProxy listener status not nbconn < maxconn
                elif listener.get('status') == constants.FULL:
                    listener_status = constants.DEGRADED
                    if lb_status == constants.ONLINE:
                        lb_status = constants.DEGRADED
                else:
                    LOG.warning(('Listener %(list)s reported status of '
                                 '%(status)s'),
                                {'list': listener_id,
                                 'status': listener.get('status')})

            try:
                if (listener_status is not None and
                        listener_status != db_op_status):
                    self._update_status(
                        session, self.listener_repo, constants.LISTENER,
                        listener_id, listener_status, db_op_status)
            except sqlalchemy.orm.exc.NoResultFound:
                LOG.error("Listener %s is not in DB", listener_id)

            if not listener:
                continue

            if health_msg_version < 2:
                raw_pools = listener['pools']

                # normalize the pool IDs. Single process listener pools
                # have the listener id appended with an ':' seperator.
                # Old multi-process listener pools only have a pool ID.
                # This makes sure the keys are only pool IDs.
                pools = {(k + ' ')[:k.rfind(':')]: v for k, v in
                         raw_pools.items()}

                for db_pool_id in db_lb.get('pools', {}):
                    # If we saw this pool already on another listener, skip it.
                    if db_pool_id in processed_pools:
                        continue
                    db_pool_dict = db_lb['pools'][db_pool_id]
                    lb_status = self._process_pool_status(
                        session, db_pool_id, db_pool_dict, pools,
                        lb_status, processed_pools, potential_offline_pools)

        if health_msg_version >= 2:
            raw_pools = health['pools']

            # normalize the pool IDs. Single process listener pools
            # have the listener id appended with an ':' seperator.
            # Old multi-process listener pools only have a pool ID.
            # This makes sure the keys are only pool IDs.
            pools = {(k + ' ')[:k.rfind(':')]: v for k, v in raw_pools.items()}

            for db_pool_id in db_lb.get('pools', {}):
                # If we saw this pool already, skip it.
                if db_pool_id in processed_pools:
                    continue
                db_pool_dict = db_lb['pools'][db_pool_id]
                lb_status = self._process_pool_status(
                    session, db_pool_id, db_pool_dict, pools,
                    lb_status, processed_pools, potential_offline_pools)

        for pool_id in potential_offline_pools:
            # Skip if we eventually found a status for this pool
            if pool_id in processed_pools:
                continue
            try:
                # If the database doesn't already show the pool offline, update
                if potential_offline_pools[pool_id] != constants.OFFLINE:
                    self._update_status(
                        session, self.pool_repo, constants.POOL,
                        pool_id, constants.OFFLINE,
                        potential_offline_pools[pool_id])
            except sqlalchemy.orm.exc.NoResultFound:
                LOG.error("Pool %s is not in DB", pool_id)

        # Update the load balancer status last
        try:
            if lb_status != db_lb['operating_status']:
                self._update_status(
                    session, self.loadbalancer_repo,
                    constants.LOADBALANCER, db_lb['id'], lb_status,
                    db_lb[constants.OPERATING_STATUS])
        except sqlalchemy.orm.exc.NoResultFound:
            LOG.error("Load balancer %s is not in DB", db_lb.id)

    def _process_pool_status(
            self, session, pool_id, db_pool_dict, pools, lb_status,
            processed_pools, potential_offline_pools):
        pool_status = None

        if pool_id not in pools:
            # If we don't have a status update for this pool_id
            # add it to the list of potential offline pools and continue.
            # We will check the potential offline pool list after we
            # finish processing the status updates from all of the listeners.
            potential_offline_pools[pool_id] = db_pool_dict['operating_status']
            return lb_status

        pool = pools[pool_id]

        processed_pools.append(pool_id)

        # UP = HAProxy backend has working or no servers
        if pool.get('status') == constants.UP:
            pool_status = constants.ONLINE
        # DOWN = HAProxy backend has no working servers
        elif pool.get('status') == constants.DOWN:
            pool_status = constants.ERROR
            lb_status = constants.ERROR
        else:
            LOG.warning(('Pool %(pool)s reported status of '
                        '%(status)s'),
                        {'pool': pool_id,
                            'status': pool.get('status')})

        # Deal with the members that are reporting from
        # the Amphora
        members = pool['members']
        for member_id in db_pool_dict.get('members', {}):
            member_status = None
            member_db_status = (
                db_pool_dict['members'][member_id]['operating_status'])

            if member_id not in members:
                if member_db_status != constants.NO_MONITOR:
                    member_status = constants.OFFLINE
            else:
                status = members[member_id]

                # Member status can be "UP" or "UP #/#"
                # (transitional)
                if status.startswith(constants.UP):
                    member_status = constants.ONLINE
                # Member status can be "DOWN" or "DOWN #/#"
                # (transitional)
                elif status.startswith(constants.DOWN):
                    member_status = constants.ERROR
                    if pool_status == constants.ONLINE:
                        pool_status = constants.DEGRADED
                        if lb_status == constants.ONLINE:
                            lb_status = constants.DEGRADED
                elif status == constants.DRAIN:
                    member_status = constants.DRAINING
                elif status == constants.MAINT:
                    member_status = constants.OFFLINE
                elif status == constants.NO_CHECK:
                    member_status = constants.NO_MONITOR
                else:
                    LOG.warning('Member %(mem)s reported '
                                'status of %(status)s',
                                {'mem': member_id,
                                    'status': status})

            try:
                if (member_status is not None and
                        member_status != member_db_status):
                    self._update_status(
                        session, self.member_repo, constants.MEMBER,
                        member_id, member_status, member_db_status)
            except sqlalchemy.orm.exc.NoResultFound:
                LOG.error("Member %s is not able to update "
                          "in DB", member_id)

        try:
            if (pool_status is not None and
                    pool_status != db_pool_dict['operating_status']):
                self._update_status(
                    session, self.pool_repo, constants.POOL,
                    pool_id, pool_status, db_pool_dict['operating_status'])
        except sqlalchemy.orm.exc.NoResultFound:
            LOG.error("Pool %s is not in DB", pool_id)

        return lb_status


class UpdateStatsDb(update_base.StatsUpdateBase, stats.StatsMixin):

    def __init__(self):
        super(UpdateStatsDb, self).__init__()
        self.repo_listener = repo.ListenerRepository()

    def update_stats(self, health_message, srcaddr):
        # The executor will eat any exceptions from the update_stats code
        # so we need to wrap it and log the unhandled exception
        try:
            self._update_stats(health_message, srcaddr)
        except Exception:
            LOG.exception('update_stats encountered an unknown error '
                          'processing stats for amphora %s with IP '
                          '%s', health_message['id'], srcaddr)

    def _update_stats(self, health_message, srcaddr):
        """This function is to update the db with listener stats

        :param health_message: The health message containing the listener stats
        :type map: string
        :returns: null

        Example V1 message::

            health = {
                "id": self.FAKE_UUID_1,
                "listeners": {
                    "listener-id-1": {
                        "status": constants.OPEN,
                        "stats": {
                            "ereq":0,
                            "conns": 0,
                            "totconns": 0,
                            "rx": 0,
                            "tx": 0,
                        },
                        "pools": {
                            "pool-id-1": {
                                "status": constants.UP,
                                "members": {"member-id-1": constants.ONLINE}
                            }
                        }
                    }
                }
            }

        Example V2 message::

            {"id": "<amphora_id>",
             "seq": 67,
             "listeners": {
               "<listener_id>": {
                 "status": "OPEN",
                 "stats": {
                   "tx": 0,
                   "rx": 0,
                   "conns": 0,
                   "totconns": 0,
                   "ereq": 0
                 }
               }
             },
             "pools": {
                 "<pool_id>:<listener_id>": {
                   "status": "UP",
                   "members": {
                     "<member_id>": "no check"
                   }
                 }
             },
             "ver": 2
            }

        """
        session = db_api.get_session()

        amphora_id = health_message['id']
        listeners = health_message['listeners']
        for listener_id, listener in listeners.items():

            stats = listener.get('stats')
            stats = {'bytes_in': stats['rx'], 'bytes_out': stats['tx'],
                     'active_connections': stats['conns'],
                     'total_connections': stats['totconns'],
                     'request_errors': stats['ereq']}
            LOG.debug("Updating listener stats in db and sending event.")
            LOG.debug("Listener %s / Amphora %s stats: %s",
                      listener_id, amphora_id, stats)
            self.listener_stats_repo.replace(
                session, listener_id, amphora_id, **stats)