fuel-octane/octane/util/maintenance.py

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import re
import time
from xml.etree import ElementTree

from octane import magic_consts
from octane.util import env as env_util
from octane.util import ssh
from octane.util import subprocess


def disable_apis(env):
    controllers = list(env_util.get_controllers(env))
    maintenance_line = 'backend maintenance'
    stats_socket_re = re.compile('stats\s+socket\s+/var/lib/haproxy/stats'
                                 '(?!.*level admin)')
    mode_tcp_re = re.compile('mode\s+tcp')
    use_backend_line = '  use_backend maintenance if TRUE'
    for node in controllers:
        sftp = ssh.sftp(node)
        sftp.chdir('/etc/haproxy')
        with ssh.update_file(sftp, 'haproxy.cfg') as (old, new):
            found_maint_line = False
            for line in old:
                if maintenance_line in line:
                    found_maint_line = True
                line = stats_socket_re.sub(r'\g<0> level admin', line)
                new.write(line)
            if not found_maint_line:
                new.write(maintenance_line)
        sftp.chdir('/etc/haproxy/conf.d')
        for f in sftp.listdir():
            with ssh.update_file(sftp, f) as (old, new):
                contents = old.read()
                if not mode_tcp_re.search(contents):
                    raise subprocess.DontUpdateException
                new.write(contents)
                if not contents.endswith('\n'):
                    new.write('\n')
                new.write(use_backend_line)
        ssh.call(['crm', 'resource', 'restart', 'p_haproxy'], node=node)


def enable_apis(env):
    controllers = list(env_util.get_controllers(env))
    maintenance_line = 'backend maintenance'
    use_backend_line = '  use_backend maintenance if TRUE'
    for node in controllers:
        sftp = ssh.sftp(node)
        sftp.chdir('/etc/haproxy')
        with ssh.update_file(sftp, 'haproxy.cfg') as (old, new):
            for line in old:
                if maintenance_line in line:
                    continue
                new.write(line)
        sftp.chdir('/etc/haproxy/conf.d')
        for f in sftp.listdir():
            with ssh.update_file(sftp, f) as (old, new):
                for line in old:
                    if use_backend_line in line:
                        continue
                    new.write(line)
        ssh.call(['crm', 'resource', 'restart', 'p_haproxy'], node=node)


_default_exclude_services = ('p_mysql', 'p_haproxy', 'p_dns', 'p_ntp', 'vip',
                             'p_conntrackd', 'p_rabbitmq-server',
                             'clone_p_vrouter')


def get_crm_services(status_out):
    data = ElementTree.fromstring(status_out)
    for resource in data:
        yield resource.get('id')


def start_corosync_services(env):
    manage_corosync_services(env, 'start')


def stop_corosync_services(env):
    manage_corosync_services(env, 'stop')


def manage_corosync_services(env, status):
    node = env_util.get_one_controller(env)
    status_out = ssh.call_output(['cibadmin', '--query', '--scope',
                                  'resources'], node=node)
    services_list = []
    for res in get_crm_services(status_out):
        if any(service in res for service in _default_exclude_services):
            continue
        services_list.append(res)

    for service in services_list:
        while True:
            try:
                ssh.call(['crm', 'resource', status, service],
                         node=node)
            except subprocess.CalledProcessError:
                # Sometimes pacemaker rejects part of requests what it is
                # not able to process. Sleep was added to mitigate this risk.
                time.sleep(1)
            else:
                break
    wait_for_corosync_services_sync(env, services_list, status)


def wait_for_corosync_services_sync(env, resource_list, status,
                                    timeout=1200, check_freq=20):
    status_bool = status == 'start'
    node = env_util.get_one_controller(env)
    started_at = time.time()
    while True:
        crm_out = ssh.call_output(['crm_mon', '--as-xml'], node=node)
        if is_resources_synced(resource_list, crm_out, status_bool):
            return
        if time.time() - started_at >= timeout:
            raise Exception("Timeout waiting for corosync cluster for env %s"
                            " to be synced" % env.id)
        time.sleep(check_freq)


def is_resources_synced(resources, crm_out, status):
    def get_resource(resources, resource_id):
        for resource in resources:
            if resource.get('id') == resource_id:
                return resource
        return None

    data = ElementTree.fromstring(crm_out)
    mon_resources = data.find('resources')
    for resource in resources:
        res = get_resource(mon_resources, resource)
        if not (is_resource_active(res) is status):
            return False
    return True


# Resources are fetching from the output of 'crm_mon' command. This command
# doesn't return resource if it's not started and we can consider 'absent'
# resource as disabled.
def is_resource_active(resource):
    if resource is None:
        return False
    if resource.tag == 'resource':
        return is_primitive_active(resource)
    for primitive in resource:
        if not is_primitive_active(primitive):
            return False
    return True


def is_primitive_active(resource):
    if resource.get('active') == 'true':
        return True
    return False


def stop_upstart_services(env):
    controllers = list(env_util.get_controllers(env))
    service_re = re.compile("^((?:%s)[^\s]*).*start/running" %
                            ("|".join(magic_consts.OS_SERVICES),),
                            re.MULTILINE)
    for node in controllers:
        sftp = ssh.sftp(node)
        try:
            svc_file = sftp.open('/root/services_list')
        except IOError:
            with sftp.open('/root/services_list.tmp', 'w') as svc_file:
                initctl_out = ssh.call_output(['initctl', 'list'], node=node)
                to_stop = []
                for match in service_re.finditer(initctl_out):
                    service = match.group(1)
                    to_stop.append(service)
                    svc_file.write(service + '\n')
            sftp.rename('/root/services_list.tmp', '/root/services_list')
        else:
            with svc_file:
                to_stop = svc_file.read().splitlines()
        for service in to_stop:
            ssh.call(['stop', service], node=node)
        ssh.call(['service', 'apache2', 'stop'], node=node)


def start_upstart_services(env):
    controllers = list(env_util.get_controllers(env))
    for node in controllers:
        sftp = ssh.sftp(node)
        try:
            svc_file = sftp.open('/root/services_list')
        except IOError:
            raise
        else:
            with svc_file:
                to_start = svc_file.read().splitlines()
        for service in to_start:
            ssh.call(['start', service], node=node)
        ssh.call(['service', 'apache2', 'start'], node=node)


def stop_cluster(env):
    cmds = [['pcs', 'cluster', 'kill']]
    controllers = list(env_util.get_controllers(env))
    for node in controllers:
        for cmd in cmds:
            ssh.call(cmd, node=node)


def start_cluster(env):
    major_version = env.data['fuel_version'].split('.')[0]
    cmds = []
    if int(major_version) < 6:
        cmds = [['service', 'corosync', 'start']]
    else:
        cmds = [['pcs', 'cluster', 'start']]
    controllers = list(env_util.get_controllers(env))
    for node in controllers:
        for cmd in cmds:
            ssh.call(cmd, node=node)
    # When we start cluster we should wait while resources from constant
    # `_default_exclude_services` become up and running. BTW, We don't touch
    # these resources in stop/start corosync resources methods at all.
    node = env_util.get_one_controller(env)
    status_out = ssh.call_output(['cibadmin', '--query', '--scope',
                                  'resources'], node=node)
    services_list = []
    for res in get_crm_services(status_out):
        if any(service in res for service in _default_exclude_services):
            services_list.append(res)

    wait_for_corosync_services_sync(env, services_list, 'start')