meta-starlingx/meta-stx-virt/recipes-extended/ceph/files/mgr-restful-plugin.py

#!/usr/bin/python
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#


### BEGIN INIT INFO
# Provides:          ceph/mgr RESTful API plugin
# Required-Start:    $ceph
# Required-Stop:     $ceph
# Default-Start:     2 3 4 5
# Default-Stop:      0 1 6
# Short-Description: Ceph MGR RESTful API plugin
# Description:       Ceph MGR RESTful API plugin
### END INIT INFO

import argparse
import contextlib
import errno
import fcntl
import inspect
import json
import logging
import multiprocessing
import os
import shutil
import signal
import socket
import subprocess
import sys
import tempfile
import time

import daemon
import psutil
import requests

# 'timeout' command returns exit status 124
# if command times out (see man page)
GNU_TIMEOUT_EXPIRED_RETCODE = 124


def psutil_terminate_kill(target, timeout):

    """Extend psutil functionality to stop a process.

       SIGINT is sent to each target then after a grace period SIGKILL
       is sent to the ones that are still running.
    """

    if not isinstance(target, list):
        target = [target]
    _, target = psutil.wait_procs(target, timeout=0)
    for action in [lambda p: p.terminate(), lambda p: p.kill()]:
        for proc in target:
            action(proc)
        _, target = psutil.wait_procs(
            target, timeout=timeout)


class Config(object):

    """ceph-mgr service wrapper configuration options.

        In the future we may want to load them from a configuration file
        (for example /etc/ceph/mgr-restful-plugin.conf )
    """

    def __init__(self):
        self.log_level = logging.INFO
        self.log_dir = '/var/log'

        self.ceph_mgr_service = '/usr/bin/ceph-mgr'
        self.ceph_mgr_config = '/etc/ceph/ceph.conf'
        self.ceph_mgr_cluster = 'ceph'
        self.ceph_mgr_rundir = '/var/run/ceph/mgr'
        self.ceph_mgr_confdir = '/var/lib/ceph/mgr'
        self.ceph_mgr_identity = socket.gethostname()

        self.service_name = 'mgr-restful-plugin'
        self.service_socket = os.path.join(
            self.ceph_mgr_rundir, '{}.socket'.format(self.service_name))
        self.service_lock = os.path.join(
            self.ceph_mgr_rundir, '{}.lock'.format(self.service_name))
        self.service_pid_file = os.path.join(
            '/var/run/ceph', '{}.pid'.format(self.service_name))

        self.restful_plugin_port = 5001

        # maximum size of a message received/sent via
        # service monitor control socket
        self.service_socket_bufsize = 1024

        # maximum time to wait for ceph cli to exit
        self.ceph_cli_timeout_sec = 30

        # how much time to wait after ceph cli commands fail with timeout
        # before running any other commands
        self.cluster_grace_period_sec = 30

        # after ceph-mgr is started it goes through an internal initialization
        # phase before; how much time to wait before querying ceph-mgr
        self.ceph_mgr_grace_period_sec = 15

        # after sending SIGTERM to ceph-mgr how much time to wait before
        # sending SIGKILL (maximum time allowed for ceph-mgr cleanup)
        self.ceph_mgr_kill_delay_sec = 5

        # if service monitor is running a recovery procedure it reports
        # status OK even if ceph-mgr is currently down. This sets the
        # maximum number of consecutive ceph-mgr failures before reporting
        # status error
        self.ceph_mgr_fail_count_report_error = 3

        # maximum number of consecutive ceph-mgr failures before
        # stopping mgr-restful-plugin service
        self.ceph_mgr_fail_count_exit = 5

        # maximum time allowed for ceph-mgr to respond to a REST API request
        self.rest_api_timeout_sec = 15

        # interval between consecutive REST API requests (ping's). A smaller
        # value here triggers more requests to ceph-mgr restful plugin. A
        # higher value makes recovery slower when services become unavailable
        self.restful_plugin_ping_delay_sec = 3

        # where to save the self-signed certificate generated by ceph-mgr
        self.restful_plugin_cert_path = os.path.join(
            self.ceph_mgr_rundir, 'restful.crt')

        # time to wait after enabling restful plugin
        self.restful_plugin_grace_period_sec = 3

        # after how many REST API ping failures to restart ceph-mgr
        self.ping_fail_count_restart_mgr = 3

        # after how many REST API ping failures to report status error.
        # Until then service monitor reports status OK just in case
        # restful plugin recovers
        self.ping_fail_count_report_error = 5

    @staticmethod
    def load():
        return Config()


def setup_logging(name=None, cleanup_handlers=False):
    if not name:
        name = CONFIG.service_name
    log = logging.getLogger(name)
    log.setLevel(CONFIG.log_level)
    if cleanup_handlers:
        try:
            for handler in log.handlers:
                if isinstance(handler, logging.StreamHandler):
                    handler.flush()
                if isinstance(handler, logging.FileHandler):
                    handler.close()
            log.handlers = []
        except Exception:
            pass
    elif log.handlers:
        return log
    handler = logging.FileHandler(
        os.path.join(CONFIG.log_dir,
                     '{}.log'.format(CONFIG.service_name)))
    handler.setFormatter(
        logging.Formatter('%(asctime)s %(process)s %(levelname)s %(name)s %(message)s'))
    log.addHandler(handler)
    return log


CONFIG = Config.load()
LOG = setup_logging(name='init-wrapper')


class ServiceException(Exception):

    """Generic mgr-restful-plugin service exception.

       Build exception string based on static (per exception class)
       string plus args, keyword args passed to exception constructor.
    """

    message = ""

    def __init__(self, *args, **kwargs):
        if "message" not in kwargs:
            try:
                message = self.message.format(*args, **kwargs)
            except Exception:   # noqa
                message = '{}, args:{}, kwargs: {}'.format(
                    self.message, args, kwargs)
        else:
            message = kwargs["message"]
        super(ServiceException, self).__init__(message)


class ServiceAlreadyStarted(ServiceException):
    message = ('Service monitor already started')


class ServiceLockFailed(ServiceException):
    message = ('Unable to lock service monitor: '
               'reason={reason}')


class ServiceNoSocket(ServiceException):
    message = ('Unable to create service monitor socket: '
               'reason={reason}')


class ServiceSocketBindFailed(ServiceException):
    message = ('Failed to bind service monitor socket: '
               'path={path}, reason={reason}')


class ServiceNoPidFile(ServiceException):
    message = ('Failed to update pid file: '
               'path={path}, reason={reason}')


class CommandFailed(ServiceException):
    message = ('Command failed: command={command}, '
               'reason={reason}, out={out}')


class CommandTimeout(ServiceException):
    message = ('Command timeout: command={command}, '
               'timeout={timeout}')


class CephMgrStartFailed(ServiceException):
    message = ('Failed to start ceph_mgr: '
               'reason={reason}')


class CephRestfulPluginFailed(ServiceException):
    message = ('Failed to start restful plugin: '
               'reason={reason}')


class RestApiPingFailed(ServiceException):
    message = ('REST API ping failed: '
               'reason={reason}')


class ServiceMonitor(object):

    """Configure and monitor ceph-mgr and restful plugin (Ceph REST API)

       1. process init script service requests: status, stop. Requests are
          received via a control socket. Stop has priority over whatever
          the monitor is doing currently. Any ceph command that may be running
          is terminated/killed. Note that while ceph-mgr and restful plugin
          configuration is in progress ServiceMonitor reports status OK to
          avoid being restarted by SM.

       2. configure ceph-mgr and mgr restful plugin: authentication, REST API
          service port, self signed certificate. This runs as a separate
          process so it can be stopped when init script requests it.

       3. periodically check (ping) REST API responds to HTTPS requests.
          Recovery actions are taken if REST API fails to respond: restart
          ceph-mgr, wait for cluster to become available again.
    """

    def __init__(self):
        # process running configuration & REST API ping loop
        self.monitor = None

        # command socket used by init script
        self.command = None

        # ceph-mgr process
        self.ceph_mgr = None

        # consecutive ceph-mgr/restful-plugin start failures. Service monitor
        # reports failure after CONFIG.ceph_mgr_max_failure_count
        self.ceph_mgr_failure_count = 0

        # consecutive REST API ping failures. ceph-mgr service is restarted
        # after CONFIG.ping_fail_count_restart_mgr threshold is exceeded
        self.ping_failure_count = 0

        # REST API url reported by ceph-mgr after enabling restful plugin
        self.restful_plugin_url = ''

        # REST API self signed certificate generated by restful plugin
        self.certificate = ''

    def run(self):
        self.disable_certificate_check()
        with self.service_lock(), self.service_socket(), \
                self.service_pid_file():
            self.start_monitor()
            self.server_loop()

    def disable_certificate_check(self):
        # ceph-mgr restful plugin is configured with a self-signed
        # certificate. Certificate host is hard-coded to "ceph-restful"
        # which causes HTTPS requests to fail because they don't
        # match current host name ("controller-..."). Disable HTTPS
        # certificates check in urllib3
        LOG.warning('Disable urllib3 certifcates check')
        requests.packages.urllib3.disable_warnings()

    def server_loop(self):
        self.command.listen(2)
        while True:
            try:
                client, _ = self.command.accept()
                request = client.recv(CONFIG.service_socket_bufsize)
                LOG.debug('Monitor command socket: request=%s', str(request))
                cmd = request.split(' ')
                cmd, args = cmd[0], cmd[1:]
                if cmd == 'status':
                    self.send_response(client, request, self.status())
                elif cmd == 'stop':
                    self.stop()
                    self.send_response(client, request, 'OK')
                    break
                elif cmd == 'restful-url':
                    try:
                        self.restful_plugin_url = args[0]
                        self.send_response(client, request, 'OK')
                    except IndexError:
                        LOG.warning('Failed to update restful plugin url: '
                                    'args=%s', str(args))
                        self.send_response(client, request, 'ERR')
                elif cmd == 'certificate':
                    try:
                        self.certificate = args[0] if args else ''
                        self.send_response(client, request, 'OK')
                    except IndexError:
                        LOG.warning('Failed to update certificate path: '
                                    'args=%s', str(args))
                        self.send_response(client, request, 'ERR')
                elif cmd == 'ceph-mgr-failures':
                    try:
                        self.ceph_mgr_failure_count = int(args[0])
                        self.send_response(client, request, 'OK')
                        if self.ceph_mgr_failure_count >= CONFIG.ceph_mgr_fail_count_exit:
                            self.stop()
                            break
                    except (IndexError, ValueError):
                        LOG.warning('Failed to update ceph-mgr failures: '
                                    'args=%s', str(args))
                        self.send_response(client, request, 'ERR')
                elif cmd == 'ping-failures':
                    try:
                        self.ping_failure_count = int(args[0])
                        self.send_response(client, request, 'OK')
                    except (IndexError, ValueError):
                        LOG.warning('Failed to update ping failures: '
                                    'args=%s', str(args))
                        self.send_response(client, request, 'ERR')
            except Exception as err:
                LOG.exception(err)

    @staticmethod
    def send_response(client, request, response):
        try:
            client.send(response)
        except socket.error as err:
            LOG.warning('Failed to send response back. '
                        'request=%s, response=%s, reason=%s',
                        request, response, err)

    def status(self):
        if not self.restful_plugin_url:
            if self.ceph_mgr_failure_count < CONFIG.ceph_mgr_fail_count_report_error \
               and self.ping_failure_count < CONFIG.ping_fail_count_report_error:
                LOG.debug('Monitor is starting services. Report status OK')
                return 'OK'
            LOG.debug('Too many failures: '
                      'ceph_mgr=%d < %d, ping=%d < %d. '
                      'Report status ERR',
                      self.ceph_mgr_failure_count,
                      CONFIG.ceph_mgr_fail_count_report_error,
                      self.ping_failure_count,
                      CONFIG.ping_fail_count_report_error)
            return 'ERR.down'
        try:
            self.restful_plugin_ping()
            LOG.debug('Restful plugin ping successful. Report status OK')
            return 'OK'
        except (CommandFailed, RestApiPingFailed):
            if self.ceph_mgr_failure_count < CONFIG.ceph_mgr_fail_count_report_error \
               and self.ping_failure_count < CONFIG.ping_fail_count_report_error:
                LOG.info('Restful plugin does not respond but failure '
                         'count is within acceptable limits: '
                         ' ceph_mgr=%d < %d, ping=%d < %d. '
                         'Report status OK',
                         self.ceph_mgr_failure_count,
                         CONFIG.ceph_mgr_fail_count_report_error,
                         self.ping_failure_count,
                         CONFIG.ping_fail_count_report_error)
                return 'OK'
            LOG.debug('Restful does not respond (ping failure count %d). '
                      'Report status ERR', self.ping_failure_count)
            return 'ERR.ping_failed'

    def stop(self):
        if not self.monitor:
            return
        LOG.info('Stop monitor with SIGTERM to process group %d',
                 self.monitor.pid)
        try:
            os.killpg(self.monitor.pid, signal.SIGTERM)
        except OSError as err:
            LOG.info('Stop monitor failed: reason=%s', str(err))
            return
        time.sleep(CONFIG.ceph_mgr_kill_delay_sec)
        LOG.info('Stop monitor with SIGKILL to process group %d',
                 self.monitor.pid)
        try:
            os.killpg(self.monitor.pid, signal.SIGKILL)
            os.waitpid(self.monitor.pid, 0)
        except OSError as err:
            LOG.info('Stop monitor failed: reason=%s', str(err))
            return
        LOG.info('Monitor stopped: pid=%d', self.monitor.pid)

    @contextlib.contextmanager
    def service_lock(self):
        LOG.info('Take service lock: path=%s', CONFIG.service_lock)
        try:
            os.makedirs(os.path.dirname(CONFIG.service_lock))
        except OSError:
            pass
        lock_file = open(CONFIG.service_lock, 'w')
        try:
            fcntl.flock(lock_file.fileno(),
                        fcntl.LOCK_EX | fcntl.LOCK_NB)
        except (IOError, OSError) as err:
            if err.errno == errno.EAGAIN:
                raise ServiceAlreadyStarted()
            else:
                raise ServiceLockFailed(reason=str(err))
        # even if we have the lock here there might be another service manager
        # running whose CONFIG.ceph_mgr_rundir was removed before starting
        # this instance. Make sure there is only one service manager running
        self.stop_other_service_managers()
        try:
            yield
        finally:
            os.unlink(CONFIG.service_lock)
            lock_file.close()
            LOG.info('Release service lock: path=%s', CONFIG.service_lock)

    def stop_other_service_managers(self):
        service = os.path.join('/etc/init.d', CONFIG.service_name)
        for p in psutil.process_iter():
            if p.cmdline()[:2] not in [[service], ['/usr/bin/python', service]]:
                continue
            if p.pid == os.getpid():
                continue
            p.kill()

    @contextlib.contextmanager
    def service_socket(self):
        LOG.info('Create service socket')
        try:
            self.command = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET)
        except socket.error as err:
            raise ServiceNoSocket(reason=str(err))
        LOG.info('Remove existing socket files')
        try:
            os.unlink(CONFIG.service_socket)
        except OSError:
            pass
        LOG.info('Bind service socket: path=%s', CONFIG.service_socket)
        try:
            self.command.bind(CONFIG.service_socket)
        except socket.error as err:
            raise ServiceSocketBindFailed(
                path=CONFIG.service_socket, reason=str(err))
        try:
            yield
        finally:
            LOG.info('Close service socket and remove file: path=%s',
                     CONFIG.service_socket)
            self.command.close()
            os.unlink(CONFIG.service_socket)

    @contextlib.contextmanager
    def service_pid_file(self):
        LOG.info('Update service pid file: path=%s', CONFIG.service_pid_file)
        try:
            pid_file = open(CONFIG.service_pid_file, 'w')
            pid_file.write(str(os.getpid()))
            pid_file.flush()
        except OSError as err:
            raise ServiceNoPidFile(
                path=CONFIG.service_pid_file, reason=str(err))
        try:
            yield
        finally:
            LOG.info('Remove service pid file: path=%s',
                     CONFIG.service_pid_file)
            try:
                os.unlink(CONFIG.service_pid_file)
            except OSError:
                pass

    def start_monitor(self):
        LOG.info('Start monitor loop')
        self.monitor = multiprocessing.Process(target=self.monitor_loop)
        self.monitor.start()

    def stop_unmanaged_ceph_mgr(self):
        LOG.info('Stop unmanaged running ceph-mgr processes')
        service_name = os.path.basename(CONFIG.ceph_mgr_service)
        if self.ceph_mgr:
            psutil_terminate_kill(
                [proc for proc in psutil.process_iter()
                 if (proc.name() == service_name
                     and proc.pid != self.ceph_mgr.pid)],
                CONFIG.ceph_mgr_kill_delay_sec)
        else:
            psutil_terminate_kill(
                [proc for proc in psutil.process_iter()
                 if proc.name() == service_name],
                CONFIG.ceph_mgr_kill_delay_sec)

    def monitor_loop(self):

        """Bring up and monitor ceph-mgr restful plugin.

           Steps:
           - wait for Ceph cluster to become available
           - configure and start ceph-mgr
           - configure and enable restful plugin
           - send periodic requests to REST API
           - recover from failures

           Note: because this runs as a separate process it
               must send status updates to service monitor
               via control socket for: ping_failure_count,
               restful_plugin_url and certificate.
        """

        # Promote to process group leader so parent (service monitor)
        # can kill the monitor plus processes spawned by it. Otherwise
        # children of monitor_loop() will keep running in background and
        # will be reaped by init when they finish but by then they might
        # interfere with any new service instance.
        os.setpgrp()

        # Ignoring SIGTERM here ensures process group is not reused by
        # the time parent (service monitor) issues the final SIGKILL.
        signal.signal(signal.SIGTERM, signal.SIG_IGN)

        while True:
            try:
                # steps to configure/start ceph-mgr and restful plugin
                self.ceph_fsid_get()
                self.ceph_mgr_auth_create()
                self.restful_plugin_set_server_port()
                self.restful_plugin_create_certificate()
                self.ceph_mgr_start()
                self.restful_plugin_enable()
                self.restful_plugin_create_admin_key()
                self.restful_plugin_get_url()
                self.restful_plugin_get_certificate()

                # REST API should be available now
                # start making periodic requests (ping)
                while True:
                    try:
                        self.restful_plugin_ping()
                        self.ping_failure_count = 0
                        self.request_update_ping_failures(
                            self.ping_failure_count)
                        self.ceph_mgr_failure_count = 0
                        self.request_update_ceph_mgr_failures(
                            self.ceph_mgr_failure_count)
                        time.sleep(CONFIG.restful_plugin_ping_delay_sec)
                        continue
                    except RestApiPingFailed as err:
                        LOG.warning(str(err))

                    LOG.info('REST API ping failure count=%d',
                             self.ping_failure_count)
                    self.ping_failure_count += 1
                    self.request_update_ping_failures(
                        self.ping_failure_count)

                    # maybe request failed because ceph-mgr is not running
                    if not self.ceph_mgr_is_running():
                        self.ceph_mgr_failure_count += 1
                        self.request_update_ceph_mgr_failures(
                            self.ceph_mgr_failure_count)
                        self.ceph_mgr_start()
                        time.sleep(CONFIG.ceph_mgr_grace_period_sec)
                        continue

                    # maybe request failed because cluster health is not ok
                    if not self.ceph_fsid_get():
                        LOG.info('Unable to get cluster fsid. '
                                 'Sleep for a while')
                        time.sleep(CONFIG.cluster_grace_period_sec)
                        break

                    # too many failures? Restart ceph-mgr and go again
                    # through configuration steps
                    if (self.ping_failure_count
                            % CONFIG.ping_fail_count_restart_mgr == 0):
                        LOG.info('Too many consecutive REST API failures. '
                                 'Restart ceph-mgr. Update service '
                                 'url and certificate')
                        self.ceph_mgr_stop()
                        self.restful_plugin_url = ''
                        self.request_update_plugin_url(self.restful_plugin_url)
                        self.certificate = ''
                        self.request_update_certificate(self.certificate)
                        break

                    time.sleep(CONFIG.restful_plugin_ping_delay_sec)

            except CommandFailed as err:
                LOG.warning(str(err))
                time.sleep(CONFIG.cluster_grace_period_sec)
            except CommandTimeout as err:
                LOG.warning(str(err))
            except (CephMgrStartFailed, CephRestfulPluginFailed) as err:
                LOG.warning(str(err))
                self.ceph_mgr_failure_count += 1
                self.request_update_ceph_mgr_failures(
                    self.ceph_mgr_failure_count)
                time.sleep(CONFIG.ceph_mgr_grace_period_sec)
            except Exception as err:
                LOG.exception(err)
                time.sleep(CONFIG.cluster_grace_period_sec)

    @staticmethod
    def run_with_timeout(command, timeout, stderr=subprocess.STDOUT):
        try:
            LOG.info('Run command: %s', ' '.join(command))
            return subprocess.check_output(
                ['/usr/bin/timeout', str(timeout)] + command,
                stderr=stderr, shell=False).strip()
        except subprocess.CalledProcessError as err:
            if err.returncode == GNU_TIMEOUT_EXPIRED_RETCODE:
                raise CommandTimeout(command=err.cmd, timeout=timeout)
            raise CommandFailed(command=err.cmd, reason=str(err),
                                out=err.output)

    def ceph_fsid_get(self):
        return self.run_with_timeout(['/usr/bin/ceph', 'fsid'],
                                     CONFIG.ceph_cli_timeout_sec)

    def ceph_mgr_has_auth(self):
        path = '{}/ceph-{}'.format(
            CONFIG.ceph_mgr_confdir, CONFIG.ceph_mgr_identity)
        try:
            os.makedirs(path)
        except OSError as err:
            pass
        try:
            self.run_with_timeout(
                ['/usr/bin/ceph', 'auth', 'get',
                 'mgr.{}'.format(CONFIG.ceph_mgr_identity),
                 '-o', '{}/keyring'.format(path)],
                CONFIG.ceph_cli_timeout_sec)
            return True
        except CommandFailed as err:
            if 'ENOENT' in str(err):
                return False
            raise

    def ceph_mgr_auth_create(self):
        if self.ceph_mgr_has_auth():
            return
        LOG.info('Create ceph-mgr authentication')
        self.run_with_timeout(
            ['/usr/bin/ceph', 'auth', 'get-or-create',
             'mgr.{}'.format(CONFIG.ceph_mgr_identity),
             'mon', 'allow *', 'osd', 'allow *'],
            CONFIG.ceph_cli_timeout_sec)

    def ceph_mgr_is_running(self):
        if not self.ceph_mgr:
            return None
        try:
            self.ceph_mgr.wait(timeout=0)
        except psutil.TimeoutExpired:
            return True
        return False

    def ceph_mgr_start(self):
        if self.ceph_mgr_is_running():
            return
        self.stop_unmanaged_ceph_mgr()
        LOG.info('Start ceph-mgr daemon')
        try:
            with open(os.devnull, 'wb') as null:
                self.ceph_mgr = psutil.Popen(
                    [CONFIG.ceph_mgr_service,
                     '--cluster', CONFIG.ceph_mgr_cluster,
                     '--conf', CONFIG.ceph_mgr_config,
                     '--id', CONFIG.ceph_mgr_identity,
                     '-f'],
                    close_fds=True,
                    stdout=null,
                    stderr=null,
                    shell=False)
        except (OSError, ValueError) as err:
            raise CephMgrStartFailed(reason=str(err))
        time.sleep(CONFIG.ceph_mgr_grace_period_sec)

    def ceph_mgr_stop(self):
        if not self.ceph_mgr:
            return
        LOG.info('Stop ceph-mgr')
        psutil_terminate_kill(self.ceph_mgr, CONFIG.ceph_mgr_kill_delay_sec)

    def restful_plugin_has_server_port(self):
        try:
            with open(os.devnull, 'wb') as null:
                out = self.run_with_timeout(
                    ['/usr/bin/ceph', 'config-key', 'get',
                     'mgr/restful/server_port'],
                    CONFIG.ceph_cli_timeout_sec, stderr=null)
            if out == str(CONFIG.restful_plugin_port):
                return True
            LOG.warning('Restful plugin port mismatch: '
                        'current=%d, expected=%d', out,
                        CONFIG.restful_plugin_port)
        except CommandFailed as err:
            LOG.warning('Failed to get restful plugin port: '
                        'reason=%s', str(err))
        return False

    def restful_plugin_set_server_port(self):
        if self.restful_plugin_has_server_port():
            return
        LOG.info('Set restful plugin port=%d', CONFIG.restful_plugin_port)
        self.run_with_timeout(
            ['/usr/bin/ceph', 'config-key', 'set',
             'mgr/restful/server_port', str(CONFIG.restful_plugin_port)],
            CONFIG.ceph_cli_timeout_sec)

    def restful_plugin_has_admin_key(self):
        try:
            self.run_with_timeout(
                ['/usr/bin/ceph', 'config-key', 'get',
                 'mgr/restful/keys/admin'],
                CONFIG.ceph_cli_timeout_sec)
            return True
        except CommandFailed:
            pass
        return False

    def restful_plugin_create_admin_key(self):
        if self.restful_plugin_has_admin_key():
            return
        LOG.info('Create restful plugin admin key')
        self.run_with_timeout(
            ['/usr/bin/ceph', 'restful',
             'create-key', 'admin'],
            CONFIG.ceph_cli_timeout_sec)

    def restful_plugin_has_certificate(self):
        try:
            self.run_with_timeout(
                ['/usr/bin/ceph', 'config-key', 'get',
                 'config/mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity)],
                CONFIG.ceph_cli_timeout_sec)
            self.run_with_timeout(
                ['/usr/bin/ceph', 'config-key', 'get',
                 'mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity)],
                CONFIG.ceph_cli_timeout_sec)
            self.run_with_timeout(
                ['/usr/bin/ceph', 'config-key', 'get',
                 'config/mgr/restful/{}/key'.format(CONFIG.ceph_mgr_identity)],
                CONFIG.ceph_cli_timeout_sec)
            self.run_with_timeout(
                ['/usr/bin/ceph', 'config-key', 'get',
                 '/mgr/restful/{}/key'.format(CONFIG.ceph_mgr_identity)],
                CONFIG.ceph_cli_timeout_sec)
            return True
        except CommandFailed:
            pass
        return False

    def restful_plugin_create_certificate(self):
        if self.restful_plugin_has_certificate():
            return
        LOG.info('Create restful plugin self signed certificate')
        path = tempfile.mkdtemp()
        try:
            try:
                with tempfile.NamedTemporaryFile() as restful_cnf:
                    restful_cnf.write((
                        '[req]\n'
                        'req_extensions = v3_ca\n'
                        'distinguished_name = req_distinguished_name\n'
                        '[v3_ca]\n'
                        'subjectAltName=DNS:{}\n'
                        'basicConstraints = CA:true\n'
                        '[ req_distinguished_name ]\n'
                        '0.organizationName = IT\n'
                        'commonName = ceph-restful\n').format(
                            CONFIG.ceph_mgr_identity))
                    restful_cnf.flush()
                    subprocess.check_call([
                        '/usr/bin/openssl', 'req', '-new', '-nodes', '-x509',
                        '-subj', '/O=IT/CN=' + CONFIG.ceph_mgr_identity,
                        '-days', '3650',
                        '-config', restful_cnf.name,
                        '-out', os.path.join(path, 'crt'),
                        '-keyout', os.path.join(path, 'key'),
                        '-extensions', 'v3_ca'])
            except subprocess.CalledProcessError as err:
                raise CommandFailed(
                    command=' '.join(err.cmd),
                    reason='failed to generate self-signed certificate: {}'.format(str(err)),
                    out=err.output)
            self.run_with_timeout(
                ['/usr/bin/ceph', 'config-key', 'set',
                 'config/mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity),
                 '-i', os.path.join(path, 'crt')],
                CONFIG.ceph_cli_timeout_sec)
            self.run_with_timeout(
                ['/usr/bin/ceph', 'config-key', 'set',
                 'mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity),
                 '-i', os.path.join(path, 'crt')],
                CONFIG.ceph_cli_timeout_sec)
            self.run_with_timeout(
                ['/usr/bin/ceph', 'config-key', 'set',
                 'config/mgr/restful/{}/key'.format(CONFIG.ceph_mgr_identity),
                 '-i', os.path.join(path, 'key')],
                CONFIG.ceph_cli_timeout_sec)
            self.run_with_timeout(
                ['/usr/bin/ceph', 'config-key', 'set',
                 'mgr/restful/{}/key'.format(CONFIG.ceph_mgr_identity),
                 '-i', os.path.join(path, 'key')],
                CONFIG.ceph_cli_timeout_sec)
        finally:
            shutil.rmtree(path)

    def restful_plugin_is_enabled(self):
        command = ['/usr/bin/ceph', 'mgr', 'module', 'ls',
                   '--format', 'json']
        with open(os.devnull, 'wb') as null:
            out = self.run_with_timeout(
                command, CONFIG.ceph_cli_timeout_sec, stderr=null)
        try:
            if 'restful' in json.loads(out)['enabled_modules']:
                return True
        except ValueError as err:
            raise CommandFailed(
                command=' '.join(command),
                reason='unable to decode json: {}'.format(err), out=out)
        except KeyError as err:
            raise CommandFailed(
                command=' '.join(command),
                reason='missing expected key: {}'.format(err), out=out)
        return False

    def restful_plugin_enable(self):
        if not self.restful_plugin_is_enabled():
            LOG.info('Enable restful plugin')
            self.run_with_timeout(
                ['/usr/bin/ceph', 'mgr',
                 'module', 'enable', 'restful'],
                CONFIG.ceph_cli_timeout_sec)
        time.sleep(CONFIG.restful_plugin_grace_period_sec)

    def restful_plugin_get_url(self):
        command = ['/usr/bin/ceph', 'mgr', 'services',
                   '--format', 'json']
        with open(os.devnull, 'wb') as null:
            out = self.run_with_timeout(
                command, CONFIG.ceph_cli_timeout_sec, stderr=null)
        try:
            self.restful_plugin_url = json.loads(out)['restful']
        except ValueError as err:
            raise CephRestfulPluginFailed(
                reason='unable to decode json: {} output={}'.format(err, out))
        except KeyError as err:
            raise CephRestfulPluginFailed(
                reason='missing expected key: {} in ouput={}'.format(err, out))
        self.request_update_plugin_url(self.restful_plugin_url)

    def restful_plugin_get_certificate(self):
        command = ['/usr/bin/ceph', 'config-key', 'get',
                   'config/mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity)]
        with open(os.devnull, 'wb') as null:
            certificate = self.run_with_timeout(
                command, CONFIG.ceph_cli_timeout_sec, stderr=null)
            with open(CONFIG.restful_plugin_cert_path, 'wb') as cert_file:
                cert_file.write(certificate)
            self.certificate = CONFIG.restful_plugin_cert_path
            self.request_update_certificate(
                self.certificate)

    def restful_plugin_ping(self):
        if not self.restful_plugin_url:
            raise RestApiPingFailed(reason='missing service url')
        if not self.certificate:
            raise RestApiPingFailed(reason='missing certificate')
        LOG.debug('Ping restful plugin: url=%d', self.restful_plugin_url)
        try:
            response = requests.request(
                'GET', self.restful_plugin_url, verify=False,
                timeout=CONFIG.rest_api_timeout_sec)
            if not response.ok:
                raise RestApiPingFailed(
                    reason='response not ok ({})'.format(response))
            LOG.debug('Ping restful plugin OK')
        except (requests.ConnectionError,
                requests.Timeout,
                requests.HTTPError) as err:
            raise RestApiPingFailed(reason=str(err))

    @staticmethod
    def _make_client_socket():
        sock = socket.socket(
            socket.AF_UNIX, socket.SOCK_SEQPACKET)
        sock.settimeout(2 * CONFIG.rest_api_timeout_sec)
        sock.connect(CONFIG.service_socket)
        return sock

    @staticmethod
    def request_status():
        try:
            with contextlib.closing(
                    ServiceMonitor._make_client_socket()) as sock:
                sock.send('status')
                status = sock.recv(CONFIG.service_socket_bufsize)
                LOG.debug('Status %s', status)
                return status.startswith('OK')
        except socket.error as err:
            LOG.error('Status error: reason=%s', err)
            return False

    @staticmethod
    def request_stop():
        try:
            with contextlib.closing(
                    ServiceMonitor._make_client_socket()) as sock:
                sock.send('stop')
                response = sock.recv(CONFIG.service_socket_bufsize)
                LOG.debug('Stop response: %s', response)
                return True
        except socket.error as err:
            LOG.error('Stop error: reason=%s', err)
            return False

    @staticmethod
    def request_update_ceph_mgr_failures(count):
        try:
            with contextlib.closing(
                    ServiceMonitor._make_client_socket()) as sock:
                sock.send('ceph-mgr-failures {}'.format(count))
                sock.recv(CONFIG.service_socket_bufsize)
                return True
        except socket.error as err:
            LOG.error('Stop error: reason=%s', err)
            return False

    @staticmethod
    def request_update_ping_failures(count):
        try:
            with contextlib.closing(
                    ServiceMonitor._make_client_socket()) as sock:
                sock.send('ping-failures {}'.format(count))
                sock.recv(CONFIG.service_socket_bufsize)
                return True
        except socket.error as err:
            LOG.error('Stop error: reason=%s', err)
            return False

    @staticmethod
    def request_update_plugin_url(url):
        try:
            with contextlib.closing(
                    ServiceMonitor._make_client_socket()) as sock:
                sock.send('restful-url {}'.format(url))
                sock.recv(CONFIG.service_socket_bufsize)
                return True
        except socket.error as err:
            LOG.error('Stop error: reason=%s', err)
            return False

    @staticmethod
    def request_update_certificate(path):
        try:
            with contextlib.closing(
                    ServiceMonitor._make_client_socket()) as sock:
                sock.send('certificate {}'.format(path))
                sock.recv(CONFIG.service_socket_bufsize)
                return True
        except socket.error as err:
            LOG.error('Stop error: reason=%s', err)
            return False


class InitWrapper(object):

    """Handle System V init script actions: start, stop, restart, etc. """

    def __init__(self):

        """Dispatch command line action to the corresponding function.

           Candidate action functions are all class methods except ones
           that start with an underscore.
        """

        parser = argparse.ArgumentParser()
        actions = [m[0]
                   for m in inspect.getmembers(self)
                   if (inspect.ismethod(m[1])
                       and not m[0].startswith('_'))]
        parser.add_argument(
            'action',
            choices=actions)
        self.args = parser.parse_args()
        getattr(self, self.args.action)()

    def start(self):

        """Start ServiceMonitor as a daemon unless one is already running.

           Use a pipe to report monitor status back to this process.
        """

        pipe = os.pipe()
        child = os.fork()
        if child == 0:
            os.close(pipe[0])
            with daemon.DaemonContext(files_preserve=[pipe[1]]):
                # prevent duplication of messages in log
                global LOG
                LOG = setup_logging(cleanup_handlers=True)
                try:
                    monitor = ServiceMonitor()
                    status = 'OK'
                except ServiceAlreadyStarted:
                    os.write(pipe[1], 'OK')
                    os.close(pipe[1])
                    return
                except Exception as err:
                    status = str(err)
                os.write(pipe[1], status)
                os.close(pipe[1])
                if status == 'OK':
                    try:
                        monitor.run()
                    except ServiceException as err:
                        LOG.warning(str(err))
                    except Exception as err:
                        LOG.exception('Service monitor error: reason=%s', err)
        else:
            os.close(pipe[1])
            try:
                status = os.read(pipe[0], CONFIG.service_socket_bufsize)
                if status == 'OK':
                    sys.exit(0)
                else:
                    LOG.warning('Service monitor failed to start: '
                                'status=%s', status)
            except IOError as err:
                LOG.warning('Failed to read monitor status: reason=%s', err)
            os.close(pipe[0])
            os.waitpid(child, 0)
            sys.exit(1)

    def stop(self):

        """Tell ServiceMonitor daemon to stop running.

           In case request fails stop ServiceMonitor and ceph_mgr proecsses
           using SIGTERM followed by SIGKILL.
        """

        result = ServiceMonitor.request_stop()
        if not result:
            ceph_mgr = os.path.basename(CONFIG.ceph_mgr_service)
            procs = []
            for proc in psutil.process_iter():
                name = proc.name()
                if name == CONFIG.service_name:
                    procs.append(proc)
                if name == ceph_mgr:
                    procs.append(proc)
            psutil_terminate_kill(procs, CONFIG.ceph_mgr_kill_delay_sec)

    def restart(self):
        self.stop()
        self.start()

    def force_reload(self):
        self.stop()
        self.start()

    def reload(self):
        self.stop()
        self.start()

    def status(self):

        """Report status from ServiceMonitor.

           We don't just try to access REST API here because ServiceMonitor may
           be in the process of starting/configuring ceph-mgr and restful
           plugin in which case we report OK to avoid being restarted by SM.
        """

        status = ServiceMonitor.request_status()
        sys.exit(0 if status is True else 1)


if __name__ == '__main__':
    InitWrapper()