Merge remote-tracking branch 'starlingx/master' into HEAD

Change-Id: Ic566b87ddbfc0f838dded07306b19b73cd566161 Signed-off-by: Scott Little <scott.little@windriver.com>
2019-01-23 15:58:50 -05:00 · 2019-01-23 15:58:50 -05:00 · d6a1fd98d6
parent a02e003618 ed8655fa77
commit d6a1fd98d6
23 changed files with 2841 additions and 61 deletions
--- a/1
+++ b/1
@ -100,6 +100,7 @@ monitoring/influxdb-extensions
 kubernetes/kubernetes
 kubernetes/docker-distribution
 kubernetes/helm
 kubernetes/registry-token-server
 logging/logmgmt
 filesystem/filesystem-scripts
 utilities/branding
--- a/ceph/ceph/centos/build_srpm.data
+++ b/ceph/ceph/centos/build_srpm.data
@ -1,4 +1,5 @@
 SRC_DIR="$CGCS_BASE/git/ceph"
 COPY_LIST="files/*"
 TIS_BASE_SRCREV=3f07f7ff1a5c7bfa8d0de12c966594d5fb7cf4ec
 TIS_PATCH_VER=GITREVCOUNT
 BUILD_IS_BIG=40
--- a/ceph/ceph/centos/ceph.spec
+++ b/ceph/ceph/centos/ceph.spec
@ -1 +0,0 @@
 ../../../../git/ceph/ceph.spec
--- a/ceph/ceph/centos/ceph.spec
+++ b/ceph/ceph/centos/ceph.spec
--- a/ceph/ceph/files/ceph-init-wrapper.sh
+++ b/ceph/ceph/files/ceph-init-wrapper.sh
@ -0,0 +1,282 @@
 #!/bin/bash
 #
 # Copyright (c) 2019 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # This script is a helper wrapper for pmon monitoring of ceph
 # processes. The "/etc/init.d/ceph" script does not know if ceph is
 # running on the node. For example when the node is locked, ceph
 # processes are not running. In that case we do not want pmond to
 # monitor these processes.
 #
 # The script "/etc/services.d/<node>/ceph.sh" will create the file
 # "/var/run/.ceph_started" when ceph is running and remove it when
 # is not.
 #
 # The script also extracts  one or more ceph process names  that are
 # reported as 'not running' or 'dead' or 'failed'  by '/etc/intit.d/ceph status'
 # and writes the names to a text file: /tmp/ceph_status_failure.txt for
 # pmond to access. The pmond adds the text to logs and alarms. Example of text
 # samples written to file by this script are:
 #   'osd.1'
 #   'osd.1, osd.2'
 #   'mon.storage-0'
 #   'mon.storage-0, osd.2'
 #
 # Moreover, for processes that are reported as 'hung' by '/etc/intit.d/ceph status'
 # the script will try increase their logging to 'debug' for a configurable interval.
 # With logging increased it will outputs a few stack traces then, at the end of this
 # interval, it dumps its stack core and kills it.
 #
 # Return values;
 # zero -   /etc/init.d/ceph returned success or ceph is not running on the node
 # non-zero /etc/init.d/ceph returned a failure or invalid syntax
 #
 source /usr/bin/tsconfig
 source /etc/platform/platform.conf
 CEPH_SCRIPT="/etc/init.d/ceph"
 CEPH_FILE="$VOLATILE_PATH/.ceph_started"
 CEPH_RESTARTING_FILE="$VOLATILE_PATH/.ceph_restarting"
 CEPH_GET_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_status"
 CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt"
 BINDIR=/usr/bin
 SBINDIR=/usr/sbin
 LIBDIR=/usr/lib64/ceph
 ETCDIR=/etc/ceph
 source $LIBDIR/ceph_common.sh
 LOG_PATH=/var/log/ceph
 LOG_FILE=$LOG_PATH/ceph-process-states.log
 LOG_LEVEL=NORMAL  # DEBUG
 verbose=0
 DATA_PATH=$VOLATILE_PATH/ceph_hang    # folder where we keep state information
 mkdir -p $DATA_PATH                   # make sure folder exists
 MONITORING_INTERVAL=15
 TRACE_LOOP_INTERVAL=5
 GET_STATUS_TIMEOUT=120
 CEPH_STATUS_TIMEOUT=20
 WAIT_FOR_CMD=1
 RC=0
 args=("$@")
 if [ ! -z $ARGS ]; then
    IFS=";" read -r -a new_args <<< "$ARGS"
    args+=("${new_args[@]}")
 fi
 wait_for_status ()
 {
    timeout=$GET_STATUS_TIMEOUT  # wait for status no more than $timeout seconds
    while [ -f ${CEPH_GET_STATUS_FILE} ] && [ $timeout -gt 0 ]; do
        sleep 1
        let timeout-=1
    done
    if [ $timeout -eq 0 ]; then
        wlog "-" "WARN" "Getting status takes more than ${GET_STATUS_TIMEOUT}s, continuing"
        rm -f $CEPH_GET_STATUS_FILE
    fi
 }
 start ()
 {
    if [ -f ${CEPH_FILE} ]; then
        wait_for_status
        ${CEPH_SCRIPT} start $1
        RC=$?
    else
        # Ceph is not running on this node, return success
        exit 0
    fi
 }
 stop ()
 {
    wait_for_status
    ${CEPH_SCRIPT} stop $1
 }
 restart ()
 {
    if [ -f ${CEPH_FILE} ]; then
        wait_for_status
        touch $CEPH_RESTARTING_FILE
        ${CEPH_SCRIPT} restart $1
        rm -f $CEPH_RESTARTING_FILE
    else
        # Ceph is not running on this node, return success
        exit 0
    fi
 }
 log_and_restart_blocked_osds ()
 {
    # Log info about the blocked osd daemons and then restart it
    local names=$1
    for name in $names; do
        wlog $name "INFO" "Restarting OSD with blocked operations"
        ${CEPH_SCRIPT} restart $name
    done
 }
 log_and_kill_hung_procs ()
 {
    # Log info about the hung processes and then kill them; later on pmon will restart them
    local names=$1
    for name in $names; do
        type=`echo $name | cut -c 1-3`   # e.g. 'mon', if $item is 'mon1'
        id=`echo $name | cut -c 4- | sed 's/^\\.//'`
        get_conf run_dir "/var/run/ceph" "run dir"
        get_conf pid_file "$run_dir/$type.$id.pid" "pid file"
        pid=$(cat $pid_file)
        wlog $name "INFO" "Dealing with hung process (pid:$pid)"
        # monitoring interval
        wlog $name "INFO" "Increasing log level"
        execute_ceph_cmd ret $name "ceph daemon $name config set debug_$type 20/20"
        monitoring=$MONITORING_INTERVAL
        while [ $monitoring -gt 0 ]; do
            if [ $(($monitoring % $TRACE_LOOP_INTERVAL)) -eq 0 ]; then
                date=$(date "+%Y-%m-%d_%H-%M-%S")
                log_file="$LOG_PATH/hang_trace_${name}_${pid}_${date}.log"
                wlog $name "INFO" "Dumping stack trace to: $log_file"
                $(pstack $pid >$log_file) &
            fi
            let monitoring-=1
            sleep 1
        done
        wlog $name "INFO" "Trigger core dump"
        kill -ABRT $pid &>/dev/null
        rm -f $pid_file # process is dead, core dump is archiving, preparing for restart
        # Wait for pending systemd core dumps
        sleep 2 # hope systemd_coredump has started meanwhile
        deadline=$(( $(date '+%s') + 300 ))
        while [[ $(date '+%s') -lt "${deadline}" ]]; do
            systemd_coredump_pid=$(pgrep -f "systemd-coredump.*${pid}.*ceph-${type}")
            [[ -z "${systemd_coredump_pid}" ]] && break
            wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
            sleep 2
        done
        kill -KILL $pid &>/dev/null
    done
 }
 status ()
 {
    if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then
        timeout $CEPH_STATUS_TIMEOUT ceph -s
        if [ "$?" -ne 0 ]; then
            # Ceph cluster is not accessible. Don't panic, controller swact
 	        # may be in progress.
            wlog "-" INFO "Ceph is down, ignoring OSD status."
            exit 0
        fi
    fi
    if [ -f ${CEPH_RESTARTING_FILE} ]; then
        # Ceph is restarting, we don't report state changes on the first pass
        rm -f ${CEPH_RESTARTING_FILE}
        exit 0
    fi
    if [ -f ${CEPH_FILE} ]; then
        # Make sure the script does not 'exit' between here and the 'rm -f' below
        # or the checkpoint file will be left behind
        touch -f ${CEPH_GET_STATUS_FILE}
        result=`${CEPH_SCRIPT} status $1`
        RC=$?
        if [ "$RC" -ne 0 ]; then
            erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
            hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
            blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
            invalid=0
            host=`hostname`
            if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
                # On 2 node configuration we have a floating monitor
                host="controller"
            fi
            for i in $(echo $erred_procs $hung_procs); do
                if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host ]]; then
                    continue
                else
                    invalid=1
                fi
            done
            log_and_restart_blocked_osds $blocked_ops_procs
            log_and_kill_hung_procs $hung_procs
            hung_procs_text=""
            for i in $(echo $hung_procs); do
                hung_procs_text+="$i(process hung) "
            done
            rm -f $CEPH_STATUS_FAILURE_TEXT_FILE
            if [ $invalid -eq 0 ]; then
                text=""
                for i in $erred_procs; do
                    text+="$i, "
                done
                for i in $hung_procs; do
                    text+="$i (process hang), "
                done
                echo "$text" | tr -d '\n' > $CEPH_STATUS_FAILURE_TEXT_FILE
            else
                echo "$host: '${CEPH_SCRIPT} status $1' result contains invalid process names: $erred_procs"
                echo "Undetermined osd or monitor id" > $CEPH_STATUS_FAILURE_TEXT_FILE
            fi
        fi
        rm -f ${CEPH_GET_STATUS_FILE}
        if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
            # SM needs exit code != 0 from 'status mon' argument of the init script on
            # standby controller otherwise it thinks that the monitor is running and
            # tries to stop it.
            # '/etc/init.d/ceph status mon' checks the status of monitors configured in
            # /etc/ceph/ceph.conf and if it should be running on current host.
            # If it should not be running it just exits with code 0. This is what
            # happens on the standby controller.
            # When floating monitor is running on active controller /var/lib/ceph/mon of
            # standby is not mounted (Ceph monitor partition is DRBD synced).
            test -e "/var/lib/ceph/mon/ceph-controller"
            if [ "$?" -ne 0 ]; then
                exit 3
            fi
        fi
    else
        # Ceph is not running on this node, return success
        exit 0
    fi
 }
 case "${args[0]}" in
    start)
        start ${args[1]}
        ;;
    stop)
        stop ${args[1]}
        ;;
    restart)
        restart ${args[1]}
        ;;
    status)
        status ${args[1]}
        ;;
    *)
        echo "Usage: $0 {start|stop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]"
        exit 1
        ;;
 esac
 exit $RC
--- a/ceph/ceph/files/ceph-manage-journal.py
+++ b/ceph/ceph/files/ceph-manage-journal.py
@ -1,6 +1,6 @@
 #!/usr/bin/python
 #
-# Copyright (c) 2016 Wind River Systems, Inc.
+# Copyright (c) 2019 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@ -12,6 +12,7 @@ import re
 import subprocess
 import sys
 DEVICE_NAME_NVME = "nvme"
 #########
 # Utils #
@ -85,7 +86,11 @@ def is_partitioning_correct(disk_path, partition_sizes):
    partition_index = 1
    for size in partition_sizes:
        # Check that each partition size matches the one in input
-        partition_node = disk_node + str(partition_index)
+        if DEVICE_NAME_NVME in disk_node:
            partition_node = '{}p{}'.format(disk_node, str(partition_index))
        else:
            partition_node = '{}{}'.format(disk_node, str(partition_index))
        output, _, _ = command(["udevadm", "settle", "-E", partition_node])
        cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"]
        output, _, _ = command(cmd)
--- a/ceph/ceph/files/ceph-radosgw.service
+++ b/ceph/ceph/files/ceph-radosgw.service
@ -0,0 +1,18 @@
 [Unit]
 Description=radosgw RESTful rados gateway
 After=network.target
 #After=remote-fs.target nss-lookup.target network-online.target time-sync.target
 #Wants=network-online.target
 [Service]
 Type=forking
 Restart=no
 KillMode=process
 RemainAfterExit=yes
 ExecStart=/etc/rc.d/init.d/ceph-radosgw start
 ExecStop=/etc/rc.d/init.d/ceph-radosgw stop
 ExecReload=/etc/rc.d/init.d/ceph-radosgw reload
 [Install]
 WantedBy=multi-user.target
--- a/ceph/ceph/files/ceph-rest-api
+++ b/ceph/ceph/files/ceph-rest-api
@ -0,0 +1,92 @@
 #!/bin/sh
 ### BEGIN INIT INFO
 # Provides:          ceph-rest-api
 # Required-Start:    $ceph
 # Required-Stop:     $ceph
 # Default-Start:     2 3 4 5
 # Default-Stop:      0 1 6
 # Short-Description: Ceph REST API daemon
 # Description:       Ceph REST API daemon
 ### END INIT INFO
 DESC="ceph-rest-api"
 DAEMON="/usr/bin/ceph-rest-api"
 RUNDIR="/var/run/ceph"
 PIDFILE="${RUNDIR}/ceph-rest-api.pid"
 start()
 {
    if [ -e $PIDFILE ]; then
        PIDDIR=/proc/$(cat $PIDFILE)
        if [ -d ${PIDDIR} ]; then
            echo "$DESC already running."
            exit 0
        else
            echo "Removing stale PID file $PIDFILE"
            rm -f $PIDFILE
        fi
    fi
    echo -n "Starting $DESC..."
    mkdir -p $RUNDIR
    start-stop-daemon --start --quiet --background \
        --pidfile ${PIDFILE} --make-pidfile --exec ${DAEMON}
    if [ $? -eq 0 ]; then
        echo "done."
    else
        echo "failed."
        exit 1
    fi
 }
 stop()
 {
    echo -n "Stopping $DESC..."
    start-stop-daemon --stop --quiet --pidfile $PIDFILE
    if [ $? -eq 0 ]; then
        echo "done."
    else
        echo "failed."
    fi
    rm -f $PIDFILE
 }
 status()
 {
    pid=`cat $PIDFILE 2>/dev/null`
    if [ -n "$pid" ]; then
        if ps -p $pid &>/dev/null ; then
            echo "$DESC is running"
            exit 0
        else
            echo "$DESC is not running but has pid file"
            exit 1
        fi
    fi
    echo "$DESC is not running"
    exit 3
 }
 case "$1" in
    start)
        start
        ;;
    stop)
        stop
        ;;
    restart|force-reload|reload)
        stop
        start
        ;;
    status)
        status
        ;;
    *)
        echo "Usage: $0 {start|stop|force-reload|restart|reload|status}"
        exit 1
        ;;
 esac
 exit 0
--- a/ceph/ceph/files/ceph-rest-api.service
+++ b/ceph/ceph/files/ceph-rest-api.service
@ -0,0 +1,16 @@
 [Unit]
 Description=Ceph REST API
 After=network.target ceph.target
 [Service]
 Type=forking
 Restart=no
 KillMode=process
 RemainAfterExit=yes
 ExecStart=/etc/rc.d/init.d/ceph-rest-api start
 ExecStop=/etc/rc.d/init.d/ceph-rest-api stop
 ExecReload=/etc/rc.d/init.d/ceph-rest-api reload
 [Install]
 WantedBy=multi-user.target
--- a/ceph/ceph/files/ceph.conf
+++ b/ceph/ceph/files/ceph.conf
@ -0,0 +1,50 @@
 [global]
 	# Unique ID for the cluster.
 	fsid = %CLUSTER_UUID%
 	# Public network where the monitor is connected to, i.e, 128.224.0.0/16
 	#public network = 127.0.0.1/24
 	# For version 0.55 and beyond, you must explicitly enable
 	# or disable authentication with "auth" entries in [global].
 	auth_cluster_required = cephx
 	auth_service_required = cephx
 	auth_client_required = cephx
 	osd_journal_size = 1024
 	# Uncomment the following line if you are mounting with ext4
 	# filestore xattr use omap = true
 	# Number of replicas of objects. Write an object 2 times.
 	# Cluster cannot reach an active + clean state until there's enough OSDs
 	# to handle the number of copies of an object. In this case, it requires
 	# at least 2 OSDs
 	osd_pool_default_size = 2
 	# Allow writing one copy in a degraded state.
 	osd_pool_default_min_size = 1
 	# Ensure you have a realistic number of placement groups. We recommend
 	# approximately 100 per OSD. E.g., total number of OSDs multiplied by 100
 	# divided by the number of replicas (i.e., osd pool default size). So for
 	# 2 OSDs and osd pool default size = 2, we'd recommend approximately
 	# (100 * 2) / 2 = 100.
 	osd_pool_default_pg_num = 64
 	osd_pool_default_pgp_num = 64
 	osd_crush_chooseleaf_type = 1
 	setuser match path = /var/lib/ceph/$type/$cluster-$id
 	# Override Jewel default of 2 reporters. StarlingX has replication factor 2
 	mon_osd_min_down_reporters = 1
 	# Use Hammer's report interval default value
 	osd_mon_report_interval_max = 120
 [osd]
 	osd_mkfs_type = xfs
 	osd_mkfs_options_xfs = "-f"
 	osd_mount_options_xfs = "rw,noatime,inode64,logbufs=8,logbsize=256k"
 [mon]
    mon warn on legacy crush tunables = false
    # Quiet new warnings on move to Hammer
    mon pg warn max per osd = 2048
    mon pg warn max object skew = 0
--- a/ceph/ceph/files/ceph.conf.pmon
+++ b/ceph/ceph/files/ceph.conf.pmon
@ -0,0 +1,26 @@
 [process]
 process  = ceph
 script   = /etc/init.d/ceph-init-wrapper
 style    = lsb
 severity = major          ; minor, major, critical
 restarts = 3              ; restart retries before error assertion
 interval = 30             ; number of seconds to wait between restarts
 mode = status             ; Monitoring mode: passive (default) or active
                          ; passive: process death monitoring (default: always)
                          ; active : heartbeat monitoring, i.e. request / response messaging
                          ; status : determine process health with executing "status" command
                          ;          "start" is used to start the process(es) again
                          ; ignore : do not monitor or stop monitoring
 ; Status and Active Monitoring Options
 period     = 30           ; monitor period in seconds
 timeout    = 120          ; for active mode, messaging timeout period in seconds, must be shorter than period
                          ; for status mode, max amount of time for a command to execute
 ; Status Monitoring Options
 start_arg      = start        ; start argument for the script
 status_arg     = status       ; status argument for the script
 status_failure_text = /tmp/ceph_status_failure.txt   ; text to be added to alarms or logs, this is optional
--- a/ceph/ceph/files/ceph.service
+++ b/ceph/ceph/files/ceph.service
@ -0,0 +1,16 @@
 [Unit]
 Description=StarlingX Ceph Startup
 After=network.target
 [Service]
 Type=forking
 Restart=no
 KillMode=process
 RemainAfterExit=yes
 ExecStart=/etc/rc.d/init.d/ceph start
 ExecStop=/etc/rc.d/init.d/ceph stop
 PIDFile=/var/run/ceph/ceph.pid
 [Install]
 WantedBy=multi-user.target
--- a/ceph/ceph/files/ceph.sh
+++ b/ceph/ceph/files/ceph.sh
@ -0,0 +1,77 @@
 #!/bin/bash
 INITDIR=/etc/init.d
 LOGFILE=/var/log/ceph/ceph-init.log
 CEPH_FILE=/var/run/.ceph_started
 # Get our nodetype
 . /etc/platform/platform.conf
 # Exit immediately if ceph not configured (i.e. no mon in the config file)
 if ! grep -q "mon\." /etc/ceph/ceph.conf
 then
    exit 0
 fi
 logecho ()
 {
    echo $1
    date >> ${LOGFILE}
    echo $1 >> ${LOGFILE}
 }
 start ()
 {
    if [[ "$nodetype" == "controller" ]] || [[ "$nodetype" == "storage" ]]; then
        logecho "Starting ceph services..."
        ${INITDIR}/ceph start >> ${LOGFILE} 2>&1
        RC=$?
        if [ ! -f ${CEPH_FILE} ]; then
            touch ${CEPH_FILE}
        fi
    else
        logecho "No ceph services on ${nodetype} node"
        exit 0
    fi
 }
 stop ()
 {
    if [[ "$nodetype" == "controller" ]] || [[ "$nodetype" == "storage" ]]; then
        if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" == "simplex" ]]; then
            logecho "Ceph services will continue to run on node"
            exit 0
        fi
        logecho "Stopping ceph services..."
        if [ -f ${CEPH_FILE} ]; then
            rm -f ${CEPH_FILE}
        fi
        ${INITDIR}/ceph stop >> ${LOGFILE} 2>&1
        RC=$?
    else
        logecho "No ceph services on ${nodetype} node"
        exit 0
    fi
 }
 RC=0
 case "$1" in
    start)
        start
        ;;
    stop)
        stop
        ;;
    *)
        echo "Usage: $0 {start|stop}"
        exit 1
        ;;
 esac
 logecho "RC was: $RC"
 exit $RC
--- a/ceph/ceph/files/osd-wait-status.py
+++ b/ceph/ceph/files/osd-wait-status.py
@ -0,0 +1,246 @@
 #!/usr/bin/python
 #
 # Copyright (c) 2019 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 #
 # Wait for one or a group of OSDs to match one or a group of statuses
 # as reported by "ceph osd tree".
 #
 # Examples:
 # - wait for osd 0 to be up:
 #     osd-wait-status -o 0 -s up
 #
 # - wait for osd 0 and osd 1 to be up:
 #     osd-wait-status -o 0 1 -s up
 #
 # The amount of time spent waiting for OSDs to match a status can
 # be limited by specifying:
 #
 # - the maximum retry count; the script will if the status doesn't
 #   match the desired one after more than retry count attempts.
 #   The interval between attempts is controlled by the "-i" flag.
 #   Example:
 #     osd-wait-status -o 0 -s up -c 2 -i 3
 #   will call "ceph osd tree" once to get the status of osd 0 and if
 #   it's not "up" then it will try one more time after 3 seconds.
 #
 # - a deadline as the maximum interval of time the script is looping
 #   waiting for OSDs to match status. The interval between attempts
 #   is controlled by the "-i" flag.
 #   Example:
 #     osd-wait-status -o 0 -s up -d 10 -i 3
 #   will call "ceph osd tree" until either osd 0 status is "up" or
 #   no more than 10 seconds have passed, that's 3-4 attempts depending
 #   on how much time it takes to run "ceph osd tree"
 #
 # Status match can be reversed by using "-n" flag.
 # Example:
 #   osd-wait-status -o 0 -n -s up
 # waits until osd 0 status is NOT up.
 #
 # osd-wait-status does not allow matching arbitrary combinations of
 # OSDs and statuses. For example: "osd 0 up and osd 1 down" is not
 # supported.
 #
 # Return code is 0 if OSDs match expected status before the
 # retry count*interval / deadline limits are reached.
 import argparse
 import json
 import logging
 import retrying
 import subprocess
 import sys
 import time
 logging.basicConfig(level=logging.DEBUG)
 LOG = logging.getLogger('osd-wait-status')
 CEPH_BINARY_PATH = '/usr/bin/ceph'
 RETRY_INTERVAL_SEC = 1
 RETRY_FOREVER = 0
 NO_DEADLINE = 0
 class OsdException(Exception):
    def __init__(self, message, restartable=False):
        super(OsdException, self).__init__(message)
        self.restartable = restartable
 def get_osd_tree():
    command = [CEPH_BINARY_PATH,
               'osd', 'tree', '--format', 'json']
    try:
        p = subprocess.Popen(command,
                  stdout = subprocess.PIPE,
                  stderr = subprocess.PIPE)
        output, error = p.communicate()
        if p.returncode != 0:
            raise OsdException(
                ('Command failed: command="{}", '
                 'returncode={}, output="{}"').format(
                    ' '.join(command),
                    p.returncode,
                    output, error),
                restartable=True)
    except OSError as e:
        raise OsdException(
            ('Command failed: command="{}", '
             'reason="{}"').format(command, str(e)))
    try:
        return json.loads(output)
    except ValueError as e:
        raise OsdException(
            ('JSON decode failed: '
             'data="{}", error="{}"').format(
                output, e))
 def osd_match_status(target_osd, target_status,
                    reverse_logic):
    LOG.info(('Match status: '
              'target_osd={}, '
              'target status={}, '
              'reverse_logic={}').format(
        target_osd, target_status, reverse_logic))
    tree = get_osd_tree()
    osd_status = {}
    for node in tree.get('nodes'):
        name = node.get('name')
        if name in target_osd:
            osd_status[name] = node.get('status')
        if len(osd_status) == len(target_osd):
            break
    LOG.info('Current OSD(s) status: {}'.format(osd_status))
    for name in target_osd:
        if name not in osd_status:
            raise OsdException(
                ('Unable to retrieve status '
                 'for "{}"').format(
                    name))
        if reverse_logic:
            if osd_status[name] not in target_status:
                del osd_status[name]
        else:
            if osd_status[name] in target_status:
                del osd_status[name]
    if len(osd_status) == 0:
        LOG.info('OSD(s) status target reached.')
        return True
    else:
        LOG.info('OSD(s) {}matching status {}: {}'.format(
            '' if reverse_logic else 'not ',
            target_status,
            osd_status.keys()))
        return False
 def osd_wait_status(target_osd, target_status,
                    reverse_logic,
                    retry_count, retry_interval,
                    deadline):
    def retry_if_false(result):
        return (result is False)
    def retry_if_restartable(exception):
        return (isinstance(exception, OsdException)
                and exception.restartable)
    LOG.info(('Wait options: '
              'target_osd={}, '
              'target_status={}, '
              'reverse_logic={}, '
              'retry_count={}, '
              'retry_interval={}, '
              'deadline={}').format(
        target_osd, target_status, reverse_logic,
        retry_count, retry_interval, deadline))
    kwargs = {
        'retry_on_result': retry_if_false,
        'retry_on_exception': retry_if_restartable}
    if retry_count != RETRY_FOREVER:
        kwargs['stop_max_attempt_number'] = retry_count
    if deadline != NO_DEADLINE:
        kwargs['stop_max_delay'] = deadline * 1000
    if retry_interval != 0:
        kwargs['wait_fixed'] = retry_interval * 1000
    if not len(target_osd):
        return
    retrying.Retrying(**kwargs).call(
        osd_match_status,
        target_osd, target_status,
        reverse_logic)
 def non_negative_interger(value):
    value = int(value)
    if value < 0:
        raise argparse.argumenttypeerror(
            '{} is a negative integer value'.format(value))
    return value
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Wait for OSD status match')
    parser.add_argument(
        '-o', '--osd',
        nargs='*',
        help='osd id',
        type=non_negative_interger,
        required=True)
    parser.add_argument(
        '-n', '--not',
        dest='reverse_logic',
        help='reverse logic: wait for status NOT to match',
        action='store_true',
        default=False)
    parser.add_argument(
        '-s', '--status',
        nargs='+',
        help='status',
        type=str,
        required=True)
    parser.add_argument(
        '-c', '--retry-count',
        help='retry count',
        type=non_negative_interger,
        default=RETRY_FOREVER)
    parser.add_argument(
        '-i', '--retry-interval',
        help='retry interval (seconds)',
        type=non_negative_interger,
        default=RETRY_INTERVAL_SEC)
    parser.add_argument(
        '-d', '--deadline',
        help='deadline (seconds)',
        type=non_negative_interger,
        default=NO_DEADLINE)
    args = parser.parse_args()
    start = time.time()
    try:
        osd_wait_status(
            ['osd.{}'.format(o) for o in args.osd],
            args.status,
            args.reverse_logic,
            args.retry_count,
            args.retry_interval,
            args.deadline)
        LOG.info('Elapsed time: {:.02f} seconds'.format(
            time.time() - start))
        sys.exit(0)
    except retrying.RetryError as e:
        LOG.warn(
            ('Retry error: {}. '
             'Elapsed time: {:.02f} seconds'.format(
                e, time.time() - start)))
    except OsdException as e:
        LOG.warn(
            ('OSD wait error: {}. '
             'Elapsed time: {:.02f} seconds').format(
                e, time.time() - start))
    sys.exit(1)
--- a/ceph/ceph/files/stx_git_version
+++ b/ceph/ceph/files/stx_git_version
@ -0,0 +1,2 @@
 656b5b63ed7c43bd014bcafd81b001959d5f089f
 v10.2.6
--- a/config/puppet-modules/openstack/puppet-ceph-2.2.0/centos/build_srpm.data
+++ b/config/puppet-modules/openstack/puppet-ceph-2.2.0/centos/build_srpm.data
@ -1 +1 @@
-TIS_PATCH_VER=6
+TIS_PATCH_VER=7
--- a/config/puppet-modules/openstack/puppet-ceph-2.2.0/centos/meta_patches/0007-Add-StarlingX-specific-restart-command-for-Ceph-moni.patch
+++ b/config/puppet-modules/openstack/puppet-ceph-2.2.0/centos/meta_patches/0007-Add-StarlingX-specific-restart-command-for-Ceph-moni.patch
@ -0,0 +1,32 @@
 From b590f06d6f6ce2bd71d4d0389b6d51a78e225c19 Mon Sep 17 00:00:00 2001
 From: Ovidiu Poncea <ovidiu.poncea@windriver.com>
 Date: Thu, 20 Dec 2018 08:07:15 -0500
 Subject: [PATCH] Add-StarlingX-specific-restart-command-for-Ceph-moni patch
 ---
 SPECS/puppet-ceph.spec | 2 ++
 1 file changed, 2 insertions(+)
 diff --git a/SPECS/puppet-ceph.spec b/SPECS/puppet-ceph.spec
 index 0b728a1..e5cc64c 100644
 --- a/SPECS/puppet-ceph.spec
 +++ b/SPECS/puppet-ceph.spec
@@ -14,6 +14,7 @@ Patch0003:      0003-Ceph-Jewel-rebase.patch
 Patch0004:      0004-US92424-Add-OSD-support-for-persistent-naming.patch
 Patch0005:      0005-Remove-puppetlabs-apt-as-ceph-requirement.patch
 Patch0006:      0006-ceph-disk-prepare-invalid-data-disk-value.patch
 +Patch0007:      0007-Add-StarlingX-specific-restart-command-for-Ceph-moni.patch
 BuildArch:      noarch
@@ -35,6 +36,7 @@ Community Developed Ceph Module
 %patch0004 -p1
 %patch0005 -p1
 %patch0006 -p1
 +%patch0007 -p1
 find . -type f -name ".*" -exec rm {} +
 find . -size 0 -exec rm {} +
 -- 
 1.8.3.1
--- a/config/puppet-modules/openstack/puppet-ceph-2.2.0/centos/meta_patches/PATCH_ORDER
+++ b/config/puppet-modules/openstack/puppet-ceph-2.2.0/centos/meta_patches/PATCH_ORDER
@ -4,3 +4,4 @@
 0004-Add-OSD-support-for-persistent-naming.patch
 0005-meta-patch-for-patch5.patch
 0006-add-ceph-disk-prepare-invalid-data-disk-value-patch.patch
 0007-Add-StarlingX-specific-restart-command-for-Ceph-moni.patch
--- a/config/puppet-modules/openstack/puppet-ceph-2.2.0/centos/patches/0007-Add-StarlingX-specific-restart-command-for-Ceph-moni.patch
+++ b/config/puppet-modules/openstack/puppet-ceph-2.2.0/centos/patches/0007-Add-StarlingX-specific-restart-command-for-Ceph-moni.patch
@ -0,0 +1,35 @@
 From a364f37cacab78cdaad5ebd23ab24cf400a3fa40 Mon Sep 17 00:00:00 2001
 From: Ovidiu Poncea <ovidiu.poncea@windriver.com>
 Date: Thu, 20 Dec 2018 07:18:55 -0500
 Subject: [PATCH] Add StarlingX specific restart command for Ceph monitors
 Since we don't use systemd to manage Ceph and we have pmon monitoring we
 have to make sure that:
 1. Restarting is properly handled as "systemctl restart" will return error
   and manifest will fail;
 2. Pmon does not check ceph-mon status during restart. Otherwise we risk
   getting into a race condition between the puppet restart and pmon
   detecting that ceph is down and trying a restart.
 Both are resolved when using /etc/init.d/ceph-init-wrapper restart
 Signed-off-by: Ovidiu Poncea <Ovidiu.Poncea@windriver.com>
 ---
 manifests/mon.pp | 1 +
 1 file changed, 1 insertion(+)
 diff --git a/manifests/mon.pp b/manifests/mon.pp
 index 17cb925..62d5059 100644
 --- a/manifests/mon.pp
 +++ b/manifests/mon.pp
@@ -106,6 +106,7 @@ define ceph::mon (
         start    => "service ceph start mon.${id}",
         stop     => "service ceph stop mon.${id}",
         status   => "service ceph status mon.${id}",
 +        restart  => "/etc/init.d/ceph-init-wrapper restart mon.${id}",
         enable   => $mon_enable,
       }
     }
 -- 
 1.8.3.1
--- a/kubernetes/helm/centos/files/helm-upload
+++ b/kubernetes/helm/centos/files/helm-upload
@ -49,31 +49,4 @@ if [ $REINDEX -eq 1 ]; then
    /usr/sbin/helm repo index $REPO_DIR
 fi
 if [ ! -f "/etc/platform/simplex" ]; then
    # We're not a one node system, copy the files to the other
    # controller if we can
    if [ $HOSTNAME == "controller-0" ]; then
        TARGET="controller-1"
    else
        TARGET="controller-0"
    fi
    # We've modified etc/rsyncd.conf to allow access to /www/helm_charts
    # To avoid races, copy over the index file last.
    rsync -acv --exclude=index.yaml ${REPO_DIR}/ rsync://${TARGET}/helm_charts
    if [ $? -ne 0 ]; then
        echo Problem syncing helm charts to $TARGET
        RETVAL=1
    fi
    rsync -acv ${REPO_DIR}/index.yaml rsync://${TARGET}/helm_charts
    if [ $? -ne 0 ]; then
        echo Problem syncing helm chart index file to $TARGET
        RETVAL=1
    fi
 fi
 # We also need to sync the helm charts on node startup
 # in case they were added while the node was down.
 exit $RETVAL
--- a/kubernetes/registry-token-server/centos/build_srpm.data
+++ b/kubernetes/registry-token-server/centos/build_srpm.data
@ -1,4 +1,12 @@
 TAR_NAME="registry-token-server"
 SRC_DIR="$PKG_BASE/src"
-COPY_LIST="$FILES_BASE/*"
+COPY_LIST=" \
-TIS_PATCH_VER=0
+  $FILES_BASE/* \
  $STX_BASE/downloads/Sirupsen-logrus-55eb11d21d2a31a3cc93838241d04800f52e823d.tar.gz \
  $STX_BASE/downloads/docker-distribution-48294d928ced5dd9b378f7fd7c6f5da3ff3f2c89.tar.gz \
  $STX_BASE/downloads/docker-libtrust-fa567046d9b14f6aa788882a950d69651d230b21.tar.gz \
  $STX_BASE/downloads/gophercloud-gophercloud-aa00757ee3ab58e53520b6cb910ca0543116400a.tar.gz \
  $STX_BASE/downloads/gorilla-context-08b5f424b9271eedf6f9f0ce86cb9396ed337a42.tar.gz \
  $STX_BASE/downloads/gorilla-mux-456bcfa82d672db7cae587c9b541463f65bc2718.tar.gz \
 "
 TIS_PATCH_VER=1
--- a/kubernetes/registry-token-server/centos/registry-token-server.spec
+++ b/kubernetes/registry-token-server/centos/registry-token-server.spec
@ -11,13 +11,20 @@ Source0:        registry-token-server-%{version}.tar.gz
 Source1:        %{name}.service
 Source2:        token_server.conf
 # Go dependencies downloaded as tarballs
 Source10:       Sirupsen-logrus-55eb11d21d2a31a3cc93838241d04800f52e823d.tar.gz
 Source11:       docker-distribution-48294d928ced5dd9b378f7fd7c6f5da3ff3f2c89.tar.gz
 Source12:       docker-libtrust-fa567046d9b14f6aa788882a950d69651d230b21.tar.gz
 Source13:       gophercloud-gophercloud-aa00757ee3ab58e53520b6cb910ca0543116400a.tar.gz
 Source14:       gorilla-context-08b5f424b9271eedf6f9f0ce86cb9396ed337a42.tar.gz
 Source15:       gorilla-mux-456bcfa82d672db7cae587c9b541463f65bc2718.tar.gz
 BuildRequires: systemd
 Requires(post): systemd
 Requires(preun): systemd
 Requires(postun): systemd
 BuildRequires:  golang >= 1.6
 BuildRequires:  golang-dep
 ExclusiveArch:  %{?go_arches:%{go_arches}}%{!?go_arches:%{ix86} x86_64 %{arm}}
 %description
@ -26,13 +33,26 @@ ExclusiveArch:  %{?go_arches:%{go_arches}}%{!?go_arches:%{ix86} x86_64 %{arm}}
 %prep
 %setup -q -n registry-token-server-%{version}
 # Extract other go dependencies
 %setup -T -D -a 10
 %setup -T -D -a 11
 %setup -T -D -a 12
 %setup -T -D -a 13
 %setup -T -D -a 14
 %setup -T -D -a 15
 mkdir -p _build/src/github.com/gorilla/ && mv gorilla-mux _build/src/github.com/gorilla/mux
 mkdir -p _build/src/github.com/docker/ && mv docker-distribution _build/src/github.com/docker/distribution
 mkdir -p _build/src/github.com/docker/ && mv docker-libtrust _build/src/github.com/docker/libtrust
 mkdir -p _build/src/github.com/docker/distribution/ && mv gorilla-context _build/src/github.com/docker/distribution/context
 mkdir -p _build/src/github.com/Sirupsen/ && mv Sirupsen-logrus _build/src/github.com/Sirupsen/logrus
 mkdir -p _build/src/github.com/gophercloud && mv gophercloud-gophercloud _build/src/github.com/gophercloud/gophercloud
 %build
 mkdir -p ./_build/src/
 ln -s $(pwd) ./_build/src/registry-token-server
 export GOPATH=$(pwd)/_build:%{gopath}
 cd ./_build/src/registry-token-server
 dep ensure
 %gobuild -o bin/registry-token-server registry-token-server
 %install
--- a/monitoring/collectd-extensions/centos/build_srpm.data
+++ b/monitoring/collectd-extensions/centos/build_srpm.data
@ -16,4 +16,4 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
  $PKG_BASE/src/example.py \
  $PKG_BASE/src/example.conf"
-TIS_PATCH_VER=5
+TIS_PATCH_VER=6
--- a/monitoring/collectd-extensions/src/ntpq.py
+++ b/monitoring/collectd-extensions/src/ntpq.py
@ -222,7 +222,6 @@ def _raise_alarm(ip=None):
 def _clear_base_alarm():
    """ Clear the NTP base alarm """
    if api.get_fault(PLUGIN_ALARMID, obj.base_eid) is not None:
    if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is False:
        collectd.error("%s failed to clear alarm %s:%s" %
                       (PLUGIN, PLUGIN_ALARMID, obj.base_eid))
@ -263,8 +262,8 @@ def _remove_ip_from_unreachable_list(ip):
    if ip and ip in obj.unreachable_servers:
        eid = obj.base_eid + '=' + ip
        collectd.debug("%s trying to clear alarm %s" % (PLUGIN, eid))
        # clear the alarm if its asserted
        if api.get_fault(PLUGIN_ALARMID, eid) is not None:
        if api.clear_fault(PLUGIN_ALARMID, eid) is True:
            collectd.info("%s cleared %s:%s alarm" %
                          (PLUGIN, PLUGIN_ALARMID, eid))
@ -277,9 +276,6 @@ def _remove_ip_from_unreachable_list(ip):
            collectd.error("%s failed alarm clear %s:%s" %
                           (PLUGIN, PLUGIN_ALARMID, eid))
            return True
        else:
            obj.unreachable_servers.remove(ip)
            collectd.info("%s alarm %s not raised" % (PLUGIN, eid))
    return False
		`@ -0,0 +1,2 @@`
							`656b5b63ed7c43bd014bcafd81b001959d5f089f`
							`v10.2.6`