Merge "Adding AIO-SX to AIO-DX migration steps patching existing PVs"
This commit is contained in:
commit
d85ac964db
|
@ -117,6 +117,7 @@ install -p -D -m 755 scripts/validate-platform-backup.sh %{buildroot}%{local_bin
|
|||
install -p -D -m 755 scripts/manage-partitions %{buildroot}%{local_bindir}/manage-partitions
|
||||
install -p -D -m 755 scripts/query_pci_id %{buildroot}%{local_bindir}/query_pci_id
|
||||
install -p -D -m 700 scripts/kube-cert-rotation.sh %{buildroot}%{local_bindir}/kube-cert-rotation.sh
|
||||
install -p -D -m 755 scripts/ceph_k8s_update_monitors.sh %{buildroot}%{local_bindir}/ceph_k8s_update_monitors.sh
|
||||
|
||||
%clean
|
||||
echo "CLEAN CALLED"
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Utility for patching Kubernetes Persistent Volumes during
|
||||
# AIO-SX to AIO-DX migration.
|
||||
#
|
||||
# This is required because Ceph-mon IP address changes
|
||||
# from controller-0 to floating controller IP. Therefore,
|
||||
# existing PV claims backed by cephfs or RBD will fail to
|
||||
# mount due to previous monitor being inaccessible.
|
||||
|
||||
# Logging info.
|
||||
NAME=$(basename $0)
|
||||
|
||||
# This will log to /var/log/platform.log
|
||||
# and stdout
|
||||
function log {
|
||||
logger -p local1.info "$NAME: $1"
|
||||
echo "$1"
|
||||
}
|
||||
|
||||
function help {
|
||||
echo "Utility for patching Kubernetes Persistent Volumes during AIO-SX to AIO-DX migration"
|
||||
echo
|
||||
echo "Syntax: $NAME [-h] CONTROLLER_0_MGMT_IP FLOATING_CONTROLLER_MGMT_IP"
|
||||
echo "options:"
|
||||
echo "h Prints this Help."
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts ":h" option; do
|
||||
case $option in
|
||||
h)
|
||||
help
|
||||
exit;;
|
||||
\?)
|
||||
log "Error: Invalid option"
|
||||
exit;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $# -ne 2 ]; then
|
||||
log "Error: Wrong number of arguments"
|
||||
log "Run $NAME -h for help"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# read input arguments
|
||||
CONTROLLER_0_IP=$1
|
||||
CONTROLLER_FLOATING_IP=$2
|
||||
|
||||
function check_pv_need_migration {
|
||||
local mon
|
||||
mon=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $1 -o jsonpath='{.spec.*.monitors}')
|
||||
echo $mon | grep -q $CONTROLLER_0_IP
|
||||
}
|
||||
|
||||
ITER=0
|
||||
MAX_ITER=5
|
||||
while [[ $ITER -le $MAX_ITER ]]; do
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf get StorageClass --all-namespaces > /dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
log "kubernetes api is not available. Retry ${ITER} of ${MAX_ITER}"
|
||||
ITER=$((ITER + 1))
|
||||
sleep 30
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ $ITER -gt $MAX_ITER ]]; then
|
||||
log "kubernetes api is not available. Exiting with failure"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
STORAGE_CLASSES=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get StorageClass --all-namespaces | \
|
||||
grep -E "ceph.com/cephfs|ceph.com/rbd" | awk '{print $1}')
|
||||
EXISTING_PVCS=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume --all-namespaces --no-headers | awk '{print $1}')
|
||||
|
||||
for PVC in $EXISTING_PVCS; do
|
||||
PVC_SC=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $PVC -o json | \
|
||||
grep -Eo '"storageClassName"[^,]*' | awk '{print $2}' | sed 's/"//g')
|
||||
|
||||
for SC in ${STORAGE_CLASSES}
|
||||
do
|
||||
if [ "$SC" == "$PVC_SC" ]; then
|
||||
# Loops over existing Persistent Volumes and replace it changing the CEPH monitor ip address
|
||||
# This is required because updating the monitor ip is not allowed by kubernetes and therefore we need
|
||||
# to re-create it. The replace command will block due to the pv-protection finalizer waiting for the bounded PVC
|
||||
# to be removed but we want to replace the PV without removing the bounded PVC. Therefore, we run the replace command
|
||||
# in the background and run a patch removing the pv-protection finalizer so that replace command completes.
|
||||
check_pv_need_migration $PVC
|
||||
if [ $? -ne 0 ]; then
|
||||
log "skipping PersistentVolume/${PVC} - already patched"
|
||||
continue
|
||||
fi
|
||||
|
||||
log "Started patching PersistentVolume/${PVC}"
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $PVC -o yaml | sed "s/$CONTROLLER_0_IP/$CONTROLLER_FLOATING_IP/g" | \
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf replace --cascade=false --force -f - >/dev/null &
|
||||
sleep 1
|
||||
TIMEOUT=4
|
||||
DELAY=0
|
||||
while [[ $DELAY -lt $TIMEOUT ]]; do
|
||||
timestamp=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $PVC -o jsonpath='{.metadata.deletionTimestamp}')
|
||||
if [ ! -z "${timestamp}" ]; then
|
||||
break
|
||||
else
|
||||
sleep 1
|
||||
DELAY=$((DELAY + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ $DELAY -lt $TIMEOUT ]]; then
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf patch PersistentVolume ${PVC} -p '{"metadata":{"finalizers":null}}' --type=merge
|
||||
wait
|
||||
log "PersistentVolume/${PVC} replaced"
|
||||
else
|
||||
log "Timed out waiting to patch PersistentVolume/${PVC}"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
exit 0
|
|
@ -303,7 +303,7 @@ class ConductorManager(service.PeriodicService):
|
|||
|
||||
self._handle_restore_in_progress()
|
||||
|
||||
self._reset_simplex_to_duplex_flag(system)
|
||||
self._sx_to_dx_post_migration_actions(system)
|
||||
|
||||
LOG.info("sysinv-conductor start committed system=%s" %
|
||||
system.as_dict())
|
||||
|
@ -410,21 +410,86 @@ class ConductorManager(service.PeriodicService):
|
|||
self._create_default_service_parameter()
|
||||
return system
|
||||
|
||||
def _reset_simplex_to_duplex_flag(self, system):
|
||||
def _update_pvc_migration_alarm(self, alarm_state=None):
|
||||
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_K8S,
|
||||
"PV-migration-failed")
|
||||
reason_text = "Failed to patch Persistent Volumes backed by CEPH "\
|
||||
"during AIO-SX to AIO-DX migration"
|
||||
|
||||
# Skip if the flag is not set or if the system mode is not set to duplex
|
||||
if (not system.capabilities.get('simplex_to_duplex_migration') or
|
||||
system.system_mode != constants.SYSTEM_MODE_DUPLEX):
|
||||
return
|
||||
if alarm_state == fm_constants.FM_ALARM_STATE_SET:
|
||||
fault = fm_api.Fault(
|
||||
alarm_id=fm_constants.FM_ALARM_ID_K8S_RESOURCE_PV,
|
||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_K8S,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
||||
reason_text=reason_text,
|
||||
alarm_type=fm_constants.FM_ALARM_TYPE_3,
|
||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_6,
|
||||
proposed_repair_action=_("Manually execute /usr/bin/ceph_k8s_update_monitors.sh "
|
||||
"to confirm PVs are updated, then lock/unlock to clear "
|
||||
"alarms. If problem persists, contact next level of "
|
||||
"support."),
|
||||
service_affecting=False)
|
||||
|
||||
self.fm_api.set_fault(fault)
|
||||
else:
|
||||
alarms = self.fm_api.get_faults(entity_instance_id)
|
||||
if alarms:
|
||||
self.fm_api.clear_all(entity_instance_id)
|
||||
|
||||
def _pvc_monitor_migration(self):
|
||||
ceph_backend_enabled = StorageBackendConfig.get_backend(
|
||||
self.dbapi,
|
||||
constants.SB_TYPE_CEPH)
|
||||
|
||||
if not ceph_backend_enabled:
|
||||
# if it does not have ceph backend enabled there is
|
||||
# nothing to migrate
|
||||
return True
|
||||
|
||||
# get the controller-0 and floating management IP address
|
||||
controller_0_address = self.dbapi.address_get_by_name(
|
||||
constants.CONTROLLER_0_MGMT).address
|
||||
floating_address = self.dbapi.address_get_by_name(
|
||||
cutils.format_address_name(constants.CONTROLLER_HOSTNAME,
|
||||
constants.NETWORK_TYPE_MGMT)).address
|
||||
try:
|
||||
cmd = ["/usr/bin/ceph_k8s_update_monitors.sh",
|
||||
controller_0_address,
|
||||
floating_address]
|
||||
__, __ = cutils.execute(*cmd, run_as_root=True)
|
||||
|
||||
LOG.info("Updated ceph-mon address from {} to {} on existing Persistent Volumes."
|
||||
.format(controller_0_address, floating_address))
|
||||
self._update_pvc_migration_alarm()
|
||||
except exception.ProcessExecutionError:
|
||||
error_msg = "Failed to patch Kubernetes Persistent Volume resources. "\
|
||||
"ceph-mon address changed from {} to {}".format(
|
||||
controller_0_address, floating_address)
|
||||
LOG.error(error_msg)
|
||||
|
||||
# raise alarm
|
||||
self._update_pvc_migration_alarm(fm_constants.FM_ALARM_STATE_SET)
|
||||
return False
|
||||
return True
|
||||
|
||||
def _sx_to_dx_post_migration_actions(self, system):
|
||||
host = self.dbapi.ihost_get(self.host_uuid)
|
||||
if host.administrative != constants.ADMIN_UNLOCKED:
|
||||
# Skip if the system mode is not set to duplex or it is not unlocked
|
||||
if (system.system_mode != constants.SYSTEM_MODE_DUPLEX or
|
||||
host.administrative != constants.ADMIN_UNLOCKED):
|
||||
return
|
||||
|
||||
if system.capabilities.get('simplex_to_duplex_migration'):
|
||||
system_dict = system.as_dict()
|
||||
del system_dict['capabilities']['simplex_to_duplex_migration']
|
||||
self.dbapi.isystem_update(system.uuid, system_dict)
|
||||
|
||||
greenthread.spawn(self._pvc_monitor_migration)
|
||||
elif self.fm_api.get_faults_by_id(fm_constants.FM_ALARM_ID_K8S_RESOURCE_PV):
|
||||
greenthread.spawn(self._pvc_monitor_migration)
|
||||
|
||||
def _upgrade_init_actions(self):
|
||||
""" Perform any upgrade related startup actions"""
|
||||
try:
|
||||
|
|
|
@ -625,6 +625,7 @@ class StorageTierDependentTCs(base.FunctionalTest):
|
|||
mock.patch.object(ceph_utils, 'fix_crushmap')) as (mock_fsid, mock_fix_crushmap):
|
||||
mock_fix_crushmap.return_value = True
|
||||
mock_fsid.return_value = (mock.MagicMock(ok=False), None)
|
||||
self.service._sx_to_dx_post_migration_actions = mock.Mock()
|
||||
self.service.start()
|
||||
self.service._init_ceph_cluster_info()
|
||||
mock_fsid.assert_called()
|
||||
|
|
|
@ -61,6 +61,8 @@ class UpdateCephCluster(base.DbTestCase):
|
|||
self.mock_fix_crushmap = self.fix_crushmap_patcher.start()
|
||||
self.mock_fix_crushmap.return_value = True
|
||||
|
||||
self.service._sx_to_dx_post_migration_actions = mock.Mock()
|
||||
|
||||
def tearDown(self):
|
||||
super(UpdateCephCluster, self).tearDown()
|
||||
self.upgrade_downgrade_kube_components_patcher.stop()
|
||||
|
|
|
@ -239,6 +239,7 @@ class ManagerTestCase(base.DbTestCase):
|
|||
self.service._allocate_addresses_for_host = mock.Mock()
|
||||
self.service._update_pxe_config = mock.Mock()
|
||||
self.service._ceph_mon_create = mock.Mock()
|
||||
self.service._sx_to_dx_post_migration_actions = mock.Mock()
|
||||
self.alarm_raised = False
|
||||
|
||||
def tearDown(self):
|
||||
|
|
Loading…
Reference in New Issue