Merge "Adding AIO-SX to AIO-DX migration steps patching existing PVs"
This commit is contained in:
commit
d85ac964db
|
@ -117,6 +117,7 @@ install -p -D -m 755 scripts/validate-platform-backup.sh %{buildroot}%{local_bin
|
||||||
install -p -D -m 755 scripts/manage-partitions %{buildroot}%{local_bindir}/manage-partitions
|
install -p -D -m 755 scripts/manage-partitions %{buildroot}%{local_bindir}/manage-partitions
|
||||||
install -p -D -m 755 scripts/query_pci_id %{buildroot}%{local_bindir}/query_pci_id
|
install -p -D -m 755 scripts/query_pci_id %{buildroot}%{local_bindir}/query_pci_id
|
||||||
install -p -D -m 700 scripts/kube-cert-rotation.sh %{buildroot}%{local_bindir}/kube-cert-rotation.sh
|
install -p -D -m 700 scripts/kube-cert-rotation.sh %{buildroot}%{local_bindir}/kube-cert-rotation.sh
|
||||||
|
install -p -D -m 755 scripts/ceph_k8s_update_monitors.sh %{buildroot}%{local_bindir}/ceph_k8s_update_monitors.sh
|
||||||
|
|
||||||
%clean
|
%clean
|
||||||
echo "CLEAN CALLED"
|
echo "CLEAN CALLED"
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Copyright (c) 2021 Wind River Systems, Inc.
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
# Utility for patching Kubernetes Persistent Volumes during
|
||||||
|
# AIO-SX to AIO-DX migration.
|
||||||
|
#
|
||||||
|
# This is required because Ceph-mon IP address changes
|
||||||
|
# from controller-0 to floating controller IP. Therefore,
|
||||||
|
# existing PV claims backed by cephfs or RBD will fail to
|
||||||
|
# mount due to previous monitor being inaccessible.
|
||||||
|
|
||||||
|
# Logging info.
|
||||||
|
NAME=$(basename $0)
|
||||||
|
|
||||||
|
# This will log to /var/log/platform.log
|
||||||
|
# and stdout
|
||||||
|
function log {
|
||||||
|
logger -p local1.info "$NAME: $1"
|
||||||
|
echo "$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
function help {
|
||||||
|
echo "Utility for patching Kubernetes Persistent Volumes during AIO-SX to AIO-DX migration"
|
||||||
|
echo
|
||||||
|
echo "Syntax: $NAME [-h] CONTROLLER_0_MGMT_IP FLOATING_CONTROLLER_MGMT_IP"
|
||||||
|
echo "options:"
|
||||||
|
echo "h Prints this Help."
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts ":h" option; do
|
||||||
|
case $option in
|
||||||
|
h)
|
||||||
|
help
|
||||||
|
exit;;
|
||||||
|
\?)
|
||||||
|
log "Error: Invalid option"
|
||||||
|
exit;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $# -ne 2 ]; then
|
||||||
|
log "Error: Wrong number of arguments"
|
||||||
|
log "Run $NAME -h for help"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# read input arguments
|
||||||
|
CONTROLLER_0_IP=$1
|
||||||
|
CONTROLLER_FLOATING_IP=$2
|
||||||
|
|
||||||
|
function check_pv_need_migration {
|
||||||
|
local mon
|
||||||
|
mon=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $1 -o jsonpath='{.spec.*.monitors}')
|
||||||
|
echo $mon | grep -q $CONTROLLER_0_IP
|
||||||
|
}
|
||||||
|
|
||||||
|
ITER=0
|
||||||
|
MAX_ITER=5
|
||||||
|
while [[ $ITER -le $MAX_ITER ]]; do
|
||||||
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf get StorageClass --all-namespaces > /dev/null
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
log "kubernetes api is not available. Retry ${ITER} of ${MAX_ITER}"
|
||||||
|
ITER=$((ITER + 1))
|
||||||
|
sleep 30
|
||||||
|
else
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ $ITER -gt $MAX_ITER ]]; then
|
||||||
|
log "kubernetes api is not available. Exiting with failure"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
STORAGE_CLASSES=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get StorageClass --all-namespaces | \
|
||||||
|
grep -E "ceph.com/cephfs|ceph.com/rbd" | awk '{print $1}')
|
||||||
|
EXISTING_PVCS=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume --all-namespaces --no-headers | awk '{print $1}')
|
||||||
|
|
||||||
|
for PVC in $EXISTING_PVCS; do
|
||||||
|
PVC_SC=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $PVC -o json | \
|
||||||
|
grep -Eo '"storageClassName"[^,]*' | awk '{print $2}' | sed 's/"//g')
|
||||||
|
|
||||||
|
for SC in ${STORAGE_CLASSES}
|
||||||
|
do
|
||||||
|
if [ "$SC" == "$PVC_SC" ]; then
|
||||||
|
# Loops over existing Persistent Volumes and replace it changing the CEPH monitor ip address
|
||||||
|
# This is required because updating the monitor ip is not allowed by kubernetes and therefore we need
|
||||||
|
# to re-create it. The replace command will block due to the pv-protection finalizer waiting for the bounded PVC
|
||||||
|
# to be removed but we want to replace the PV without removing the bounded PVC. Therefore, we run the replace command
|
||||||
|
# in the background and run a patch removing the pv-protection finalizer so that replace command completes.
|
||||||
|
check_pv_need_migration $PVC
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
log "skipping PersistentVolume/${PVC} - already patched"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Started patching PersistentVolume/${PVC}"
|
||||||
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $PVC -o yaml | sed "s/$CONTROLLER_0_IP/$CONTROLLER_FLOATING_IP/g" | \
|
||||||
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf replace --cascade=false --force -f - >/dev/null &
|
||||||
|
sleep 1
|
||||||
|
TIMEOUT=4
|
||||||
|
DELAY=0
|
||||||
|
while [[ $DELAY -lt $TIMEOUT ]]; do
|
||||||
|
timestamp=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $PVC -o jsonpath='{.metadata.deletionTimestamp}')
|
||||||
|
if [ ! -z "${timestamp}" ]; then
|
||||||
|
break
|
||||||
|
else
|
||||||
|
sleep 1
|
||||||
|
DELAY=$((DELAY + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ $DELAY -lt $TIMEOUT ]]; then
|
||||||
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf patch PersistentVolume ${PVC} -p '{"metadata":{"finalizers":null}}' --type=merge
|
||||||
|
wait
|
||||||
|
log "PersistentVolume/${PVC} replaced"
|
||||||
|
else
|
||||||
|
log "Timed out waiting to patch PersistentVolume/${PVC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
exit 0
|
|
@ -303,7 +303,7 @@ class ConductorManager(service.PeriodicService):
|
||||||
|
|
||||||
self._handle_restore_in_progress()
|
self._handle_restore_in_progress()
|
||||||
|
|
||||||
self._reset_simplex_to_duplex_flag(system)
|
self._sx_to_dx_post_migration_actions(system)
|
||||||
|
|
||||||
LOG.info("sysinv-conductor start committed system=%s" %
|
LOG.info("sysinv-conductor start committed system=%s" %
|
||||||
system.as_dict())
|
system.as_dict())
|
||||||
|
@ -410,20 +410,85 @@ class ConductorManager(service.PeriodicService):
|
||||||
self._create_default_service_parameter()
|
self._create_default_service_parameter()
|
||||||
return system
|
return system
|
||||||
|
|
||||||
def _reset_simplex_to_duplex_flag(self, system):
|
def _update_pvc_migration_alarm(self, alarm_state=None):
|
||||||
|
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_K8S,
|
||||||
|
"PV-migration-failed")
|
||||||
|
reason_text = "Failed to patch Persistent Volumes backed by CEPH "\
|
||||||
|
"during AIO-SX to AIO-DX migration"
|
||||||
|
|
||||||
# Skip if the flag is not set or if the system mode is not set to duplex
|
if alarm_state == fm_constants.FM_ALARM_STATE_SET:
|
||||||
if (not system.capabilities.get('simplex_to_duplex_migration') or
|
fault = fm_api.Fault(
|
||||||
system.system_mode != constants.SYSTEM_MODE_DUPLEX):
|
alarm_id=fm_constants.FM_ALARM_ID_K8S_RESOURCE_PV,
|
||||||
return
|
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||||
|
entity_type_id=fm_constants.FM_ENTITY_TYPE_K8S,
|
||||||
|
entity_instance_id=entity_instance_id,
|
||||||
|
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
||||||
|
reason_text=reason_text,
|
||||||
|
alarm_type=fm_constants.FM_ALARM_TYPE_3,
|
||||||
|
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_6,
|
||||||
|
proposed_repair_action=_("Manually execute /usr/bin/ceph_k8s_update_monitors.sh "
|
||||||
|
"to confirm PVs are updated, then lock/unlock to clear "
|
||||||
|
"alarms. If problem persists, contact next level of "
|
||||||
|
"support."),
|
||||||
|
service_affecting=False)
|
||||||
|
|
||||||
|
self.fm_api.set_fault(fault)
|
||||||
|
else:
|
||||||
|
alarms = self.fm_api.get_faults(entity_instance_id)
|
||||||
|
if alarms:
|
||||||
|
self.fm_api.clear_all(entity_instance_id)
|
||||||
|
|
||||||
|
def _pvc_monitor_migration(self):
|
||||||
|
ceph_backend_enabled = StorageBackendConfig.get_backend(
|
||||||
|
self.dbapi,
|
||||||
|
constants.SB_TYPE_CEPH)
|
||||||
|
|
||||||
|
if not ceph_backend_enabled:
|
||||||
|
# if it does not have ceph backend enabled there is
|
||||||
|
# nothing to migrate
|
||||||
|
return True
|
||||||
|
|
||||||
|
# get the controller-0 and floating management IP address
|
||||||
|
controller_0_address = self.dbapi.address_get_by_name(
|
||||||
|
constants.CONTROLLER_0_MGMT).address
|
||||||
|
floating_address = self.dbapi.address_get_by_name(
|
||||||
|
cutils.format_address_name(constants.CONTROLLER_HOSTNAME,
|
||||||
|
constants.NETWORK_TYPE_MGMT)).address
|
||||||
|
try:
|
||||||
|
cmd = ["/usr/bin/ceph_k8s_update_monitors.sh",
|
||||||
|
controller_0_address,
|
||||||
|
floating_address]
|
||||||
|
__, __ = cutils.execute(*cmd, run_as_root=True)
|
||||||
|
|
||||||
|
LOG.info("Updated ceph-mon address from {} to {} on existing Persistent Volumes."
|
||||||
|
.format(controller_0_address, floating_address))
|
||||||
|
self._update_pvc_migration_alarm()
|
||||||
|
except exception.ProcessExecutionError:
|
||||||
|
error_msg = "Failed to patch Kubernetes Persistent Volume resources. "\
|
||||||
|
"ceph-mon address changed from {} to {}".format(
|
||||||
|
controller_0_address, floating_address)
|
||||||
|
LOG.error(error_msg)
|
||||||
|
|
||||||
|
# raise alarm
|
||||||
|
self._update_pvc_migration_alarm(fm_constants.FM_ALARM_STATE_SET)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _sx_to_dx_post_migration_actions(self, system):
|
||||||
host = self.dbapi.ihost_get(self.host_uuid)
|
host = self.dbapi.ihost_get(self.host_uuid)
|
||||||
if host.administrative != constants.ADMIN_UNLOCKED:
|
# Skip if the system mode is not set to duplex or it is not unlocked
|
||||||
|
if (system.system_mode != constants.SYSTEM_MODE_DUPLEX or
|
||||||
|
host.administrative != constants.ADMIN_UNLOCKED):
|
||||||
return
|
return
|
||||||
|
|
||||||
system_dict = system.as_dict()
|
if system.capabilities.get('simplex_to_duplex_migration'):
|
||||||
del system_dict['capabilities']['simplex_to_duplex_migration']
|
system_dict = system.as_dict()
|
||||||
self.dbapi.isystem_update(system.uuid, system_dict)
|
del system_dict['capabilities']['simplex_to_duplex_migration']
|
||||||
|
self.dbapi.isystem_update(system.uuid, system_dict)
|
||||||
|
|
||||||
|
greenthread.spawn(self._pvc_monitor_migration)
|
||||||
|
elif self.fm_api.get_faults_by_id(fm_constants.FM_ALARM_ID_K8S_RESOURCE_PV):
|
||||||
|
greenthread.spawn(self._pvc_monitor_migration)
|
||||||
|
|
||||||
def _upgrade_init_actions(self):
|
def _upgrade_init_actions(self):
|
||||||
""" Perform any upgrade related startup actions"""
|
""" Perform any upgrade related startup actions"""
|
||||||
|
|
|
@ -625,6 +625,7 @@ class StorageTierDependentTCs(base.FunctionalTest):
|
||||||
mock.patch.object(ceph_utils, 'fix_crushmap')) as (mock_fsid, mock_fix_crushmap):
|
mock.patch.object(ceph_utils, 'fix_crushmap')) as (mock_fsid, mock_fix_crushmap):
|
||||||
mock_fix_crushmap.return_value = True
|
mock_fix_crushmap.return_value = True
|
||||||
mock_fsid.return_value = (mock.MagicMock(ok=False), None)
|
mock_fsid.return_value = (mock.MagicMock(ok=False), None)
|
||||||
|
self.service._sx_to_dx_post_migration_actions = mock.Mock()
|
||||||
self.service.start()
|
self.service.start()
|
||||||
self.service._init_ceph_cluster_info()
|
self.service._init_ceph_cluster_info()
|
||||||
mock_fsid.assert_called()
|
mock_fsid.assert_called()
|
||||||
|
|
|
@ -61,6 +61,8 @@ class UpdateCephCluster(base.DbTestCase):
|
||||||
self.mock_fix_crushmap = self.fix_crushmap_patcher.start()
|
self.mock_fix_crushmap = self.fix_crushmap_patcher.start()
|
||||||
self.mock_fix_crushmap.return_value = True
|
self.mock_fix_crushmap.return_value = True
|
||||||
|
|
||||||
|
self.service._sx_to_dx_post_migration_actions = mock.Mock()
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
super(UpdateCephCluster, self).tearDown()
|
super(UpdateCephCluster, self).tearDown()
|
||||||
self.upgrade_downgrade_kube_components_patcher.stop()
|
self.upgrade_downgrade_kube_components_patcher.stop()
|
||||||
|
|
|
@ -239,6 +239,7 @@ class ManagerTestCase(base.DbTestCase):
|
||||||
self.service._allocate_addresses_for_host = mock.Mock()
|
self.service._allocate_addresses_for_host = mock.Mock()
|
||||||
self.service._update_pxe_config = mock.Mock()
|
self.service._update_pxe_config = mock.Mock()
|
||||||
self.service._ceph_mon_create = mock.Mock()
|
self.service._ceph_mon_create = mock.Mock()
|
||||||
|
self.service._sx_to_dx_post_migration_actions = mock.Mock()
|
||||||
self.alarm_raised = False
|
self.alarm_raised = False
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
|
|
Loading…
Reference in New Issue