Adding AIO-SX to AIO-DX migration steps patching existing PVs

Kubelet and kube-api are no longer available during puppet
manifest run during unlock. Therefore, we moved the patching
of Persistent Volumes from puppet tosysinv-conductor
as a post-migration step during its start-up.

Closes-Bug: 1927224
Depends-On: https://review.opendev.org/c/starlingx/stx-puppet/+/789844
Depends-On: https://review.opendev.org/c/starlingx/fault/+/790183
Change-Id: I9745b7f8547c82485353130156011650f2655317
Signed-off-by: Pedro Henrique Linhares <PedroHenriqueLinhares.Silva@windriver.com>
This commit is contained in:
Pedro Henrique Linhares 2021-05-05 11:34:47 -03:00 committed by Pedro Linhares
parent f755b2efd4
commit 6df2034a4e
6 changed files with 209 additions and 10 deletions

View File

@ -117,6 +117,7 @@ install -p -D -m 755 scripts/validate-platform-backup.sh %{buildroot}%{local_bin
install -p -D -m 755 scripts/manage-partitions %{buildroot}%{local_bindir}/manage-partitions install -p -D -m 755 scripts/manage-partitions %{buildroot}%{local_bindir}/manage-partitions
install -p -D -m 755 scripts/query_pci_id %{buildroot}%{local_bindir}/query_pci_id install -p -D -m 755 scripts/query_pci_id %{buildroot}%{local_bindir}/query_pci_id
install -p -D -m 700 scripts/kube-cert-rotation.sh %{buildroot}%{local_bindir}/kube-cert-rotation.sh install -p -D -m 700 scripts/kube-cert-rotation.sh %{buildroot}%{local_bindir}/kube-cert-rotation.sh
install -p -D -m 755 scripts/ceph_k8s_update_monitors.sh %{buildroot}%{local_bindir}/ceph_k8s_update_monitors.sh
%clean %clean
echo "CLEAN CALLED" echo "CLEAN CALLED"

View File

@ -0,0 +1,129 @@
#!/bin/bash
#
# Copyright (c) 2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# Utility for patching Kubernetes Persistent Volumes during
# AIO-SX to AIO-DX migration.
#
# This is required because Ceph-mon IP address changes
# from controller-0 to floating controller IP. Therefore,
# existing PV claims backed by cephfs or RBD will fail to
# mount due to previous monitor being inaccessible.
# Logging info.
NAME=$(basename $0)
# This will log to /var/log/platform.log
# and stdout
function log {
logger -p local1.info "$NAME: $1"
echo "$1"
}
function help {
echo "Utility for patching Kubernetes Persistent Volumes during AIO-SX to AIO-DX migration"
echo
echo "Syntax: $NAME [-h] CONTROLLER_0_MGMT_IP FLOATING_CONTROLLER_MGMT_IP"
echo "options:"
echo "h Prints this Help."
echo
}
while getopts ":h" option; do
case $option in
h)
help
exit;;
\?)
log "Error: Invalid option"
exit;;
esac
done
if [ $# -ne 2 ]; then
log "Error: Wrong number of arguments"
log "Run $NAME -h for help"
exit 1
fi
# read input arguments
CONTROLLER_0_IP=$1
CONTROLLER_FLOATING_IP=$2
function check_pv_need_migration {
local mon
mon=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $1 -o jsonpath='{.spec.*.monitors}')
echo $mon | grep -q $CONTROLLER_0_IP
}
ITER=0
MAX_ITER=5
while [[ $ITER -le $MAX_ITER ]]; do
kubectl --kubeconfig=/etc/kubernetes/admin.conf get StorageClass --all-namespaces > /dev/null
if [ $? -ne 0 ]; then
log "kubernetes api is not available. Retry ${ITER} of ${MAX_ITER}"
ITER=$((ITER + 1))
sleep 30
else
break
fi
done
if [[ $ITER -gt $MAX_ITER ]]; then
log "kubernetes api is not available. Exiting with failure"
exit 1
fi
STORAGE_CLASSES=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get StorageClass --all-namespaces | \
grep -E "ceph.com/cephfs|ceph.com/rbd" | awk '{print $1}')
EXISTING_PVCS=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume --all-namespaces --no-headers | awk '{print $1}')
for PVC in $EXISTING_PVCS; do
PVC_SC=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $PVC -o json | \
grep -Eo '"storageClassName"[^,]*' | awk '{print $2}' | sed 's/"//g')
for SC in ${STORAGE_CLASSES}
do
if [ "$SC" == "$PVC_SC" ]; then
# Loops over existing Persistent Volumes and replace it changing the CEPH monitor ip address
# This is required because updating the monitor ip is not allowed by kubernetes and therefore we need
# to re-create it. The replace command will block due to the pv-protection finalizer waiting for the bounded PVC
# to be removed but we want to replace the PV without removing the bounded PVC. Therefore, we run the replace command
# in the background and run a patch removing the pv-protection finalizer so that replace command completes.
check_pv_need_migration $PVC
if [ $? -ne 0 ]; then
log "skipping PersistentVolume/${PVC} - already patched"
continue
fi
log "Started patching PersistentVolume/${PVC}"
kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $PVC -o yaml | sed "s/$CONTROLLER_0_IP/$CONTROLLER_FLOATING_IP/g" | \
kubectl --kubeconfig=/etc/kubernetes/admin.conf replace --cascade=false --force -f - >/dev/null &
sleep 1
TIMEOUT=4
DELAY=0
while [[ $DELAY -lt $TIMEOUT ]]; do
timestamp=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf get PersistentVolume $PVC -o jsonpath='{.metadata.deletionTimestamp}')
if [ ! -z "${timestamp}" ]; then
break
else
sleep 1
DELAY=$((DELAY + 1))
fi
done
if [[ $DELAY -lt $TIMEOUT ]]; then
kubectl --kubeconfig=/etc/kubernetes/admin.conf patch PersistentVolume ${PVC} -p '{"metadata":{"finalizers":null}}' --type=merge
wait
log "PersistentVolume/${PVC} replaced"
else
log "Timed out waiting to patch PersistentVolume/${PVC}"
exit 1
fi
fi
done
done
exit 0

View File

@ -303,7 +303,7 @@ class ConductorManager(service.PeriodicService):
self._handle_restore_in_progress() self._handle_restore_in_progress()
self._reset_simplex_to_duplex_flag(system) self._sx_to_dx_post_migration_actions(system)
LOG.info("sysinv-conductor start committed system=%s" % LOG.info("sysinv-conductor start committed system=%s" %
system.as_dict()) system.as_dict())
@ -410,20 +410,85 @@ class ConductorManager(service.PeriodicService):
self._create_default_service_parameter() self._create_default_service_parameter()
return system return system
def _reset_simplex_to_duplex_flag(self, system): def _update_pvc_migration_alarm(self, alarm_state=None):
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_K8S,
"PV-migration-failed")
reason_text = "Failed to patch Persistent Volumes backed by CEPH "\
"during AIO-SX to AIO-DX migration"
# Skip if the flag is not set or if the system mode is not set to duplex if alarm_state == fm_constants.FM_ALARM_STATE_SET:
if (not system.capabilities.get('simplex_to_duplex_migration') or fault = fm_api.Fault(
system.system_mode != constants.SYSTEM_MODE_DUPLEX): alarm_id=fm_constants.FM_ALARM_ID_K8S_RESOURCE_PV,
return alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_K8S,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
reason_text=reason_text,
alarm_type=fm_constants.FM_ALARM_TYPE_3,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_6,
proposed_repair_action=_("Manually execute /usr/bin/ceph_k8s_update_monitors.sh "
"to confirm PVs are updated, then lock/unlock to clear "
"alarms. If problem persists, contact next level of "
"support."),
service_affecting=False)
self.fm_api.set_fault(fault)
else:
alarms = self.fm_api.get_faults(entity_instance_id)
if alarms:
self.fm_api.clear_all(entity_instance_id)
def _pvc_monitor_migration(self):
ceph_backend_enabled = StorageBackendConfig.get_backend(
self.dbapi,
constants.SB_TYPE_CEPH)
if not ceph_backend_enabled:
# if it does not have ceph backend enabled there is
# nothing to migrate
return True
# get the controller-0 and floating management IP address
controller_0_address = self.dbapi.address_get_by_name(
constants.CONTROLLER_0_MGMT).address
floating_address = self.dbapi.address_get_by_name(
cutils.format_address_name(constants.CONTROLLER_HOSTNAME,
constants.NETWORK_TYPE_MGMT)).address
try:
cmd = ["/usr/bin/ceph_k8s_update_monitors.sh",
controller_0_address,
floating_address]
__, __ = cutils.execute(*cmd, run_as_root=True)
LOG.info("Updated ceph-mon address from {} to {} on existing Persistent Volumes."
.format(controller_0_address, floating_address))
self._update_pvc_migration_alarm()
except exception.ProcessExecutionError:
error_msg = "Failed to patch Kubernetes Persistent Volume resources. "\
"ceph-mon address changed from {} to {}".format(
controller_0_address, floating_address)
LOG.error(error_msg)
# raise alarm
self._update_pvc_migration_alarm(fm_constants.FM_ALARM_STATE_SET)
return False
return True
def _sx_to_dx_post_migration_actions(self, system):
host = self.dbapi.ihost_get(self.host_uuid) host = self.dbapi.ihost_get(self.host_uuid)
if host.administrative != constants.ADMIN_UNLOCKED: # Skip if the system mode is not set to duplex or it is not unlocked
if (system.system_mode != constants.SYSTEM_MODE_DUPLEX or
host.administrative != constants.ADMIN_UNLOCKED):
return return
system_dict = system.as_dict() if system.capabilities.get('simplex_to_duplex_migration'):
del system_dict['capabilities']['simplex_to_duplex_migration'] system_dict = system.as_dict()
self.dbapi.isystem_update(system.uuid, system_dict) del system_dict['capabilities']['simplex_to_duplex_migration']
self.dbapi.isystem_update(system.uuid, system_dict)
greenthread.spawn(self._pvc_monitor_migration)
elif self.fm_api.get_faults_by_id(fm_constants.FM_ALARM_ID_K8S_RESOURCE_PV):
greenthread.spawn(self._pvc_monitor_migration)
def _upgrade_init_actions(self): def _upgrade_init_actions(self):
""" Perform any upgrade related startup actions""" """ Perform any upgrade related startup actions"""

View File

@ -625,6 +625,7 @@ class StorageTierDependentTCs(base.FunctionalTest):
mock.patch.object(ceph_utils, 'fix_crushmap')) as (mock_fsid, mock_fix_crushmap): mock.patch.object(ceph_utils, 'fix_crushmap')) as (mock_fsid, mock_fix_crushmap):
mock_fix_crushmap.return_value = True mock_fix_crushmap.return_value = True
mock_fsid.return_value = (mock.MagicMock(ok=False), None) mock_fsid.return_value = (mock.MagicMock(ok=False), None)
self.service._sx_to_dx_post_migration_actions = mock.Mock()
self.service.start() self.service.start()
self.service._init_ceph_cluster_info() self.service._init_ceph_cluster_info()
mock_fsid.assert_called() mock_fsid.assert_called()

View File

@ -61,6 +61,8 @@ class UpdateCephCluster(base.DbTestCase):
self.mock_fix_crushmap = self.fix_crushmap_patcher.start() self.mock_fix_crushmap = self.fix_crushmap_patcher.start()
self.mock_fix_crushmap.return_value = True self.mock_fix_crushmap.return_value = True
self.service._sx_to_dx_post_migration_actions = mock.Mock()
def tearDown(self): def tearDown(self):
super(UpdateCephCluster, self).tearDown() super(UpdateCephCluster, self).tearDown()
self.upgrade_downgrade_kube_components_patcher.stop() self.upgrade_downgrade_kube_components_patcher.stop()

View File

@ -239,6 +239,7 @@ class ManagerTestCase(base.DbTestCase):
self.service._allocate_addresses_for_host = mock.Mock() self.service._allocate_addresses_for_host = mock.Mock()
self.service._update_pxe_config = mock.Mock() self.service._update_pxe_config = mock.Mock()
self.service._ceph_mon_create = mock.Mock() self.service._ceph_mon_create = mock.Mock()
self.service._sx_to_dx_post_migration_actions = mock.Mock()
self.alarm_raised = False self.alarm_raised = False
def tearDown(self): def tearDown(self):