From b0066dcd270117cd12360ee6469f1e7ac0f4d4e7 Mon Sep 17 00:00:00 2001 From: Robert Church Date: Tue, 29 Nov 2022 00:34:53 -0600 Subject: [PATCH] Remove all volume groups by UUID In cases when wipedisk isn't run or isn't working correctly, pre-existing volume groups, physical volumes, and logical volumes will be present on the root disk. Depending on the sizes and layout of the previous install along with partial or aborted cleanup activities, this may lead [unknown] PVs with duplicate volume group names. Adjust the cleanup logic to: - Discover existing volume groups by UUID so that duplicate volume groups (i.e two occurrences of cgts-vg) can be handled individually. - Ignore [unknown] physical volumes in a volume group as they cannnot be removed. Cleaning up existing physical volumes across all volume groups will resolve any [unknown] physical volumes. In addition, unify if/then for/do syntax in the %pre-part hook Test Plan: PASS - create a scenario with multiple partitions along with a nova-local and cgts-vg volume group that result in an [unknown] physical volume and a duplicate cgts-vg. Do not wipe the disks and install an ISO with the above changes. Observe proper cleanup and install. PASS - Perform consecutive installs without wipedisk and observe proper cleanup and install Change-Id: Idf845cf00ca3c009d72dedef0805a77d94fa3d97 Partial-Bug: #1998204 Signed-off-by: Robert Church --- kickstart/files/kickstart.cfg | 105 ++++++++++++++++------------------ kickstart/files/miniboot.cfg | 63 +++++++++----------- 2 files changed, 75 insertions(+), 93 deletions(-) diff --git a/kickstart/files/kickstart.cfg b/kickstart/files/kickstart.cfg index 5e7a74a3..264b1fe1 100644 --- a/kickstart/files/kickstart.cfg +++ b/kickstart/files/kickstart.cfg @@ -1102,10 +1102,10 @@ EOF if [ -e /usr/sbin/ntpd ] ; then /usr/sbin/ntpd -g -q -n -c /etc/ntp_kickstart.conf rc=$? - if [ ${rc} -eq 0 ]; then + if [ ${rc} -eq 0 ] ; then if [ -e /sbin/hwclock ] ; then /sbin/hwclock --systohc --utc - if [ $? -ne 0 ]; then + if [ $? -ne 0 ] ; then wlog "failed hwclock command ; /sbin/hwclock --systohc --utc" else ilog "ntpd and hwclock set ok" @@ -1142,7 +1142,7 @@ fi # operations. ilog "Detected storage devices:" STOR_DEVS="" -for f in /dev/disk/by-path/*; do +for f in /dev/disk/by-path/* ; do dev=$(readlink -f $f) # dlog "found device ${f}" exec_retry 2 0.5 "lsblk --nodeps --pairs $dev" | grep -q 'TYPE="disk"' @@ -1168,7 +1168,7 @@ fi # devices. Since udev events are asynchronous this could lead to a case # where /dev/ links for existing partitions are briefly missing. # Missing /dev links leads to command execution failures. -for dev in $STOR_DEVS; do +for dev in $STOR_DEVS ; do exec {fd}>$dev || report_failure_with_msg "Error creating file descriptor for $dev." flock -n "$fd" || report_failure_with_msg "Can't get a lock on fd $fd of device $dev." STOR_DEV_FDS="$STOR_DEV_FDS $fd" @@ -1176,7 +1176,7 @@ for dev in $STOR_DEVS; do done # Log info about system state at beginning of partitioning operation -for dev in $STOR_DEVS; do +for dev in $STOR_DEVS ; do ilog "Initial partition table for $dev is:" # log "Initial partition table for $dev is:" parted -s $dev unit mib print @@ -1189,7 +1189,7 @@ display_mount_info # Consider removing since LAT already handles this failure mode # Ensure specified device is not a USB drive udevadm info --query=property --name=${INSTDEV} |grep -q '^ID_BUS=usb' -if [ $? -eq 0 ]; then +if [ $? -eq 0 ] ; then report_failure_with_msg "Specified installation ($INSTDEV) device is a USB drive." fi @@ -1200,41 +1200,35 @@ ilog "Volume Groups : ${VOLUME_GROUPS} ; $STOR_DEV_FDS" sed -i "s#obtain_device_list_from_udev = 1#obtain_device_list_from_udev = 0#" /etc/lvm/lvm.conf # Deactivate existing volume groups to avoid Anaconda issues with pre-existing groups -# TODO: May not need this Anaconda specific behavior work around -vgs=$(exec_no_fds "$STOR_DEV_FDS" "vgs --noheadings -o vg_name 2>/dev/null") -if [ -z ${vgs} ] ; then +vgs=( $(exec_no_fds "$STOR_DEV_FDS" "vgdisplay -C --noheadings -o vg_uuid 2>/dev/null") ) +if [ ${#vgs[@]} -eq 0 ] ; then ilog "No volume groups found" else - ilog "Found '${vgs}' volume groups" - for vg in $vgs; do - ilog "... disabling $vg" - exec_no_fds "$STOR_DEV_FDS" "vgchange -an $vg 2>/dev/null" 5 0.5 - [ $? -ne 0 ] && report_failure_with_msg "Failed to disable $vg." + for vg in ${vgs[@]} ; do + ilog "Disable volume group ${vg}" + exec_no_fds "$STOR_DEV_FDS" "vgchange -an --select vg_uuid=${vg} 2>/dev/null" 5 0.5 + [ $? -ne 0 ] && report_failure_with_msg "Failed to disable ${vg}." done - # Remove the volume groups that have physical volumes on the root disk - for vg in $(exec_no_fds "$STOR_DEV_FDS" "vgs --noheadings -o vg_name"); do - exec_no_fds "$STOR_DEV_FDS" "pvs --select \"vg_name=$vg\" --noheadings -o pv_name" | grep -q "${INSTDEV}" - if [ $? -ne 0 ]; then - wlog "Found $vg with no PV on rootfs, ignoring." - continue - fi - ilog "Removing LVs on $vg." - exec_no_fds "$STOR_DEV_FDS" "lvremove --force $vg" 5 0.5 || wlog "WARNING: Failed to remove lvs on $vg." - pvs=$(exec_no_fds "$STOR_DEV_FDS" "pvs --select \"vg_name=$vg\" --noheadings -o pv_name") - wlog "VG $vg has PVs: $(echo $pvs), removing them." - for pv in $pvs; do - ilog "Removing PV $pv." - exec_no_fds "$STOR_DEV_FDS" "pvremove --force --force --yes $pv" 5 0.5 - [ $? -ne 0 ] && report_failure_with_msg "Failed to remove PV." - done - # VG should no longer be present - vg_check=$(exec_no_fds "$STOR_DEV_FDS" "vgs --select \"vg_name=$vg\" --noheadings -o vg_name") - if [ -n "$vg_check" ]; then - wlog "WARNING: VG $vg is still present after removing PVs! Removing it by force." - exec_no_fds "$STOR_DEV_FDS" "vgremove --force $vg" 5 0.5 - [ $? -ne 0 ] && report_failure_with_msg "Failed to remove VG." + for vg in ${vgs[@]} ; do + vg_name=$(exec_no_fds "$STOR_DEV_FDS" "vgdisplay -C --noheadings --select vg_uuid=${vg} -o vg_name 2>/dev/null" | xargs) + + pvs=( $(exec_no_fds "$STOR_DEV_FDS" "pvs --select vg_uuid=${vg} --noheadings -o pv_name 2>/dev/null" | grep -v unknown) ) + if [ ${#pvs[@]} -ne 0 ] ; then + ilog "Remove logical volumes from ${vg_name} (${vg})" + exec_no_fds "$STOR_DEV_FDS" "lvremove --force --select vg_uuid=${vg} 2>/dev/null" 5 0.5 || wlog "WARNING: Failed to remove lvs on ${vg_name} (${vg})." + + for pv in ${pvs[@]} ; do + ilog "Remove physical volume ${pv} from ${vg_name} (${vg})" + exec_no_fds "$STOR_DEV_FDS" "pvremove --force --force --yes ${pv} 2>/dev/null" 5 0.5 + [ $? -ne 0 ] && report_failure_with_msg "Failed to remove ${pv}." + done fi + + ilog "Force remove volume group ${vg_name} (${vg})" + exec_no_fds "$STOR_DEV_FDS" "vgremove --force --select vg_uuid=${vg} 2>/dev/null" 5 0.5 + [ $? -ne 0 ] && report_failure_with_msg "Failed to remove ${vg_name} (${vg})." + done fi @@ -1255,7 +1249,7 @@ part_type_flags_str="Attribute flags" hostname="hostname" -if [ "$(curl -sf http://pxecontroller:6385/v1/upgrade/${hostname}/in_upgrade 2>/dev/null)" = "true" ]; then +if [ "$(curl -sf http://pxecontroller:6385/v1/upgrade/${hostname}/in_upgrade 2>/dev/null)" = "true" ] ; then # In an upgrade, only wipe the disk with the rootfs and boot partition wlog "In upgrade, wiping only ${INSTDEV}" @@ -1273,7 +1267,7 @@ else CEPH_JOURNAL_GUID="45B0969E-9B03-4F30-B4C6-B4B80CEFF106" # Check if we wipe OSDs - if [ "$(curl -sf http://pxecontroller:6385/v1/ihosts/wipe_osds 2>/dev/null)" = "true" ]; then + if [ "$(curl -sf http://pxecontroller:6385/v1/ihosts/wipe_osds 2>/dev/null)" = "true" ] ; then ilog "Wipe OSD data" WIPE_CEPH_OSDS="true" else @@ -1281,8 +1275,7 @@ else WIPE_CEPH_OSDS="false" fi - for dev in $STOR_DEVS - do + for dev in $STOR_DEVS ; do # TODO: Allowing the install dev 'in' results in a failure mode where # every second install fails with the following error string # and unrecoverable mount failure. @@ -1307,11 +1300,11 @@ else udevadm info --query=property --name=$dev |grep -q '^ID_BUS=usb' && continue # Avoid wiping ceph osds if sysinv tells us so - if [ ${WIPE_CEPH_OSDS} == "false" ]; then + if [ ${WIPE_CEPH_OSDS} == "false" ] ; then wipe_dev="true" exec_no_fds "$STOR_DEV_FDS" "pvs" | grep -q "$dev *ceph" - if [ $? -eq 0 ]; then + if [ $? -eq 0 ] ; then wlog "skip rook provisoned disk $dev" continue fi @@ -1319,33 +1312,33 @@ else part_numbers=( `parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}'` ) # Scanning the partitions looking for CEPH OSDs and # skipping any disk found with such partitions - for part_number in "${part_numbers[@]}"; do + for part_number in "${part_numbers[@]}" ; do sgdisk_part_info=$(sgdisk -i $part_number $dev) part_type_guid=$(echo "$sgdisk_part_info" | grep "$part_type_guid_str" | awk '{print $4;}') - if [ "$part_type_guid" == $CEPH_OSD_GUID ]; then + if [ "$part_type_guid" == $CEPH_OSD_GUID ] ; then wlog "OSD found on $dev, skipping wipe" wipe_dev="false" break fi exec_no_fds "$STOR_DEV_FDS" "pvs" | grep -q -e "${dev}${part_number} *ceph" -e "${dev}p${part_number} *ceph" - if [ $? -eq 0 ]; then + if [ $? -eq 0 ] ; then wlog "Rook OSD found on $dev$part_number, skip wipe" wipe_dev="false" break fi done - if [ "$wipe_dev" == "false" ]; then + if [ "$wipe_dev" == "false" ] ; then continue fi fi # Add device to the wipe list devname=$(basename $dev) - if [ -e $dev -a "$ISO_DEV" != "../../$devname" -a "$USB_DEV" != "../../$devname" ]; then + if [ -e $dev -a "$ISO_DEV" != "../../$devname" -a "$USB_DEV" != "../../$devname" ] ; then ilog "Adding ${dev} to list of disks to be wiped" - if [ -n "$WIPE_HDD" ]; then + if [ -n "$WIPE_HDD" ] ; then ilog "WIPE_HDD=$WIPE_HDD,$dev" WIPE_HDD=$WIPE_HDD,$dev else @@ -1362,14 +1355,13 @@ ilog "WIPE DISKs: ${WIPE_HDD}" ilog "===========" by_dev=${INSTDEV} # TODO: Avoid this loop if the INSTDEV does not have by-path in its name -for f in /dev/disk/by-path/*; do +for f in /dev/disk/by-path/* ; do if [ "${f}" == "${INSTDEV}" ] ; then by_dev=$(get_disk "${INSTDEV}") break fi done -for dev in ${WIPE_HDD//,/ } -do +for dev in ${WIPE_HDD//,/ } ; do ilog "Wiping $dev" # Clear previous GPT tables or LVM data on each disk. @@ -1383,7 +1375,7 @@ do part_numbers=( $(parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}') ) # For each '/dev/${dev}${part_number} apply wipe rules - for part_number in "${part_numbers[@]}"; do + for part_number in "${part_numbers[@]}" ; do sgdisk_part_info=$(sgdisk -i $part_number $dev) part_name=$(echo "$sgdisk_part_info" | grep "$part_type_name_str" | awk '{print $3;}') @@ -1452,7 +1444,7 @@ do # and contains a proper filesystem if [ "${part_number}" == "${BACKUP_PART_NO}" ] ; then part_fstype=$(exec_retry 5 0.5 "blkid -s TYPE -o value $part") - if [ "${part_fstype}" == "ext4" ]; then + if [ "${part_fstype}" == "ext4" ] ; then ilog "Discovered persistent backup partition, ${part}, is in the expected location and is formatted correctly. Maintaining..." BACKUP_PART_FOUND=1 continue @@ -1485,7 +1477,7 @@ do fi fi - if [ $WIPE_CEPH_OSDS == "true" -a "$part_type_guid" == $CEPH_JOURNAL_GUID ]; then + if [ $WIPE_CEPH_OSDS == "true" -a "$part_type_guid" == $CEPH_JOURNAL_GUID ] ; then # Journal partitions require additional wiping. Based on the ceph-manage-journal.py # script in the integ repo (at the ceph/ceph/files/ceph-manage-journal.py location) # wiping 100MB of data at the beginning of the partition should be enough. We also @@ -1500,7 +1492,7 @@ do fi done - if [ ${BACKUP_PART_FOUND} -eq 0 -o "${dev}" != "${by_dev}" ]; then + if [ ${BACKUP_PART_FOUND} -eq 0 -o "${dev}" != "${by_dev}" ] ; then ilog "Creating disk label for $dev" parted -s $dev mktable gpt ilog "... done" @@ -1508,8 +1500,7 @@ do done ilog "Ensure any LAT installer root/boot partitions are zapped/wiped" -for oldrootlabel in otaroot otaroot_1 otaroot_b otaroot_b_1 -do +for oldrootlabel in otaroot otaroot_1 otaroot_b otaroot_b_1 ; do oldrootpart=$(blkid --label $oldrootlabel) [ -z "$oldrootpart" ] && continue diff --git a/kickstart/files/miniboot.cfg b/kickstart/files/miniboot.cfg index 1e0e4457..adedd2bb 100644 --- a/kickstart/files/miniboot.cfg +++ b/kickstart/files/miniboot.cfg @@ -1190,41 +1190,35 @@ ilog "Volume Groups : ${VOLUME_GROUPS} ; $STOR_DEV_FDS" sed -i "s#obtain_device_list_from_udev = 1#obtain_device_list_from_udev = 0#" /etc/lvm/lvm.conf # Deactivate existing volume groups to avoid Anaconda issues with pre-existing groups -# TODO: May not need this Anaconda specific behavior work around -vgs=$(exec_no_fds "$STOR_DEV_FDS" "vgs --noheadings -o vg_name 2>/dev/null") -if [ -z ${vgs} ] ; then +vgs=( $(exec_no_fds "$STOR_DEV_FDS" "vgdisplay -C --noheadings -o vg_uuid 2>/dev/null") ) +if [ ${#vgs[@]} -eq 0 ] ; then ilog "No volume groups found" else - ilog "Found '${vgs}' volume groups" - for vg in $vgs; do - ilog "... disabling $vg" - exec_no_fds "$STOR_DEV_FDS" "vgchange -an $vg 2>/dev/null" 5 0.5 - [ $? -ne 0 ] && report_failure_with_msg "Failed to disable $vg." + for vg in ${vgs[@]}; do + ilog "Disable volume group ${vg}" + exec_no_fds "$STOR_DEV_FDS" "vgchange -an --select vg_uuid=${vg} 2>/dev/null" 5 0.5 + [ $? -ne 0 ] && report_failure_with_msg "Failed to disable ${vg}." done - # Remove the volume groups that have physical volumes on the root disk - for vg in $(exec_no_fds "$STOR_DEV_FDS" "vgs --noheadings -o vg_name"); do - exec_no_fds "$STOR_DEV_FDS" "pvs --select \"vg_name=$vg\" --noheadings -o pv_name" | grep -q "${INSTDEV}" - if [ $? -ne 0 ]; then - wlog "Found $vg with no PV on rootfs, ignoring." - continue - fi - ilog "Removing LVs on $vg." - exec_no_fds "$STOR_DEV_FDS" "lvremove --force $vg" 5 0.5 || wlog "WARNING: Failed to remove lvs on $vg." - pvs=$(exec_no_fds "$STOR_DEV_FDS" "pvs --select \"vg_name=$vg\" --noheadings -o pv_name") - wlog "VG $vg has PVs: $(echo $pvs), removing them." - for pv in $pvs; do - ilog "Removing PV $pv." - exec_no_fds "$STOR_DEV_FDS" "pvremove --force --force --yes $pv" 5 0.5 - [ $? -ne 0 ] && report_failure_with_msg "Failed to remove PV." - done - # VG should no longer be present - vg_check=$(exec_no_fds "$STOR_DEV_FDS" "vgs --select \"vg_name=$vg\" --noheadings -o vg_name") - if [ -n "$vg_check" ]; then - wlog "WARNING: VG $vg is still present after removing PVs! Removing it by force." - exec_no_fds "$STOR_DEV_FDS" "vgremove --force $vg" 5 0.5 - [ $? -ne 0 ] && report_failure_with_msg "Failed to remove VG." + for vg in ${vgs[@]}; do + vg_name=$(exec_no_fds "$STOR_DEV_FDS" "vgdisplay -C --noheadings --select vg_uuid=${vg} -o vg_name 2>/dev/null" | xargs) + + pvs=( $(exec_no_fds "$STOR_DEV_FDS" "pvs --select vg_uuid=${vg} --noheadings -o pv_name 2>/dev/null" | grep -v unknown) ) + if [ ${#pvs[@]} -ne 0 ] ; then + ilog "Remove logical volumes from ${vg_name} (${vg})" + exec_no_fds "$STOR_DEV_FDS" "lvremove --force --select vg_uuid=${vg} 2>/dev/null" 5 0.5 || wlog "WARNING: Failed to remove lvs on ${vg_name} (${vg})." + + for pv in ${pvs[@]}; do + ilog "Remove physical volume ${pv} from ${vg_name} (${vg})" + exec_no_fds "$STOR_DEV_FDS" "pvremove --force --force --yes ${pv} 2>/dev/null" 5 0.5 + [ $? -ne 0 ] && report_failure_with_msg "Failed to remove ${pv}." + done fi + + ilog "Force remove volume group ${vg_name} (${vg})" + exec_no_fds "$STOR_DEV_FDS" "vgremove --force --select vg_uuid=${vg} 2>/dev/null" 5 0.5 + [ $? -ne 0 ] && report_failure_with_msg "Failed to remove ${vg_name} (${vg})." + done fi @@ -1271,8 +1265,7 @@ else WIPE_CEPH_OSDS="false" fi - for dev in $STOR_DEVS - do + for dev in $STOR_DEVS ; do # TODO: Allowing the install dev 'in' results in a failure mode where # every second install fails with the following error string # and unrecoverable mount failure. @@ -1358,8 +1351,7 @@ for f in /dev/disk/by-path/*; do break fi done -for dev in ${WIPE_HDD//,/ } -do +for dev in ${WIPE_HDD//,/ } ; do ilog "Wiping $dev" # Clear previous GPT tables or LVM data on each disk. @@ -1502,8 +1494,7 @@ do done ilog "Ensure any LAT installer root/boot partitions are zapped/wiped" -for oldrootlabel in otaroot otaroot_1 otaroot_b otaroot_b_1 -do +for oldrootlabel in otaroot otaroot_1 otaroot_b otaroot_b_1 ; do oldrootpart=$(blkid --label $oldrootlabel) [ -z "$oldrootpart" ] && continue