Replace etcd cloning for symlink

Previously, the etcd to-release directory was a clone of the
from-release database; with the current code a symlink is
created instead, so the etcd database is the same over all the
upgrade procedure, and the symlink is removed and from-release
directory is renamed to to-release during deploy delete.

This change is proposed to fix an issue occuring in multinode
systems where there is a mismatch between etcd (kubectl) and
the pods effectively running in a host (crictl), ultimately
leading to deploy activate failures.

Test Plan
PASS: AIO-SX e2e stx-10 -> stx-11 upgrade
PASS: AIO-DX e2e stx-10 -> stx-11 upgrade
PASS: AIO-DX orchestrated stx-10 -> stx-11 upgrade

Closes-bug: 2111588

Change-Id: I19bdffbbe7325e3edd9c45751dcac4af66acdf97
Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com>
This commit is contained in:
Heitor Matsui
2025-05-22 18:19:21 -03:00
committed by Heitor Matsui
parent 4f3d32a9e9
commit 28ab0e7aa1
2 changed files with 29 additions and 4 deletions
+6 -3
View File
@@ -157,11 +157,14 @@ class DataMigration(object):
etcd_to_dir = os.path.join(ETCD_PATH, self.to_release)
etcd_from_dir = os.path.join(ETCD_PATH, self.from_release)
shutil.rmtree(etcd_to_dir, ignore_errors=True)
if os.path.islink(etcd_to_dir):
os.unlink(etcd_to_dir)
elif os.path.isdir(etcd_to_dir):
shutil.rmtree(etcd_to_dir, ignore_errors=True)
try:
shutil.copytree(etcd_from_dir, etcd_to_dir)
LOG.info("Copied etcd from %s to %s completed", etcd_from_dir, etcd_to_dir)
os.symlink(etcd_from_dir, etcd_to_dir, target_is_directory=True)
LOG.info("Symlink etcd from %s to %s completed", etcd_from_dir, etcd_to_dir)
except Exception as e:
LOG.exception("Failed to copy etcd from %s to %s. Error: %s.",
etcd_from_dir, etcd_to_dir, e.output)
+23 -1
View File
@@ -1553,11 +1553,33 @@ def clean_up_deployment_data(major_release):
os.path.join(constants.POSTGRES_PATH, constants.UPGRADE),
os.path.join(constants.POSTGRES_PATH, major_release),
os.path.join(constants.RABBIT_PATH, major_release),
os.path.join(constants.ETCD_PATH, major_release),
]
for folder in upgrade_folders:
shutil.rmtree(folder, ignore_errors=True)
# etcd has different cleanup procedure:
# - remove the to-release symlink
# - rename from-release directory to to-release
# - restart etcd process
etcd_from_path = os.path.join(constants.ETCD_PATH, major_release)
etcd_to_path = os.path.join(constants.ETCD_PATH, SW_VERSION)
if utils.compare_release_version(SW_VERSION, major_release):
if os.path.islink(etcd_to_path):
os.unlink(etcd_to_path)
LOG.info("Removed %s symlink", etcd_to_path)
os.rename(etcd_from_path, etcd_to_path)
LOG.info("Renamed %s directory to %s", etcd_from_path, etcd_to_path)
try:
subprocess.run(["/usr/bin/sm-restart-safe", "service", "etcd"], check=True)
LOG.info("Restarted etcd service")
except subprocess.CalledProcessError as e:
LOG.error("Error restarting etcd: %s", str(e))
# on rollback, only the symlink needs to be removed
else:
if os.path.islink(etcd_from_path):
os.unlink(etcd_from_path)
LOG.info("Removed %s symlink", etcd_from_path)
def remove_major_release_deployment_flags():
"""