Merge "Allow rehome related data update when subcloud migration fails"
This commit is contained in:
commit
510f5df953
@ -305,7 +305,9 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController):
|
||||
(system_leader_id and
|
||||
system_leader_id != group.system_leader_id) or
|
||||
(system_leader_name and
|
||||
system_leader_name != group.system_leader_name)))
|
||||
system_leader_name != group.system_leader_name) or
|
||||
(migration_status and
|
||||
migration_status != group.migration_status)))
|
||||
if not any_update:
|
||||
return db_api.subcloud_peer_group_db_model_to_dict(group)
|
||||
|
||||
|
@ -343,13 +343,19 @@ class SubcloudsController(object):
|
||||
else dccommon_consts.DEPLOY_CONFIG_UP_TO_DATE
|
||||
return sync_status
|
||||
|
||||
def _validate_rehome_pending(self, subcloud, management_state):
|
||||
def _validate_rehome_pending(self, subcloud, management_state, request):
|
||||
unmanaged = dccommon_consts.MANAGEMENT_UNMANAGED
|
||||
error_msg = None
|
||||
|
||||
# Can only set the subcloud to rehome-pending
|
||||
# if the deployment is done
|
||||
if subcloud.deploy_status != consts.DEPLOY_STATE_DONE:
|
||||
# if the deployment is done or request from another site.
|
||||
# The reason that we skip the validation if the request is from
|
||||
# another site is when migrating the subcloud back to a peer site,
|
||||
# the site will attempt to set the remote subcloud's deploy status
|
||||
# to "rehome-pending." However, the remote subcloud might be in a
|
||||
# "rehome-failed" state from a previous failed rehoming attempt.
|
||||
if (subcloud.deploy_status != consts.DEPLOY_STATE_DONE and
|
||||
not utils.is_req_from_another_dc(request)):
|
||||
error_msg = (
|
||||
"The deploy status can only be updated to "
|
||||
f"'{consts.DEPLOY_STATE_REHOME_PENDING}' if the current "
|
||||
@ -656,13 +662,16 @@ class SubcloudsController(object):
|
||||
req_from_another_dc = utils.is_req_from_another_dc(request)
|
||||
original_pgrp = None
|
||||
leader_on_local_site = False
|
||||
peer_site_available = True
|
||||
pga = None
|
||||
update_in_non_primary_site = False
|
||||
if subcloud.peer_group_id is not None:
|
||||
# Get the original peer group of the subcloud
|
||||
original_pgrp = db_api.subcloud_peer_group_get(
|
||||
context, subcloud.peer_group_id)
|
||||
leader_on_local_site = utils.is_leader_on_local_site(original_pgrp)
|
||||
# A sync command is required after updating a subcloud
|
||||
# in an SPG that is already associated with a PGA on the primary
|
||||
# in an SPG that is already associated with a PGA in the primary
|
||||
# and leader site. The existence of the PGA will be checked
|
||||
# by the update_association_sync_status method later.
|
||||
if (original_pgrp.group_priority == 0 and
|
||||
@ -670,6 +679,18 @@ class SubcloudsController(object):
|
||||
not req_from_another_dc):
|
||||
sync_peer_groups.add(subcloud.peer_group_id)
|
||||
|
||||
# Get the peer site availability and PGA sync status
|
||||
# TODO(lzhu1): support multiple sites
|
||||
associations = db_api.peer_group_association_get_by_peer_group_id(
|
||||
context, original_pgrp.id)
|
||||
for association in associations:
|
||||
pga = association
|
||||
system_peer = db_api.system_peer_get(
|
||||
context, association.system_peer_id)
|
||||
peer_site_available = \
|
||||
system_peer.availability_state == \
|
||||
consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE
|
||||
|
||||
peer_group = payload.get('peer_group')
|
||||
# Verify the peer_group is valid
|
||||
peer_group_id = None
|
||||
@ -690,22 +711,13 @@ class SubcloudsController(object):
|
||||
pecan.abort(400, _("Removing subcloud from a "
|
||||
"peer group not led by the "
|
||||
"current site is prohibited."))
|
||||
# Get associations by peer group id
|
||||
associations = db_api.\
|
||||
peer_group_association_get_by_peer_group_id(
|
||||
context, original_pgrp.id)
|
||||
for association in associations:
|
||||
system_peer = db_api.system_peer_get(
|
||||
context, association.system_peer_id)
|
||||
# If system peer is available, then does not allow
|
||||
# to remove the subcloud from secondary peer group
|
||||
if system_peer.availability_state == consts.\
|
||||
SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE \
|
||||
and original_pgrp.group_priority > 0:
|
||||
pecan.abort(400, _(
|
||||
"Removing subcloud from a peer group "
|
||||
"associated with an available system peer "
|
||||
"is prohibited."))
|
||||
# If system peer is available, then does not allow
|
||||
# to remove the subcloud from secondary peer group
|
||||
if peer_site_available and original_pgrp.group_priority > 0:
|
||||
pecan.abort(400, _(
|
||||
"Removing subcloud from a peer group "
|
||||
"associated with an available system peer "
|
||||
"is prohibited."))
|
||||
peer_group_id = 'none'
|
||||
else:
|
||||
if not (subcloud.rehome_data or (
|
||||
@ -744,16 +756,39 @@ class SubcloudsController(object):
|
||||
sync_peer_groups.add(pgrp.id)
|
||||
peer_group_id = pgrp.id
|
||||
|
||||
bootstrap_values = payload.get('bootstrap_values')
|
||||
bootstrap_address = payload.get('bootstrap_address')
|
||||
|
||||
# Subcloud can only be updated while it is managed in
|
||||
# the primary site because the sync command can only be issued
|
||||
# in the site where the SPG was created.
|
||||
# in the site where the SPG was created. However, bootstrap
|
||||
# values or address update is an exception.
|
||||
if original_pgrp and peer_group_id is None and not req_from_another_dc:
|
||||
if original_pgrp.group_priority > 0:
|
||||
pecan.abort(400, _("Subcloud update is only allowed when "
|
||||
"its peer group priority value is 0."))
|
||||
if bootstrap_values or bootstrap_address:
|
||||
if any(field not in
|
||||
('bootstrap_values', 'bootstrap_address')
|
||||
for field in payload):
|
||||
pecan.abort(400,
|
||||
_("Only bootstrap values and address "
|
||||
"can be updated in the non-primary site"))
|
||||
if (subcloud.deploy_status ==
|
||||
consts.DEPLOY_STATE_REHOME_FAILED and
|
||||
not peer_site_available):
|
||||
update_in_non_primary_site = True
|
||||
else:
|
||||
pecan.abort(400,
|
||||
_("Subcloud bootstrap values or address "
|
||||
"update in the non-primary site is only "
|
||||
"allowed when rehome failed and the "
|
||||
"primary site is unavailable."))
|
||||
if not update_in_non_primary_site:
|
||||
pecan.abort(400, _("Subcloud update is only allowed when "
|
||||
"its peer group priority value is 0."))
|
||||
|
||||
# Updating a subcloud under the peer group on primary site
|
||||
# that the peer group should be led by the primary site.
|
||||
if not leader_on_local_site:
|
||||
if not leader_on_local_site and not update_in_non_primary_site:
|
||||
pecan.abort(400, _("Updating subcloud from a "
|
||||
"peer group not led by the "
|
||||
"current site is prohibited."))
|
||||
@ -845,15 +880,13 @@ class SubcloudsController(object):
|
||||
group_id = payload.get('group_id')
|
||||
description = payload.get('description')
|
||||
location = payload.get('location')
|
||||
bootstrap_values = payload.get('bootstrap_values')
|
||||
bootstrap_address = payload.get('bootstrap_address')
|
||||
|
||||
# If the migrate flag is present we need to update the deploy status
|
||||
# to consts.DEPLOY_STATE_REHOME_PENDING
|
||||
deploy_status = None
|
||||
if (payload.get('migrate') == 'true' and subcloud.deploy_status !=
|
||||
consts.DEPLOY_STATE_REHOME_PENDING):
|
||||
self._validate_rehome_pending(subcloud, management_state)
|
||||
self._validate_rehome_pending(subcloud, management_state, request)
|
||||
deploy_status = consts.DEPLOY_STATE_REHOME_PENDING
|
||||
|
||||
# Syntax checking
|
||||
@ -917,7 +950,23 @@ class SubcloudsController(object):
|
||||
bootstrap_address=bootstrap_address,
|
||||
deploy_status=deploy_status)
|
||||
|
||||
if sync_peer_groups:
|
||||
# Update the PGA sync_status to out-of-sync locally
|
||||
# in the non-primary site. This only occurs when the primary site
|
||||
# is unavailable and rehome fails due to the issue with bootstrap
|
||||
# values or address.
|
||||
if (update_in_non_primary_site and
|
||||
pga.sync_status !=
|
||||
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC):
|
||||
db_api.peer_group_association_update(
|
||||
context,
|
||||
pga.id,
|
||||
sync_status=consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC)
|
||||
LOG.debug(
|
||||
f"Updated Local Peer Group Association {pga.id} "
|
||||
f"sync_status to out-of-sync.")
|
||||
# Sync the PGA out-of-sync status across all sites launched by
|
||||
# the primary site.
|
||||
elif sync_peer_groups:
|
||||
# Collect the affected peer group association IDs.
|
||||
association_ids = set()
|
||||
for pg_id in sync_peer_groups:
|
||||
|
@ -43,10 +43,10 @@ class PeerGroupAuditManager(manager.Manager):
|
||||
self.thread_lock = threading.Lock()
|
||||
|
||||
def _get_subclouds_by_peer_group_from_system_peer(self,
|
||||
dc_client,
|
||||
system_peer,
|
||||
peer_group_name):
|
||||
try:
|
||||
dc_client = SystemPeerManager.get_peer_dc_client(system_peer)
|
||||
subclouds = dc_client.get_subcloud_list_by_peer_group(
|
||||
peer_group_name)
|
||||
return subclouds
|
||||
@ -55,6 +55,22 @@ class PeerGroupAuditManager(manager.Manager):
|
||||
f"{peer_group_name} from DC: "
|
||||
f"{system_peer.peer_name}")
|
||||
|
||||
@staticmethod
|
||||
def _get_association_sync_status_from_peer_site(dc_client,
|
||||
system_peer,
|
||||
peer_group_id):
|
||||
try:
|
||||
# Get peer site system peer
|
||||
dc_peer_system_peer = dc_client.get_system_peer(
|
||||
utils.get_local_system().uuid)
|
||||
association = dc_client. \
|
||||
get_peer_group_association_with_peer_id_and_pg_id(
|
||||
dc_peer_system_peer.get('id'), peer_group_id)
|
||||
return association.get("sync-status")
|
||||
except Exception:
|
||||
LOG.exception(f"Failed to get subclouds of peer group "
|
||||
f"{peer_group_id} from DC: {system_peer.peer_name}")
|
||||
|
||||
def _update_remote_peer_group_migration_status(self,
|
||||
system_peer,
|
||||
peer_group_name,
|
||||
@ -71,9 +87,11 @@ class PeerGroupAuditManager(manager.Manager):
|
||||
|
||||
def _get_local_subclouds_to_update_and_delete(self,
|
||||
local_peer_group,
|
||||
remote_subclouds):
|
||||
remote_subclouds,
|
||||
remote_sync_status):
|
||||
local_subclouds_to_update = list()
|
||||
local_subclouds_to_delete = list()
|
||||
any_rehome_failed = False
|
||||
remote_subclouds_dict = {remote_subcloud.get('region-name'):
|
||||
remote_subcloud for remote_subcloud
|
||||
in remote_subclouds}
|
||||
@ -92,10 +110,30 @@ class PeerGroupAuditManager(manager.Manager):
|
||||
not utils.subcloud_is_secondary_state(
|
||||
local_subcloud.deploy_status)):
|
||||
local_subclouds_to_update.append(local_subcloud)
|
||||
# Sync rehome_data from remote to local subcloud if the remote
|
||||
# PGA sync_status is out-of-sync once migration completes,
|
||||
# indicating any bootstrap values/address updates to
|
||||
# the subcloud on the remote site.
|
||||
if remote_sync_status == \
|
||||
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC:
|
||||
self._sync_rehome_data(
|
||||
local_subcloud.id, remote_subcloud.get('rehome_data'))
|
||||
elif remote_subcloud.get('deploy-status') in \
|
||||
(consts.DEPLOY_STATE_REHOME_FAILED,
|
||||
consts.DEPLOY_STATE_REHOME_PREP_FAILED):
|
||||
# Set local subcloud to rehome-failed if the remote is
|
||||
# rehome-failed or rehome-prep-failed, otherwise, the
|
||||
# deploy_status will remain rehome-pending, which will
|
||||
# block the correction of the bootstrap values/address.
|
||||
db_api.subcloud_update(
|
||||
self.context, local_subcloud.id,
|
||||
deploy_status=consts.DEPLOY_STATE_REHOME_FAILED)
|
||||
any_rehome_failed = True
|
||||
else:
|
||||
local_subclouds_to_delete.append(local_subcloud)
|
||||
|
||||
return local_subclouds_to_update, local_subclouds_to_delete
|
||||
return local_subclouds_to_update, local_subclouds_to_delete, \
|
||||
any_rehome_failed
|
||||
|
||||
def _set_local_subcloud_to_secondary(self, subcloud):
|
||||
try:
|
||||
@ -118,6 +156,9 @@ class PeerGroupAuditManager(manager.Manager):
|
||||
f"and offline subcloud [{subcloud.name}], err: {e}")
|
||||
raise e
|
||||
|
||||
def _sync_rehome_data(self, subcloud_id, rehome_data):
|
||||
db_api.subcloud_update(self.context, subcloud_id, rehome_data=rehome_data)
|
||||
|
||||
def audit(self, system_peer, remote_peer_group, local_peer_group):
|
||||
if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING:
|
||||
LOG.info("Local peer group in migrating state, quit audit")
|
||||
@ -187,14 +228,22 @@ class PeerGroupAuditManager(manager.Manager):
|
||||
# set 'unmanaged+secondary' to local on same subclouds
|
||||
elif remote_peer_group.get("migration_status") == \
|
||||
consts.PEER_GROUP_MIGRATION_COMPLETE:
|
||||
dc_client = SystemPeerManager.get_peer_dc_client(system_peer)
|
||||
remote_subclouds = \
|
||||
self._get_subclouds_by_peer_group_from_system_peer(
|
||||
dc_client,
|
||||
system_peer,
|
||||
remote_peer_group.get("peer_group_name"))
|
||||
remote_sync_status = \
|
||||
self._get_association_sync_status_from_peer_site(
|
||||
dc_client,
|
||||
system_peer,
|
||||
remote_peer_group.get("id"))
|
||||
|
||||
local_subclouds_to_update, local_subclouds_to_delete = \
|
||||
local_subclouds_to_update, local_subclouds_to_delete, \
|
||||
any_rehome_failed = \
|
||||
self._get_local_subclouds_to_update_and_delete(
|
||||
local_peer_group, remote_subclouds)
|
||||
local_peer_group, remote_subclouds, remote_sync_status)
|
||||
|
||||
for subcloud in local_subclouds_to_update:
|
||||
self._set_local_subcloud_to_secondary(subcloud)
|
||||
@ -218,7 +267,7 @@ class PeerGroupAuditManager(manager.Manager):
|
||||
f"peer site, err: {e}")
|
||||
raise e
|
||||
|
||||
if local_subclouds_to_update or local_subclouds_to_delete:
|
||||
if remote_peer_group.get("system_leader_id") == system_peer.peer_uuid:
|
||||
self._clear_or_raise_alarm(system_peer,
|
||||
local_peer_group,
|
||||
remote_peer_group)
|
||||
@ -232,10 +281,13 @@ class PeerGroupAuditManager(manager.Manager):
|
||||
system_peer,
|
||||
remote_peer_group.get("peer_group_name"),
|
||||
None)
|
||||
SystemPeerManager.update_sync_status(
|
||||
self.context, system_peer,
|
||||
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
|
||||
local_peer_group, remote_peer_group)
|
||||
|
||||
if not (remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC
|
||||
and any_rehome_failed):
|
||||
SystemPeerManager.update_sync_status(
|
||||
self.context, system_peer,
|
||||
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
|
||||
local_peer_group, remote_peer_group)
|
||||
self.require_audit_flag = False
|
||||
else:
|
||||
# If remote peer group migration_status is 'None'
|
||||
|
@ -236,10 +236,14 @@ class SystemPeerManager(manager.Manager):
|
||||
f"it doesn't exist.")
|
||||
return
|
||||
|
||||
is_secondary = SystemPeerManager.is_subcloud_secondary(peer_subcloud)
|
||||
if not is_secondary:
|
||||
if SystemPeerManager.get_subcloud_deploy_status(peer_subcloud) not in (
|
||||
consts.DEPLOY_STATE_SECONDARY_FAILED,
|
||||
consts.DEPLOY_STATE_SECONDARY,
|
||||
consts.DEPLOY_STATE_REHOME_FAILED,
|
||||
consts.DEPLOY_STATE_REHOME_PREP_FAILED
|
||||
):
|
||||
LOG.info(f"Ignoring delete Peer Site Subcloud {subcloud_ref} "
|
||||
f"as is not in secondary state.")
|
||||
f"as is not in secondary or rehome failed state.")
|
||||
return
|
||||
|
||||
dc_client.delete_subcloud(subcloud_ref)
|
||||
@ -340,7 +344,10 @@ class SystemPeerManager(manager.Manager):
|
||||
# should be recorded as a failure.
|
||||
peer_subcloud_deploy_status = self.get_subcloud_deploy_status(
|
||||
peer_subcloud)
|
||||
if peer_subcloud_deploy_status != consts.DEPLOY_STATE_SECONDARY:
|
||||
if peer_subcloud_deploy_status not in \
|
||||
(consts.DEPLOY_STATE_SECONDARY,
|
||||
consts.DEPLOY_STATE_REHOME_FAILED,
|
||||
consts.DEPLOY_STATE_REHOME_PREP_FAILED):
|
||||
subcloud.msg = "Subcloud's deploy status not correct: %s" \
|
||||
% peer_subcloud_deploy_status
|
||||
return subcloud, False
|
||||
@ -427,6 +434,9 @@ class SystemPeerManager(manager.Manager):
|
||||
continue
|
||||
|
||||
try:
|
||||
# TODO(lzhu1): Sending requests to fetch the subcloud one by one
|
||||
# should be optimized to fetch them all with one request by calling
|
||||
# the "get_subcloud_list_by_peer_group" method
|
||||
peer_subcloud = self.get_peer_subcloud(dc_client, subcloud_name)
|
||||
if not peer_subcloud:
|
||||
LOG.info(f"Subcloud {subcloud_name} (region_name: "
|
||||
@ -434,10 +444,12 @@ class SystemPeerManager(manager.Manager):
|
||||
valid_subclouds.append(subcloud)
|
||||
continue
|
||||
|
||||
if not self.is_subcloud_secondary(peer_subcloud):
|
||||
msg = "Ignoring update Peer Site Subcloud " + \
|
||||
f"{subcloud_name} (region_name: {region_name})" + \
|
||||
" as is not in secondary state."
|
||||
if not self.is_subcloud_secondary(peer_subcloud) and \
|
||||
self.get_subcloud_deploy_status(peer_subcloud) not in \
|
||||
(consts.DEPLOY_STATE_REHOME_FAILED,
|
||||
consts.DEPLOY_STATE_REHOME_PREP_FAILED):
|
||||
msg = (f"Subcloud {subcloud_name} is not in the right state "
|
||||
f"for sync.")
|
||||
LOG.info(msg)
|
||||
error_msg[subcloud_name] = msg
|
||||
continue
|
||||
|
Loading…
Reference in New Issue
Block a user