Fix: DC orch re-enters completed/failed state sometimes

Sometimes DC orchestration can re-enter the same state after it has either failed or completed. This issue can only be observed when a large number of subcloud orchestration is performed. As the problematic code appears to be in orch_thread.py, it affects all types of DC orchestration. This commit fixes the issue described above. Test: Successfuly completed load import (as a part of duplex subcloud orchestration) with large number of subclouds without re-entering the same state. Change-Id: I57802a07009ff50d300146869efa3ceb4c9a2749 Signed-off-by: Jessica Castelino <jessica.castelino@windriver.com> Closes-Bug: 1953519
2021-12-07 05:23:32 -05:00
parent ed5423d076
commit 6e204a8e05
1 changed files with 37 additions and 15 deletions
--- a/distributedcloud/dcmanager/orchestrator/orch_thread.py
+++ b/distributedcloud/dcmanager/orchestrator/orch_thread.py
@@ -162,6 +162,14 @@ class OrchThread(threading.Thread):
                                           started_at=started_at,
                                           finished_at=finished_at)

+    def _delete_subcloud_worker(self, region):
+        if region in self.subcloud_workers:
+            # The orchestration for this subcloud has either
+            # completed/failed/aborted, remove it from the
+            # dictionary.
+            LOG.debug("Remove %s from subcloud_workers dict" % region)
+            del self.subcloud_workers[region]
+
    def run_orch(self):
        while not self.stopped():
            try:
@@ -211,13 +219,16 @@ class OrchThread(threading.Thread):
        for strategy_step in strategy_steps:
            if strategy_step.state == consts.STRATEGY_STATE_COMPLETE:
                # This step is complete
+                self._delete_subcloud_worker(strategy_step.subcloud.name)
                continue
            elif strategy_step.state == consts.STRATEGY_STATE_ABORTED:
                # This step was aborted
+                self._delete_subcloud_worker(strategy_step.subcloud.name)
                abort_detected = True
                continue
            elif strategy_step.state == consts.STRATEGY_STATE_FAILED:
                failure_detected = True
+                self._delete_subcloud_worker(strategy_step.subcloud.name)
                # This step has failed and needs no further action
                if strategy_step.subcloud_id is None:
                    # Strategy on SystemController failed. We are done.
@@ -269,6 +280,8 @@ class OrchThread(threading.Thread):
                        self.context,
                        state=consts.SW_UPDATE_STATE_COMPLETE,
                        update_type=self.update_type)
+
+            self.subcloud_workers.clear()
            # Trigger audit to update the sync status for each subcloud.
            self.trigger_audit()
            return
@@ -306,16 +319,19 @@ class OrchThread(threading.Thread):
                if self.stopped():
                    LOG.info("(%s) Exiting because task is stopped"
                             % self.update_type)
+                    self.subcloud_workers.clear()
                    return
                if strategy_step.state == \
                        consts.STRATEGY_STATE_FAILED:
                    LOG.debug("(%s) Intermediate step is failed"
                              % self.update_type)
+                    self._delete_subcloud_worker(region)
                    continue
                elif strategy_step.state == \
                        consts.STRATEGY_STATE_COMPLETE:
                    LOG.debug("(%s) Intermediate step is complete"
                              % self.update_type)
+                    self._delete_subcloud_worker(region)
                    continue
                elif strategy_step.state == \
                        consts.STRATEGY_STATE_INITIAL:
@@ -479,19 +495,30 @@ class OrchThread(threading.Thread):
    def process_update_step(self, region, strategy_step, log_error=False):
        """manage the green thread for calling perform_state_action"""
        if region in self.subcloud_workers:
-            # A worker already exists. Let it finish whatever it was doing.
-            if log_error:
-                LOG.error("(%s) Worker should not exist for %s."
-                          % (self.update_type, region))
+            if self.subcloud_workers[region][0] == strategy_step.state:
+                # A worker already exists. Let it finish whatever it was doing.
+                if log_error:
+                    LOG.error("(%s) Worker should not exist for %s."
+                              % (self.update_type, region))
+                else:
+                    LOG.debug("(%s) Update worker exists for %s."
+                              % (self.update_type, region))
            else:
-                LOG.debug("(%s) Update worker exists for %s."
-                          % (self.update_type, region))
+                LOG.debug("Starting a new worker for region %s at state %s (update)"
+                          % (region, strategy_step.state))
+                # Advance to the next state. The previous greenthread has exited,
+                # create a new one.
+                self.subcloud_workers[region] = \
+                    (strategy_step.state, self.thread_group_manager.start(
+                     self.perform_state_action, strategy_step))
        else:
-            # Create a greenthread to start processing the update for the
-            # subcloud and invoke the perform_state_action method
+            # This is the first state. create a greenthread to start processing
+            # the update for the subcloud and invoke the perform_state_action method.
+            LOG.debug("Starting a new worker for region %s at state %s"
+                      % (region, strategy_step.state))
            self.subcloud_workers[region] = \
-                self.thread_group_manager.start(self.perform_state_action,
-                                                strategy_step)
+                (strategy_step.state, self.thread_group_manager.start(
+                 self.perform_state_action, strategy_step))

    def perform_state_action(self, strategy_step):
        """Extensible state handler for processing and transitioning states """
@@ -519,8 +546,3 @@ class OrchThread(threading.Thread):
            self.strategy_step_update(strategy_step.subcloud_id,
                                      state=consts.STRATEGY_STATE_FAILED,
                                      details=details)
-        finally:
-            # The worker is done.
-            region = self.get_region_name(strategy_step)
-            if region in self.subcloud_workers:
-                del self.subcloud_workers[region]