Fix race condition in cluster-show

There is a race condition in showing a cluster wherein the server gets the list of instances from the db and then iterates over the list to get the server information. If a shrink operation is in progress, it can happen that one of the instances is no longer present when trying to retrieve the server info, and this causes the show command to throw a NotFound error. This is now trapped and the 'missing' server excluded from the list. Change-Id: I54edc4acac09ca2278f525c08ad0d87576f0549e Closee-Bug: 1643002
2016-11-18 11:42:56 -05:00
parent fef848fbb2
commit 1b114c8735
2 changed files with 17 additions and 3 deletions
--- a/releasenotes/notes/fix-cluster-show-346798b3e3.yaml
+++ b/releasenotes/notes/fix-cluster-show-346798b3e3.yaml
@ -0,0 +1,5 @@
+---
+fixes:
+  - Fix race condition in cluster-show that returned
+    erroneous not found error.
+    Bug 1643002
--- a/trove/instance/models.py
+++ b/trove/instance/models.py
@ -1359,9 +1359,18 @@ class Instances(object):
    def load_all_by_cluster_id(context, cluster_id, load_servers=True):
        db_instances = DBInstance.find_all(cluster_id=cluster_id,
                                           deleted=False)
-        return [load_any_instance(context, db_inst.id,
-                                  load_server=load_servers)
-                for db_inst in db_instances]
+        db_insts = []
+        for db_instance in db_instances:
+            try:
+                db_inst = load_any_instance(
+                    context, db_instance.id, load_server=load_servers)
+                db_insts.append(db_inst)
+            except exception.NotFound:
+                # The instance may be gone if we're in the middle of a
+                # shrink operation, so just log and continue
+                LOG.debug("Instance %s is no longer available, skipping." %
+                          db_instance.id)
+        return db_insts

    @staticmethod
    def _load_servers_status(load_instance, context, db_items, find_server):