Browse Source

Merge "Remediate releases stuck in non-DEPLOYED statuses"

Zuul 2 months ago
parent
commit
52f29ddf73

+ 16
- 12
armada/exceptions/armada_exceptions.py View File

@@ -31,14 +31,15 @@ class ArmadaTimeoutException(ArmadaException):
31 31
 
32 32
 class ProtectedReleaseException(ArmadaException):
33 33
     '''
34
-    Exception that occurs when Armada encounters a FAILED release that is
35
-    designated `protected` in the Chart and `continue_processing` is False.
34
+    Exception that occurs when Armada encounters a release with status other
35
+    than DEPLOYED that is designated `protected` in the Chart and
36
+    `continue_processing` is False.
36 37
     '''
37 38
 
38
-    def __init__(self, reason):
39
+    def __init__(self, release, status):
39 40
         self._message = (
40
-            'Armada encountered protected release %s in FAILED status' %
41
-            reason)
41
+            'Armada encountered protected release {} in {} status'.format(
42
+                release, status))
42 43
         super(ProtectedReleaseException, self).__init__(self._message)
43 44
 
44 45
 
@@ -88,13 +89,16 @@ class WaitException(ArmadaException):
88 89
         super(WaitException, self).__init__(message)
89 90
 
90 91
 
91
-class UnexpectedReleaseStatusException(ArmadaException):
92
+class DeploymentLikelyPendingException(ArmadaException):
92 93
     '''
93
-    Exception that occurs when armada encounters an existing release for a
94
-    chart with an unexpected status which armada does not know what to do with.
94
+    Exception that occurs when it is detected that an existing release
95
+    operation (e.g. install, update, rollback, delete) is likely still pending.
95 96
     '''
96 97
 
97
-    def __init__(self, release_name, status):
98
-        self._message = "Found release {} in unexpected status {}".format(
99
-            release_name, status)
100
-        super(UnexpectedReleaseStatusException, self).__init__(self._message)
98
+    def __init__(self, release, status, last_deployment_age, timeout):
99
+        self._message = (
100
+            'Existing deployment likely pending '
101
+            'release={}, status={}, '
102
+            '(last deployment age={}s) < (chart wait timeout={}s)'.format(
103
+                release, status, last_deployment_age, timeout))
104
+        super(DeploymentLikelyPendingException, self).__init__(self._message)

+ 56
- 30
armada/handlers/chart_deploy.py View File

@@ -52,19 +52,12 @@ class ChartDeploy(object):
52 52
 
53 53
         result = {}
54 54
 
55
-        protected = chart.get('protected', {})
56
-        p_continue = protected.get('continue_processing', False)
57
-
58 55
         old_release = self.find_chart_release(known_releases, release_name)
59 56
 
60 57
         status = None
61 58
         if old_release:
62 59
             status = r.get_release_status(old_release)
63 60
 
64
-            if status not in [const.STATUS_FAILED, const.STATUS_DEPLOYED]:
65
-                raise armada_exceptions.UnexpectedReleaseStatusException(
66
-                    release_name, status)
67
-
68 61
         chart_wait = ChartWait(
69 62
             self.tiller.k8s,
70 63
             release_name,
@@ -82,29 +75,6 @@ class ChartDeploy(object):
82 75
         chartbuilder = ChartBuilder(chart)
83 76
         new_chart = chartbuilder.get_helm_chart()
84 77
 
85
-        # Check for existing FAILED release, and purge
86
-        if status == const.STATUS_FAILED:
87
-            LOG.info('Purging FAILED release %s before deployment.',
88
-                     release_name)
89
-            if protected:
90
-                if p_continue:
91
-                    LOG.warn(
92
-                        'Release %s is `protected`, '
93
-                        'continue_processing=True. Operator must '
94
-                        'handle FAILED release manually.', release_name)
95
-                    result['protected'] = release_name
96
-                    return result
97
-                else:
98
-                    LOG.error(
99
-                        'Release %s is `protected`, '
100
-                        'continue_processing=False.', release_name)
101
-                    raise armada_exceptions.ProtectedReleaseException(
102
-                        release_name)
103
-            else:
104
-                # Purge the release
105
-                self.tiller.uninstall_release(release_name)
106
-                result['purge'] = release_name
107
-
108 78
         # TODO(mark-burnett): It may be more robust to directly call
109 79
         # tiller status to decide whether to install/upgrade rather
110 80
         # than checking for list membership.
@@ -181,6 +151,62 @@ class ChartDeploy(object):
181 151
                          tiller_result.__dict__)
182 152
                 result['upgrade'] = release_name
183 153
         else:
154
+            # Check for release with status other than DEPLOYED
155
+            if status:
156
+                if status != const.STATUS_FAILED:
157
+                    LOG.warn(
158
+                        'Unexpected release status encountered '
159
+                        'release=%s, status=%s', release_name, status)
160
+
161
+                    # Make best effort to determine whether a deployment is
162
+                    # likely pending, by checking if the last deployment
163
+                    # was started within the timeout window of the chart.
164
+                    last_deployment_age = r.get_last_deployment_age(
165
+                        old_release)
166
+                    wait_timeout = chart_wait.get_timeout()
167
+                    likely_pending = last_deployment_age <= wait_timeout
168
+                    if likely_pending:
169
+                        # Give up if a deployment is likely pending, we do not
170
+                        # want to have multiple operations going on for the
171
+                        # same release at the same time.
172
+                        raise armada_exceptions.\
173
+                            DeploymentLikelyPendingException(
174
+                                release_name, status, last_deployment_age,
175
+                                wait_timeout)
176
+                    else:
177
+                        # Release is likely stuck in an unintended (by tiller)
178
+                        # state. Log and continue on with remediation steps
179
+                        # below.
180
+                        LOG.info(
181
+                            'Old release %s likely stuck in status %s, '
182
+                            '(last deployment age=%ss) >= '
183
+                            '(chart wait timeout=%ss)', release, status,
184
+                            last_deployment_age, wait_timeout)
185
+
186
+                protected = chart.get('protected', {})
187
+                if protected:
188
+                    p_continue = protected.get('continue_processing', False)
189
+                    if p_continue:
190
+                        LOG.warn(
191
+                            'Release %s is `protected`, '
192
+                            'continue_processing=True. Operator must '
193
+                            'handle %s release manually.', release_name,
194
+                            status)
195
+                        result['protected'] = release_name
196
+                        return result
197
+                    else:
198
+                        LOG.error(
199
+                            'Release %s is `protected`, '
200
+                            'continue_processing=False.', release_name)
201
+                        raise armada_exceptions.ProtectedReleaseException(
202
+                            release_name, status)
203
+                else:
204
+                    # Purge the release
205
+                    LOG.info('Purging release %s with status %s', release_name,
206
+                             status)
207
+                    self.tiller.uninstall_release(release_name)
208
+                    result['purge'] = release_name
209
+
184 210
             timer = int(round(deadline - time.time()))
185 211
             LOG.info(
186 212
                 "Installing release %s in namespace %s, wait=%s, "

+ 16
- 0
armada/utils/release.py View File

@@ -14,6 +14,8 @@
14 14
 
15 15
 from armada.handlers.test import Test
16 16
 
17
+import time
18
+
17 19
 
18 20
 def release_prefixer(prefix, release):
19 21
     '''
@@ -53,3 +55,17 @@ def get_last_test_result(release):
53 55
     if not status.HasField('last_test_suite_run'):
54 56
         return None
55 57
     return Test.get_test_suite_run_success(status.last_test_suite_run)
58
+
59
+
60
+def get_last_deployment_age(release):
61
+    """
62
+    :param release: protobuf release object
63
+
64
+    :return: age in seconds of last deployment of release
65
+    """
66
+
67
+    last_deployed = release.info.last_deployed.seconds
68
+    now = int(time.time())
69
+    last_deployment_age = now - last_deployed
70
+
71
+    return last_deployment_age

+ 1
- 1
doc/source/operations/exceptions/armada-exceptions.inc View File

@@ -47,7 +47,7 @@ Armada Exceptions
47 47
    :show-inheritance:
48 48
    :undoc-members:
49 49
 
50
-.. autoexception:: armada.exceptions.armada_exceptions.UnexpectedReleaseStatusException
50
+.. autoexception:: armada.exceptions.armada_exceptions.DeploymentLikelyPendingException
51 51
    :members:
52 52
    :show-inheritance:
53 53
    :undoc-members:

Loading…
Cancel
Save