Browse Source

Remediate releases stuck in non-DEPLOYED statuses

Armada remediates releases stuck in FAILED status, if not protected,
by purging and re-installing them. This implements the same for other
non-DEPLOYED statuses. For these statuses it guards this with a best
effort determination of whether a previous deployment of the release,
either through armada or the helm CLI, is likely still pending based
on whether it was last deployed within the chart's wait timeout. If
it is deemed likely pending an error is raised, however this
condition will eventually expire on future runs allowing for
eventual remediation.

Reasons why a release may get stuck in statuses other than DEPLOYED
or FAILED include:

1. tiller crashed mid-deployment
2. tiller could not reach kubernetes to update the release state
3. running `helm delete <rel>` (without --purge) (DELETED status)

Change-Id: Ia89cd59f056103dde47980a149c07a2984c4bbb4
Sean Eagan 3 months ago
parent
commit
2310ddbc2c

+ 16
- 12
armada/exceptions/armada_exceptions.py View File

@@ -31,14 +31,15 @@ class ArmadaTimeoutException(ArmadaException):
31 31
 
32 32
 class ProtectedReleaseException(ArmadaException):
33 33
     '''
34
-    Exception that occurs when Armada encounters a FAILED release that is
35
-    designated `protected` in the Chart and `continue_processing` is False.
34
+    Exception that occurs when Armada encounters a release with status other
35
+    than DEPLOYED that is designated `protected` in the Chart and
36
+    `continue_processing` is False.
36 37
     '''
37 38
 
38
-    def __init__(self, reason):
39
+    def __init__(self, release, status):
39 40
         self._message = (
40
-            'Armada encountered protected release %s in FAILED status' %
41
-            reason)
41
+            'Armada encountered protected release {} in {} status'.format(
42
+                release, status))
42 43
         super(ProtectedReleaseException, self).__init__(self._message)
43 44
 
44 45
 
@@ -88,13 +89,16 @@ class WaitException(ArmadaException):
88 89
         super(WaitException, self).__init__(message)
89 90
 
90 91
 
91
-class UnexpectedReleaseStatusException(ArmadaException):
92
+class DeploymentLikelyPendingException(ArmadaException):
92 93
     '''
93
-    Exception that occurs when armada encounters an existing release for a
94
-    chart with an unexpected status which armada does not know what to do with.
94
+    Exception that occurs when it is detected that an existing release
95
+    operation (e.g. install, update, rollback, delete) is likely still pending.
95 96
     '''
96 97
 
97
-    def __init__(self, release_name, status):
98
-        self._message = "Found release {} in unexpected status {}".format(
99
-            release_name, status)
100
-        super(UnexpectedReleaseStatusException, self).__init__(self._message)
98
+    def __init__(self, release, status, last_deployment_age, timeout):
99
+        self._message = (
100
+            'Existing deployment likely pending '
101
+            'release={}, status={}, '
102
+            '(last deployment age={}s) < (chart wait timeout={}s)'.format(
103
+                release, status, last_deployment_age, timeout))
104
+        super(DeploymentLikelyPendingException, self).__init__(self._message)

+ 56
- 30
armada/handlers/chart_deploy.py View File

@@ -52,19 +52,12 @@ class ChartDeploy(object):
52 52
 
53 53
         result = {}
54 54
 
55
-        protected = chart.get('protected', {})
56
-        p_continue = protected.get('continue_processing', False)
57
-
58 55
         old_release = self.find_chart_release(known_releases, release_name)
59 56
 
60 57
         status = None
61 58
         if old_release:
62 59
             status = r.get_release_status(old_release)
63 60
 
64
-            if status not in [const.STATUS_FAILED, const.STATUS_DEPLOYED]:
65
-                raise armada_exceptions.UnexpectedReleaseStatusException(
66
-                    release_name, status)
67
-
68 61
         chart_wait = ChartWait(
69 62
             self.tiller.k8s,
70 63
             release_name,
@@ -82,29 +75,6 @@ class ChartDeploy(object):
82 75
         chartbuilder = ChartBuilder(chart)
83 76
         new_chart = chartbuilder.get_helm_chart()
84 77
 
85
-        # Check for existing FAILED release, and purge
86
-        if status == const.STATUS_FAILED:
87
-            LOG.info('Purging FAILED release %s before deployment.',
88
-                     release_name)
89
-            if protected:
90
-                if p_continue:
91
-                    LOG.warn(
92
-                        'Release %s is `protected`, '
93
-                        'continue_processing=True. Operator must '
94
-                        'handle FAILED release manually.', release_name)
95
-                    result['protected'] = release_name
96
-                    return result
97
-                else:
98
-                    LOG.error(
99
-                        'Release %s is `protected`, '
100
-                        'continue_processing=False.', release_name)
101
-                    raise armada_exceptions.ProtectedReleaseException(
102
-                        release_name)
103
-            else:
104
-                # Purge the release
105
-                self.tiller.uninstall_release(release_name)
106
-                result['purge'] = release_name
107
-
108 78
         # TODO(mark-burnett): It may be more robust to directly call
109 79
         # tiller status to decide whether to install/upgrade rather
110 80
         # than checking for list membership.
@@ -181,6 +151,62 @@ class ChartDeploy(object):
181 151
                          tiller_result.__dict__)
182 152
                 result['upgrade'] = release_name
183 153
         else:
154
+            # Check for release with status other than DEPLOYED
155
+            if status:
156
+                if status != const.STATUS_FAILED:
157
+                    LOG.warn(
158
+                        'Unexpected release status encountered '
159
+                        'release=%s, status=%s', release_name, status)
160
+
161
+                    # Make best effort to determine whether a deployment is
162
+                    # likely pending, by checking if the last deployment
163
+                    # was started within the timeout window of the chart.
164
+                    last_deployment_age = r.get_last_deployment_age(
165
+                        old_release)
166
+                    wait_timeout = chart_wait.get_timeout()
167
+                    likely_pending = last_deployment_age <= wait_timeout
168
+                    if likely_pending:
169
+                        # Give up if a deployment is likely pending, we do not
170
+                        # want to have multiple operations going on for the
171
+                        # same release at the same time.
172
+                        raise armada_exceptions.\
173
+                            DeploymentLikelyPendingException(
174
+                                release_name, status, last_deployment_age,
175
+                                wait_timeout)
176
+                    else:
177
+                        # Release is likely stuck in an unintended (by tiller)
178
+                        # state. Log and continue on with remediation steps
179
+                        # below.
180
+                        LOG.info(
181
+                            'Old release %s likely stuck in status %s, '
182
+                            '(last deployment age=%ss) >= '
183
+                            '(chart wait timeout=%ss)', release, status,
184
+                            last_deployment_age, wait_timeout)
185
+
186
+                protected = chart.get('protected', {})
187
+                if protected:
188
+                    p_continue = protected.get('continue_processing', False)
189
+                    if p_continue:
190
+                        LOG.warn(
191
+                            'Release %s is `protected`, '
192
+                            'continue_processing=True. Operator must '
193
+                            'handle %s release manually.', release_name,
194
+                            status)
195
+                        result['protected'] = release_name
196
+                        return result
197
+                    else:
198
+                        LOG.error(
199
+                            'Release %s is `protected`, '
200
+                            'continue_processing=False.', release_name)
201
+                        raise armada_exceptions.ProtectedReleaseException(
202
+                            release_name, status)
203
+                else:
204
+                    # Purge the release
205
+                    LOG.info('Purging release %s with status %s', release_name,
206
+                             status)
207
+                    self.tiller.uninstall_release(release_name)
208
+                    result['purge'] = release_name
209
+
184 210
             timer = int(round(deadline - time.time()))
185 211
             LOG.info(
186 212
                 "Installing release %s in namespace %s, wait=%s, "

+ 16
- 0
armada/utils/release.py View File

@@ -14,6 +14,8 @@
14 14
 
15 15
 from armada.handlers.test import Test
16 16
 
17
+import time
18
+
17 19
 
18 20
 def release_prefixer(prefix, release):
19 21
     '''
@@ -53,3 +55,17 @@ def get_last_test_result(release):
53 55
     if not status.HasField('last_test_suite_run'):
54 56
         return None
55 57
     return Test.get_test_suite_run_success(status.last_test_suite_run)
58
+
59
+
60
+def get_last_deployment_age(release):
61
+    """
62
+    :param release: protobuf release object
63
+
64
+    :return: age in seconds of last deployment of release
65
+    """
66
+
67
+    last_deployed = release.info.last_deployed.seconds
68
+    now = int(time.time())
69
+    last_deployment_age = now - last_deployed
70
+
71
+    return last_deployment_age

+ 1
- 1
doc/source/operations/exceptions/armada-exceptions.inc View File

@@ -47,7 +47,7 @@ Armada Exceptions
47 47
    :show-inheritance:
48 48
    :undoc-members:
49 49
 
50
-.. autoexception:: armada.exceptions.armada_exceptions.UnexpectedReleaseStatusException
50
+.. autoexception:: armada.exceptions.armada_exceptions.DeploymentLikelyPendingException
51 51
    :members:
52 52
    :show-inheritance:
53 53
    :undoc-members:

Loading…
Cancel
Save