Reestablish watch and retry wait for some errors

Armada uses a Kubernetes watch to implement its chart wait logic. This can be a fairly long-lived connection to the Kubernetes API server, and is vulnerable to disruption (if, for example, the kubernetes apiserver chart is being upgraded). This change allows Armada to retry the wait for some specific errors, including the establishment of a new watch, until the overall chart timeout is reached. https://github.com/kubernetes-client/python/issues/972 urllib3.exceptions.ProtocolError: ('Connection broken: IncompleteRead(0 bytes read)', IncompleteRead(0 bytes read)) Change-Id: I3e68a54becadd5b2a2343960a120bdc3de8e8515
2020-04-19 01:35:09 -07:00 · 2020-04-19 01:35:09 -07:00 · 6b2c7245de
commit 6b2c7245de
parent ae1281d874
2 changed files with 13 additions and 0 deletions
--- a/armada/handlers/wait.py
+++ b/armada/handlers/wait.py
@ -21,6 +21,8 @@ import time

 from kubernetes import watch
 from oslo_log import log as logging
+from retry import retry
+import urllib3.exceptions

 from armada import const
 from armada.exceptions import k8s_exceptions
@ -318,6 +320,16 @@ class ResourceWait(ABC):
        else:
            self._wait(deadline)

+    # The Kubernetes Python Client does not always recover from broken
+    # connections to the k8s apiserver, and the resulting uncaught exceptions
+    # in the Watch.stream method cause the chart installation to fail. As long
+    # as the wait deadline has not passed, it is better to retry the entire
+    # wait operation.
+    @retry(
+        exceptions=(
+            urllib3.exceptions.ProtocolError,
+            urllib3.exceptions.MaxRetryError),
+        delay=1)
    def _wait(self, deadline):
        '''
        Waits for resources to become ready.
--- a/requirements.txt
+++ b/requirements.txt
@ -10,6 +10,7 @@ PasteDeploy>=1.5.2
 protobuf>=3.4.0
 PyYAML==3.12
 requests
+retry
 prometheus_client==0.7.0

 # API