From 8b1d46d24636ff845a5323a3f46f614933b1ed89 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 30 Jan 2025 13:11:19 -0800 Subject: [PATCH] Retry nodescan connections during the key phase The nodescan state machines has 4 main states: * initial connection * initial key negotiation * key connection * key negotiation We expect many connection failures during the "initial connection" phase, but if we encounter a hung connection during the "key connection" phase, then we will wait for the complete boot timeout duration and then fail. In an attempt to reduce nodescan failures due to initial flaky connections, retry the connection attempt for the same key if we encounter a failure during connection (but not negotiation -- a negotiation failure will still proceed to the next key). In addition, allocate 10 seconds for each key connection attempt before timing out and retrying the connection. This matches an existing 10 second connection timeout we have in the initial connection attempt. The _connect() method is updated so that it never sets the next state, instead, we will set that at the call sites. It is important that once we pass the negotiating_init state that we don't go back to it, since we may have started to restrict the set of keys. Setting the next state at the call sites makes that explicit. Change-Id: I34691af325ce806e2bf0ae658da09aed0239d82d --- zuul/launcher/server.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/server.py b/zuul/launcher/server.py index 0bb13073e5..d54f122910 100644 --- a/zuul/launcher/server.py +++ b/zuul/launcher/server.py @@ -377,7 +377,7 @@ class NodescanRequest: try: self.sock.connect(self.sockaddr) except BlockingIOError: - self.state = self.CONNECTING_INIT + pass self.connect_start_time = time.monotonic() self.worker.registerDescriptor(self.sock) @@ -419,6 +419,7 @@ class NodescanRequest: else: self.init_connection_attempts += 1 self._connect() + self.state = self.CONNECTING_INIT if self.state == self.CONNECTING_INIT: if not socket_ready: @@ -465,6 +466,7 @@ class NodescanRequest: self.state = self.START self._checkTimeout() self._connect() + self.state = self.CONNECTING_INIT return # This is our first successful connection. Now that # we've done it, start again specifying the first key @@ -477,13 +479,24 @@ class NodescanRequest: if self.state == self.CONNECTING_KEY: if not socket_ready: self._checkTimeout() + # If we're still here, then don't let any individual + # connection attempt last more than 10 seconds: + if time.monotonic() - self.connect_start_time >= 10: + # Restart the connection attempt for this key (not + # the whole series). + self.key_connection_failures += 1 + self._close() + self._connect() + self.state = self.CONNECTING_KEY return eno = self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_ERROR) if eno: self.log.error( f"Error {eno} connecting to {self.ip} on port {self.port}") self.key_connection_failures += 1 - self._nextKey() + self._close() + self._connect() + self.state = self.CONNECTING_KEY return self._start() self.state = self.NEGOTIATING_KEY