Cleanup leaked build request parameters

Now that build request parameters are sharded outside of a transaction,
we might write them but fail to write the actual build request.  To
address that, add a periodic checke for leaked parameters which is run
after the search for lost build requests.

Change-Id: I0787d94513572f90ae66616a6619d488c9592a5d
This commit is contained in:
James E. Blair 2021-08-02 13:34:51 -07:00
parent 965f25b8d9
commit a99b2dace3
4 changed files with 98 additions and 16 deletions

View File

@ -3173,28 +3173,15 @@ class TestingExecutorApi(HoldableExecutorApi):
# As this method is used for assertions in the tests, it
# should look up the build requests directly from ZooKeeper
# and not from a cache layer.
zones = []
if self.zone_filter:
zones = self.zone_filter
else:
try:
# Get all available zones from ZooKeeper
zones = self.kazoo_client.get_children(
'/'.join([self.BUILD_REQUEST_ROOT, 'zones']))
zones.append(None)
except kazoo.exceptions.NoNodeError:
zones = [None]
zones = self._getAllZones()
all_builds = []
for zone in zones:
try:
zone_path = self._getZoneRoot(zone)
builds = self.kazoo_client.get_children(zone_path)
except kazoo.exceptions.NoNodeError:
# Skip this zone as it doesn't have any builds
continue
for build_uuid in builds:
zone_path = self._getZoneRoot(zone)
for build_uuid in self._getAllBuildIds([zone]):
build = self.get("/".join([zone_path, build_uuid]))
if build and (not states or build.state in states):
all_builds.append(build)

View File

@ -629,6 +629,29 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
self.assertEqual(2, len(lost_build_requests))
self.assertEqual(b.path, lost_build_requests[0].path)
def test_lost_build_request_params(self):
# Test cleaning up orphaned request parameters
executor_api = ExecutorApi(self.zk_client)
path_a = executor_api.submit(
"A", "tenant", "pipeline", {}, "zone", '1')
params_root = executor_api.BUILD_PARAMS_ROOT
self.assertEqual(len(executor_api._getAllBuildIds()), 1)
self.assertEqual(len(
self.zk_client.client.get_children(params_root)), 1)
# Delete the request but not the params
self.zk_client.client.delete(path_a)
self.assertEqual(len(executor_api._getAllBuildIds()), 0)
self.assertEqual(len(
self.zk_client.client.get_children(params_root)), 1)
# Clean up leaked params
executor_api.cleanup(0)
self.assertEqual(len(
self.zk_client.client.get_children(params_root)), 0)
def test_existing_build_request(self):
# Test that an executor sees an existing build request when
# coming online

View File

@ -229,6 +229,7 @@ class ExecutorClient(object):
self.cleanupLostBuildRequest(build_request)
except Exception:
self.log.exception("Exception cleaning up lost build request:")
self.executor_api.cleanup()
def cleanupLostBuildRequest(self, build_request):
result = {"result": "ABORTED"}

View File

@ -12,6 +12,7 @@
# License for the specific language governing permissions and limitations
# under the License.
import time
import json
import logging
from contextlib import suppress
@ -424,6 +425,76 @@ class ExecutorApi(ZooKeeperSimpleBase):
self.inState(BuildRequest.RUNNING, BuildRequest.PAUSED),
)
def _getAllZones(self):
# Get a list of all zones without using the cache.
try:
# Get all available zones from ZooKeeper
zones = self.kazoo_client.get_children(
'/'.join([self.BUILD_REQUEST_ROOT, 'zones']))
zones.append(None)
except NoNodeError:
zones = [None]
return zones
def _getAllBuildIds(self, zones=None):
# Get a list of all build uuids without using the cache.
if zones is None:
zones = self._getAllZones()
all_builds = set()
for zone in zones:
try:
zone_path = self._getZoneRoot(zone)
all_builds.update(self.kazoo_client.get_children(zone_path))
except NoNodeError:
# Skip this zone as it doesn't have any builds
continue
return all_builds
def _findLostParams(self, age):
# Get data nodes which are older than the specified age (we
# don't want to delete nodes which are just being written
# slowly).
# Convert to MS
now = int(time.time() * 1000)
age = age * 1000
data_nodes = dict()
for data_id in self.kazoo_client.get_children(self.BUILD_PARAMS_ROOT):
data_path = self._getParamsPath(data_id)
data_zstat = self.kazoo_client.exists(data_path)
if now - data_zstat.mtime > age:
data_nodes[data_id] = data_path
# If there are no candidate data nodes, we don't need to
# filter them by known requests.
if not data_nodes:
return data_nodes.values()
# Remove current request uuids
for request_id in self._getAllBuildIds():
if request_id in data_nodes:
del data_nodes[request_id]
# Return the paths
return data_nodes.values()
def cleanup(self, age=300):
# Delete build request params which are not associated with
# any current build requests. Note, this does not clean up
# lost build requests themselves; the executor client takes
# care of that.
try:
for path in self._findLostParams(age):
try:
self.log.error("Removing build request params: %s", path)
self.kazoo_client.delete(path, recursive=True)
except Exception:
self.log.execption(
"Unable to delete build request params %s", path)
except Exception:
self.log.exception(
"Error cleaning up build request queue %s", self)
@staticmethod
def _bytesToDict(data):
return json.loads(data.decode("utf-8"))