Refactor Merger/Executor API

The Merger and executor APIs have a lot in common, but they behave
slightly differently.  A merger needs to sometimes return results.
An executor needs to have separate queues for zones and be able to
pause or cancel jobs.

This refactors them both into a common class which can handle job
state changes (like pause/cancel) and return results if requested.

The MergerApi can subclass this fairly trivially.

The ExecutorApi adds an intermediate layer which uses a
DefaultKeyDict to maintain a distinct queue for every zone and then
transparently dispatches method calls to the queue object for
that zone.

The ZK paths for both are significantly altered in this change.

Change-Id: I3adedcc4ea293e43070ba6ef0fe29e7889a0b502
This commit is contained in:
James E. Blair 2021-08-04 16:40:14 -07:00
parent 8038f9f75c
commit a729d6c6e8
14 changed files with 941 additions and 1247 deletions

View File

@ -3165,14 +3165,11 @@ class HoldableMergerApi(MergerApi):
self.hold_in_queue = False self.hold_in_queue = False
self.history = {} self.history = {}
def submit(self, uuid, job_type, build_set_uuid, tenant_name, def submit(self, request, params, needs_result=False):
pipeline_name, params, event_id, precedence=0, self.log.debug("Appending merge job to history: %s", request.uuid)
needs_result=False): self.history[request.uuid] = FakeMergeRequest(
self.log.debug("Appending merge job to history: %s", uuid) request.uuid, request.job_type, params)
self.history[uuid] = FakeMergeRequest(uuid, job_type, params) return super().submit(request, params, needs_result)
return super().submit(
uuid, job_type, build_set_uuid, tenant_name, pipeline_name, params,
event_id, precedence, needs_result)
@property @property
def initial_state(self): def initial_state(self):
@ -3190,15 +3187,13 @@ class TestingMergerApi(HoldableMergerApi):
# the merge requests directly from ZooKeeper and not from a cache # the merge requests directly from ZooKeeper and not from a cache
# layer. # layer.
all_merge_requests = [] all_merge_requests = []
for merge_uuid in self.kazoo_client.get_children( for merge_uuid in self._getAllRequestIds():
self.MERGE_REQUEST_ROOT
):
merge_request = self.get("/".join( merge_request = self.get("/".join(
[self.MERGE_REQUEST_ROOT, merge_uuid])) [self.REQUEST_ROOT, merge_uuid]))
if merge_request and (not states or merge_request.state in states): if merge_request and (not states or merge_request.state in states):
all_merge_requests.append(merge_request) all_merge_requests.append(merge_request)
return all_merge_requests return sorted(all_merge_requests)
def release(self, merge_request=None): def release(self, merge_request=None):
""" """
@ -3251,11 +3246,10 @@ class RecordingMergeClient(zuul.merger.client.MergeClient):
class HoldableExecutorApi(ExecutorApi): class HoldableExecutorApi(ExecutorApi):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hold_in_queue = False self.hold_in_queue = False
super().__init__(*args, **kwargs)
@property def _getInitialState(self):
def initial_state(self):
if self.hold_in_queue: if self.hold_in_queue:
return BuildRequest.HOLD return BuildRequest.HOLD
return BuildRequest.REQUESTED return BuildRequest.REQUESTED
@ -3268,16 +3262,12 @@ class TestingExecutorApi(HoldableExecutorApi):
# As this method is used for assertions in the tests, it # As this method is used for assertions in the tests, it
# should look up the build requests directly from ZooKeeper # should look up the build requests directly from ZooKeeper
# and not from a cache layer. # and not from a cache layer.
if self.zone_filter:
zones = self.zone_filter
else:
zones = self._getAllZones()
all_builds = [] all_builds = []
for zone in zones: for zone in self._getAllZones():
zone_path = self._getZoneRoot(zone) queue = self.zone_queues[zone]
for build_uuid in self._getAllBuildIds([zone]): for build_uuid in queue._getAllRequestIds():
build = self.get("/".join([zone_path, build_uuid])) build = queue.get(f'{queue.REQUEST_ROOT}/{build_uuid}')
if build and (not states or build.state in states): if build and (not states or build.state in states):
all_builds.append(build) all_builds.append(build)
@ -3293,7 +3283,7 @@ class TestingExecutorApi(HoldableExecutorApi):
self._test_build_request_job_map = {} self._test_build_request_job_map = {}
if build_request.uuid in self._test_build_request_job_map: if build_request.uuid in self._test_build_request_job_map:
return self._test_build_request_job_map[build_request.uuid] return self._test_build_request_job_map[build_request.uuid]
d = self.getBuildParams(build_request) d = self.getParams(build_request)
if d: if d:
data = d.get('job', '') data = d.get('job', '')
else: else:

View File

@ -1025,17 +1025,16 @@ class TestMerger(ZuulTestCase):
merger_api = MergerApi(self.zk_client) merger_api = MergerApi(self.zk_client)
payload = {'merge': 'test'} payload = {'merge': 'test'}
merger_api.submit( merger_api.submit(MergeRequest(
uuid='B', uuid='B',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='BB', build_set_uuid='BB',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) ), payload)
b = merger_api.get(f"{merger_api.MERGE_REQUEST_ROOT}/B") b = merger_api.get(f"{merger_api.REQUEST_ROOT}/B")
b.state = MergeRequest.RUNNING b.state = MergeRequest.RUNNING
merger_api.update(b) merger_api.update(b)
@ -1046,14 +1045,14 @@ class TestMerger(ZuulTestCase):
# DataWatch might be triggered for the correct event, but the cache # DataWatch might be triggered for the correct event, but the cache
# might still be outdated as the DataWatch that updates the cache # might still be outdated as the DataWatch that updates the cache
# itself wasn't triggered yet. # itself wasn't triggered yet.
cache = merger_api._cached_merge_requests cache = merger_api._cached_requests
for _ in iterate_timeout(30, "cache to be up-to-date"): for _ in iterate_timeout(30, "cache to be up-to-date"):
if (cache and cache[b.path].state == MergeRequest.RUNNING): if (cache and cache[b.path].state == MergeRequest.RUNNING):
break break
# The lost_merges method should only return merges which are running # The lost_merges method should only return merges which are running
# but not locked by any merger, in this case merge b # but not locked by any merger, in this case merge b
lost_merge_requests = list(merger_api.lostMergeRequests()) lost_merge_requests = list(merger_api.lostRequests())
self.assertEqual(1, len(lost_merge_requests)) self.assertEqual(1, len(lost_merge_requests))
self.assertEqual(b.path, lost_merge_requests[0].path) self.assertEqual(b.path, lost_merge_requests[0].path)
@ -1062,7 +1061,7 @@ class TestMerger(ZuulTestCase):
merger_client = self.scheds.first.sched.merger merger_client = self.scheds.first.sched.merger
merger_client.cleanupLostMergeRequests() merger_client.cleanupLostMergeRequests()
lost_merge_requests = list(merger_api.lostMergeRequests()) lost_merge_requests = list(merger_api.lostRequests())
self.assertEqual(0, len(lost_merge_requests)) self.assertEqual(0, len(lost_merge_requests))

View File

@ -1015,8 +1015,7 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(len(self.builds), 0) self.assertEqual(len(self.builds), 0)
self.assertEqual(len(queue), 1) self.assertEqual(len(queue), 1)
self.assertEqual(queue[0].zone, None) self.assertEqual(queue[0].zone, None)
params = self.executor_server.executor_api.getBuildParams( params = self.executor_server.executor_api.getParams(queue[0])
queue[0])
self.assertEqual(params['job'], 'project-merge') self.assertEqual(params['job'], 'project-merge')
self.assertEqual(params['items'][0]['number'], '%d' % A.number) self.assertEqual(params['items'][0]['number'], '%d' % A.number)
@ -1027,8 +1026,8 @@ class TestScheduler(ZuulTestCase):
self.executor_api.release('.*-merge') self.executor_api.release('.*-merge')
self.waitUntilSettled() self.waitUntilSettled()
queue = list(self.executor_api.queued()) queue = list(self.executor_api.queued())
params = [self.executor_server.executor_api.getBuildParams( params = [self.executor_server.executor_api.getParams(item)
item) for item in queue] for item in queue]
self.assertEqual(len(self.builds), 0) self.assertEqual(len(self.builds), 0)
self.assertEqual(len(queue), 6) self.assertEqual(len(queue), 6)

View File

@ -22,7 +22,8 @@ from zuul.model import BuildRequest, HoldRequest, MergeRequest
from zuul.zk import ZooKeeperClient from zuul.zk import ZooKeeperClient
from zuul.zk.config_cache import SystemConfigCache, UnparsedConfigCache from zuul.zk.config_cache import SystemConfigCache, UnparsedConfigCache
from zuul.zk.exceptions import LockException from zuul.zk.exceptions import LockException
from zuul.zk.executor import ExecutorApi, BuildRequestEvent from zuul.zk.executor import ExecutorApi
from zuul.zk.job_request_queue import JobRequestEvent
from zuul.zk.merger import MergerApi from zuul.zk.merger import MergerApi
from zuul.zk.layout import LayoutStateStore, LayoutState from zuul.zk.layout import LayoutStateStore, LayoutState
from zuul.zk.locks import locked from zuul.zk.locks import locked
@ -386,7 +387,8 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
build_event_callback=eq_put) build_event_callback=eq_put)
# Scheduler submits request # Scheduler submits request
client.submit("A", "tenant", "pipeline", {'job': 'test'}, None, '1') request = BuildRequest("A", None, "tenant", "pipeline", '1')
client.submit(request, {'job': 'test'})
request_queue.get(timeout=30) request_queue.get(timeout=30)
# Executor receives request # Executor receives request
@ -394,10 +396,10 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
self.assertEqual(len(reqs), 1) self.assertEqual(len(reqs), 1)
a = reqs[0] a = reqs[0]
self.assertEqual(a.uuid, 'A') self.assertEqual(a.uuid, 'A')
params = client.getBuildParams(a) params = client.getParams(a)
self.assertEqual(params, {'job': 'test'}) self.assertEqual(params, {'job': 'test'})
client.clearBuildParams(a) client.clearParams(a)
params = client.getBuildParams(a) params = client.getParams(a)
self.assertIsNone(params) self.assertIsNone(params)
# Executor locks request # Executor locks request
@ -421,7 +423,7 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
client.requestResume(sched_a) client.requestResume(sched_a)
(build_request, event) = event_queue.get(timeout=30) (build_request, event) = event_queue.get(timeout=30)
self.assertEqual(build_request, a) self.assertEqual(build_request, a)
self.assertEqual(event, BuildRequestEvent.RESUMED) self.assertEqual(event, JobRequestEvent.RESUMED)
# Executor resumes build # Executor resumes build
a.state = BuildRequest.RUNNING a.state = BuildRequest.RUNNING
@ -435,7 +437,7 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
client.requestCancel(sched_a) client.requestCancel(sched_a)
(build_request, event) = event_queue.get(timeout=30) (build_request, event) = event_queue.get(timeout=30)
self.assertEqual(build_request, a) self.assertEqual(build_request, a)
self.assertEqual(event, BuildRequestEvent.CANCELED) self.assertEqual(event, JobRequestEvent.CANCELED)
# Executor aborts build # Executor aborts build
a.state = BuildRequest.COMPLETED a.state = BuildRequest.COMPLETED
@ -450,12 +452,14 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
# Scheduler removes build request on completion # Scheduler removes build request on completion
client.remove(sched_a) client.remove(sched_a)
self.assertEqual(set(self._get_zk_tree(client.BUILD_REQUEST_ROOT)), self.assertEqual(set(self._get_zk_tree('/zuul/executor')),
set(['/zuul/build-requests/unzoned', set(['/zuul/executor/unzoned',
'/zuul/build-requests/zones'])) '/zuul/executor/unzoned/locks',
self.assertEqual(self._get_zk_tree( '/zuul/executor/unzoned/params',
client.BUILD_REQUEST_ROOT + '/zones'), []) '/zuul/executor/unzoned/requests',
self.assertEqual(self._get_zk_tree(client.LOCK_ROOT), []) '/zuul/executor/unzoned/result-data',
'/zuul/executor/unzoned/results',
'/zuul/executor/unzoned/waiters']))
self.assertEqual(self._get_watches(), {}) self.assertEqual(self._get_watches(), {})
def test_build_request_remove(self): def test_build_request_remove(self):
@ -478,7 +482,8 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
build_event_callback=eq_put) build_event_callback=eq_put)
# Scheduler submits request # Scheduler submits request
client.submit("A", "tenant", "pipeline", {}, None, '1') request = BuildRequest("A", None, "tenant", "pipeline", '1')
client.submit(request, {})
request_queue.get(timeout=30) request_queue.get(timeout=30)
# Executor receives request # Executor receives request
@ -505,7 +510,7 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
# Make sure it shows up as deleted # Make sure it shows up as deleted
(build_request, event) = event_queue.get(timeout=30) (build_request, event) = event_queue.get(timeout=30)
self.assertEqual(build_request, a) self.assertEqual(build_request, a)
self.assertEqual(event, BuildRequestEvent.DELETED) self.assertEqual(event, JobRequestEvent.DELETED)
# Executor should not write anything else since the request # Executor should not write anything else since the request
# was deleted. # was deleted.
@ -530,7 +535,8 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
build_event_callback=eq_put) build_event_callback=eq_put)
# Scheduler submits request # Scheduler submits request
a_path = client.submit("A", "tenant", "pipeline", {}, None, '1') request = BuildRequest("A", None, "tenant", "pipeline", '1')
client.submit(request, {})
request_queue.get(timeout=30) request_queue.get(timeout=30)
# Executor receives nothing # Executor receives nothing
@ -538,7 +544,7 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
self.assertEqual(len(reqs), 0) self.assertEqual(len(reqs), 0)
# Test releases hold # Test releases hold
a = client.get(a_path) a = client.get(request.path)
self.assertEqual(a.uuid, 'A') self.assertEqual(a.uuid, 'A')
a.state = BuildRequest.REQUESTED a.state = BuildRequest.REQUESTED
client.update(a) client.update(a)
@ -566,15 +572,16 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
client = ExecutorApi(self.zk_client) client = ExecutorApi(self.zk_client)
# Scheduler submits request # Scheduler submits request
a_path = client.submit("A", "tenant", "pipeline", {}, None, '1') request = BuildRequest("A", None, "tenant", "pipeline", '1')
sched_a = client.get(a_path) client.submit(request, {})
sched_a = client.get(request.path)
# Simulate the server side # Simulate the server side
server = ExecutorApi(self.zk_client, server = ExecutorApi(self.zk_client,
build_request_callback=rq_put, build_request_callback=rq_put,
build_event_callback=eq_put) build_event_callback=eq_put)
exec_a = server.get(a_path) exec_a = server.get(request.path)
client.remove(sched_a) client.remove(sched_a)
# Try to lock a request that was just removed # Try to lock a request that was just removed
@ -585,15 +592,24 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
# requests # requests
executor_api = ExecutorApi(self.zk_client) executor_api = ExecutorApi(self.zk_client)
executor_api.submit("A", "tenant", "pipeline", {}, "zone", '1') br = BuildRequest("A", "zone", "tenant", "pipeline", '1')
path_b = executor_api.submit("B", "tenant", "pipeline", {}, executor_api.submit(br, {})
None, '1')
path_c = executor_api.submit("C", "tenant", "pipeline", {}, br = BuildRequest("B", None, "tenant", "pipeline", '1')
"zone", '1') executor_api.submit(br, {})
path_d = executor_api.submit("D", "tenant", "pipeline", {}, path_b = br.path
"zone", '1')
path_e = executor_api.submit("E", "tenant", "pipeline", {}, br = BuildRequest("C", "zone", "tenant", "pipeline", '1')
"zone", '1') executor_api.submit(br, {})
path_c = br.path
br = BuildRequest("D", "zone", "tenant", "pipeline", '1')
executor_api.submit(br, {})
path_d = br.path
br = BuildRequest("E", "zone", "tenant", "pipeline", '1')
executor_api.submit(br, {})
path_e = br.path
b = executor_api.get(path_b) b = executor_api.get(path_b)
c = executor_api.get(path_c) c = executor_api.get(path_c)
@ -619,15 +635,16 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
# DataWatch might be triggered for the correct event, but the cache # DataWatch might be triggered for the correct event, but the cache
# might still be outdated as the DataWatch that updates the cache # might still be outdated as the DataWatch that updates the cache
# itself wasn't triggered yet. # itself wasn't triggered yet.
cache = executor_api._cached_build_requests b_cache = executor_api.zone_queues[None]._cached_requests
e_cache = executor_api.zone_queues['zone']._cached_requests
for _ in iterate_timeout(30, "cache to be up-to-date"): for _ in iterate_timeout(30, "cache to be up-to-date"):
if (cache[path_b].state == BuildRequest.RUNNING and if (b_cache[path_b].state == BuildRequest.RUNNING and
cache[path_e].state == BuildRequest.PAUSED): e_cache[path_e].state == BuildRequest.PAUSED):
break break
# The lost_builds method should only return builds which are running or # The lost_builds method should only return builds which are running or
# paused, but not locked by any executor, in this case build b and e. # paused, but not locked by any executor, in this case build b and e.
lost_build_requests = list(executor_api.lostBuildRequests()) lost_build_requests = list(executor_api.lostRequests())
self.assertEqual(2, len(lost_build_requests)) self.assertEqual(2, len(lost_build_requests))
self.assertEqual(b.path, lost_build_requests[0].path) self.assertEqual(b.path, lost_build_requests[0].path)
@ -636,17 +653,17 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
# Test cleaning up orphaned request parameters # Test cleaning up orphaned request parameters
executor_api = ExecutorApi(self.zk_client) executor_api = ExecutorApi(self.zk_client)
path_a = executor_api.submit( br = BuildRequest("A", "zone", "tenant", "pipeline", '1')
"A", "tenant", "pipeline", {}, "zone", '1') executor_api.submit(br, {})
params_root = executor_api.BUILD_PARAMS_ROOT params_root = executor_api.zone_queues['zone'].PARAM_ROOT
self.assertEqual(len(executor_api._getAllBuildIds()), 1) self.assertEqual(len(executor_api._getAllRequestIds()), 1)
self.assertEqual(len( self.assertEqual(len(
self.zk_client.client.get_children(params_root)), 1) self.zk_client.client.get_children(params_root)), 1)
# Delete the request but not the params # Delete the request but not the params
self.zk_client.client.delete(path_a) self.zk_client.client.delete(br.path)
self.assertEqual(len(executor_api._getAllBuildIds()), 0) self.assertEqual(len(executor_api._getAllRequestIds()), 0)
self.assertEqual(len( self.assertEqual(len(
self.zk_client.client.get_children(params_root)), 1) self.zk_client.client.get_children(params_root)), 1)
@ -673,7 +690,8 @@ class TestExecutorApi(ZooKeeperBaseTestCase):
# Simulate the client side # Simulate the client side
client = ExecutorApi(self.zk_client) client = ExecutorApi(self.zk_client)
client.submit("A", "tenant", "pipeline", {}, None, '1') client.submit(
BuildRequest("A", None, "tenant", "pipeline", '1'), {})
# Simulate the server side # Simulate the server side
server = ExecutorApi(self.zk_client, server = ExecutorApi(self.zk_client,
@ -720,14 +738,13 @@ class TestMergerApi(ZooKeeperBaseTestCase):
sessions = None sessions = None
return ret return ret
def _assertEmptyRoots(self): def _assertEmptyRoots(self, client):
api = MergerApi self.assertEqual(self._get_zk_tree(client.REQUEST_ROOT), [])
self.assertEqual(self._get_zk_tree(api.MERGE_REQUEST_ROOT), []) self.assertEqual(self._get_zk_tree(client.PARAM_ROOT), [])
self.assertEqual(self._get_zk_tree(api.MERGE_PARAMS_ROOT), []) self.assertEqual(self._get_zk_tree(client.RESULT_ROOT), [])
self.assertEqual(self._get_zk_tree(api.MERGE_RESULT_ROOT), []) self.assertEqual(self._get_zk_tree(client.RESULT_DATA_ROOT), [])
self.assertEqual(self._get_zk_tree(api.MERGE_RESULT_DATA_ROOT), []) self.assertEqual(self._get_zk_tree(client.WAITER_ROOT), [])
self.assertEqual(self._get_zk_tree(api.MERGE_WAITER_ROOT), []) self.assertEqual(self._get_zk_tree(client.LOCK_ROOT), [])
self.assertEqual(self._get_zk_tree(api.LOCK_ROOT), [])
self.assertEqual(self._get_watches(), {}) self.assertEqual(self._get_watches(), {})
def test_merge_request(self): def test_merge_request(self):
@ -746,15 +763,15 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Scheduler submits request # Scheduler submits request
payload = {'merge': 'test'} payload = {'merge': 'test'}
client.submit( request = MergeRequest(
uuid='A', uuid='A',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='AA', build_set_uuid='AA',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) )
client.submit(request, payload)
request_queue.get(timeout=30) request_queue.get(timeout=30)
# Merger receives request # Merger receives request
@ -762,10 +779,10 @@ class TestMergerApi(ZooKeeperBaseTestCase):
self.assertEqual(len(reqs), 1) self.assertEqual(len(reqs), 1)
a = reqs[0] a = reqs[0]
self.assertEqual(a.uuid, 'A') self.assertEqual(a.uuid, 'A')
params = client.getMergeParams(a) params = client.getParams(a)
self.assertEqual(params, payload) self.assertEqual(params, payload)
client.clearMergeParams(a) client.clearParams(a)
params = client.getMergeParams(a) params = client.getParams(a)
self.assertIsNone(params) self.assertIsNone(params)
# Merger locks request # Merger locks request
@ -782,7 +799,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
server.remove(a) server.remove(a)
server.unlock(a) server.unlock(a)
self._assertEmptyRoots() self._assertEmptyRoots(client)
def test_merge_request_hold(self): def test_merge_request_hold(self):
# Test that we can hold a merge request in "queue" # Test that we can hold a merge request in "queue"
@ -800,15 +817,14 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Scheduler submits request # Scheduler submits request
payload = {'merge': 'test'} payload = {'merge': 'test'}
client.submit( client.submit(MergeRequest(
uuid='A', uuid='A',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='AA', build_set_uuid='AA',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) ), payload)
request_queue.get(timeout=30) request_queue.get(timeout=30)
# Merger receives nothing # Merger receives nothing
@ -817,7 +833,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Test releases hold # Test releases hold
# We have to get a new merge_request object to update it. # We have to get a new merge_request object to update it.
a = client.get(f"{client.MERGE_REQUEST_ROOT}/A") a = client.get(f"{client.REQUEST_ROOT}/A")
self.assertEqual(a.uuid, 'A') self.assertEqual(a.uuid, 'A')
a.state = MergeRequest.REQUESTED a.state = MergeRequest.REQUESTED
client.update(a) client.update(a)
@ -831,7 +847,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
server.remove(a) server.remove(a)
# The rest is redundant. # The rest is redundant.
self._assertEmptyRoots() self._assertEmptyRoots(client)
def test_merge_request_result(self): def test_merge_request_result(self):
# Test the lifecycle of a merge request # Test the lifecycle of a merge request
@ -849,16 +865,14 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Scheduler submits request # Scheduler submits request
payload = {'merge': 'test'} payload = {'merge': 'test'}
future = client.submit( future = client.submit(MergeRequest(
uuid='A', uuid='A',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='AA', build_set_uuid='AA',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
needs_result=True, ), payload, needs_result=True)
)
request_queue.get(timeout=30) request_queue.get(timeout=30)
# Merger receives request # Merger receives request
@ -877,13 +891,13 @@ class TestMergerApi(ZooKeeperBaseTestCase):
result_data = {'result': 'ok'} result_data = {'result': 'ok'}
server.reportResult(a, result_data) server.reportResult(a, result_data)
self.assertEqual(set(self._get_zk_tree(client.MERGE_RESULT_ROOT)), self.assertEqual(set(self._get_zk_tree(client.RESULT_ROOT)),
set(['/zuul/merge-results/A'])) set(['/zuul/merger/results/A']))
self.assertEqual(set(self._get_zk_tree(client.MERGE_RESULT_DATA_ROOT)), self.assertEqual(set(self._get_zk_tree(client.RESULT_DATA_ROOT)),
set(['/zuul/merge-result-data/A', set(['/zuul/merger/result-data/A',
'/zuul/merge-result-data/A/0000000000'])) '/zuul/merger/result-data/A/0000000000']))
self.assertEqual(self._get_zk_tree(client.MERGE_WAITER_ROOT), self.assertEqual(self._get_zk_tree(client.WAITER_ROOT),
['/zuul/merge-waiters/A']) ['/zuul/merger/waiters/A'])
# Merger removes and unlocks merge request on completion # Merger removes and unlocks merge request on completion
server.remove(a) server.remove(a)
@ -893,7 +907,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
self.assertTrue(future.wait()) self.assertTrue(future.wait())
self.assertEqual(future.data, result_data) self.assertEqual(future.data, result_data)
self._assertEmptyRoots() self._assertEmptyRoots(client)
def test_lost_merge_request_params(self): def test_lost_merge_request_params(self):
# Test cleaning up orphaned request parameters # Test cleaning up orphaned request parameters
@ -901,18 +915,17 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Scheduler submits request # Scheduler submits request
payload = {'merge': 'test'} payload = {'merge': 'test'}
merger_api.submit( merger_api.submit(MergeRequest(
uuid='A', uuid='A',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='AA', build_set_uuid='AA',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) ), payload)
path_a = '/'.join([merger_api.MERGE_REQUEST_ROOT, 'A']) path_a = '/'.join([merger_api.REQUEST_ROOT, 'A'])
params_root = merger_api.MERGE_PARAMS_ROOT params_root = merger_api.PARAM_ROOT
self.assertEqual(len(merger_api._getAllRequestIds()), 1) self.assertEqual(len(merger_api._getAllRequestIds()), 1)
self.assertEqual(len( self.assertEqual(len(
self.zk_client.client.get_children(params_root)), 1) self.zk_client.client.get_children(params_root)), 1)
@ -928,7 +941,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
self.assertEqual(len( self.assertEqual(len(
self.zk_client.client.get_children(params_root)), 0) self.zk_client.client.get_children(params_root)), 0)
self._assertEmptyRoots() self._assertEmptyRoots(merger_api)
def test_lost_merge_request_result(self): def test_lost_merge_request_result(self):
# Test that we can clean up orphaned merge results # Test that we can clean up orphaned merge results
@ -946,16 +959,15 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Scheduler submits request # Scheduler submits request
payload = {'merge': 'test'} payload = {'merge': 'test'}
future = client.submit( future = client.submit(MergeRequest(
uuid='A', uuid='A',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='AA', build_set_uuid='AA',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
needs_result=True, ), payload, needs_result=True)
)
request_queue.get(timeout=30) request_queue.get(timeout=30)
# Merger receives request # Merger receives request
@ -978,13 +990,13 @@ class TestMergerApi(ZooKeeperBaseTestCase):
server.remove(a) server.remove(a)
server.unlock(a) server.unlock(a)
self.assertEqual(set(self._get_zk_tree(client.MERGE_RESULT_ROOT)), self.assertEqual(set(self._get_zk_tree(client.RESULT_ROOT)),
set(['/zuul/merge-results/A'])) set(['/zuul/merger/results/A']))
self.assertEqual(set(self._get_zk_tree(client.MERGE_RESULT_DATA_ROOT)), self.assertEqual(set(self._get_zk_tree(client.RESULT_DATA_ROOT)),
set(['/zuul/merge-result-data/A', set(['/zuul/merger/result-data/A',
'/zuul/merge-result-data/A/0000000000'])) '/zuul/merger/result-data/A/0000000000']))
self.assertEqual(self._get_zk_tree(client.MERGE_WAITER_ROOT), self.assertEqual(self._get_zk_tree(client.WAITER_ROOT),
['/zuul/merge-waiters/A']) ['/zuul/merger/waiters/A'])
# Scheduler "disconnects" # Scheduler "disconnects"
self.zk_client.client.delete(future._waiter_path) self.zk_client.client.delete(future._waiter_path)
@ -992,7 +1004,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Find orphaned results # Find orphaned results
client.cleanup(age=0) client.cleanup(age=0)
self._assertEmptyRoots() self._assertEmptyRoots(client)
def test_nonexistent_lock(self): def test_nonexistent_lock(self):
request_queue = queue.Queue() request_queue = queue.Queue()
@ -1005,16 +1017,15 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Scheduler submits request # Scheduler submits request
payload = {'merge': 'test'} payload = {'merge': 'test'}
client.submit( client.submit(MergeRequest(
uuid='A', uuid='A',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='AA', build_set_uuid='AA',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) ), payload)
client_a = client.get(f"{client.MERGE_REQUEST_ROOT}/A") client_a = client.get(f"{client.REQUEST_ROOT}/A")
# Simulate the server side # Simulate the server side
server = MergerApi(self.zk_client, server = MergerApi(self.zk_client,
@ -1025,7 +1036,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Try to lock a request that was just removed # Try to lock a request that was just removed
self.assertFalse(server.lock(server_a)) self.assertFalse(server.lock(server_a))
self._assertEmptyRoots() self._assertEmptyRoots(client)
def test_lost_merge_requests(self): def test_lost_merge_requests(self):
# Test that lostMergeRequests() returns unlocked running merge # Test that lostMergeRequests() returns unlocked running merge
@ -1033,46 +1044,42 @@ class TestMergerApi(ZooKeeperBaseTestCase):
merger_api = MergerApi(self.zk_client) merger_api = MergerApi(self.zk_client)
payload = {'merge': 'test'} payload = {'merge': 'test'}
merger_api.submit( merger_api.submit(MergeRequest(
uuid='A', uuid='A',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='AA', build_set_uuid='AA',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) ), payload)
merger_api.submit( merger_api.submit(MergeRequest(
uuid='B', uuid='B',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='BB', build_set_uuid='BB',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) ), payload)
merger_api.submit( merger_api.submit(MergeRequest(
uuid='C', uuid='C',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='CC', build_set_uuid='CC',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) ), payload)
merger_api.submit( merger_api.submit(MergeRequest(
uuid='D', uuid='D',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='DD', build_set_uuid='DD',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) ), payload)
b = merger_api.get(f"{merger_api.MERGE_REQUEST_ROOT}/B") b = merger_api.get(f"{merger_api.REQUEST_ROOT}/B")
c = merger_api.get(f"{merger_api.MERGE_REQUEST_ROOT}/C") c = merger_api.get(f"{merger_api.REQUEST_ROOT}/C")
d = merger_api.get(f"{merger_api.MERGE_REQUEST_ROOT}/D") d = merger_api.get(f"{merger_api.REQUEST_ROOT}/D")
b.state = MergeRequest.RUNNING b.state = MergeRequest.RUNNING
merger_api.update(b) merger_api.update(b)
@ -1090,7 +1097,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# DataWatch might be triggered for the correct event, but the cache # DataWatch might be triggered for the correct event, but the cache
# might still be outdated as the DataWatch that updates the cache # might still be outdated as the DataWatch that updates the cache
# itself wasn't triggered yet. # itself wasn't triggered yet.
cache = merger_api._cached_merge_requests cache = merger_api._cached_requests
for _ in iterate_timeout(30, "cache to be up-to-date"): for _ in iterate_timeout(30, "cache to be up-to-date"):
if (cache[b.path].state == MergeRequest.RUNNING and if (cache[b.path].state == MergeRequest.RUNNING and
cache[c.path].state == MergeRequest.RUNNING): cache[c.path].state == MergeRequest.RUNNING):
@ -1098,7 +1105,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# The lost_merges method should only return merges which are running # The lost_merges method should only return merges which are running
# but not locked by any merger, in this case merge b # but not locked by any merger, in this case merge b
lost_merge_requests = list(merger_api.lostMergeRequests()) lost_merge_requests = list(merger_api.lostRequests())
self.assertEqual(1, len(lost_merge_requests)) self.assertEqual(1, len(lost_merge_requests))
self.assertEqual(b.path, lost_merge_requests[0].path) self.assertEqual(b.path, lost_merge_requests[0].path)
@ -1119,15 +1126,14 @@ class TestMergerApi(ZooKeeperBaseTestCase):
# Simulate the client side # Simulate the client side
client = MergerApi(self.zk_client) client = MergerApi(self.zk_client)
payload = {'merge': 'test'} payload = {'merge': 'test'}
client.submit( client.submit(MergeRequest(
uuid='A', uuid='A',
job_type=MergeRequest.MERGE, job_type=MergeRequest.MERGE,
build_set_uuid='AA', build_set_uuid='AA',
tenant_name='tenant', tenant_name='tenant',
pipeline_name='check', pipeline_name='check',
params=payload,
event_id='1', event_id='1',
) ), payload)
# Simulate the server side # Simulate the server side
server = MergerApi(self.zk_client, server = MergerApi(self.zk_client,
@ -1143,7 +1149,7 @@ class TestMergerApi(ZooKeeperBaseTestCase):
self.assertEqual(a.uuid, 'A') self.assertEqual(a.uuid, 'A')
client.remove(a) client.remove(a)
self._assertEmptyRoots() self._assertEmptyRoots(client)
class TestLocks(ZooKeeperBaseTestCase): class TestLocks(ZooKeeperBaseTestCase):

View File

@ -27,7 +27,7 @@ from zuul.model import (
) )
from zuul.zk.event_queues import PipelineResultEventQueue from zuul.zk.event_queues import PipelineResultEventQueue
from zuul.zk.executor import ExecutorApi from zuul.zk.executor import ExecutorApi
from zuul.zk.exceptions import BuildRequestNotFound from zuul.zk.exceptions import JobRequestNotFound
from kazoo.exceptions import BadVersionError from kazoo.exceptions import BadVersionError
@ -131,15 +131,16 @@ class ExecutorClient(object):
# Fall back to the default zone # Fall back to the default zone
executor_zone = None executor_zone = None
build.build_request_ref = self.executor_api.submit( request = BuildRequest(
uuid=uuid, uuid=uuid,
tenant_name=build.build_set.item.pipeline.tenant.name, tenant_name=build.build_set.item.pipeline.tenant.name,
pipeline_name=build.build_set.item.pipeline.name, pipeline_name=build.build_set.item.pipeline.name,
params=params,
zone=executor_zone, zone=executor_zone,
event_id=item.event.zuul_event_id, event_id=item.event.zuul_event_id,
precedence=PRIORITY_MAP[pipeline.precedence], precedence=PRIORITY_MAP[pipeline.precedence]
) )
self.executor_api.submit(request, params)
build.build_request_ref = request.path
def cancel(self, build): def cancel(self, build):
log = get_annotated_logger(self.log, build.zuul_event_id, log = get_annotated_logger(self.log, build.zuul_event_id,
@ -224,7 +225,7 @@ class ExecutorClient(object):
del self.builds[build.uuid] del self.builds[build.uuid]
def cleanupLostBuildRequests(self): def cleanupLostBuildRequests(self):
for build_request in self.executor_api.lostBuildRequests(): for build_request in self.executor_api.lostRequests():
try: try:
self.cleanupLostBuildRequest(build_request) self.cleanupLostBuildRequest(build_request)
except Exception: except Exception:
@ -241,7 +242,7 @@ class ExecutorClient(object):
build_request.state = BuildRequest.COMPLETED build_request.state = BuildRequest.COMPLETED
try: try:
self.executor_api.update(build_request) self.executor_api.update(build_request)
except BuildRequestNotFound as e: except JobRequestNotFound as e:
self.log.warning("Could not complete build: %s", str(e)) self.log.warning("Could not complete build: %s", str(e))
return return
except BadVersionError: except BadVersionError:

View File

@ -66,8 +66,9 @@ import zuul.model
from zuul.nodepool import Nodepool from zuul.nodepool import Nodepool
from zuul.zk.event_queues import PipelineResultEventQueue from zuul.zk.event_queues import PipelineResultEventQueue
from zuul.zk.components import ExecutorComponent from zuul.zk.components import ExecutorComponent
from zuul.zk.exceptions import BuildRequestNotFound from zuul.zk.exceptions import JobRequestNotFound
from zuul.zk.executor import BuildRequestEvent, ExecutorApi from zuul.zk.executor import ExecutorApi
from zuul.zk.job_request_queue import JobRequestEvent
BUFFER_LINES_FOR_SYNTAX = 200 BUFFER_LINES_FOR_SYNTAX = 200
COMMANDS = ['stop', 'pause', 'unpause', 'graceful', 'verbose', COMMANDS = ['stop', 'pause', 'unpause', 'graceful', 'verbose',
@ -3511,13 +3512,13 @@ class ExecutorServer(BaseMergeServer):
# not we could avoid this ZK update. The cancel request can anyway # not we could avoid this ZK update. The cancel request can anyway
# only be fulfilled by the executor that executes the job. So, if # only be fulfilled by the executor that executes the job. So, if
# that executor died, no other can pick up the request. # that executor died, no other can pick up the request.
if build_event == BuildRequestEvent.CANCELED: if build_event == JobRequestEvent.CANCELED:
self.executor_api.fulfillCancel(build_request) self.executor_api.fulfillCancel(build_request)
self.stopJob(build_request) self.stopJob(build_request)
elif build_event == BuildRequestEvent.RESUMED: elif build_event == JobRequestEvent.RESUMED:
self.executor_api.fulfillResume(build_request) self.executor_api.fulfillResume(build_request)
self.resumeJob(build_request) self.resumeJob(build_request)
elif build_event == BuildRequestEvent.DELETED: elif build_event == JobRequestEvent.DELETED:
self.stopJob(build_request) self.stopJob(build_request)
def runBuildWorker(self): def runBuildWorker(self):
@ -3558,8 +3559,8 @@ class ExecutorServer(BaseMergeServer):
return return
build_request.state = BuildRequest.RUNNING build_request.state = BuildRequest.RUNNING
params = self.executor_api.getBuildParams(build_request) params = self.executor_api.getParams(build_request)
self.executor_api.clearBuildParams(build_request) self.executor_api.clearParams(build_request)
# Directly update the build in ZooKeeper, so we don't # Directly update the build in ZooKeeper, so we don't
# loop over and try to lock it again and again. # loop over and try to lock it again and again.
self.executor_api.update(build_request) self.executor_api.update(build_request)
@ -3809,7 +3810,7 @@ class ExecutorServer(BaseMergeServer):
build_request.state = BuildRequest.PAUSED build_request.state = BuildRequest.PAUSED
try: try:
self.executor_api.update(build_request) self.executor_api.update(build_request)
except BuildRequestNotFound as e: except JobRequestNotFound as e:
self.log.warning("Could not pause build: %s", str(e)) self.log.warning("Could not pause build: %s", str(e))
return return
@ -3821,7 +3822,7 @@ class ExecutorServer(BaseMergeServer):
build_request.state = BuildRequest.RUNNING build_request.state = BuildRequest.RUNNING
try: try:
self.executor_api.update(build_request) self.executor_api.update(build_request)
except BuildRequestNotFound as e: except JobRequestNotFound as e:
self.log.warning("Could not resume build: %s", str(e)) self.log.warning("Could not resume build: %s", str(e))
return return
@ -3864,7 +3865,7 @@ class ExecutorServer(BaseMergeServer):
build_request.state = BuildRequest.COMPLETED build_request.state = BuildRequest.COMPLETED
try: try:
self.executor_api.update(build_request) self.executor_api.update(build_request)
except BuildRequestNotFound as e: except JobRequestNotFound as e:
self.log.warning("Could not complete build: %s", str(e)) self.log.warning("Could not complete build: %s", str(e))
return return

View File

@ -19,7 +19,7 @@ from zuul.lib.config import get_default
from zuul.lib.logutil import get_annotated_logger from zuul.lib.logutil import get_annotated_logger
from zuul.model import MergeRequest, PRECEDENCE_HIGH, PRECEDENCE_NORMAL from zuul.model import MergeRequest, PRECEDENCE_HIGH, PRECEDENCE_NORMAL
from zuul.zk.merger import MergerApi from zuul.zk.merger import MergerApi
from zuul.zk.exceptions import MergeRequestNotFound from zuul.zk.exceptions import JobRequestNotFound
from kazoo.exceptions import BadVersionError from kazoo.exceptions import BadVersionError
@ -64,17 +64,17 @@ class MergeClient(object):
log = get_annotated_logger(self.log, event) log = get_annotated_logger(self.log, event)
log.debug("Submitting job %s with data %s", uuid, data) log.debug("Submitting job %s with data %s", uuid, data)
return self.merger_api.submit( request = MergeRequest(
uuid=uuid, uuid=uuid,
job_type=job_type, job_type=job_type,
build_set_uuid=build_set_uuid, build_set_uuid=build_set_uuid,
tenant_name=tenant_name, tenant_name=tenant_name,
pipeline_name=pipeline_name, pipeline_name=pipeline_name,
params=data,
event_id=event.zuul_event_id if event else None, event_id=event.zuul_event_id if event else None,
precedence=precedence, precedence=precedence
needs_result=needs_result,
) )
return self.merger_api.submit(request, data,
needs_result=needs_result)
def mergeChanges(self, items, build_set, files=None, dirs=None, def mergeChanges(self, items, build_set, files=None, dirs=None,
repo_state=None, precedence=PRECEDENCE_NORMAL, repo_state=None, precedence=PRECEDENCE_NORMAL,
@ -128,7 +128,7 @@ class MergeClient(object):
return job return job
def cleanupLostMergeRequests(self): def cleanupLostMergeRequests(self):
for merge_request in self.merger_api.lostMergeRequests(): for merge_request in self.merger_api.lostRequests():
try: try:
self.cleanupLostMergeRequest(merge_request) self.cleanupLostMergeRequest(merge_request)
except Exception: except Exception:
@ -142,7 +142,7 @@ class MergeClient(object):
# TODO (felix): If we want to optimize ZK requests, we could only # TODO (felix): If we want to optimize ZK requests, we could only
# call the remove() here. # call the remove() here.
self.merger_api.remove(merge_request) self.merger_api.remove(merge_request)
except MergeRequestNotFound as e: except JobRequestNotFound as e:
self.log.warning("Could not complete merge: %s", str(e)) self.log.warning("Could not complete merge: %s", str(e))
return return
except BadVersionError: except BadVersionError:

View File

@ -97,9 +97,6 @@ class BaseMergeServer(metaclass=ABCMeta):
) )
self.merger_loop_wake_event = threading.Event() self.merger_loop_wake_event = threading.Event()
self.merger_cleanup_election = self.zk_client.client.Election(
f"{MergerApi.MERGE_REQUEST_ROOT}/election"
)
self.merger_api = MergerApi( self.merger_api = MergerApi(
self.zk_client, self.zk_client,
@ -172,7 +169,6 @@ class BaseMergeServer(metaclass=ABCMeta):
self.log.debug('Stopping merger') self.log.debug('Stopping merger')
self._merger_running = False self._merger_running = False
self.merger_loop_wake_event.set() self.merger_loop_wake_event.set()
self.merger_cleanup_election.cancel()
self.zk_client.disconnect() self.zk_client.disconnect()
def join(self): def join(self):
@ -215,8 +211,8 @@ class BaseMergeServer(metaclass=ABCMeta):
return return
merge_request.state = MergeRequest.RUNNING merge_request.state = MergeRequest.RUNNING
params = self.merger_api.getMergeParams(merge_request) params = self.merger_api.getParams(merge_request)
self.merger_api.clearMergeParams(merge_request) self.merger_api.clearParams(merge_request)
# Directly update the merge request in ZooKeeper, so we don't loop over # Directly update the merge request in ZooKeeper, so we don't loop over
# and try to lock it again and again. # and try to lock it again and again.
self.merger_api.update(merge_request) self.merger_api.update(merge_request)

View File

@ -2069,130 +2069,31 @@ class JobGraph(object):
@total_ordering @total_ordering
class MergeRequest: class JobRequest:
# States: # States:
UNSUBMITTED = "unsubmitted"
REQUESTED = "requested" REQUESTED = "requested"
HOLD = "hold" # Used by tests to stall processing HOLD = "hold" # Used by tests to stall processing
RUNNING = "running" RUNNING = "running"
COMPLETED = "completed" COMPLETED = "completed"
ALL_STATES = (REQUESTED, HOLD, RUNNING, COMPLETED) ALL_STATES = (UNSUBMITTED, REQUESTED, HOLD, RUNNING, COMPLETED)
# Types: def __init__(self, uuid, precedence=None, state=None, result_path=None):
MERGE = "merge"
CAT = "cat"
REF_STATE = "refstate"
FILES_CHANGES = "fileschanges"
def __init__(
self,
uuid,
state,
job_type,
precedence,
build_set_uuid,
tenant_name,
pipeline_name,
event_id
):
self.uuid = uuid self.uuid = uuid
self.state = state if precedence is None:
self.job_type = job_type self.precedence = 0
self.precedence = precedence
self.build_set_uuid = build_set_uuid
self.tenant_name = tenant_name
self.pipeline_name = pipeline_name
self.event_id = event_id
# Path to the future result if requested
self.result_path = None
# ZK related data
self.path = None
self._zstat = None
self.lock = None
def toDict(self):
return {
"uuid": self.uuid,
"state": self.state,
"job_type": self.job_type,
"precedence": self.precedence,
"build_set_uuid": self.build_set_uuid,
"tenant_name": self.tenant_name,
"pipeline_name": self.pipeline_name,
"result_path": self.result_path,
"event_id": self.event_id,
}
@classmethod
def fromDict(cls, data):
job = cls(
data["uuid"],
data["state"],
data["job_type"],
data["precedence"],
data["build_set_uuid"],
data["tenant_name"],
data["pipeline_name"],
data["event_id"],
)
job.result_path = data.get("result_path")
return job
def __lt__(self, other):
# Sort jobs by precedence and their creation time in ZooKeeper in
# ascending order to prevent older jobs from starving.
if self.precedence == other.precedence:
if self._zstat and other._zstat:
return self._zstat.ctime < other._zstat.ctime
return self.uuid < other.uuid
return self.precedence < other.precedence
def __eq__(self, other):
same_prec = self.precedence == other.precedence
if self._zstat and other._zstat:
same_ctime = self._zstat.ctime == other._zstat.ctime
else: else:
same_ctime = self.uuid == other.uuid
return same_prec and same_ctime
def __repr__(self):
return (
f"<MergeRequest {self.uuid}, job_type={self.job_type}, "
f"state={self.state}, path={self.path}>"
)
@total_ordering
class BuildRequest:
"""A request for a build in a specific zone"""
# States:
# Waiting
REQUESTED = 'requested'
HOLD = 'hold' # Used by tests to stall processing
# Running
RUNNING = 'running'
PAUSED = 'paused'
# Finished
COMPLETED = 'completed'
ALL_STATES = (REQUESTED, HOLD, RUNNING, PAUSED, COMPLETED)
def __init__(self, uuid, state, precedence, zone,
tenant_name, pipeline_name, event_id):
self.uuid = uuid
self.state = state
self.precedence = precedence self.precedence = precedence
self.zone = zone
self.tenant_name = tenant_name
self.pipeline_name = pipeline_name
self.event_id = event_id
# ZK related data if state is None:
self.state = self.UNSUBMITTED
else:
self.state = state
# Path to the future result if requested
self.result_path = result_path
# ZK related data not serialized
self.path = None self.path = None
self._zstat = None self._zstat = None
self.lock = None self.lock = None
@ -2202,34 +2103,27 @@ class BuildRequest:
"uuid": self.uuid, "uuid": self.uuid,
"state": self.state, "state": self.state,
"precedence": self.precedence, "precedence": self.precedence,
"zone": self.zone, "result_path": self.result_path,
"tenant_name": self.tenant_name,
"pipeline_name": self.pipeline_name,
"event_id": self.event_id,
} }
@classmethod @classmethod
def fromDict(cls, data): def fromDict(cls, data):
build_request = cls( return cls(
data["uuid"], data["uuid"],
data["state"], precedence=data["precedence"],
data["precedence"], state=data["state"],
data["zone"], result_path=data["result_path"]
data["tenant_name"],
data["pipeline_name"],
data["event_id"],
) )
return build_request
def __lt__(self, other): def __lt__(self, other):
# Sort build requests by precedence and their creation time in # Sort requests by precedence and their creation time in
# ZooKeeper in ascending order to prevent older builds from starving. # ZooKeeper in ascending order to prevent older requests from
# starving.
if self.precedence == other.precedence: if self.precedence == other.precedence:
if self._zstat and other._zstat: if self._zstat and other._zstat:
return self._zstat.ctime < other._zstat.ctime return self._zstat.ctime < other._zstat.ctime
# NOTE (felix): As the _zstat should always be set when retrieving # NOTE (felix): As the _zstat should always be set when retrieving
# the build request from ZooKeeper, this branch shouldn't matter # the request from ZooKeeper, this branch shouldn't matter
# much. It's just there, because the _zstat could - theoretically - # much. It's just there, because the _zstat could - theoretically -
# be None. # be None.
return self.uuid < other.uuid return self.uuid < other.uuid
@ -2241,9 +2135,103 @@ class BuildRequest:
same_ctime = self._zstat.ctime == other._zstat.ctime same_ctime = self._zstat.ctime == other._zstat.ctime
else: else:
same_ctime = self.uuid == other.uuid same_ctime = self.uuid == other.uuid
return same_prec and same_ctime return same_prec and same_ctime
def __repr__(self):
return (f"<JobRequest {self.uuid}, state={self.state}, "
f"path={self.path} zone={self.zone}>")
class MergeRequest(JobRequest):
# Types:
MERGE = "merge"
CAT = "cat"
REF_STATE = "refstate"
FILES_CHANGES = "fileschanges"
def __init__(self, uuid, job_type, build_set_uuid, tenant_name,
pipeline_name, event_id, precedence=None, state=None,
result_path=None):
super().__init__(uuid, precedence, state, result_path)
self.job_type = job_type
self.build_set_uuid = build_set_uuid
self.tenant_name = tenant_name
self.pipeline_name = pipeline_name
self.event_id = event_id
def toDict(self):
d = super().toDict()
d.update({
"job_type": self.job_type,
"build_set_uuid": self.build_set_uuid,
"tenant_name": self.tenant_name,
"pipeline_name": self.pipeline_name,
"event_id": self.event_id,
})
return d
@classmethod
def fromDict(cls, data):
return cls(
data["uuid"],
data["job_type"],
data["build_set_uuid"],
data["tenant_name"],
data["pipeline_name"],
data["event_id"],
precedence=data["precedence"],
state=data["state"],
result_path=data["result_path"]
)
def __repr__(self):
return (
f"<MergeRequest {self.uuid}, job_type={self.job_type}, "
f"state={self.state}, path={self.path}>"
)
class BuildRequest(JobRequest):
"""A request for a build in a specific zone"""
# States:
PAUSED = 'paused'
ALL_STATES = JobRequest.ALL_STATES + (PAUSED,) # type: ignore
def __init__(self, uuid, zone,
tenant_name, pipeline_name, event_id,
precedence=None, state=None, result_path=None):
super().__init__(uuid, precedence, state, result_path)
self.zone = zone
self.tenant_name = tenant_name
self.pipeline_name = pipeline_name
self.event_id = event_id
def toDict(self):
d = super().toDict()
d.update({
"zone": self.zone,
"tenant_name": self.tenant_name,
"pipeline_name": self.pipeline_name,
"event_id": self.event_id,
})
return d
@classmethod
def fromDict(cls, data):
return cls(
data["uuid"],
data["zone"],
data["tenant_name"],
data["pipeline_name"],
data["event_id"],
precedence=data["precedence"],
state=data["state"],
result_path=data["result_path"]
)
def __repr__(self): def __repr__(self):
return ( return (
f"<BuildRequest {self.uuid}, state={self.state}, " f"<BuildRequest {self.uuid}, state={self.state}, "

View File

@ -441,9 +441,9 @@ class EventResultFuture(ZooKeeperSimpleBase):
return True return True
class MergerEventResultFuture(EventResultFuture): class JobResultFuture(EventResultFuture):
log = logging.getLogger("zuul.zk.event_queues.ManagementEventResultFuture") log = logging.getLogger("zuul.JobResultFuture")
def __init__(self, client, result_path, waiter_path): def __init__(self, client, result_path, waiter_path):
super().__init__(client, result_path) super().__init__(client, result_path)

View File

@ -27,9 +27,5 @@ class NoClientException(ZuulZooKeeperException):
super().__init__("No zookeeper client!") super().__init__("No zookeeper client!")
class BuildRequestNotFound(ZuulZooKeeperException): class JobRequestNotFound(ZuulZooKeeperException):
pass
class MergeRequestNotFound(ZuulZooKeeperException):
pass pass

View File

@ -12,513 +12,170 @@
# License for the specific language governing permissions and limitations # License for the specific language governing permissions and limitations
# under the License. # under the License.
import time
import json
import logging import logging
from contextlib import suppress
from enum import Enum
from kazoo.exceptions import LockTimeout, NoNodeError from kazoo.exceptions import NoNodeError
from kazoo.protocol.states import EventType
from kazoo.recipe.lock import Lock
from zuul.lib.jsonutil import json_dumps from zuul.lib.collections import DefaultKeyDict
from zuul.lib.logutil import get_annotated_logger
from zuul.model import BuildRequest from zuul.model import BuildRequest
from zuul.zk import ZooKeeperSimpleBase from zuul.zk.job_request_queue import JobRequestQueue
from zuul.zk.exceptions import BuildRequestNotFound
from zuul.zk import sharding
from zuul.zk.vendor.watchers import ExistingDataWatch
class BuildRequestEvent(Enum): class ExecutorQueue(JobRequestQueue):
CREATED = 0 log = logging.getLogger("zuul.ExecutorQueue")
UPDATED = 1 request_class = BuildRequest # type: ignore
RESUMED = 2
CANCELED = 3 def __init__(self, client, root,
DELETED = 4 initial_state_getter,
request_callback=None,
event_callback=None):
self.log.debug("Creating executor queue at root %s", root)
self._initial_state_getter = initial_state_getter
super().__init__(client, root, request_callback, event_callback)
@property
def initial_state(self):
# This supports holding requests in tests
return self._initial_state_getter()
def lostRequests(self):
# Get a list of requests which are running but not locked by
# any client.
yield from filter(
lambda b: not self.isLocked(b),
self.inState(self.request_class.RUNNING,
self.request_class.PAUSED),
)
class ExecutorApi(ZooKeeperSimpleBase): class ExecutorApi:
log = logging.getLogger("zuul.ExecutorApi")
BUILD_REQUEST_ROOT = "/zuul/build-requests"
BUILD_PARAMS_ROOT = "/zuul/build-params"
LOCK_ROOT = "/zuul/build-request-locks"
log = logging.getLogger("zuul.zk.executor.ExecutorApi")
def __init__(self, client, zone_filter=None, def __init__(self, client, zone_filter=None,
build_request_callback=None, build_request_callback=None,
build_event_callback=None): build_event_callback=None):
super().__init__(client) self.client = client
self.request_callback = build_request_callback
self.event_callback = build_event_callback
self.zone_filter = zone_filter self.zone_filter = zone_filter
self._watched_zones = set() self._watched_zones = set()
self.build_request_callback = build_request_callback self.root = '/zuul/executor'
self.build_event_callback = build_event_callback self.unzoned_root = f"{self.root}/unzoned"
self.zones_root = f"{self.root}/zones"
# path -> build request self.zone_queues = DefaultKeyDict(
self._cached_build_requests = {} lambda zone: ExecutorQueue(
self.client,
self._getZoneRoot(zone),
self._getInitialState,
self.request_callback,
self.event_callback))
self.kazoo_client.ensure_path(self.BUILD_PARAMS_ROOT)
if zone_filter is None: if zone_filter is None:
self.registerAllZones() self.registerAllZones()
else: else:
for zone in zone_filter: for zone in zone_filter:
self.registerZone(zone) # For the side effect of creating a queue
self.zone_queues[zone]
@property def _getInitialState(self):
def initial_state(self):
# This supports holding build requests in tests
return BuildRequest.REQUESTED return BuildRequest.REQUESTED
def _getZoneRoot(self, zone): def _getZoneRoot(self, zone):
if zone is None: if zone is None:
return "/".join([self.BUILD_REQUEST_ROOT, 'unzoned']) return self.unzoned_root
else: else:
return "/".join([self.BUILD_REQUEST_ROOT, 'zones', zone]) return f"{self.zones_root}/{zone}"
def registerZone(self, zone):
if zone in self._watched_zones:
return
zone_root = self._getZoneRoot(zone)
self.log.debug("Registering for zone %s at %s", zone, zone_root)
self.kazoo_client.ensure_path(zone_root)
self.kazoo_client.ChildrenWatch(
zone_root, self._makeBuildRequestWatcher(zone_root),
send_event=True
)
self._watched_zones.add(zone)
def registerAllZones(self): def registerAllZones(self):
self.kazoo_client.ensure_path(self.BUILD_REQUEST_ROOT)
# Register a child watch that listens to new zones and automatically # Register a child watch that listens to new zones and automatically
# registers to them. # registers to them.
def watch_zones(children): def watch_zones(children):
for zone in children: for zone in children:
self.registerZone(zone) # For the side effect of creating a queue
self.zone_queues[zone]
zones_root = "/".join([self.BUILD_REQUEST_ROOT, 'zones']) self.client.client.ChildrenWatch(self.zones_root, watch_zones)
self.kazoo_client.ensure_path(zones_root) # For the side effect of creating a queue
self.kazoo_client.ChildrenWatch(zones_root, watch_zones) self.zone_queues[None]
self.registerZone(None)
def _makeBuildStateWatcher(self, path):
def watch(data, stat, event=None):
return self._watchBuildState(path, data, stat, event)
return watch
def _watchBuildState(self, path, data, stat, event=None):
if not event or event.type == EventType.CHANGED:
# Don't process change events w/o any data. This can happen when
# a "slow" change watch tried to retrieve the data of a znode that
# was deleted in the meantime.
if data is None:
return
# As we already get the data and the stat value, we can directly
# use it without asking ZooKeeper for the data again.
content = self._bytesToDict(data)
if not content:
return
# We need this one for the HOLD -> REQUESTED check further down
old_build_request = self._cached_build_requests.get(path)
build_request = BuildRequest.fromDict(content)
build_request.path = path
build_request._zstat = stat
self._cached_build_requests[path] = build_request
# NOTE (felix): This is a test-specific condition: For test cases
# which are using hold_jobs_in_queue the state change on the build
# request from HOLD to REQUESTED is done outside of the executor.
# Thus, we must also set the wake event (the callback) so the
# executor can pick up those builds after they are released. To not
# cause a thundering herd problem in production for each cache
# update, the callback is only called under this very specific
# condition that can only occur in the tests.
if (
self.build_request_callback
and old_build_request
and old_build_request.state == BuildRequest.HOLD
and build_request.state == BuildRequest.REQUESTED
):
self.build_request_callback()
elif event.type == EventType.DELETED:
build_request = self._cached_build_requests.get(path)
with suppress(KeyError):
del self._cached_build_requests[path]
if build_request and self.build_event_callback:
self.build_event_callback(
build_request, BuildRequestEvent.DELETED
)
# Return False to stop the datawatch as the build got deleted.
return False
def _makeBuildRequestWatcher(self, path):
def watch(build_requests, event=None):
return self._watchBuildRequests(path, build_requests, event)
return watch
def _watchBuildRequests(self, path, build_requests, event=None):
# The build_requests list always contains all active children. Thus, we
# first have to find the new ones by calculating the delta between the
# build_requests list and our current cache entries.
# NOTE (felix): We could also use this list to determine the deleted
# build requests, but it's easier to do this in the DataWatch for the
# single build request instead. Otherwise we have to deal with race
# conditions between the children and the data watch as one watch might
# update a cache entry while the other tries to remove it.
build_request_paths = {
f"{path}/{uuid}" for uuid in build_requests
}
new_build_requests = build_request_paths - set(
self._cached_build_requests.keys()
)
for req_path in new_build_requests:
ExistingDataWatch(self.kazoo_client,
req_path,
self._makeBuildStateWatcher(req_path))
# Notify the user about new build requests if a callback is provided,
# but only if there are new requests (we don't want to fire on the
# initial callback from kazoo from registering the datawatch).
if new_build_requests and self.build_request_callback:
self.build_request_callback()
def _iterBuildRequests(self):
# As the entries in the cache dictionary are added and removed via
# data and children watches, we can't simply iterate over it in here,
# as the values might change during iteration.
for key in list(self._cached_build_requests.keys()):
try:
build_request = self._cached_build_requests[key]
except KeyError:
continue
yield build_request
def inState(self, *states):
if not states:
# If no states are provided, build a tuple containing all available
# ones to always match. We need a tuple to be compliant to the
# type of *states above.
states = BuildRequest.ALL_STATES
build_requests = list(
filter(lambda b: b.state in states, self._iterBuildRequests())
)
# Sort the list of builds by precedence and their creation time in
# ZooKeeper in ascending order to prevent older builds from starving.
return (b for b in sorted(build_requests))
def next(self):
yield from self.inState(BuildRequest.REQUESTED)
def submit(self, uuid, tenant_name, pipeline_name, params, zone,
event_id, precedence=200):
log = get_annotated_logger(self.log, event=None, build=uuid)
zone_root = self._getZoneRoot(zone)
path = "/".join([zone_root, uuid])
build_request = BuildRequest(
uuid,
self.initial_state,
precedence,
zone,
tenant_name,
pipeline_name,
event_id,
)
log.debug("Submitting build request to ZooKeeper %s", build_request)
self.kazoo_client.ensure_path(zone_root)
params_path = self._getParamsPath(uuid)
with sharding.BufferedShardWriter(
self.kazoo_client, params_path) as stream:
stream.write(self._dictToBytes(params))
return self.kazoo_client.create(
path, self._dictToBytes(build_request.toDict()))
# We use child nodes here so that we don't need to lock the build
# request node.
def requestResume(self, build_request):
self.kazoo_client.ensure_path(f"{build_request.path}/resume")
def requestCancel(self, build_request):
self.kazoo_client.ensure_path(f"{build_request.path}/cancel")
def fulfillResume(self, build_request):
self.kazoo_client.delete(f"{build_request.path}/resume")
def fulfillCancel(self, build_request):
self.kazoo_client.delete(f"{build_request.path}/cancel")
def update(self, build_request):
log = get_annotated_logger(
self.log, event=None, build=build_request.uuid
)
log.debug("Updating build request %s", build_request)
if build_request._zstat is None:
log.debug(
"Cannot update build request %s: Missing version information.",
build_request.uuid,
)
return
try:
zstat = self.kazoo_client.set(
build_request.path,
self._dictToBytes(build_request.toDict()),
version=build_request._zstat.version,
)
# Update the zstat on the item after updating the ZK node
build_request._zstat = zstat
except NoNodeError:
raise BuildRequestNotFound(
f"Could not update {build_request.path}"
)
def get(self, path):
"""Get a build request
Note: do not mix get with iteration; iteration returns cached
BuildRequests while get returns a newly created object each
time. If you lock a BuildRequest, you must use the same
object to unlock it.
"""
try:
data, zstat = self.kazoo_client.get(path)
except NoNodeError:
return None
if not data:
return None
content = self._bytesToDict(data)
build_request = BuildRequest.fromDict(content)
build_request.path = path
build_request._zstat = zstat
return build_request
def remove(self, build_request):
log = get_annotated_logger(
self.log, event=None, build=build_request.uuid
)
log.debug("Removing build request %s", build_request)
try:
# As the build node might contain children (result, data, ...) we
# must delete it recursively.
self.kazoo_client.delete(build_request.path, recursive=True)
except NoNodeError:
# Nothing to do if the node is already deleted
pass
self.clearBuildParams(build_request)
try:
# Delete the lock parent node as well.
path = "/".join([self.LOCK_ROOT, build_request.uuid])
self.kazoo_client.delete(path, recursive=True)
except NoNodeError:
pass
def _watchBuildEvents(self, actions, event=None):
if event is None:
return
build_event = None
if "cancel" in actions:
build_event = BuildRequestEvent.CANCELED
elif "resume" in actions:
build_event = BuildRequestEvent.RESUMED
if build_event and self.build_event_callback:
build_request = self._cached_build_requests.get(event.path)
self.build_event_callback(build_request, build_event)
def lock(self, build_request, blocking=True, timeout=None):
# Keep the lock nodes in a different path to keep the build request
# subnode structure clean. Otherwise, the lock node will be in between
# the cancel and resume requests.
path = "/".join([self.LOCK_ROOT, build_request.uuid])
have_lock = False
lock = None
try:
lock = Lock(self.kazoo_client, path)
have_lock = lock.acquire(blocking, timeout)
except LockTimeout:
have_lock = False
self.log.error(
"Timeout trying to acquire lock: %s", build_request.uuid
)
# If we aren't blocking, it's possible we didn't get the lock
# because someone else has it.
if not have_lock:
return False
if not self.kazoo_client.exists(build_request.path):
lock.release()
self.log.error(
"Build not found for locking: %s", build_request.uuid
)
# We may have just re-created the lock parent node just
# after the scheduler deleted it; therefore we should
# (re-) delete it.
try:
# Delete the lock parent node as well.
path = "/".join([self.LOCK_ROOT, build_request.uuid])
self.kazoo_client.delete(path, recursive=True)
except NoNodeError:
pass
return False
build_request.lock = lock
# Create the children watch to listen for cancel/resume actions on this
# build request.
self.kazoo_client.ChildrenWatch(
build_request.path, self._watchBuildEvents, send_event=True
)
return True
def unlock(self, build_request):
if build_request.lock is None:
self.log.warning(
"BuildRequest %s does not hold a lock", build_request
)
else:
build_request.lock.release()
build_request.lock = None
def isLocked(self, build_request):
path = "/".join([self.LOCK_ROOT, build_request.uuid])
lock = Lock(self.kazoo_client, path)
is_locked = len(lock.contenders()) > 0
return is_locked
def lostBuildRequests(self):
# Get a list of builds which are running but not locked by any executor
yield from filter(
lambda b: not self.isLocked(b),
self.inState(BuildRequest.RUNNING, BuildRequest.PAUSED),
)
def _getAllZones(self): def _getAllZones(self):
# Get a list of all zones without using the cache. # Get a list of all zones without using the cache.
try: try:
# Get all available zones from ZooKeeper # Get all available zones from ZooKeeper
zones = self.kazoo_client.get_children( zones = self.client.client.get_children(self.zones_root)
'/'.join([self.BUILD_REQUEST_ROOT, 'zones']))
zones.append(None) zones.append(None)
except NoNodeError: except NoNodeError:
zones = [None] zones = [None]
return zones return zones
def _getAllBuildIds(self, zones=None): # Override JobRequestQueue methods to accomodate the zone dict.
# Get a list of all build uuids without using the cache.
if zones is None:
zones = self._getAllZones()
all_builds = set() def inState(self, *states):
for zone in zones: requests = []
try: for queue in self.zone_queues.values():
zone_path = self._getZoneRoot(zone) requests.extend(queue.inState(*states))
all_builds.update(self.kazoo_client.get_children(zone_path)) return sorted(requests)
except NoNodeError:
# Skip this zone as it doesn't have any builds
continue
return all_builds
def _findLostParams(self, age): def next(self):
# Get data nodes which are older than the specified age (we yield from self.inState(BuildRequest.REQUESTED)
# don't want to delete nodes which are just being written
# slowly).
# Convert to MS
now = int(time.time() * 1000)
age = age * 1000
data_nodes = dict()
for data_id in self.kazoo_client.get_children(self.BUILD_PARAMS_ROOT):
data_path = self._getParamsPath(data_id)
data_zstat = self.kazoo_client.exists(data_path)
if now - data_zstat.mtime > age:
data_nodes[data_id] = data_path
# If there are no candidate data nodes, we don't need to def submit(self, request, params):
# filter them by known requests. return self.zone_queues[request.zone].submit(request, params)
if not data_nodes:
return data_nodes.values()
# Remove current request uuids def update(self, request):
for request_id in self._getAllBuildIds(): return self.zone_queues[request.zone].update(request)
if request_id in data_nodes:
del data_nodes[request_id]
# Return the paths def reportResult(self, request, result):
return data_nodes.values() return self.zone_queues[request.zone].reportResult(request)
def get(self, path):
if path.startswith(self.zones_root):
zone = path[len(self.zones_root):]
else:
zone = None
return self.zone_queues[zone].get(path)
def remove(self, request):
return self.zone_queues[request.zone].remove(request)
def requestResume(self, request):
return self.zone_queues[request.zone].requestResume(request)
def requestCancel(self, request):
return self.zone_queues[request.zone].requestCancel(request)
def fulfillResume(self, request):
return self.zone_queues[request.zone].fulfillResume(request)
def fulfillCancel(self, request):
return self.zone_queues[request.zone].fulfillCancel(request)
def lock(self, request, *args, **kw):
return self.zone_queues[request.zone].lock(request, *args, **kw)
def unlock(self, request):
return self.zone_queues[request.zone].unlock(request)
def isLocked(self, request):
return self.zone_queues[request.zone].isLocked(request)
def lostRequests(self):
for queue in self.zone_queues.values():
yield from queue.lostRequests()
def cleanup(self, age=300): def cleanup(self, age=300):
# Delete build request params which are not associated with for queue in self.zone_queues.values():
# any current build requests. Note, this does not clean up queue.cleanup(age)
# lost build requests themselves; the executor client takes
# care of that.
try:
for path in self._findLostParams(age):
try:
self.log.error("Removing build request params: %s", path)
self.kazoo_client.delete(path, recursive=True)
except Exception:
self.log.execption(
"Unable to delete build request params %s", path)
except Exception:
self.log.exception(
"Error cleaning up build request queue %s", self)
@staticmethod def clearParams(self, request):
def _bytesToDict(data): return self.zone_queues[request.zone].clearParams(request)
return json.loads(data.decode("utf-8"))
@staticmethod def getParams(self, request):
def _dictToBytes(data): return self.zone_queues[request.zone].getParams(request)
# The custom json_dumps() will also serialize MappingProxyType objects
return json_dumps(data).encode("utf-8")
def _getParamsPath(self, build_uuid): def _getAllRequestIds(self):
return '/'.join([self.BUILD_PARAMS_ROOT, build_uuid]) ret = []
for queue in self.zone_queues.values():
def clearBuildParams(self, build_request): ret.extend(queue._getAllRequestIds())
"""Erase the build parameters from ZK to save space""" return ret
self.kazoo_client.delete(self._getParamsPath(build_request.uuid),
recursive=True)
def getBuildParams(self, build_request):
"""Return the parameters for a build request, if they exist.
Once a build request is accepted by an executor, the params
may be erased from ZK; this will return None in that case.
"""
with sharding.BufferedShardReader(
self.kazoo_client,
self._getParamsPath(build_request.uuid)) as stream:
data = stream.read()
if not data:
return None
return self._bytesToDict(data)

View File

@ -0,0 +1,528 @@
# Copyright 2021 BMW Group
# Copyright 2021 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import json
import logging
import time
from contextlib import suppress
from enum import Enum
from kazoo.exceptions import LockTimeout, NoNodeError
from kazoo.protocol.states import EventType
from kazoo.recipe.lock import Lock
from zuul.lib.jsonutil import json_dumps
from zuul.lib.logutil import get_annotated_logger
from zuul.model import JobRequest
from zuul.zk import ZooKeeperSimpleBase, sharding
from zuul.zk.event_queues import JobResultFuture
from zuul.zk.exceptions import JobRequestNotFound
from zuul.zk.vendor.watchers import ExistingDataWatch
class JobRequestEvent(Enum):
CREATED = 0
UPDATED = 1
RESUMED = 2
CANCELED = 3
DELETED = 4
class JobRequestQueue(ZooKeeperSimpleBase):
log = logging.getLogger("zuul.JobRequestQueue")
request_class = JobRequest
def __init__(self, client, root,
request_callback=None, event_callback=None):
super().__init__(client)
self.REQUEST_ROOT = f"{root}/requests"
self.LOCK_ROOT = f"{root}/locks"
self.PARAM_ROOT = f"{root}/params"
self.RESULT_ROOT = f"{root}/results"
self.RESULT_DATA_ROOT = f"{root}/result-data"
self.WAITER_ROOT = f"{root}/waiters"
self.request_callback = request_callback
self.event_callback = event_callback
# path -> request
self._cached_requests = {}
self.kazoo_client.ensure_path(self.REQUEST_ROOT)
self.kazoo_client.ensure_path(self.PARAM_ROOT)
self.kazoo_client.ensure_path(self.RESULT_ROOT)
self.kazoo_client.ensure_path(self.RESULT_DATA_ROOT)
self.kazoo_client.ensure_path(self.WAITER_ROOT)
self.kazoo_client.ensure_path(self.LOCK_ROOT)
self.register()
@property
def initial_state(self):
# This supports holding requests in tests
return self.request_class.REQUESTED
def register(self):
# Register a child watch that listens for new requests
self.kazoo_client.ChildrenWatch(
self.REQUEST_ROOT,
self._makeRequestWatcher(self.REQUEST_ROOT),
send_event=True,
)
def _makeRequestWatcher(self, path):
def watch(requests, event=None):
return self._watchRequests(path, requests)
return watch
def _watchRequests(self, path, requests):
# The requests list always contains all active children. Thus,
# we first have to find the new ones by calculating the delta
# between the requests list and our current cache entries.
# NOTE (felix): We could also use this list to determine the
# deleted requests, but it's easier to do this in the
# DataWatch for the single request instead. Otherwise we have
# to deal with race conditions between the children and the
# data watch as one watch might update a cache entry while the
# other tries to remove it.
request_paths = {
f"{path}/{uuid}" for uuid in requests
}
new_requests = request_paths - set(
self._cached_requests.keys()
)
for req_path in new_requests:
ExistingDataWatch(self.kazoo_client,
req_path,
self._makeStateWatcher(req_path))
# Notify the user about new requests if a callback is provided.
# When we register the data watch, we will receive an initial
# callback immediately. The list of children may be empty in
# that case, so we should not fire our callback since there
# are no requests to handle.
if new_requests and self.request_callback:
self.request_callback()
def _makeStateWatcher(self, path):
def watch(data, stat, event=None):
return self._watchState(path, data, stat, event)
return watch
def _watchState(self, path, data, stat, event=None):
if not event or event.type == EventType.CHANGED:
# Don't process change events w/o any data. This can happen when a
# "slow" change watch tried to retrieve the data of a znode that
# was deleted in the meantime.
if not data:
return
# As we already get the data and the stat value, we can directly
# use it without asking ZooKeeper for the data again.
content = self._bytesToDict(data)
if not content:
return
# We need this one for the HOLD -> REQUESTED check further down
old_request = self._cached_requests.get(path)
request = self.request_class.fromDict(content)
request.path = path
request._zstat = stat
self._cached_requests[path] = request
# NOTE (felix): This is a test-specific condition: For test cases
# which are using hold_*_jobs_in_queue the state change on the
# request from HOLD to REQUESTED is done outside of the server.
# Thus, we must also set the wake event (the callback) so the
# servercan pick up those jobs after they are released. To not
# cause a thundering herd problem in production for each cache
# update, the callback is only called under this very specific
# condition that can only occur in the tests.
if (
self.request_callback
and old_request
and old_request.state == self.request_class.HOLD
and request.state == self.request_class.REQUESTED
):
self.request_callback()
elif event.type == EventType.DELETED:
request = self._cached_requests.get(path)
with suppress(KeyError):
del self._cached_requests[path]
if request and self.event_callback:
self.event_callback(request, JobRequestEvent.DELETED)
# Return False to stop the datawatch as the build got deleted.
return False
def inState(self, *states):
if not states:
# If no states are provided, build a tuple containing all available
# ones to always match. We need a tuple to be compliant to the
# type of *states above.
states = self.request_class.ALL_STATES
requests = [
req for req in self._cached_requests.values()
if req.state in states
]
# Sort the list of requests by precedence and their creation time
# in ZooKeeper in ascending order to prevent older requests from
# starving.
return sorted(requests)
def next(self):
yield from self.inState(self.request_class.REQUESTED)
def submit(self, request, params, needs_result=False):
log = get_annotated_logger(self.log, event=request.event_id)
path = "/".join([self.REQUEST_ROOT, request.uuid])
request.path = path
assert isinstance(request, self.request_class)
assert request.state == self.request_class.UNSUBMITTED
request.state = self.initial_state
result = None
# If a result is needed, create the result_path with the same
# UUID and store it on the request, so the server can store
# the result there.
if needs_result:
result_path = "/".join(
[self.RESULT_ROOT, request.uuid]
)
waiter_path = "/".join(
[self.WAITER_ROOT, request.uuid]
)
self.kazoo_client.create(waiter_path, ephemeral=True)
result = JobResultFuture(self.client, result_path, waiter_path)
request.result_path = result_path
log.debug("Submitting job request to ZooKeeper %s", request)
params_path = self._getParamsPath(request.uuid)
with sharding.BufferedShardWriter(
self.kazoo_client, params_path
) as stream:
stream.write(self._dictToBytes(params))
self.kazoo_client.create(path, self._dictToBytes(request.toDict()))
return result
def update(self, request):
log = get_annotated_logger(
self.log, event=request.event_id, build=request.uuid
)
log.debug("Updating request %s", request)
if request._zstat is None:
log.debug(
"Cannot update request %s: Missing version information.",
request.uuid,
)
return
try:
zstat = self.kazoo_client.set(
request.path,
self._dictToBytes(request.toDict()),
version=request._zstat.version,
)
# Update the zstat on the item after updating the ZK node
request._zstat = zstat
except NoNodeError:
raise JobRequestNotFound(
f"Could not update {request.path}"
)
def reportResult(self, request, result):
# Write the result data first since it may be multiple nodes.
result_data_path = "/".join(
[self.RESULT_DATA_ROOT, request.uuid]
)
with sharding.BufferedShardWriter(
self.kazoo_client, result_data_path) as stream:
stream.write(self._dictToBytes(result))
# Then write the result node to signify it's ready.
data = {'result_data_path': result_data_path}
self.kazoo_client.create(request.result_path,
self._dictToBytes(data))
def get(self, path):
"""Get a request
Note: do not mix get with iteration; iteration returns cached
requests while get returns a newly created object each
time. If you lock a request, you must use the same object to
unlock it.
"""
try:
data, zstat = self.kazoo_client.get(path)
except NoNodeError:
return None
if not data:
return None
content = self._bytesToDict(data)
request = self.request_class.fromDict(content)
request.path = path
request._zstat = zstat
return request
def remove(self, request):
self.log.debug("Removing request %s", request)
try:
self.kazoo_client.delete(request.path, recursive=True)
except NoNodeError:
# Nothing to do if the node is already deleted
pass
self.clearParams(request)
try:
# Delete the lock parent node as well
path = "/".join([self.LOCK_ROOT, request.uuid])
self.kazoo_client.delete(path, recursive=True)
except NoNodeError:
pass
# We use child nodes here so that we don't need to lock the
# request node.
def requestResume(self, request):
self.kazoo_client.ensure_path(f"{request.path}/resume")
def requestCancel(self, request):
self.kazoo_client.ensure_path(f"{request.path}/cancel")
def fulfillResume(self, request):
self.kazoo_client.delete(f"{request.path}/resume")
def fulfillCancel(self, request):
self.kazoo_client.delete(f"{request.path}/cancel")
def _watchEvents(self, actions, event=None):
if event is None:
return
job_event = None
if "cancel" in actions:
job_event = JobRequestEvent.CANCELED
elif "resume" in actions:
job_event = JobRequestEvent.RESUMED
if job_event:
request = self._cached_requests.get(event.path)
self.event_callback(request, job_event)
def lock(self, request, blocking=True, timeout=None):
path = "/".join([self.LOCK_ROOT, request.uuid])
have_lock = False
lock = None
try:
lock = Lock(self.kazoo_client, path)
have_lock = lock.acquire(blocking, timeout)
except LockTimeout:
have_lock = False
self.log.error(
"Timeout trying to acquire lock: %s", request.uuid
)
# If we aren't blocking, it's possible we didn't get the lock
# because someone else has it.
if not have_lock:
return False
if not self.kazoo_client.exists(request.path):
lock.release()
self.log.error(
"Request not found for locking: %s", request.uuid
)
# We may have just re-created the lock parent node just after the
# scheduler deleted it; therefore we should (re-) delete it.
try:
# Delete the lock parent node as well.
path = "/".join([self.LOCK_ROOT, request.uuid])
self.kazoo_client.delete(path, recursive=True)
except NoNodeError:
pass
return False
request.lock = lock
# Create the children watch to listen for cancel/resume actions on this
# build request.
if self.event_callback:
self.kazoo_client.ChildrenWatch(
request.path, self._watchEvents, send_event=True)
return True
def unlock(self, request):
if request.lock is None:
self.log.warning(
"Request %s does not hold a lock", request
)
else:
request.lock.release()
request.lock = None
def isLocked(self, request):
path = "/".join([self.LOCK_ROOT, request.uuid])
lock = Lock(self.kazoo_client, path)
is_locked = len(lock.contenders()) > 0
return is_locked
def lostRequests(self):
# Get a list of requests which are running but not locked by
# any client.
yield from filter(
lambda b: not self.isLocked(b),
self.inState(self.request_class.RUNNING),
)
def _getAllRequestIds(self):
# Get a list of all request ids without using the cache.
return self.kazoo_client.get_children(self.REQUEST_ROOT)
def _findLostParams(self, age):
# Get data nodes which are older than the specified age (we
# don't want to delete nodes which are just being written
# slowly).
# Convert to MS
now = int(time.time() * 1000)
age = age * 1000
data_nodes = dict()
for data_id in self.kazoo_client.get_children(self.PARAM_ROOT):
data_path = self._getParamsPath(data_id)
data_zstat = self.kazoo_client.exists(data_path)
if now - data_zstat.mtime > age:
data_nodes[data_id] = data_path
# If there are no candidate data nodes, we don't need to
# filter them by known requests.
if not data_nodes:
return data_nodes.values()
# Remove current request uuids
for request_id in self._getAllRequestIds():
if request_id in data_nodes:
del data_nodes[request_id]
# Return the paths
return data_nodes.values()
def _findLostResults(self):
# Get a list of results which don't have a connection waiting for
# them. As the results and waiters are not part of our cache, we have
# to look them up directly from ZK.
waiters1 = set(self.kazoo_client.get_children(self.WAITER_ROOT))
results = set(self.kazoo_client.get_children(self.RESULT_ROOT))
result_data = set(self.kazoo_client.get_children(
self.RESULT_DATA_ROOT))
waiters2 = set(self.kazoo_client.get_children(self.WAITER_ROOT))
waiters = waiters1.union(waiters2)
lost_results = results - waiters
lost_data = result_data - waiters
return lost_results, lost_data
def cleanup(self, age=300):
# Delete build request params which are not associated with
# any current build requests. Note, this does not clean up
# lost requests themselves; the client takes care of that.
try:
for path in self._findLostParams(age):
try:
self.log.error("Removing request params: %s", path)
self.kazoo_client.delete(path, recursive=True)
except Exception:
self.log.execption(
"Unable to delete request params %s", path)
except Exception:
self.log.exception(
"Error cleaning up request queue %s", self)
try:
lost_results, lost_data = self._findLostResults()
for result_id in lost_results:
try:
path = '/'.join([self.RESULT_ROOT, result_id])
self.log.error("Removing request result: %s", path)
self.kazoo_client.delete(path, recursive=True)
except Exception:
self.log.execption(
"Unable to delete request params %s", result_id)
for result_id in lost_data:
try:
path = '/'.join([self.RESULT_DATA_ROOT, result_id])
self.log.error(
"Removing request result data: %s", path)
self.kazoo_client.delete(path, recursive=True)
except Exception:
self.log.execption(
"Unable to delete request params %s", result_id)
except Exception:
self.log.exception(
"Error cleaning up result queue %s", self)
@staticmethod
def _bytesToDict(data):
return json.loads(data.decode("utf-8"))
@staticmethod
def _dictToBytes(data):
# The custom json_dumps() will also serialize MappingProxyType objects
return json_dumps(data).encode("utf-8")
def _getParamsPath(self, uuid):
return '/'.join([self.PARAM_ROOT, uuid])
def clearParams(self, request):
"""Erase the parameters from ZK to save space"""
self.kazoo_client.delete(self._getParamsPath(request.uuid),
recursive=True)
def getParams(self, request):
"""Return the parameters for a request, if they exist.
Once a request is accepted by an executor, the params
may be erased from ZK; this will return None in that case.
"""
with sharding.BufferedShardReader(
self.kazoo_client, self._getParamsPath(request.uuid)
) as stream:
data = stream.read()
if not data:
return None
return self._bytesToDict(data)
def deleteResult(self, path):
with suppress(NoNodeError):
self.kazoo_client.delete(path, recursive=True)

View File

@ -12,483 +12,16 @@
# License for the specific language governing permissions and limitations # License for the specific language governing permissions and limitations
# under the License. # under the License.
import json
import logging import logging
import time
from contextlib import suppress
from kazoo.exceptions import LockTimeout, NoNodeError
from kazoo.protocol.states import EventType
from kazoo.recipe.lock import Lock
from zuul.lib.jsonutil import json_dumps
from zuul.lib.logutil import get_annotated_logger
from zuul.model import MergeRequest from zuul.model import MergeRequest
from zuul.zk import ZooKeeperSimpleBase, sharding from zuul.zk.job_request_queue import JobRequestQueue
from zuul.zk.event_queues import MergerEventResultFuture
from zuul.zk.exceptions import MergeRequestNotFound
from zuul.zk.vendor.watchers import ExistingDataWatch
class MergerApi(ZooKeeperSimpleBase): class MergerApi(JobRequestQueue):
log = logging.getLogger("zuul.MergerApi")
MERGE_REQUEST_ROOT = "/zuul/merge-requests" request_class = MergeRequest # type: ignore
MERGE_PARAMS_ROOT = "/zuul/merge-params"
MERGE_RESULT_ROOT = "/zuul/merge-results"
MERGE_RESULT_DATA_ROOT = "/zuul/merge-result-data"
MERGE_WAITER_ROOT = "/zuul/merge-waiters"
LOCK_ROOT = "/zuul/merge-request-locks"
log = logging.getLogger("zuul.zk.merger.MergerApi")
def __init__(self, client, merge_request_callback=None): def __init__(self, client, merge_request_callback=None):
super().__init__(client) root = '/zuul/merger'
super().__init__(client, root, merge_request_callback)
self.merge_request_callback = merge_request_callback
# path -> merge request
self._cached_merge_requests = {}
self.register()
@property
def initial_state(self):
# This supports holding merge requests in tests
return MergeRequest.REQUESTED
def register(self):
self.kazoo_client.ensure_path(self.MERGE_REQUEST_ROOT)
self.kazoo_client.ensure_path(self.MERGE_PARAMS_ROOT)
self.kazoo_client.ensure_path(self.MERGE_RESULT_ROOT)
self.kazoo_client.ensure_path(self.MERGE_RESULT_DATA_ROOT)
self.kazoo_client.ensure_path(self.MERGE_WAITER_ROOT)
self.kazoo_client.ensure_path(self.LOCK_ROOT)
# Register a child watch that listens for new merge requests
self.kazoo_client.ChildrenWatch(
self.MERGE_REQUEST_ROOT,
self._makeMergeRequestWatcher(self.MERGE_REQUEST_ROOT),
send_event=True,
)
def _makeMergeStateWatcher(self, path):
def watch(data, stat, event=None):
return self._watchMergeState(path, data, stat, event)
return watch
def _watchMergeState(self, path, data, stat, event=None):
if not event or event.type == EventType.CHANGED:
# Don't process change events w/o any data. This can happen when a
# "slow" change watch tried to retrieve the data of a znode that
# was deleted in the meantime.
if not data:
return
# As we already get the data and the stat value, we can directly
# use it without asking ZooKeeper for the data again.
content = self._bytesToDict(data)
if not content:
return
# We need this one for the HOLD -> REQUESTED check further down
old_merge_request = self._cached_merge_requests.get(path)
merge_request = MergeRequest.fromDict(content)
merge_request.path = path
merge_request._zstat = stat
self._cached_merge_requests[path] = merge_request
# NOTE (felix): This is a test-specific condition: For test cases
# which are using hold_merge_jobs_in_queue the state change on the
# merge request from HOLD to REQUESTED is done outside of the
# merger.
# Thus, we must also set the wake event (the callback) so the
# merger can pick up those jobs after they are released. To not
# cause a thundering herd problem in production for each cache
# update, the callback is only called under this very specific
# condition that can only occur in the tests.
if (
self.merge_request_callback
and old_merge_request
and old_merge_request.state == MergeRequest.HOLD
and merge_request.state == MergeRequest.REQUESTED
):
self.merge_request_callback()
elif event.type == EventType.DELETED:
with suppress(KeyError):
del self._cached_merge_requests[path]
# Return False to stop the datawatch as the build got deleted.
return False
def _makeMergeRequestWatcher(self, path):
def watch(merge_requests, event=None):
return self._watchMergeRequests(path, merge_requests)
return watch
def _watchMergeRequests(self, path, merge_requests):
# The merge_requests list always contains all active children. Thus, we
# first have to find the new ones by calculating the delta between the
# merge_requests list and our current cache entries.
# NOTE (felix): We could also use this list to determine the deleted
# merge requests, but it's easier to do this in the DataWatch for the
# single merge request instead. Otherwise we have to deal with race
# conditions between the children and the data watch as one watch might
# update a cache entry while the other tries to remove it.
merge_request_paths = {
f"{path}/{uuid}" for uuid in merge_requests
}
new_merge_requests = merge_request_paths - set(
self._cached_merge_requests.keys()
)
for req_path in new_merge_requests:
ExistingDataWatch(self.kazoo_client,
req_path,
self._makeMergeStateWatcher(req_path))
# Notify the user about new merge requests if a callback is provided.
# When we register the data watch, we will receive an initial
# callback immediately. The list of children may be empty in
# that case, so we should not fire our callback since there
# are no merge requests to handle.
if new_merge_requests and self.merge_request_callback:
self.merge_request_callback()
def _iterMergeRequests(self):
# As the entries in the cache dictionary are added and removed via
# data and children watches, we can't simply iterate over it in here,
# as the values might change during iteration.
for key in list(self._cached_merge_requests.keys()):
try:
merge_request = self._cached_merge_requests[key]
except KeyError:
continue
yield merge_request
def inState(self, *states):
if not states:
# If no states are provided, build a tuple containing all available
# ones to always match. We need a tuple to be compliant to the
# type of *states above.
states = MergeRequest.ALL_STATES
merge_requests = list(
filter(lambda b: b.state in states, self._iterMergeRequests())
)
# Sort the list of merge requests by precedence and their creation time
# in ZooKeeper in ascending order to prevent older requests from
# starving.
return (b for b in sorted(merge_requests))
def next(self):
yield from self.inState(MergeRequest.REQUESTED)
def submit(self, uuid, job_type, build_set_uuid, tenant_name,
pipeline_name, params, event_id, precedence=0,
needs_result=False):
log = get_annotated_logger(self.log, event=event_id)
path = "/".join([self.MERGE_REQUEST_ROOT, uuid])
merge_request = MergeRequest(
uuid,
self.initial_state,
job_type,
precedence,
build_set_uuid,
tenant_name,
pipeline_name,
event_id,
)
result = None
# If a result is needed, create the result_path with the same UUID and
# store it on the merge request, so the merger server can store the
# result there.
if needs_result:
result_path = "/".join(
[self.MERGE_RESULT_ROOT, merge_request.uuid]
)
waiter_path = "/".join(
[self.MERGE_WAITER_ROOT, merge_request.uuid]
)
self.kazoo_client.create(waiter_path, ephemeral=True)
result = MergerEventResultFuture(self.client, result_path,
waiter_path)
merge_request.result_path = result_path
log.debug("Submitting merge request to ZooKeeper %s", merge_request)
params_path = self._getParamsPath(uuid)
with sharding.BufferedShardWriter(
self.kazoo_client, params_path
) as stream:
stream.write(self._dictToBytes(params))
self.kazoo_client.create(
path, self._dictToBytes(merge_request.toDict()))
return result
def update(self, merge_request):
log = get_annotated_logger(
self.log, event=None, build=merge_request.uuid
)
log.debug("Updating merge request %s", merge_request)
if merge_request._zstat is None:
log.debug(
"Cannot update merge request %s: Missing version information.",
merge_request.uuid,
)
return
try:
zstat = self.kazoo_client.set(
merge_request.path,
self._dictToBytes(merge_request.toDict()),
version=merge_request._zstat.version,
)
# Update the zstat on the item after updating the ZK node
merge_request._zstat = zstat
except NoNodeError:
raise MergeRequestNotFound(
f"Could not update {merge_request.path}"
)
def reportResult(self, merge_request, result):
# Write the result data first since it may be multiple nodes.
result_data_path = "/".join(
[self.MERGE_RESULT_DATA_ROOT, merge_request.uuid]
)
with sharding.BufferedShardWriter(
self.kazoo_client, result_data_path) as stream:
stream.write(self._dictToBytes(result))
# Then write the (empty) result note to signify it's ready.
data = {'result_data_path': result_data_path}
self.kazoo_client.create(merge_request.result_path,
self._dictToBytes(data))
def get(self, path):
"""Get a merge request
Note: do not mix get with iteration; iteration returns cached
MergeRequests while get returns a newly created object each time. If
you lock a MergeRequest, you must use the same object to unlock it.
"""
try:
data, zstat = self.kazoo_client.get(path)
except NoNodeError:
return None
if not data:
return None
content = self._bytesToDict(data)
merge_request = MergeRequest.fromDict(content)
merge_request.path = path
merge_request._zstat = zstat
return merge_request
def remove(self, merge_request):
self.log.debug("Removing merge request %s", merge_request)
try:
self.kazoo_client.delete(merge_request.path, recursive=True)
except NoNodeError:
# Nothing to do if the node is already deleted
pass
self.clearMergeParams(merge_request)
try:
# Delete the lock parent node as well
path = "/".join([self.LOCK_ROOT, merge_request.uuid])
self.kazoo_client.delete(path, recursive=True)
except NoNodeError:
pass
def lock(self, merge_request, blocking=True, timeout=None):
path = "/".join([self.LOCK_ROOT, merge_request.uuid])
have_lock = False
lock = None
try:
lock = Lock(self.kazoo_client, path)
have_lock = lock.acquire(blocking, timeout)
except LockTimeout:
have_lock = False
self.log.error(
"Timeout trying to acquire lock: %s", merge_request.uuid
)
# If we aren't blocking, it's possible we didn't get the lock
# because someone else has it.
if not have_lock:
return False
if not self.kazoo_client.exists(merge_request.path):
lock.release()
self.log.error(
"Merge not found for locking: %s", merge_request.uuid
)
# We may have just re-created the lock parent node just after the
# scheduler deleted it; therefore we should (re-) delete it.
try:
# Delete the lock parent node as well.
path = "/".join([self.LOCK_ROOT, merge_request.uuid])
self.kazoo_client.delete(path, recursive=True)
except NoNodeError:
pass
return False
merge_request.lock = lock
return True
def unlock(self, merge_request):
if merge_request.lock is None:
self.log.warning(
"MergeRequest %s does not hold a lock", merge_request
)
else:
merge_request.lock.release()
merge_request.lock = None
def isLocked(self, merge_request):
path = "/".join([self.LOCK_ROOT, merge_request.uuid])
lock = Lock(self.kazoo_client, path)
is_locked = len(lock.contenders()) > 0
return is_locked
def lostMergeRequests(self):
# Get a list of merge requests which are running but not locked by any
# merger.
yield from filter(
lambda b: not self.isLocked(b),
self.inState(MergeRequest.RUNNING),
)
def _getAllRequestIds(self):
# Get a list of all request ids without using the cache.
return self.kazoo_client.get_children(self.MERGE_REQUEST_ROOT)
def _findLostParams(self, age):
# Get data nodes which are older than the specified age (we
# don't want to delete nodes which are just being written
# slowly).
# Convert to MS
now = int(time.time() * 1000)
age = age * 1000
data_nodes = dict()
for data_id in self.kazoo_client.get_children(self.MERGE_PARAMS_ROOT):
data_path = self._getParamsPath(data_id)
data_zstat = self.kazoo_client.exists(data_path)
if now - data_zstat.mtime > age:
data_nodes[data_id] = data_path
# If there are no candidate data nodes, we don't need to
# filter them by known requests.
if not data_nodes:
return data_nodes.values()
# Remove current request uuids
for request_id in self._getAllRequestIds():
if request_id in data_nodes:
del data_nodes[request_id]
# Return the paths
return data_nodes.values()
def _findLostMergeResults(self):
# Get a list of merge results which don't have a connection waiting for
# them. As the results and waiters are not part of our cache, we have
# to look them up directly from ZK.
waiters1 = set(self.kazoo_client.get_children(self.MERGE_WAITER_ROOT))
results = set(self.kazoo_client.get_children(self.MERGE_RESULT_ROOT))
result_data = set(self.kazoo_client.get_children(
self.MERGE_RESULT_DATA_ROOT))
waiters2 = set(self.kazoo_client.get_children(self.MERGE_WAITER_ROOT))
waiters = waiters1.union(waiters2)
lost_results = results - waiters
lost_data = result_data - waiters
return lost_results, lost_data
def cleanup(self, age=300):
# Delete build request params which are not associated with
# any current build requests. Note, this does not clean up
# lost build requests themselves; the merger client takes
# care of that.
try:
for path in self._findLostParams(age):
try:
self.log.error("Removing merge request params: %s", path)
self.kazoo_client.delete(path, recursive=True)
except Exception:
self.log.execption(
"Unable to delete merge request params %s", path)
except Exception:
self.log.exception(
"Error cleaning up merge request queue %s", self)
try:
lost_results, lost_data = self._findLostMergeResults()
for result_id in lost_results:
try:
path = '/'.join([self.MERGE_RESULT_ROOT, result_id])
self.log.error("Removing merge request result: %s", path)
self.kazoo_client.delete(path, recursive=True)
except Exception:
self.log.execption(
"Unable to delete merge request params %s", result_id)
for result_id in lost_data:
try:
path = '/'.join([self.MERGE_RESULT_DATA_ROOT, result_id])
self.log.error(
"Removing merge request result data: %s", path)
self.kazoo_client.delete(path, recursive=True)
except Exception:
self.log.execption(
"Unable to delete merge request params %s", result_id)
except Exception:
self.log.exception(
"Error cleaning up merge result queue %s", self)
@staticmethod
def _bytesToDict(data):
return json.loads(data.decode("utf-8"))
@staticmethod
def _dictToBytes(data):
# The custom json_dumps() will also serialize MappingProxyType objects
return json_dumps(data).encode("utf-8")
def _getParamsPath(self, uuid):
return '/'.join([self.MERGE_PARAMS_ROOT, uuid])
def clearMergeParams(self, merge_request):
"""Erase the merge parameters from ZK to save space"""
self.kazoo_client.delete(self._getParamsPath(merge_request.uuid),
recursive=True)
def getMergeParams(self, merge_request):
"""Return the parameters for a merge request, if they exist.
Once a merge request is accepted by an executor, the params
may be erased from ZK; this will return None in that case.
"""
with sharding.BufferedShardReader(
self.kazoo_client, self._getParamsPath(merge_request.uuid)
) as stream:
data = stream.read()
if not data:
return None
return self._bytesToDict(data)