Better event checking timeouts
Previously we gave every event a 20 minute timeout. This meant that we could eventually rollover on the day and start querying against current indexes for data in older indexes. If this happens every query would fail because we are looking in the wrong index. Every query failing means we run the 20 minute timeout every time. All this results in snowballing never being able to check if events are indexed. Address this by using the gerrit eventCreatedOn timestamp to determine when our timeout is hit. We will timeout 20 minutes from that timestamp regardless of how long interim processing has taken us. This should over longer periods of time ensure we query the current index for current events. Change-Id: Ic9ed7fefae37d2668de5d89e0d06b8326eadfbb9
This commit is contained in:
parent
f8c8cc1cda
commit
02d0651f29
@ -115,6 +115,7 @@ class FailEvent(object):
|
||||
self.project = event['change']['project']
|
||||
self.url = event['change']['url']
|
||||
self.comment = event["comment"]
|
||||
self.created_on = event["eventCreatedOn"]
|
||||
# TODO(jogo): make FailEvent generate the jobs
|
||||
self.failed_jobs = failed_jobs
|
||||
self.config = config or er_conf.Config()
|
||||
@ -257,12 +258,18 @@ class Stream(object):
|
||||
|
||||
def _does_es_have_data(self, event):
|
||||
"""Wait till ElasticSearch is ready, but return False if timeout."""
|
||||
NUMBER_OF_RETRIES = 30
|
||||
SLEEP_TIME = 40
|
||||
started_at = datetime.datetime.now()
|
||||
# this checks that we've got the console log uploaded, need to retry
|
||||
# We wait 20 minutes wall time since receiving the event until we
|
||||
# treat the logs as missing
|
||||
timeout = 1200
|
||||
# Wait 40 seconds between queries.
|
||||
sleep_time = 40
|
||||
timed_out = False
|
||||
# This checks that we've got the console log uploaded, need to retry
|
||||
# in case ES goes bonkers on cold data, which it does some times.
|
||||
for i in range(NUMBER_OF_RETRIES):
|
||||
# We check at least once so that we can return success if data is
|
||||
# there. But then only check again until we reach a timeout since
|
||||
# the event was received.
|
||||
while True:
|
||||
try:
|
||||
for job in event.failed_jobs:
|
||||
# TODO(jogo): if there are three failed jobs and only the
|
||||
@ -271,23 +278,32 @@ class Stream(object):
|
||||
self._job_console_uploaded(
|
||||
event.change, event.rev, job.name,
|
||||
job.build_short_uuid)
|
||||
self._has_required_files(
|
||||
event.change, event.rev, job.name,
|
||||
job.build_short_uuid)
|
||||
break
|
||||
|
||||
except ConsoleNotReady as e:
|
||||
self.log.debug(e)
|
||||
time.sleep(SLEEP_TIME)
|
||||
continue
|
||||
except FilesNotReady as e:
|
||||
self.log.info(e)
|
||||
except pyelasticsearch.exceptions.InvalidJsonResponseError:
|
||||
# If ElasticSearch returns an error code, sleep and retry
|
||||
# TODO(jogo): if this works pull out search into a helper
|
||||
# function that does this.
|
||||
self.log.exception(
|
||||
"Elastic Search not responding on attempt %d" % i)
|
||||
time.sleep(SLEEP_TIME)
|
||||
continue
|
||||
else:
|
||||
elapsed = format_timedelta(datetime.datetime.now() - started_at)
|
||||
msg = ("Console logs not available after %ss for %s %d,%d,%s" %
|
||||
"Elastic Search not responding")
|
||||
# If we fall through then we had a failure of some sort.
|
||||
# Wait until timeout is exceeded.
|
||||
now = time.time()
|
||||
if now > event.created_on + timeout:
|
||||
# We've waited too long for this event, move on.
|
||||
timed_out = True
|
||||
break
|
||||
time.sleep(sleep_time)
|
||||
if timed_out:
|
||||
elapsed = now - event.created_on
|
||||
msg = ("Required files not ready after %ss for %s %d,%d,%s" %
|
||||
(elapsed, job.name, event.change, event.rev,
|
||||
job.build_short_uuid))
|
||||
raise ResultTimedOut(msg)
|
||||
@ -295,28 +311,10 @@ class Stream(object):
|
||||
self.log.debug(
|
||||
"Found hits for change_number: %d, patch_number: %d"
|
||||
% (event.change, event.rev))
|
||||
|
||||
for i in range(NUMBER_OF_RETRIES):
|
||||
try:
|
||||
for job in event.failed_jobs:
|
||||
self._has_required_files(
|
||||
event.change, event.rev, job.name,
|
||||
job.build_short_uuid)
|
||||
self.log.info(
|
||||
"All files present for change_number: %d, patch_number: %d"
|
||||
% (event.change, event.rev))
|
||||
time.sleep(10)
|
||||
return True
|
||||
except FilesNotReady as e:
|
||||
self.log.info(e)
|
||||
time.sleep(SLEEP_TIME)
|
||||
|
||||
# if we get to the end, we're broken
|
||||
elapsed = format_timedelta(datetime.datetime.now() - started_at)
|
||||
msg = ("Required files not ready after %ss for %s %d,%d,%s" %
|
||||
(elapsed, job.name, event.change, event.rev,
|
||||
job.build_short_uuid))
|
||||
raise ResultTimedOut(msg)
|
||||
self.log.info(
|
||||
"All files present for change_number: %d, patch_number: %d"
|
||||
% (event.change, event.rev))
|
||||
return True
|
||||
|
||||
def get_failed_tempest(self):
|
||||
self.log.debug("entering get_failed_tempest")
|
||||
|
@ -24,6 +24,7 @@
|
||||
"revision": "74d75a1050c96eef498e9daaa80fcb9652003a20"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389371115,
|
||||
"change": {
|
||||
"topic": "bp\/mapping",
|
||||
"url": "https:\/\/review.openstack.org\/60244",
|
||||
@ -64,6 +65,7 @@
|
||||
"revision": "10d31584a0696cd77200a8fee45900d2343e295e"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389384794,
|
||||
"change": {
|
||||
"topic": "65361",
|
||||
"url": "https:\/\/review.openstack.org\/65361",
|
||||
@ -104,6 +106,7 @@
|
||||
"revision": "2e5a51c3f4e0121dbaf47c13b12809f6e7820d09"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389383805,
|
||||
"change": {
|
||||
"topic": "bp\/image-flavor-selection-ux-enhancement",
|
||||
"url": "https:\/\/review.openstack.org\/63078",
|
||||
@ -144,6 +147,7 @@
|
||||
"revision": "3ca110f8dc6a552e951074faaca3a11ce65e40f3"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389306576,
|
||||
"change": {
|
||||
"topic": "slow-down-there-cowboy",
|
||||
"url": "https:\/\/review.openstack.org\/65805",
|
||||
@ -184,6 +188,7 @@
|
||||
"revision": "4fe4f8b6dda390c432b2c6d2f6cff2152d3475f3"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389391347,
|
||||
"change": {
|
||||
"topic": "1187119-a",
|
||||
"url": "https:\/\/review.openstack.org\/65378",
|
||||
@ -224,6 +229,7 @@
|
||||
"revision": "06b14616cb2a11514b0c151cc213c1ab5cea324b"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389378289,
|
||||
"change": {
|
||||
"topic": "bug\/1265108",
|
||||
"url": "https:\/\/review.openstack.org\/64749",
|
||||
@ -264,6 +270,7 @@
|
||||
"revision": "06b14616cb2a11514b0c151cc213c1ab5cea324b"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389378289,
|
||||
"change": {
|
||||
"topic": "bug\/1265108",
|
||||
"url": "https:\/\/review.openstack.org\/64750",
|
||||
@ -304,6 +311,7 @@
|
||||
"revision": "8044bcbfe8f10aa3f965992e30e9548258d6c736"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389387264,
|
||||
"change": {
|
||||
"topic": "bugs\/1267682",
|
||||
"url": "https:\/\/review.openstack.org\/65902",
|
||||
@ -344,6 +352,7 @@
|
||||
"revision": "b05c9313f6873239897d7c88babdfa8ebe5fefed"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389388172,
|
||||
"change": {
|
||||
"topic": "bug\/1253497",
|
||||
"url": "https:\/\/review.openstack.org\/66064",
|
||||
@ -384,6 +393,7 @@
|
||||
"revision": "fd9bee400259d370e837799236b0a4a6f0343438"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1389354908,
|
||||
"change": {
|
||||
"topic": "del-param",
|
||||
"url": "https:\/\/review.openstack.org\/65925",
|
||||
@ -424,6 +434,7 @@
|
||||
"revision": "af064c7a5304c672f7d71e4c8df5f69c195c739a"
|
||||
},
|
||||
"type": "comment-added",
|
||||
"eventCreatedOn": 1387806669,
|
||||
"change": {
|
||||
"topic": "bug\/1259078",
|
||||
"url": "https:\/\/review.openstack.org\/61074",
|
||||
|
Loading…
Reference in New Issue
Block a user