From 3bc4866a21b5fe255a808fb1742e40c3a5bee7b1 Mon Sep 17 00:00:00 2001 From: Josh Gachnang Date: Wed, 26 Mar 2014 10:57:25 -0700 Subject: [PATCH] Adding DynamicLoopingCall around lookup Currently, if a single lookup call to the Ironic API fails, the entire agent errors out and restarts. This allows the agent to retry for a set amount of time before throwing an uncaught exception forcing a restart. Change-Id: I39752fb3f42ad3e4f15a49194f1554e1d3463cf8 Closes-Bug: 1297019 --- ironic_python_agent/agent.py | 25 ++- ironic_python_agent/cmd/agent.py | 17 +- ironic_python_agent/ironic_api_client.py | 80 ++++++++-- .../openstack/common/loopingcall.py | 145 ++++++++++++++++++ ironic_python_agent/tests/agent.py | 4 +- .../tests/ironic_api_client.py | 143 +++++++++++++---- openstack-common.conf | 4 +- requirements.txt | 2 + test-requirements.txt | 1 - 9 files changed, 366 insertions(+), 55 deletions(-) create mode 100644 ironic_python_agent/openstack/common/loopingcall.py diff --git a/ironic_python_agent/agent.py b/ironic_python_agent/agent.py index a3fe39400..80a744125 100644 --- a/ironic_python_agent/agent.py +++ b/ironic_python_agent/agent.py @@ -106,7 +106,8 @@ class IronicPythonAgentHeartbeater(threading.Thread): class IronicPythonAgent(object): - def __init__(self, api_url, advertise_address, listen_address): + def __init__(self, api_url, advertise_address, listen_address, + lookup_timeout, lookup_interval): self.api_url = api_url self.api_client = ironic_api_client.APIClient(self.api_url) self.listen_address = listen_address @@ -127,6 +128,9 @@ class IronicPythonAgent(object): invoke_on_load=True, propagate_map_exceptions=True, ) + # lookup timeout in seconds + self.lookup_timeout = lookup_timeout + self.lookup_interval = lookup_interval def get_status(self): """Retrieve a serializable status.""" @@ -195,13 +199,14 @@ class IronicPythonAgent(object): def run(self): """Run the Ironic Python Agent.""" + # Get the UUID so we can heartbeat to Ironic. Raises LookupNodeError + # if there is an issue (uncaught, restart agent) self.started_at = _time() - # Get the UUID so we can heartbeat to Ironic content = self.api_client.lookup_node( - hardware_info=self.hardware.list_hardware_info() - ) - if 'node' not in content or 'heartbeat_timeout' not in content: - raise LookupError('Lookup return needs node and heartbeat_timeout') + hardware_info=self.hardware.list_hardware_info(), + timeout=self.lookup_timeout, + starting_interval=self.lookup_interval) + self.node = content['node'] self.heartbeat_timeout = content['heartbeat_timeout'] self.heartbeater.start() @@ -223,8 +228,12 @@ def build_agent(api_url, advertise_host, advertise_port, listen_host, - listen_port): + listen_port, + lookup_timeout, + lookup_interval): return IronicPythonAgent(api_url, (advertise_host, advertise_port), - (listen_host, listen_port)) + (listen_host, listen_port), + lookup_timeout, + lookup_interval) diff --git a/ironic_python_agent/cmd/agent.py b/ironic_python_agent/cmd/agent.py index e12d37d03..14fdf3815 100644 --- a/ironic_python_agent/cmd/agent.py +++ b/ironic_python_agent/cmd/agent.py @@ -47,9 +47,24 @@ def run(): type=int, help='The port to tell Ironic to reply and send ' 'commands to.') + parser.add_argument('--lookup-timeout', + default=300, + type=int, + help='The amount of time to retry the initial lookup ' + 'call to Ironic. After the timeout, the agent ' + 'will exit with a non-zero exit code.') + parser.add_argument('--lookup-interval', + default=1, + type=int, + help='The initial interval for retries on the initial ' + 'lookup call to Ironic. The interval will be ' + 'doubled after each failure until timeout is ' + 'exceeded.') args = parser.parse_args() agent.build_agent(args.api_url, args.advertise_host, args.advertise_port, args.listen_host, - args.listen_port).run() + args.listen_port, + args.lookup_timeout, + args.lookup_interval).run() diff --git a/ironic_python_agent/ironic_api_client.py b/ironic_python_agent/ironic_api_client.py index 11ab86af3..296bac5b0 100644 --- a/ironic_python_agent/ironic_api_client.py +++ b/ironic_python_agent/ironic_api_client.py @@ -15,11 +15,12 @@ limitations under the License. """ import json - import requests from ironic_python_agent import encoding from ironic_python_agent import errors +from ironic_python_agent.openstack.common import log +from ironic_python_agent.openstack.common import loopingcall class APIClient(object): @@ -29,6 +30,7 @@ class APIClient(object): self.api_url = api_url.rstrip('/') self.session = requests.Session() self.encoder = encoding.RESTJSONEncoder() + self.log = log.getLogger(__name__) def _request(self, method, path, data=None): request_url = '{api_url}{path}'.format(api_url=self.api_url, path=path) @@ -70,38 +72,86 @@ class APIClient(object): except Exception: raise errors.HeartbeatError('Invalid Heartbeat-Before header') - def lookup_node(self, hardware_info): + def lookup_node(self, hardware_info, timeout, starting_interval): + timer = loopingcall.DynamicLoopingCall( + self._do_lookup, + hardware_info=hardware_info, + intervals=[starting_interval], + total_time=[0], + timeout=timeout) + node_content = timer.start().wait() + + # True is returned on timeout + if node_content is True: + raise errors.LookupNodeError('Could not look up node info. Check ' + 'logs for details.') + return node_content + + def _do_lookup(self, hardware_info, timeout, intervals=[1], + total_time=[0]): + """The actual call to lookup a node. Should be called inside + loopingcall.DynamicLoopingCall. + + intervals and total_time are mutable so it can be changed by each run + in the looping call and accessed/changed on the next run. + """ + def next_interval(timeout, intervals=[], total_time=[]): + """Function to calculate what the next interval should be. Uses + exponential backoff and raises an exception (that won't + be caught by do_lookup) to kill the looping call if it goes too + long + """ + new_interval = intervals[-1] * 2 + if total_time[0] + new_interval > timeout: + # No retvalue signifies error + raise loopingcall.LoopingCallDone() + + total_time[0] += new_interval + intervals.append(new_interval) + return new_interval + path = '/{api_version}/drivers/teeth/vendor_passthru/lookup'.format( api_version=self.api_version ) - # This hardware won't be saved on the node currently, because of how - # driver_vendor_passthru is implemented (no node saving). + # This hardware won't be saved on the node currently, because of + # how driver_vendor_passthru is implemented (no node saving). data = { - 'hardware': hardware_info, + 'hardware': hardware_info } + # Make the POST, make sure we get back normal data/status codes and + # content try: response = self._request('POST', path, data=data) except Exception as e: - raise errors.LookupNodeError(str(e)) + self.log.warning('POST failed: %s' % str(e)) + return next_interval(timeout, intervals, total_time) if response.status_code != requests.codes.OK: - msg = 'Invalid status code: {0}'.format(response.status_code) - raise errors.LookupNodeError(msg) + self.log.warning('Invalid status code: %s' % + response.status_code) + + return next_interval(timeout, intervals, total_time) try: content = json.loads(response.content) except Exception as e: - raise errors.LookupNodeError('Error decoding response: ' - + str(e)) + self.log.warning('Error decoding response: %s' % str(e)) + return next_interval(timeout, intervals, total_time) + # Check for valid response data if 'node' not in content or 'uuid' not in content['node']: - raise errors.LookupNodeError('Got invalid node data from the API:' - '%s' % content) + self.log.warning('Got invalid node data from the API: %s' % + content) + return next_interval(timeout, intervals, total_time) + if 'heartbeat_timeout' not in content: - raise errors.LookupNodeError('Got invalid heartbeat from the API:' - '%s' % content) - return content + self.log.warning('Got invalid heartbeat from the API: %s' % + content) + return next_interval(timeout, intervals, total_time) + + # Got valid content + raise loopingcall.LoopingCallDone(retvalue=content) def _get_agent_url(self, advertise_address): return 'http://{0}:{1}'.format(advertise_address[0], diff --git a/ironic_python_agent/openstack/common/loopingcall.py b/ironic_python_agent/openstack/common/loopingcall.py new file mode 100644 index 000000000..dc0f17b66 --- /dev/null +++ b/ironic_python_agent/openstack/common/loopingcall.py @@ -0,0 +1,145 @@ +# Copyright 2010 United States Government as represented by the +# Administrator of the National Aeronautics and Space Administration. +# Copyright 2011 Justin Santa Barbara +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import sys + +from eventlet import event +from eventlet import greenthread + +from ironic_python_agent.openstack.common.gettextutils import _LE, _LW +from ironic_python_agent.openstack.common import log as logging +from ironic_python_agent.openstack.common import timeutils + +LOG = logging.getLogger(__name__) + + +class LoopingCallDone(Exception): + """Exception to break out and stop a LoopingCall. + + The poll-function passed to LoopingCall can raise this exception to + break out of the loop normally. This is somewhat analogous to + StopIteration. + + An optional return-value can be included as the argument to the exception; + this return-value will be returned by LoopingCall.wait() + + """ + + def __init__(self, retvalue=True): + """:param retvalue: Value that LoopingCall.wait() should return.""" + self.retvalue = retvalue + + +class LoopingCallBase(object): + def __init__(self, f=None, *args, **kw): + self.args = args + self.kw = kw + self.f = f + self._running = False + self.done = None + + def stop(self): + self._running = False + + def wait(self): + return self.done.wait() + + +class FixedIntervalLoopingCall(LoopingCallBase): + """A fixed interval looping call.""" + + def start(self, interval, initial_delay=None): + self._running = True + done = event.Event() + + def _inner(): + if initial_delay: + greenthread.sleep(initial_delay) + + try: + while self._running: + start = timeutils.utcnow() + self.f(*self.args, **self.kw) + end = timeutils.utcnow() + if not self._running: + break + delay = interval - timeutils.delta_seconds(start, end) + if delay <= 0: + LOG.warn(_LW('task run outlasted interval by %s sec') % + -delay) + greenthread.sleep(delay if delay > 0 else 0) + except LoopingCallDone as e: + self.stop() + done.send(e.retvalue) + except Exception: + LOG.exception(_LE('in fixed duration looping call')) + done.send_exception(*sys.exc_info()) + return + else: + done.send(True) + + self.done = done + + greenthread.spawn_n(_inner) + return self.done + + +# TODO(mikal): this class name is deprecated in Havana and should be removed +# in the I release +LoopingCall = FixedIntervalLoopingCall + + +class DynamicLoopingCall(LoopingCallBase): + """A looping call which sleeps until the next known event. + + The function called should return how long to sleep for before being + called again. + """ + + def start(self, initial_delay=None, periodic_interval_max=None): + self._running = True + done = event.Event() + + def _inner(): + if initial_delay: + greenthread.sleep(initial_delay) + + try: + while self._running: + idle = self.f(*self.args, **self.kw) + if not self._running: + break + + if periodic_interval_max is not None: + idle = min(idle, periodic_interval_max) + LOG.debug('Dynamic looping call sleeping for %.02f ' + 'seconds', idle) + greenthread.sleep(idle) + except LoopingCallDone as e: + self.stop() + done.send(e.retvalue) + except Exception: + LOG.exception(_LE('in dynamic looping call')) + done.send_exception(*sys.exc_info()) + return + else: + done.send(True) + + self.done = done + + greenthread.spawn(_inner) + return self.done diff --git a/ironic_python_agent/tests/agent.py b/ironic_python_agent/tests/agent.py index 0e697d601..6ef421254 100644 --- a/ironic_python_agent/tests/agent.py +++ b/ironic_python_agent/tests/agent.py @@ -123,7 +123,9 @@ class TestBaseAgent(unittest.TestCase): self.agent = agent.IronicPythonAgent('https://fake_api.example.' 'org:8081/', ('203.0.113.1', 9990), - ('192.0.2.1', 9999)) + ('192.0.2.1', 9999), + lookup_timeout=300, + lookup_interval=1) def assertEqualEncoded(self, a, b): # Evidently JSONEncoder.default() can't handle None (??) so we have to diff --git a/ironic_python_agent/tests/ironic_api_client.py b/ironic_python_agent/tests/ironic_api_client.py index 3ea92fb27..888e3db83 100644 --- a/ironic_python_agent/tests/ironic_api_client.py +++ b/ironic_python_agent/tests/ironic_api_client.py @@ -14,7 +14,6 @@ See the License for the specific language governing permissions and limitations under the License. """ -import httmock import json import mock import time @@ -23,10 +22,20 @@ import unittest from ironic_python_agent import errors from ironic_python_agent import hardware from ironic_python_agent import ironic_api_client +from ironic_python_agent.openstack.common import loopingcall + API_URL = 'http://agent-api.ironic.example.org/' +class FakeResponse(object): + def __init__(self, content=None, status_code=200, headers=None): + content = content or {} + self.content = json.dumps(content) + self.status_code = status_code + self.headers = headers or {} + + class TestBaseIronicPythonAgent(unittest.TestCase): def setUp(self): self.api_client = ironic_api_client.APIClient(API_URL) @@ -39,7 +48,7 @@ class TestBaseIronicPythonAgent(unittest.TestCase): def test_successful_heartbeat(self): expected_heartbeat_before = time.time() + 120 - response = httmock.response(status_code=204, headers={ + response = FakeResponse(status_code=204, headers={ 'Heartbeat-Before': expected_heartbeat_before, }) @@ -69,7 +78,7 @@ class TestBaseIronicPythonAgent(unittest.TestCase): advertise_address=('192.0.2.1', '9999')) def test_heartbeat_invalid_status_code(self): - response = httmock.response(status_code=404) + response = FakeResponse(status_code=404) self.api_client.session.request = mock.Mock() self.api_client.session.request.return_value = response @@ -79,7 +88,7 @@ class TestBaseIronicPythonAgent(unittest.TestCase): advertise_address=('192.0.2.1', '9999')) def test_heartbeat_missing_heartbeat_before_header(self): - response = httmock.response(status_code=204) + response = FakeResponse(status_code=204) self.api_client.session.request = mock.Mock() self.api_client.session.request.return_value = response @@ -89,7 +98,7 @@ class TestBaseIronicPythonAgent(unittest.TestCase): advertise_address=('192.0.2.1', '9999')) def test_heartbeat_invalid_heartbeat_before_header(self): - response = httmock.response(status_code=204, headers={ + response = FakeResponse(status_code=204, headers={ 'Heartbeat-Before': 'tomorrow', }) self.api_client.session.request = mock.Mock() @@ -100,8 +109,47 @@ class TestBaseIronicPythonAgent(unittest.TestCase): uuid='deadbeef-dabb-ad00-b105-f00d00bab10c', advertise_address=('192.0.2.1', '9999')) - def test_lookup_node(self): - response = httmock.response(status_code=200, content={ + @mock.patch('eventlet.greenthread.sleep') + def test_lookup_node(self, sleep_mock): + content = { + 'node': { + 'uuid': 'deadbeef-dabb-ad00-b105-f00d00bab10c' + }, + 'heartbeat_timeout': 300 + } + response = FakeResponse(status_code=200, content={ + 'node': { + 'uuid': 'deadbeef-dabb-ad00-b105-f00d00bab10c' + }, + 'heartbeat_timeout': 300 + }) + self.api_client.session.request = mock.Mock() + self.api_client.session.request.return_value = response + returned_content = self.api_client.lookup_node( + hardware_info=self.hardware_info, + timeout=300, + starting_interval=1) + + self.assertEqual(content, returned_content) + + @mock.patch('eventlet.greenthread.sleep') + def test_lookup_node_exception(self, sleep_mock): + bad_response = FakeResponse(status_code=500, content={ + 'node': { + 'uuid': 'deadbeef-dabb-ad00-b105-f00d00bab10c' + }, + 'heartbeat_timeout': 300 + }) + self.api_client.session.request = mock.Mock() + self.api_client.session.request.return_value = bad_response + self.assertRaises(errors.LookupNodeError, + self.api_client.lookup_node, + hardware_info=self.hardware_info, + timeout=300, + starting_interval=1) + + def test_do_lookup(self): + response = FakeResponse(status_code=200, content={ 'node': { 'uuid': 'deadbeef-dabb-ad00-b105-f00d00bab10c' }, @@ -111,12 +159,16 @@ class TestBaseIronicPythonAgent(unittest.TestCase): self.api_client.session.request = mock.Mock() self.api_client.session.request.return_value = response - self.api_client.lookup_node(hardware_info=self.hardware_info) + self.assertRaises(loopingcall.LoopingCallDone, + self.api_client._do_lookup, + hardware_info=self.hardware_info, + timeout=300, + intervals=[2]) request_args = self.api_client.session.request.call_args[0] self.assertEqual(request_args[0], 'POST') self.assertEqual(request_args[1], - API_URL + 'v1/drivers/teeth/vendor_passthru/lookup') + API_URL + 'v1/drivers/teeth/vendor_passthru/lookup') data = self.api_client.session.request.call_args[1]['data'] content = json.loads(data) @@ -131,8 +183,8 @@ class TestBaseIronicPythonAgent(unittest.TestCase): }, ]) - def test_lookup_node_bad_response_code(self): - response = httmock.response(status_code=400, content={ + def test_do_lookup_bad_response_code(self): + response = FakeResponse(status_code=400, content={ 'node': { 'uuid': 'deadbeef-dabb-ad00-b105-f00d00bab10c' } @@ -141,24 +193,32 @@ class TestBaseIronicPythonAgent(unittest.TestCase): self.api_client.session.request = mock.Mock() self.api_client.session.request.return_value = response - self.assertRaises(errors.LookupNodeError, - self.api_client.lookup_node, - hardware_info=self.hardware_info) + total_time = [0] + interval = self.api_client._do_lookup(self.hardware_info, + timeout=300, + total_time=total_time, + intervals=[2]) + self.assertEqual(4, interval) + self.assertEqual(4, total_time[0]) - def test_lookup_node_bad_response_data(self): - response = httmock.response(status_code=200, content={ + def test_do_lookup_bad_response_data(self): + response = FakeResponse(status_code=200, content={ 'heartbeat_timeout': 300 }) self.api_client.session.request = mock.Mock() self.api_client.session.request.return_value = response - self.assertRaises(errors.LookupNodeError, - self.api_client.lookup_node, - hardware_info=self.hardware_info) + total_time = [0] + interval = self.api_client._do_lookup(self.hardware_info, + timeout=300, + total_time=total_time, + intervals=[2]) + self.assertEqual(4, interval) + self.assertEqual(4, total_time[0]) - def test_lookup_node_no_heartbeat_timeout(self): - response = httmock.response(status_code=200, content={ + def test_do_lookup_no_heartbeat_timeout(self): + response = FakeResponse(status_code=200, content={ 'node': { 'uuid': 'deadbeef-dabb-ad00-b105-f00d00bab10c' } @@ -167,18 +227,47 @@ class TestBaseIronicPythonAgent(unittest.TestCase): self.api_client.session.request = mock.Mock() self.api_client.session.request.return_value = response - self.assertRaises(errors.LookupNodeError, - self.api_client.lookup_node, - hardware_info=self.hardware_info) + total_time = [0] + interval = self.api_client._do_lookup(self.hardware_info, + timeout=300, + total_time=total_time, + intervals=[2]) + self.assertEqual(4, interval) + self.assertEqual(4, total_time[0]) - def test_lookup_node_bad_response_body(self): - response = httmock.response(status_code=200, content={ + def test_do_lookup_bad_response_body(self): + response = FakeResponse(status_code=200, content={ 'node_node': 'also_not_node' }) self.api_client.session.request = mock.Mock() self.api_client.session.request.return_value = response + total_time = [0] + interval = self.api_client._do_lookup(self.hardware_info, + timeout=300, + total_time=total_time, + intervals=[2]) + self.assertEqual(4, interval) + self.assertEqual(4, total_time[0]) + + @mock.patch('eventlet.greenthread.sleep') + def test_lookup_node_exponential_backoff(self, sleep_mock): + bad_response = FakeResponse(status_code=500, content={ + 'node': { + 'uuid': 'deadbeef-dabb-ad00-b105-f00d00bab10c' + }, + 'heartbeat_timeout': 300 + }) + self.api_client.session.request = mock.Mock() + self.api_client.session.request.return_value = bad_response self.assertRaises(errors.LookupNodeError, self.api_client.lookup_node, - hardware_info=self.hardware_info) + hardware_info=self.hardware_info, + timeout=300, + starting_interval=1) + # 254 seconds before timeout + expected_times = [mock.call(2), mock.call(4), mock.call(8), + mock.call(16), mock.call(32), mock.call(64), + mock.call(128)] + self.assertEqual(expected_times, sleep_mock.call_args_list) diff --git a/openstack-common.conf b/openstack-common.conf index fab48572f..3fc1d5e59 100644 --- a/openstack-common.conf +++ b/openstack-common.conf @@ -1,7 +1,7 @@ [DEFAULT] # The list of modules to copy from oslo-incubator -modules=log - +module=log +module=loopingcall # The base module to hold the copy of openstack.common base=ironic_python_agent diff --git a/requirements.txt b/requirements.txt index b0578ec34..7eae0c0c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ six>=1.5.2 oslo.config==1.2.1 Babel==1.3 iso8601==0.1.10 +eventlet>=0.13.0 + diff --git a/test-requirements.txt b/test-requirements.txt index 0f0b5c69a..08dfe4f42 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -5,4 +5,3 @@ mock>=1.0 testrepository>=0.0.18 testtools>=0.9.34 python-subunit>=0.0.18 -httmock \ No newline at end of file