From d65ca49990f1241a5a8afdc3ad0e1a99c878275a Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 30 Jun 2014 14:09:24 +0100 Subject: [PATCH] xenapi: deal with reboots while talking to agent The latest version of the agent has started changing the hostname through the official windows APIs, when resetnetwork is called. This means a reboot might happen after resetnetwork. A reboot happens when polling for the version. Until now we wait for the call timeout, before fetching the new dom id. This change ensures that if we spot a reboot, the plugin exits early rather than keeping polling the wrong dom id. Turns out its best to wait for the dom_id to change, before trying to poll the agent again. Once the dom_id in xenapi has been updated, the xenstore keys are always in place. Trying too early leads to lots of reboot detections because we are retrying with the old dom_id. XenServer continues to return the old dom_id for a little while after the reboot. Closes-Bug: #1370999 Change-Id: Id0bf5b64f2b271d162db5bbce50167ab1f665c87 --- nova/tests/virt/xenapi/test_agent.py | 83 ++++++++++++++++++- nova/virt/xenapi/agent.py | 23 +++++ .../xenserver/xenapi/etc/xapi.d/plugins/agent | 27 +++++- .../xenapi/etc/xapi.d/plugins/xenstore.py | 6 +- 4 files changed, 132 insertions(+), 7 deletions(-) diff --git a/nova/tests/virt/xenapi/test_agent.py b/nova/tests/virt/xenapi/test_agent.py index b4f7f427c2e4..5004b381d4aa 100644 --- a/nova/tests/virt/xenapi/test_agent.py +++ b/nova/tests/virt/xenapi/test_agent.py @@ -14,6 +14,7 @@ # under the License. import base64 +import time import uuid import mock @@ -41,8 +42,8 @@ def _get_fake_instance(**kwargs): class AgentTestCaseBase(test.NoDBTestCase): - def _create_agent(self, instance): - self.session = "session" + def _create_agent(self, instance, session="session"): + self.session = session self.virtapi = "virtapi" self.vm_ref = "vm_ref" return agent.XenAPIBasedAgent(self.session, self.virtapi, @@ -202,6 +203,84 @@ class FileInjectionTestCase(AgentTestCaseBase): agent.inject_files(files) +class FakeRebootException(Exception): + details = ["", "", "", "asdf REBOOT: asdf"] + + +class RebootRetryTestCase(AgentTestCaseBase): + @mock.patch.object(agent, '_wait_for_new_dom_id') + def test_retry_on_reboot(self, mock_wait): + mock_session = mock.Mock() + + def fake_call_plugin(*args, **kwargs): + if fake_call_plugin.called: + return {"returncode": '0', "message": "done"} + else: + fake_call_plugin.called = True + raise FakeRebootException() + + fake_call_plugin.called = False + mock_session.XenAPI.Failure = FakeRebootException + mock_session.VM.get_domid.return_value = "fake_dom_id" + mock_session.call_plugin.side_effect = fake_call_plugin + + agent = self._create_agent(None, mock_session) + + result = agent._call_agent("asdf") + self.assertEqual("done", result) + self.assertTrue(mock_session.VM.get_domid.called) + self.assertEqual(2, mock_session.call_plugin.call_count) + mock_wait.called_once_with(mock_session, self.vm_ref, + "fake_dom_id", "asdf") + + @mock.patch.object(time, 'sleep') + @mock.patch.object(time, 'time') + def test_wait_for_new_dom_id_found(self, mock_time, mock_sleep): + mock_session = mock.Mock() + mock_session.VM.get_domid.return_value = "new" + + agent._wait_for_new_dom_id(mock_session, "vm_ref", "old", "method") + + mock_session.VM.get_domid.assert_called_once_with("vm_ref") + self.assertFalse(mock_sleep.called) + + @mock.patch.object(time, 'sleep') + @mock.patch.object(time, 'time') + def test_wait_for_new_dom_id_after_retry(self, mock_time, mock_sleep): + self.flags(agent_timeout=3, group="xenserver") + mock_time.return_value = 0 + mock_session = mock.Mock() + old = 40 + new = 42 + mock_session.VM.get_domid.side_effect = [old, -1, new] + + agent._wait_for_new_dom_id(mock_session, "vm_ref", old, "method") + + mock_session.VM.get_domid.assert_called_with("vm_ref") + self.assertEqual(3, mock_session.VM.get_domid.call_count) + self.assertEqual(2, mock_sleep.call_count) + + @mock.patch.object(time, 'sleep') + @mock.patch.object(time, 'time') + def test_wait_for_new_dom_id_timeout(self, mock_time, mock_sleep): + self.flags(agent_timeout=3, group="xenserver") + + def fake_time(): + fake_time.time = fake_time.time + 1 + return fake_time.time + + fake_time.time = 0 + mock_time.side_effect = fake_time + mock_session = mock.Mock() + mock_session.VM.get_domid.return_value = "old" + + self.assertRaises(exception.AgentTimeout, + agent._wait_for_new_dom_id, + mock_session, "vm_ref", "old", "method") + + self.assertEqual(4, mock_session.VM.get_domid.call_count) + + class SetAdminPasswordTestCase(AgentTestCaseBase): @mock.patch.object(agent.XenAPIBasedAgent, '_call_agent') @mock.patch("nova.virt.xenapi.agent.SimpleDH") diff --git a/nova/virt/xenapi/agent.py b/nova/virt/xenapi/agent.py index 17480dc1f448..247c1dfa7fb0 100644 --- a/nova/virt/xenapi/agent.py +++ b/nova/virt/xenapi/agent.py @@ -113,6 +113,13 @@ def _call_agent(session, instance, vm_ref, method, addl_args=None, 'args=%(args)r'), {'method': method, 'args': args}, instance=instance) raise exception.AgentTimeout(method=method) + elif 'REBOOT:' in err_msg: + LOG.debug('REBOOT: The call to %(method)s detected a reboot. ' + 'args=%(args)r', + {'method': method, 'args': args}, instance=instance) + _wait_for_new_dom_id(session, vm_ref, dom_id, method) + return _call_agent(session, instance, vm_ref, method, + addl_args, timeout, success_codes) elif 'NOT IMPLEMENTED:' in err_msg: LOG.error(_('NOT IMPLEMENTED: The call to %(method)s is not ' 'supported by the agent. args=%(args)r'), @@ -152,6 +159,22 @@ def _call_agent(session, instance, vm_ref, method, addl_args=None, return ret['message'].replace('\\r\\n', '') +def _wait_for_new_dom_id(session, vm_ref, old_dom_id, method): + expiration = time.time() + CONF.xenserver.agent_timeout + while True: + dom_id = session.VM.get_domid(vm_ref) + + if dom_id and dom_id != -1 and dom_id != old_dom_id: + LOG.debug("Found new dom_id %s" % dom_id) + return + + if time.time() > expiration: + LOG.debug("Timed out waiting for new dom_id %s" % dom_id) + raise exception.AgentTimeout(method=method) + + time.sleep(1) + + def is_upgrade_required(current_version, available_version): # NOTE(johngarbutt): agent version numbers are four part, # so we need to use the loose version to compare them diff --git a/plugins/xenserver/xenapi/etc/xapi.d/plugins/agent b/plugins/xenserver/xenapi/etc/xapi.d/plugins/agent index 546e73368ded..5112cdedafa9 100755 --- a/plugins/xenserver/xenapi/etc/xapi.d/plugins/agent +++ b/plugins/xenserver/xenapi/etc/xapi.d/plugins/agent @@ -46,6 +46,10 @@ class TimeoutError(StandardError): pass +class RebootDetectedError(StandardError): + pass + + def version(self, arg_dict): """Get version of agent.""" timeout = int(arg_dict.pop('timeout', DEFAULT_TIMEOUT)) @@ -210,6 +214,7 @@ def _wait_for_agent(self, request_id, arg_dict, timeout): arg_dict["path"] = "data/guest/%s" % request_id arg_dict["ignore_missing_path"] = True start = time.time() + reboot_detected = False while time.time() - start < timeout: ret = xenstore.read_record(self, arg_dict) # Note: the response for None with be a string that includes @@ -220,12 +225,30 @@ def _wait_for_agent(self, request_id, arg_dict, timeout): time.sleep(.5) + # NOTE(johngarbutt) If we can't find this domid, then + # the VM has rebooted, so we must trigger domid refresh. + # Check after the sleep to give xenstore time to update + # after the VM reboot. + exists_args = { + "dom_id": arg_dict["dom_id"], + "path": "name", + } + dom_id_is_present = xenstore.record_exists(exists_args) + if not dom_id_is_present: + reboot_detected = True + break + # No response within the timeout period; bail out # First, delete the request record arg_dict["path"] = "data/host/%s" % request_id xenstore.delete_record(self, arg_dict) - raise TimeoutError(_("TIMEOUT: No response from agent within" - " %s seconds.") % timeout) + + if reboot_detected: + raise RebootDetectedError(_("REBOOT: dom_id %s no longer " + "present") % arg_dict["dom_id"]) + else: + raise TimeoutError(_("TIMEOUT: No response from agent within" + " %s seconds.") % timeout) if __name__ == "__main__": diff --git a/plugins/xenserver/xenapi/etc/xapi.d/plugins/xenstore.py b/plugins/xenserver/xenapi/etc/xapi.d/plugins/xenstore.py index 3d28fc617d19..033ef149f4b0 100755 --- a/plugins/xenserver/xenapi/etc/xapi.d/plugins/xenstore.py +++ b/plugins/xenserver/xenapi/etc/xapi.d/plugins/xenstore.py @@ -62,7 +62,7 @@ def jsonify(fnc): return wrapper -def _record_exists(arg_dict): +def record_exists(arg_dict): """Returns whether or not the given record exists. The record path is determined from the given path and dom_id in the arg_dict. """ @@ -93,7 +93,7 @@ def read_record(self, arg_dict): except XenstoreError, e: # noqa if not arg_dict.get("ignore_missing_path", False): raise - if not _record_exists(arg_dict): + if not record_exists(arg_dict): return "None" # Just try again in case the agent write won the race against # the record_exists check. If this fails again, it will likely raise @@ -129,7 +129,7 @@ def list_records(self, arg_dict): try: recs = _run_command(cmd) except XenstoreError, e: # noqa - if not _record_exists(arg_dict): + if not record_exists(arg_dict): return {} # Just try again in case the path was created in between # the "ls" and the existence check. If this fails again, it will