xenapi: deal with reboots while talking to agent

The latest version of the agent has started changing the hostname
through the official windows APIs, when resetnetwork is called. This
means a reboot might happen after resetnetwork.

A reboot happens when polling for the version. Until now we wait for the
call timeout, before fetching the new dom id. This change ensures that
if we spot a reboot, the plugin exits early rather than keeping polling
the wrong dom id.

Turns out its best to wait for the dom_id to change, before trying
to poll the agent again. Once the dom_id in xenapi has been updated,
the xenstore keys are always in place.

Trying too early leads to lots of reboot detections because we are
retrying with the old dom_id. XenServer continues to return the old
dom_id for a little while after the reboot.

Closes-Bug: #1370999

Change-Id: Id0bf5b64f2b271d162db5bbce50167ab1f665c87
This commit is contained in:
John Garbutt 2014-06-30 14:09:24 +01:00
parent 51de439a4d
commit d65ca49990
4 changed files with 132 additions and 7 deletions

View File

@ -14,6 +14,7 @@
# under the License.
import base64
import time
import uuid
import mock
@ -41,8 +42,8 @@ def _get_fake_instance(**kwargs):
class AgentTestCaseBase(test.NoDBTestCase):
def _create_agent(self, instance):
self.session = "session"
def _create_agent(self, instance, session="session"):
self.session = session
self.virtapi = "virtapi"
self.vm_ref = "vm_ref"
return agent.XenAPIBasedAgent(self.session, self.virtapi,
@ -202,6 +203,84 @@ class FileInjectionTestCase(AgentTestCaseBase):
agent.inject_files(files)
class FakeRebootException(Exception):
details = ["", "", "", "asdf REBOOT: asdf"]
class RebootRetryTestCase(AgentTestCaseBase):
@mock.patch.object(agent, '_wait_for_new_dom_id')
def test_retry_on_reboot(self, mock_wait):
mock_session = mock.Mock()
def fake_call_plugin(*args, **kwargs):
if fake_call_plugin.called:
return {"returncode": '0', "message": "done"}
else:
fake_call_plugin.called = True
raise FakeRebootException()
fake_call_plugin.called = False
mock_session.XenAPI.Failure = FakeRebootException
mock_session.VM.get_domid.return_value = "fake_dom_id"
mock_session.call_plugin.side_effect = fake_call_plugin
agent = self._create_agent(None, mock_session)
result = agent._call_agent("asdf")
self.assertEqual("done", result)
self.assertTrue(mock_session.VM.get_domid.called)
self.assertEqual(2, mock_session.call_plugin.call_count)
mock_wait.called_once_with(mock_session, self.vm_ref,
"fake_dom_id", "asdf")
@mock.patch.object(time, 'sleep')
@mock.patch.object(time, 'time')
def test_wait_for_new_dom_id_found(self, mock_time, mock_sleep):
mock_session = mock.Mock()
mock_session.VM.get_domid.return_value = "new"
agent._wait_for_new_dom_id(mock_session, "vm_ref", "old", "method")
mock_session.VM.get_domid.assert_called_once_with("vm_ref")
self.assertFalse(mock_sleep.called)
@mock.patch.object(time, 'sleep')
@mock.patch.object(time, 'time')
def test_wait_for_new_dom_id_after_retry(self, mock_time, mock_sleep):
self.flags(agent_timeout=3, group="xenserver")
mock_time.return_value = 0
mock_session = mock.Mock()
old = 40
new = 42
mock_session.VM.get_domid.side_effect = [old, -1, new]
agent._wait_for_new_dom_id(mock_session, "vm_ref", old, "method")
mock_session.VM.get_domid.assert_called_with("vm_ref")
self.assertEqual(3, mock_session.VM.get_domid.call_count)
self.assertEqual(2, mock_sleep.call_count)
@mock.patch.object(time, 'sleep')
@mock.patch.object(time, 'time')
def test_wait_for_new_dom_id_timeout(self, mock_time, mock_sleep):
self.flags(agent_timeout=3, group="xenserver")
def fake_time():
fake_time.time = fake_time.time + 1
return fake_time.time
fake_time.time = 0
mock_time.side_effect = fake_time
mock_session = mock.Mock()
mock_session.VM.get_domid.return_value = "old"
self.assertRaises(exception.AgentTimeout,
agent._wait_for_new_dom_id,
mock_session, "vm_ref", "old", "method")
self.assertEqual(4, mock_session.VM.get_domid.call_count)
class SetAdminPasswordTestCase(AgentTestCaseBase):
@mock.patch.object(agent.XenAPIBasedAgent, '_call_agent')
@mock.patch("nova.virt.xenapi.agent.SimpleDH")

View File

@ -113,6 +113,13 @@ def _call_agent(session, instance, vm_ref, method, addl_args=None,
'args=%(args)r'),
{'method': method, 'args': args}, instance=instance)
raise exception.AgentTimeout(method=method)
elif 'REBOOT:' in err_msg:
LOG.debug('REBOOT: The call to %(method)s detected a reboot. '
'args=%(args)r',
{'method': method, 'args': args}, instance=instance)
_wait_for_new_dom_id(session, vm_ref, dom_id, method)
return _call_agent(session, instance, vm_ref, method,
addl_args, timeout, success_codes)
elif 'NOT IMPLEMENTED:' in err_msg:
LOG.error(_('NOT IMPLEMENTED: The call to %(method)s is not '
'supported by the agent. args=%(args)r'),
@ -152,6 +159,22 @@ def _call_agent(session, instance, vm_ref, method, addl_args=None,
return ret['message'].replace('\\r\\n', '')
def _wait_for_new_dom_id(session, vm_ref, old_dom_id, method):
expiration = time.time() + CONF.xenserver.agent_timeout
while True:
dom_id = session.VM.get_domid(vm_ref)
if dom_id and dom_id != -1 and dom_id != old_dom_id:
LOG.debug("Found new dom_id %s" % dom_id)
return
if time.time() > expiration:
LOG.debug("Timed out waiting for new dom_id %s" % dom_id)
raise exception.AgentTimeout(method=method)
time.sleep(1)
def is_upgrade_required(current_version, available_version):
# NOTE(johngarbutt): agent version numbers are four part,
# so we need to use the loose version to compare them

View File

@ -46,6 +46,10 @@ class TimeoutError(StandardError):
pass
class RebootDetectedError(StandardError):
pass
def version(self, arg_dict):
"""Get version of agent."""
timeout = int(arg_dict.pop('timeout', DEFAULT_TIMEOUT))
@ -210,6 +214,7 @@ def _wait_for_agent(self, request_id, arg_dict, timeout):
arg_dict["path"] = "data/guest/%s" % request_id
arg_dict["ignore_missing_path"] = True
start = time.time()
reboot_detected = False
while time.time() - start < timeout:
ret = xenstore.read_record(self, arg_dict)
# Note: the response for None with be a string that includes
@ -220,12 +225,30 @@ def _wait_for_agent(self, request_id, arg_dict, timeout):
time.sleep(.5)
# NOTE(johngarbutt) If we can't find this domid, then
# the VM has rebooted, so we must trigger domid refresh.
# Check after the sleep to give xenstore time to update
# after the VM reboot.
exists_args = {
"dom_id": arg_dict["dom_id"],
"path": "name",
}
dom_id_is_present = xenstore.record_exists(exists_args)
if not dom_id_is_present:
reboot_detected = True
break
# No response within the timeout period; bail out
# First, delete the request record
arg_dict["path"] = "data/host/%s" % request_id
xenstore.delete_record(self, arg_dict)
raise TimeoutError(_("TIMEOUT: No response from agent within"
" %s seconds.") % timeout)
if reboot_detected:
raise RebootDetectedError(_("REBOOT: dom_id %s no longer "
"present") % arg_dict["dom_id"])
else:
raise TimeoutError(_("TIMEOUT: No response from agent within"
" %s seconds.") % timeout)
if __name__ == "__main__":

View File

@ -62,7 +62,7 @@ def jsonify(fnc):
return wrapper
def _record_exists(arg_dict):
def record_exists(arg_dict):
"""Returns whether or not the given record exists. The record path
is determined from the given path and dom_id in the arg_dict.
"""
@ -93,7 +93,7 @@ def read_record(self, arg_dict):
except XenstoreError, e: # noqa
if not arg_dict.get("ignore_missing_path", False):
raise
if not _record_exists(arg_dict):
if not record_exists(arg_dict):
return "None"
# Just try again in case the agent write won the race against
# the record_exists check. If this fails again, it will likely raise
@ -129,7 +129,7 @@ def list_records(self, arg_dict):
try:
recs = _run_command(cmd)
except XenstoreError, e: # noqa
if not _record_exists(arg_dict):
if not record_exists(arg_dict):
return {}
# Just try again in case the path was created in between
# the "ls" and the existence check. If this fails again, it will