xenapi: deal with reboots while talking to agent
The latest version of the agent has started changing the hostname through the official windows APIs, when resetnetwork is called. This means a reboot might happen after resetnetwork. A reboot happens when polling for the version. Until now we wait for the call timeout, before fetching the new dom id. This change ensures that if we spot a reboot, the plugin exits early rather than keeping polling the wrong dom id. Turns out its best to wait for the dom_id to change, before trying to poll the agent again. Once the dom_id in xenapi has been updated, the xenstore keys are always in place. Trying too early leads to lots of reboot detections because we are retrying with the old dom_id. XenServer continues to return the old dom_id for a little while after the reboot. Closes-Bug: #1370999 Change-Id: Id0bf5b64f2b271d162db5bbce50167ab1f665c87
This commit is contained in:
parent
51de439a4d
commit
d65ca49990
@ -14,6 +14,7 @@
|
||||
# under the License.
|
||||
|
||||
import base64
|
||||
import time
|
||||
import uuid
|
||||
|
||||
import mock
|
||||
@ -41,8 +42,8 @@ def _get_fake_instance(**kwargs):
|
||||
|
||||
|
||||
class AgentTestCaseBase(test.NoDBTestCase):
|
||||
def _create_agent(self, instance):
|
||||
self.session = "session"
|
||||
def _create_agent(self, instance, session="session"):
|
||||
self.session = session
|
||||
self.virtapi = "virtapi"
|
||||
self.vm_ref = "vm_ref"
|
||||
return agent.XenAPIBasedAgent(self.session, self.virtapi,
|
||||
@ -202,6 +203,84 @@ class FileInjectionTestCase(AgentTestCaseBase):
|
||||
agent.inject_files(files)
|
||||
|
||||
|
||||
class FakeRebootException(Exception):
|
||||
details = ["", "", "", "asdf REBOOT: asdf"]
|
||||
|
||||
|
||||
class RebootRetryTestCase(AgentTestCaseBase):
|
||||
@mock.patch.object(agent, '_wait_for_new_dom_id')
|
||||
def test_retry_on_reboot(self, mock_wait):
|
||||
mock_session = mock.Mock()
|
||||
|
||||
def fake_call_plugin(*args, **kwargs):
|
||||
if fake_call_plugin.called:
|
||||
return {"returncode": '0', "message": "done"}
|
||||
else:
|
||||
fake_call_plugin.called = True
|
||||
raise FakeRebootException()
|
||||
|
||||
fake_call_plugin.called = False
|
||||
mock_session.XenAPI.Failure = FakeRebootException
|
||||
mock_session.VM.get_domid.return_value = "fake_dom_id"
|
||||
mock_session.call_plugin.side_effect = fake_call_plugin
|
||||
|
||||
agent = self._create_agent(None, mock_session)
|
||||
|
||||
result = agent._call_agent("asdf")
|
||||
self.assertEqual("done", result)
|
||||
self.assertTrue(mock_session.VM.get_domid.called)
|
||||
self.assertEqual(2, mock_session.call_plugin.call_count)
|
||||
mock_wait.called_once_with(mock_session, self.vm_ref,
|
||||
"fake_dom_id", "asdf")
|
||||
|
||||
@mock.patch.object(time, 'sleep')
|
||||
@mock.patch.object(time, 'time')
|
||||
def test_wait_for_new_dom_id_found(self, mock_time, mock_sleep):
|
||||
mock_session = mock.Mock()
|
||||
mock_session.VM.get_domid.return_value = "new"
|
||||
|
||||
agent._wait_for_new_dom_id(mock_session, "vm_ref", "old", "method")
|
||||
|
||||
mock_session.VM.get_domid.assert_called_once_with("vm_ref")
|
||||
self.assertFalse(mock_sleep.called)
|
||||
|
||||
@mock.patch.object(time, 'sleep')
|
||||
@mock.patch.object(time, 'time')
|
||||
def test_wait_for_new_dom_id_after_retry(self, mock_time, mock_sleep):
|
||||
self.flags(agent_timeout=3, group="xenserver")
|
||||
mock_time.return_value = 0
|
||||
mock_session = mock.Mock()
|
||||
old = 40
|
||||
new = 42
|
||||
mock_session.VM.get_domid.side_effect = [old, -1, new]
|
||||
|
||||
agent._wait_for_new_dom_id(mock_session, "vm_ref", old, "method")
|
||||
|
||||
mock_session.VM.get_domid.assert_called_with("vm_ref")
|
||||
self.assertEqual(3, mock_session.VM.get_domid.call_count)
|
||||
self.assertEqual(2, mock_sleep.call_count)
|
||||
|
||||
@mock.patch.object(time, 'sleep')
|
||||
@mock.patch.object(time, 'time')
|
||||
def test_wait_for_new_dom_id_timeout(self, mock_time, mock_sleep):
|
||||
self.flags(agent_timeout=3, group="xenserver")
|
||||
|
||||
def fake_time():
|
||||
fake_time.time = fake_time.time + 1
|
||||
return fake_time.time
|
||||
|
||||
fake_time.time = 0
|
||||
mock_time.side_effect = fake_time
|
||||
mock_session = mock.Mock()
|
||||
mock_session.VM.get_domid.return_value = "old"
|
||||
|
||||
self.assertRaises(exception.AgentTimeout,
|
||||
agent._wait_for_new_dom_id,
|
||||
mock_session, "vm_ref", "old", "method")
|
||||
|
||||
self.assertEqual(4, mock_session.VM.get_domid.call_count)
|
||||
|
||||
|
||||
class SetAdminPasswordTestCase(AgentTestCaseBase):
|
||||
@mock.patch.object(agent.XenAPIBasedAgent, '_call_agent')
|
||||
@mock.patch("nova.virt.xenapi.agent.SimpleDH")
|
||||
|
@ -113,6 +113,13 @@ def _call_agent(session, instance, vm_ref, method, addl_args=None,
|
||||
'args=%(args)r'),
|
||||
{'method': method, 'args': args}, instance=instance)
|
||||
raise exception.AgentTimeout(method=method)
|
||||
elif 'REBOOT:' in err_msg:
|
||||
LOG.debug('REBOOT: The call to %(method)s detected a reboot. '
|
||||
'args=%(args)r',
|
||||
{'method': method, 'args': args}, instance=instance)
|
||||
_wait_for_new_dom_id(session, vm_ref, dom_id, method)
|
||||
return _call_agent(session, instance, vm_ref, method,
|
||||
addl_args, timeout, success_codes)
|
||||
elif 'NOT IMPLEMENTED:' in err_msg:
|
||||
LOG.error(_('NOT IMPLEMENTED: The call to %(method)s is not '
|
||||
'supported by the agent. args=%(args)r'),
|
||||
@ -152,6 +159,22 @@ def _call_agent(session, instance, vm_ref, method, addl_args=None,
|
||||
return ret['message'].replace('\\r\\n', '')
|
||||
|
||||
|
||||
def _wait_for_new_dom_id(session, vm_ref, old_dom_id, method):
|
||||
expiration = time.time() + CONF.xenserver.agent_timeout
|
||||
while True:
|
||||
dom_id = session.VM.get_domid(vm_ref)
|
||||
|
||||
if dom_id and dom_id != -1 and dom_id != old_dom_id:
|
||||
LOG.debug("Found new dom_id %s" % dom_id)
|
||||
return
|
||||
|
||||
if time.time() > expiration:
|
||||
LOG.debug("Timed out waiting for new dom_id %s" % dom_id)
|
||||
raise exception.AgentTimeout(method=method)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def is_upgrade_required(current_version, available_version):
|
||||
# NOTE(johngarbutt): agent version numbers are four part,
|
||||
# so we need to use the loose version to compare them
|
||||
|
@ -46,6 +46,10 @@ class TimeoutError(StandardError):
|
||||
pass
|
||||
|
||||
|
||||
class RebootDetectedError(StandardError):
|
||||
pass
|
||||
|
||||
|
||||
def version(self, arg_dict):
|
||||
"""Get version of agent."""
|
||||
timeout = int(arg_dict.pop('timeout', DEFAULT_TIMEOUT))
|
||||
@ -210,6 +214,7 @@ def _wait_for_agent(self, request_id, arg_dict, timeout):
|
||||
arg_dict["path"] = "data/guest/%s" % request_id
|
||||
arg_dict["ignore_missing_path"] = True
|
||||
start = time.time()
|
||||
reboot_detected = False
|
||||
while time.time() - start < timeout:
|
||||
ret = xenstore.read_record(self, arg_dict)
|
||||
# Note: the response for None with be a string that includes
|
||||
@ -220,12 +225,30 @@ def _wait_for_agent(self, request_id, arg_dict, timeout):
|
||||
|
||||
time.sleep(.5)
|
||||
|
||||
# NOTE(johngarbutt) If we can't find this domid, then
|
||||
# the VM has rebooted, so we must trigger domid refresh.
|
||||
# Check after the sleep to give xenstore time to update
|
||||
# after the VM reboot.
|
||||
exists_args = {
|
||||
"dom_id": arg_dict["dom_id"],
|
||||
"path": "name",
|
||||
}
|
||||
dom_id_is_present = xenstore.record_exists(exists_args)
|
||||
if not dom_id_is_present:
|
||||
reboot_detected = True
|
||||
break
|
||||
|
||||
# No response within the timeout period; bail out
|
||||
# First, delete the request record
|
||||
arg_dict["path"] = "data/host/%s" % request_id
|
||||
xenstore.delete_record(self, arg_dict)
|
||||
raise TimeoutError(_("TIMEOUT: No response from agent within"
|
||||
" %s seconds.") % timeout)
|
||||
|
||||
if reboot_detected:
|
||||
raise RebootDetectedError(_("REBOOT: dom_id %s no longer "
|
||||
"present") % arg_dict["dom_id"])
|
||||
else:
|
||||
raise TimeoutError(_("TIMEOUT: No response from agent within"
|
||||
" %s seconds.") % timeout)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -62,7 +62,7 @@ def jsonify(fnc):
|
||||
return wrapper
|
||||
|
||||
|
||||
def _record_exists(arg_dict):
|
||||
def record_exists(arg_dict):
|
||||
"""Returns whether or not the given record exists. The record path
|
||||
is determined from the given path and dom_id in the arg_dict.
|
||||
"""
|
||||
@ -93,7 +93,7 @@ def read_record(self, arg_dict):
|
||||
except XenstoreError, e: # noqa
|
||||
if not arg_dict.get("ignore_missing_path", False):
|
||||
raise
|
||||
if not _record_exists(arg_dict):
|
||||
if not record_exists(arg_dict):
|
||||
return "None"
|
||||
# Just try again in case the agent write won the race against
|
||||
# the record_exists check. If this fails again, it will likely raise
|
||||
@ -129,7 +129,7 @@ def list_records(self, arg_dict):
|
||||
try:
|
||||
recs = _run_command(cmd)
|
||||
except XenstoreError, e: # noqa
|
||||
if not _record_exists(arg_dict):
|
||||
if not record_exists(arg_dict):
|
||||
return {}
|
||||
# Just try again in case the path was created in between
|
||||
# the "ls" and the existence check. If this fails again, it will
|
||||
|
Loading…
Reference in New Issue
Block a user