xenapi: deal with reboots while talking to agent
The latest version of the agent has started changing the hostname through the official windows APIs, when resetnetwork is called. This means a reboot might happen after resetnetwork. A reboot happens when polling for the version. Until now we wait for the call timeout, before fetching the new dom id. This change ensures that if we spot a reboot, the plugin exits early rather than keeping polling the wrong dom id. Turns out its best to wait for the dom_id to change, before trying to poll the agent again. Once the dom_id in xenapi has been updated, the xenstore keys are always in place. Trying too early leads to lots of reboot detections because we are retrying with the old dom_id. XenServer continues to return the old dom_id for a little while after the reboot. Closes-Bug: #1370999 Change-Id: Id0bf5b64f2b271d162db5bbce50167ab1f665c87
This commit is contained in:
parent
51de439a4d
commit
d65ca49990
@ -14,6 +14,7 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
import mock
|
import mock
|
||||||
@ -41,8 +42,8 @@ def _get_fake_instance(**kwargs):
|
|||||||
|
|
||||||
|
|
||||||
class AgentTestCaseBase(test.NoDBTestCase):
|
class AgentTestCaseBase(test.NoDBTestCase):
|
||||||
def _create_agent(self, instance):
|
def _create_agent(self, instance, session="session"):
|
||||||
self.session = "session"
|
self.session = session
|
||||||
self.virtapi = "virtapi"
|
self.virtapi = "virtapi"
|
||||||
self.vm_ref = "vm_ref"
|
self.vm_ref = "vm_ref"
|
||||||
return agent.XenAPIBasedAgent(self.session, self.virtapi,
|
return agent.XenAPIBasedAgent(self.session, self.virtapi,
|
||||||
@ -202,6 +203,84 @@ class FileInjectionTestCase(AgentTestCaseBase):
|
|||||||
agent.inject_files(files)
|
agent.inject_files(files)
|
||||||
|
|
||||||
|
|
||||||
|
class FakeRebootException(Exception):
|
||||||
|
details = ["", "", "", "asdf REBOOT: asdf"]
|
||||||
|
|
||||||
|
|
||||||
|
class RebootRetryTestCase(AgentTestCaseBase):
|
||||||
|
@mock.patch.object(agent, '_wait_for_new_dom_id')
|
||||||
|
def test_retry_on_reboot(self, mock_wait):
|
||||||
|
mock_session = mock.Mock()
|
||||||
|
|
||||||
|
def fake_call_plugin(*args, **kwargs):
|
||||||
|
if fake_call_plugin.called:
|
||||||
|
return {"returncode": '0', "message": "done"}
|
||||||
|
else:
|
||||||
|
fake_call_plugin.called = True
|
||||||
|
raise FakeRebootException()
|
||||||
|
|
||||||
|
fake_call_plugin.called = False
|
||||||
|
mock_session.XenAPI.Failure = FakeRebootException
|
||||||
|
mock_session.VM.get_domid.return_value = "fake_dom_id"
|
||||||
|
mock_session.call_plugin.side_effect = fake_call_plugin
|
||||||
|
|
||||||
|
agent = self._create_agent(None, mock_session)
|
||||||
|
|
||||||
|
result = agent._call_agent("asdf")
|
||||||
|
self.assertEqual("done", result)
|
||||||
|
self.assertTrue(mock_session.VM.get_domid.called)
|
||||||
|
self.assertEqual(2, mock_session.call_plugin.call_count)
|
||||||
|
mock_wait.called_once_with(mock_session, self.vm_ref,
|
||||||
|
"fake_dom_id", "asdf")
|
||||||
|
|
||||||
|
@mock.patch.object(time, 'sleep')
|
||||||
|
@mock.patch.object(time, 'time')
|
||||||
|
def test_wait_for_new_dom_id_found(self, mock_time, mock_sleep):
|
||||||
|
mock_session = mock.Mock()
|
||||||
|
mock_session.VM.get_domid.return_value = "new"
|
||||||
|
|
||||||
|
agent._wait_for_new_dom_id(mock_session, "vm_ref", "old", "method")
|
||||||
|
|
||||||
|
mock_session.VM.get_domid.assert_called_once_with("vm_ref")
|
||||||
|
self.assertFalse(mock_sleep.called)
|
||||||
|
|
||||||
|
@mock.patch.object(time, 'sleep')
|
||||||
|
@mock.patch.object(time, 'time')
|
||||||
|
def test_wait_for_new_dom_id_after_retry(self, mock_time, mock_sleep):
|
||||||
|
self.flags(agent_timeout=3, group="xenserver")
|
||||||
|
mock_time.return_value = 0
|
||||||
|
mock_session = mock.Mock()
|
||||||
|
old = 40
|
||||||
|
new = 42
|
||||||
|
mock_session.VM.get_domid.side_effect = [old, -1, new]
|
||||||
|
|
||||||
|
agent._wait_for_new_dom_id(mock_session, "vm_ref", old, "method")
|
||||||
|
|
||||||
|
mock_session.VM.get_domid.assert_called_with("vm_ref")
|
||||||
|
self.assertEqual(3, mock_session.VM.get_domid.call_count)
|
||||||
|
self.assertEqual(2, mock_sleep.call_count)
|
||||||
|
|
||||||
|
@mock.patch.object(time, 'sleep')
|
||||||
|
@mock.patch.object(time, 'time')
|
||||||
|
def test_wait_for_new_dom_id_timeout(self, mock_time, mock_sleep):
|
||||||
|
self.flags(agent_timeout=3, group="xenserver")
|
||||||
|
|
||||||
|
def fake_time():
|
||||||
|
fake_time.time = fake_time.time + 1
|
||||||
|
return fake_time.time
|
||||||
|
|
||||||
|
fake_time.time = 0
|
||||||
|
mock_time.side_effect = fake_time
|
||||||
|
mock_session = mock.Mock()
|
||||||
|
mock_session.VM.get_domid.return_value = "old"
|
||||||
|
|
||||||
|
self.assertRaises(exception.AgentTimeout,
|
||||||
|
agent._wait_for_new_dom_id,
|
||||||
|
mock_session, "vm_ref", "old", "method")
|
||||||
|
|
||||||
|
self.assertEqual(4, mock_session.VM.get_domid.call_count)
|
||||||
|
|
||||||
|
|
||||||
class SetAdminPasswordTestCase(AgentTestCaseBase):
|
class SetAdminPasswordTestCase(AgentTestCaseBase):
|
||||||
@mock.patch.object(agent.XenAPIBasedAgent, '_call_agent')
|
@mock.patch.object(agent.XenAPIBasedAgent, '_call_agent')
|
||||||
@mock.patch("nova.virt.xenapi.agent.SimpleDH")
|
@mock.patch("nova.virt.xenapi.agent.SimpleDH")
|
||||||
|
@ -113,6 +113,13 @@ def _call_agent(session, instance, vm_ref, method, addl_args=None,
|
|||||||
'args=%(args)r'),
|
'args=%(args)r'),
|
||||||
{'method': method, 'args': args}, instance=instance)
|
{'method': method, 'args': args}, instance=instance)
|
||||||
raise exception.AgentTimeout(method=method)
|
raise exception.AgentTimeout(method=method)
|
||||||
|
elif 'REBOOT:' in err_msg:
|
||||||
|
LOG.debug('REBOOT: The call to %(method)s detected a reboot. '
|
||||||
|
'args=%(args)r',
|
||||||
|
{'method': method, 'args': args}, instance=instance)
|
||||||
|
_wait_for_new_dom_id(session, vm_ref, dom_id, method)
|
||||||
|
return _call_agent(session, instance, vm_ref, method,
|
||||||
|
addl_args, timeout, success_codes)
|
||||||
elif 'NOT IMPLEMENTED:' in err_msg:
|
elif 'NOT IMPLEMENTED:' in err_msg:
|
||||||
LOG.error(_('NOT IMPLEMENTED: The call to %(method)s is not '
|
LOG.error(_('NOT IMPLEMENTED: The call to %(method)s is not '
|
||||||
'supported by the agent. args=%(args)r'),
|
'supported by the agent. args=%(args)r'),
|
||||||
@ -152,6 +159,22 @@ def _call_agent(session, instance, vm_ref, method, addl_args=None,
|
|||||||
return ret['message'].replace('\\r\\n', '')
|
return ret['message'].replace('\\r\\n', '')
|
||||||
|
|
||||||
|
|
||||||
|
def _wait_for_new_dom_id(session, vm_ref, old_dom_id, method):
|
||||||
|
expiration = time.time() + CONF.xenserver.agent_timeout
|
||||||
|
while True:
|
||||||
|
dom_id = session.VM.get_domid(vm_ref)
|
||||||
|
|
||||||
|
if dom_id and dom_id != -1 and dom_id != old_dom_id:
|
||||||
|
LOG.debug("Found new dom_id %s" % dom_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
if time.time() > expiration:
|
||||||
|
LOG.debug("Timed out waiting for new dom_id %s" % dom_id)
|
||||||
|
raise exception.AgentTimeout(method=method)
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
def is_upgrade_required(current_version, available_version):
|
def is_upgrade_required(current_version, available_version):
|
||||||
# NOTE(johngarbutt): agent version numbers are four part,
|
# NOTE(johngarbutt): agent version numbers are four part,
|
||||||
# so we need to use the loose version to compare them
|
# so we need to use the loose version to compare them
|
||||||
|
@ -46,6 +46,10 @@ class TimeoutError(StandardError):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RebootDetectedError(StandardError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def version(self, arg_dict):
|
def version(self, arg_dict):
|
||||||
"""Get version of agent."""
|
"""Get version of agent."""
|
||||||
timeout = int(arg_dict.pop('timeout', DEFAULT_TIMEOUT))
|
timeout = int(arg_dict.pop('timeout', DEFAULT_TIMEOUT))
|
||||||
@ -210,6 +214,7 @@ def _wait_for_agent(self, request_id, arg_dict, timeout):
|
|||||||
arg_dict["path"] = "data/guest/%s" % request_id
|
arg_dict["path"] = "data/guest/%s" % request_id
|
||||||
arg_dict["ignore_missing_path"] = True
|
arg_dict["ignore_missing_path"] = True
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
reboot_detected = False
|
||||||
while time.time() - start < timeout:
|
while time.time() - start < timeout:
|
||||||
ret = xenstore.read_record(self, arg_dict)
|
ret = xenstore.read_record(self, arg_dict)
|
||||||
# Note: the response for None with be a string that includes
|
# Note: the response for None with be a string that includes
|
||||||
@ -220,12 +225,30 @@ def _wait_for_agent(self, request_id, arg_dict, timeout):
|
|||||||
|
|
||||||
time.sleep(.5)
|
time.sleep(.5)
|
||||||
|
|
||||||
|
# NOTE(johngarbutt) If we can't find this domid, then
|
||||||
|
# the VM has rebooted, so we must trigger domid refresh.
|
||||||
|
# Check after the sleep to give xenstore time to update
|
||||||
|
# after the VM reboot.
|
||||||
|
exists_args = {
|
||||||
|
"dom_id": arg_dict["dom_id"],
|
||||||
|
"path": "name",
|
||||||
|
}
|
||||||
|
dom_id_is_present = xenstore.record_exists(exists_args)
|
||||||
|
if not dom_id_is_present:
|
||||||
|
reboot_detected = True
|
||||||
|
break
|
||||||
|
|
||||||
# No response within the timeout period; bail out
|
# No response within the timeout period; bail out
|
||||||
# First, delete the request record
|
# First, delete the request record
|
||||||
arg_dict["path"] = "data/host/%s" % request_id
|
arg_dict["path"] = "data/host/%s" % request_id
|
||||||
xenstore.delete_record(self, arg_dict)
|
xenstore.delete_record(self, arg_dict)
|
||||||
raise TimeoutError(_("TIMEOUT: No response from agent within"
|
|
||||||
" %s seconds.") % timeout)
|
if reboot_detected:
|
||||||
|
raise RebootDetectedError(_("REBOOT: dom_id %s no longer "
|
||||||
|
"present") % arg_dict["dom_id"])
|
||||||
|
else:
|
||||||
|
raise TimeoutError(_("TIMEOUT: No response from agent within"
|
||||||
|
" %s seconds.") % timeout)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -62,7 +62,7 @@ def jsonify(fnc):
|
|||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
def _record_exists(arg_dict):
|
def record_exists(arg_dict):
|
||||||
"""Returns whether or not the given record exists. The record path
|
"""Returns whether or not the given record exists. The record path
|
||||||
is determined from the given path and dom_id in the arg_dict.
|
is determined from the given path and dom_id in the arg_dict.
|
||||||
"""
|
"""
|
||||||
@ -93,7 +93,7 @@ def read_record(self, arg_dict):
|
|||||||
except XenstoreError, e: # noqa
|
except XenstoreError, e: # noqa
|
||||||
if not arg_dict.get("ignore_missing_path", False):
|
if not arg_dict.get("ignore_missing_path", False):
|
||||||
raise
|
raise
|
||||||
if not _record_exists(arg_dict):
|
if not record_exists(arg_dict):
|
||||||
return "None"
|
return "None"
|
||||||
# Just try again in case the agent write won the race against
|
# Just try again in case the agent write won the race against
|
||||||
# the record_exists check. If this fails again, it will likely raise
|
# the record_exists check. If this fails again, it will likely raise
|
||||||
@ -129,7 +129,7 @@ def list_records(self, arg_dict):
|
|||||||
try:
|
try:
|
||||||
recs = _run_command(cmd)
|
recs = _run_command(cmd)
|
||||||
except XenstoreError, e: # noqa
|
except XenstoreError, e: # noqa
|
||||||
if not _record_exists(arg_dict):
|
if not record_exists(arg_dict):
|
||||||
return {}
|
return {}
|
||||||
# Just try again in case the path was created in between
|
# Just try again in case the path was created in between
|
||||||
# the "ls" and the existence check. If this fails again, it will
|
# the "ls" and the existence check. If this fails again, it will
|
||||||
|
Loading…
Reference in New Issue
Block a user