OpenStack Compute (Nova)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

8417 lines
396KB

  1. # Copyright 2010 United States Government as represented by the
  2. # Administrator of the National Aeronautics and Space Administration.
  3. # Copyright 2011 Justin Santa Barbara
  4. # All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  7. # not use this file except in compliance with the License. You may obtain
  8. # a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  14. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  15. # License for the specific language governing permissions and limitations
  16. # under the License.
  17. """Handles all processes relating to instances (guest vms).
  18. The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that
  19. handles RPC calls relating to creating instances. It is responsible for
  20. building a disk image, launching it via the underlying virtualization driver,
  21. responding to calls to check its state, attaching persistent storage, and
  22. terminating it.
  23. """
  24. import base64
  25. import binascii
  26. import contextlib
  27. import copy
  28. import functools
  29. import inspect
  30. import sys
  31. import time
  32. import traceback
  33. from cinderclient import exceptions as cinder_exception
  34. from cursive import exception as cursive_exception
  35. import eventlet.event
  36. from eventlet import greenthread
  37. import eventlet.semaphore
  38. import eventlet.timeout
  39. from keystoneauth1 import exceptions as keystone_exception
  40. from oslo_log import log as logging
  41. import oslo_messaging as messaging
  42. from oslo_serialization import jsonutils
  43. from oslo_service import loopingcall
  44. from oslo_service import periodic_task
  45. from oslo_utils import excutils
  46. from oslo_utils import strutils
  47. from oslo_utils import timeutils
  48. from oslo_utils import uuidutils
  49. import six
  50. from six.moves import range
  51. from nova import block_device
  52. from nova.cells import rpcapi as cells_rpcapi
  53. from nova import compute
  54. from nova.compute import build_results
  55. from nova.compute import claims
  56. from nova.compute import power_state
  57. from nova.compute import resource_tracker
  58. from nova.compute import rpcapi as compute_rpcapi
  59. from nova.compute import task_states
  60. from nova.compute import utils as compute_utils
  61. from nova.compute.utils import wrap_instance_event
  62. from nova.compute import vm_states
  63. from nova import conductor
  64. import nova.conf
  65. from nova.console import rpcapi as console_rpcapi
  66. import nova.context
  67. from nova import exception
  68. from nova import exception_wrapper
  69. from nova import hooks
  70. from nova.i18n import _
  71. from nova import image
  72. from nova import manager
  73. from nova import network
  74. from nova.network import base_api as base_net_api
  75. from nova.network import model as network_model
  76. from nova.network.security_group import openstack_driver
  77. from nova import objects
  78. from nova.objects import base as obj_base
  79. from nova.objects import fields
  80. from nova.objects import instance as obj_instance
  81. from nova.objects import migrate_data as migrate_data_obj
  82. from nova.pci import whitelist
  83. from nova import rpc
  84. from nova import safe_utils
  85. from nova.scheduler import client as scheduler_client
  86. from nova.scheduler import utils as scheduler_utils
  87. from nova import utils
  88. from nova.virt import block_device as driver_block_device
  89. from nova.virt import configdrive
  90. from nova.virt import driver
  91. from nova.virt import event as virtevent
  92. from nova.virt import storage_users
  93. from nova.virt import virtapi
  94. from nova.volume import cinder
  95. CONF = nova.conf.CONF
  96. LOG = logging.getLogger(__name__)
  97. get_notifier = functools.partial(rpc.get_notifier, service='compute')
  98. wrap_exception = functools.partial(exception_wrapper.wrap_exception,
  99. get_notifier=get_notifier,
  100. binary='nova-compute')
  101. @contextlib.contextmanager
  102. def errors_out_migration_ctxt(migration):
  103. """Context manager to error out migration on failure."""
  104. try:
  105. yield
  106. except Exception:
  107. with excutils.save_and_reraise_exception():
  108. if migration:
  109. # We may have been passed None for our migration if we're
  110. # receiving from an older client. The migration will be
  111. # errored via the legacy path.
  112. migration.status = 'error'
  113. try:
  114. with migration.obj_as_admin():
  115. migration.save()
  116. except Exception:
  117. LOG.debug(
  118. 'Error setting migration status for instance %s.',
  119. migration.instance_uuid, exc_info=True)
  120. @utils.expects_func_args('migration')
  121. def errors_out_migration(function):
  122. """Decorator to error out migration on failure."""
  123. @functools.wraps(function)
  124. def decorated_function(self, context, *args, **kwargs):
  125. wrapped_func = safe_utils.get_wrapped_function(function)
  126. keyed_args = inspect.getcallargs(wrapped_func, self, context,
  127. *args, **kwargs)
  128. migration = keyed_args['migration']
  129. with errors_out_migration_ctxt(migration):
  130. return function(self, context, *args, **kwargs)
  131. return decorated_function
  132. @utils.expects_func_args('instance')
  133. def reverts_task_state(function):
  134. """Decorator to revert task_state on failure."""
  135. @functools.wraps(function)
  136. def decorated_function(self, context, *args, **kwargs):
  137. try:
  138. return function(self, context, *args, **kwargs)
  139. except exception.UnexpectedTaskStateError as e:
  140. # Note(maoy): unexpected task state means the current
  141. # task is preempted. Do not clear task state in this
  142. # case.
  143. with excutils.save_and_reraise_exception():
  144. LOG.info("Task possibly preempted: %s",
  145. e.format_message())
  146. except Exception:
  147. with excutils.save_and_reraise_exception():
  148. wrapped_func = safe_utils.get_wrapped_function(function)
  149. keyed_args = inspect.getcallargs(wrapped_func, self, context,
  150. *args, **kwargs)
  151. # NOTE(mriedem): 'instance' must be in keyed_args because we
  152. # have utils.expects_func_args('instance') decorating this
  153. # method.
  154. instance = keyed_args['instance']
  155. original_task_state = instance.task_state
  156. try:
  157. self._instance_update(context, instance, task_state=None)
  158. LOG.info("Successfully reverted task state from %s on "
  159. "failure for instance.",
  160. original_task_state, instance=instance)
  161. except exception.InstanceNotFound:
  162. # We might delete an instance that failed to build shortly
  163. # after it errored out this is an expected case and we
  164. # should not trace on it.
  165. pass
  166. except Exception as e:
  167. LOG.warning("Failed to revert task state for instance. "
  168. "Error: %s", e, instance=instance)
  169. return decorated_function
  170. @utils.expects_func_args('instance')
  171. def wrap_instance_fault(function):
  172. """Wraps a method to catch exceptions related to instances.
  173. This decorator wraps a method to catch any exceptions having to do with
  174. an instance that may get thrown. It then logs an instance fault in the db.
  175. """
  176. @functools.wraps(function)
  177. def decorated_function(self, context, *args, **kwargs):
  178. try:
  179. return function(self, context, *args, **kwargs)
  180. except exception.InstanceNotFound:
  181. raise
  182. except Exception as e:
  183. # NOTE(gtt): If argument 'instance' is in args rather than kwargs,
  184. # we will get a KeyError exception which will cover up the real
  185. # exception. So, we update kwargs with the values from args first.
  186. # then, we can get 'instance' from kwargs easily.
  187. kwargs.update(dict(zip(function.__code__.co_varnames[2:], args)))
  188. with excutils.save_and_reraise_exception():
  189. compute_utils.add_instance_fault_from_exc(context,
  190. kwargs['instance'], e, sys.exc_info())
  191. return decorated_function
  192. @utils.expects_func_args('image_id', 'instance')
  193. def delete_image_on_error(function):
  194. """Used for snapshot related method to ensure the image created in
  195. compute.api is deleted when an error occurs.
  196. """
  197. @functools.wraps(function)
  198. def decorated_function(self, context, image_id, instance,
  199. *args, **kwargs):
  200. try:
  201. return function(self, context, image_id, instance,
  202. *args, **kwargs)
  203. except Exception:
  204. with excutils.save_and_reraise_exception():
  205. LOG.debug("Cleaning up image %s", image_id,
  206. exc_info=True, instance=instance)
  207. try:
  208. self.image_api.delete(context, image_id)
  209. except exception.ImageNotFound:
  210. # Since we're trying to cleanup an image, we don't care if
  211. # if it's already gone.
  212. pass
  213. except Exception:
  214. LOG.exception("Error while trying to clean up image %s",
  215. image_id, instance=instance)
  216. return decorated_function
  217. # TODO(danms): Remove me after Icehouse
  218. # TODO(alaski): Actually remove this after Newton, assuming a major RPC bump
  219. # NOTE(mikal): if the method being decorated has more than one decorator, then
  220. # put this one first. Otherwise the various exception handling decorators do
  221. # not function correctly.
  222. def object_compat(function):
  223. """Wraps a method that expects a new-world instance
  224. This provides compatibility for callers passing old-style dict
  225. instances.
  226. """
  227. @functools.wraps(function)
  228. def decorated_function(self, context, *args, **kwargs):
  229. def _load_instance(instance_or_dict):
  230. if isinstance(instance_or_dict, dict):
  231. # try to get metadata and system_metadata for most cases but
  232. # only attempt to load those if the db instance already has
  233. # those fields joined
  234. metas = [meta for meta in ('metadata', 'system_metadata')
  235. if meta in instance_or_dict]
  236. instance = objects.Instance._from_db_object(
  237. context, objects.Instance(), instance_or_dict,
  238. expected_attrs=metas)
  239. instance._context = context
  240. return instance
  241. return instance_or_dict
  242. try:
  243. kwargs['instance'] = _load_instance(kwargs['instance'])
  244. except KeyError:
  245. args = (_load_instance(args[0]),) + args[1:]
  246. migration = kwargs.get('migration')
  247. if isinstance(migration, dict):
  248. migration = objects.Migration._from_db_object(
  249. context.elevated(), objects.Migration(),
  250. migration)
  251. kwargs['migration'] = migration
  252. return function(self, context, *args, **kwargs)
  253. return decorated_function
  254. class InstanceEvents(object):
  255. def __init__(self):
  256. self._events = {}
  257. @staticmethod
  258. def _lock_name(instance):
  259. return '%s-%s' % (instance.uuid, 'events')
  260. def prepare_for_instance_event(self, instance, name, tag):
  261. """Prepare to receive an event for an instance.
  262. This will register an event for the given instance that we will
  263. wait on later. This should be called before initiating whatever
  264. action will trigger the event. The resulting eventlet.event.Event
  265. object should be wait()'d on to ensure completion.
  266. :param instance: the instance for which the event will be generated
  267. :param name: the name of the event we're expecting
  268. :param tag: the tag associated with the event we're expecting
  269. :returns: an event object that should be wait()'d on
  270. """
  271. if self._events is None:
  272. # NOTE(danms): We really should have a more specific error
  273. # here, but this is what we use for our default error case
  274. raise exception.NovaException('In shutdown, no new events '
  275. 'can be scheduled')
  276. @utils.synchronized(self._lock_name(instance))
  277. def _create_or_get_event():
  278. instance_events = self._events.setdefault(instance.uuid, {})
  279. return instance_events.setdefault((name, tag),
  280. eventlet.event.Event())
  281. LOG.debug('Preparing to wait for external event %(name)s-%(tag)s',
  282. {'name': name, 'tag': tag}, instance=instance)
  283. return _create_or_get_event()
  284. def pop_instance_event(self, instance, event):
  285. """Remove a pending event from the wait list.
  286. This will remove a pending event from the wait list so that it
  287. can be used to signal the waiters to wake up.
  288. :param instance: the instance for which the event was generated
  289. :param event: the nova.objects.external_event.InstanceExternalEvent
  290. that describes the event
  291. :returns: the eventlet.event.Event object on which the waiters
  292. are blocked
  293. """
  294. no_events_sentinel = object()
  295. no_matching_event_sentinel = object()
  296. @utils.synchronized(self._lock_name(instance))
  297. def _pop_event():
  298. if self._events is None:
  299. LOG.debug('Unexpected attempt to pop events during shutdown',
  300. instance=instance)
  301. return no_events_sentinel
  302. events = self._events.get(instance.uuid)
  303. if not events:
  304. return no_events_sentinel
  305. _event = events.pop((event.name, event.tag), None)
  306. if not events:
  307. del self._events[instance.uuid]
  308. if _event is None:
  309. return no_matching_event_sentinel
  310. return _event
  311. result = _pop_event()
  312. if result is no_events_sentinel:
  313. LOG.debug('No waiting events found dispatching %(event)s',
  314. {'event': event.key},
  315. instance=instance)
  316. return None
  317. elif result is no_matching_event_sentinel:
  318. LOG.debug('No event matching %(event)s in %(events)s',
  319. {'event': event.key,
  320. 'events': self._events.get(instance.uuid, {}).keys()},
  321. instance=instance)
  322. return None
  323. else:
  324. return result
  325. def clear_events_for_instance(self, instance):
  326. """Remove all pending events for an instance.
  327. This will remove all events currently pending for an instance
  328. and return them (indexed by event name).
  329. :param instance: the instance for which events should be purged
  330. :returns: a dictionary of {event_name: eventlet.event.Event}
  331. """
  332. @utils.synchronized(self._lock_name(instance))
  333. def _clear_events():
  334. if self._events is None:
  335. LOG.debug('Unexpected attempt to clear events during shutdown',
  336. instance=instance)
  337. return dict()
  338. # NOTE(danms): We have historically returned the raw internal
  339. # format here, which is {event.key: [events, ...])} so just
  340. # trivially convert it here.
  341. return {'%s-%s' % k: e
  342. for k, e in self._events.pop(instance.uuid, {}).items()}
  343. return _clear_events()
  344. def cancel_all_events(self):
  345. if self._events is None:
  346. LOG.debug('Unexpected attempt to cancel events during shutdown.')
  347. return
  348. our_events = self._events
  349. # NOTE(danms): Block new events
  350. self._events = None
  351. for instance_uuid, events in our_events.items():
  352. for (name, tag), eventlet_event in events.items():
  353. LOG.debug('Canceling in-flight event %(name)s-%(tag)s for '
  354. 'instance %(instance_uuid)s',
  355. {'name': name,
  356. 'tag': tag,
  357. 'instance_uuid': instance_uuid})
  358. event = objects.InstanceExternalEvent(
  359. instance_uuid=instance_uuid,
  360. name=name, status='failed',
  361. tag=tag, data={})
  362. eventlet_event.send(event)
  363. class ComputeVirtAPI(virtapi.VirtAPI):
  364. def __init__(self, compute):
  365. super(ComputeVirtAPI, self).__init__()
  366. self._compute = compute
  367. def _default_error_callback(self, event_name, instance):
  368. raise exception.NovaException(_('Instance event failed'))
  369. @contextlib.contextmanager
  370. def wait_for_instance_event(self, instance, event_names, deadline=300,
  371. error_callback=None):
  372. """Plan to wait for some events, run some code, then wait.
  373. This context manager will first create plans to wait for the
  374. provided event_names, yield, and then wait for all the scheduled
  375. events to complete.
  376. Note that this uses an eventlet.timeout.Timeout to bound the
  377. operation, so callers should be prepared to catch that
  378. failure and handle that situation appropriately.
  379. If the event is not received by the specified timeout deadline,
  380. eventlet.timeout.Timeout is raised.
  381. If the event is received but did not have a 'completed'
  382. status, a NovaException is raised. If an error_callback is
  383. provided, instead of raising an exception as detailed above
  384. for the failure case, the callback will be called with the
  385. event_name and instance, and can return True to continue
  386. waiting for the rest of the events, False to stop processing,
  387. or raise an exception which will bubble up to the waiter.
  388. :param instance: The instance for which an event is expected
  389. :param event_names: A list of event names. Each element is a
  390. tuple of strings to indicate (name, tag),
  391. where name is required, but tag may be None.
  392. :param deadline: Maximum number of seconds we should wait for all
  393. of the specified events to arrive.
  394. :param error_callback: A function to be called if an event arrives
  395. """
  396. if error_callback is None:
  397. error_callback = self._default_error_callback
  398. events = {}
  399. for event_name in event_names:
  400. name, tag = event_name
  401. event_name = objects.InstanceExternalEvent.make_key(name, tag)
  402. try:
  403. events[event_name] = (
  404. self._compute.instance_events.prepare_for_instance_event(
  405. instance, name, tag))
  406. except exception.NovaException:
  407. error_callback(event_name, instance)
  408. # NOTE(danms): Don't wait for any of the events. They
  409. # should all be canceled and fired immediately below,
  410. # but don't stick around if not.
  411. deadline = 0
  412. yield
  413. with eventlet.timeout.Timeout(deadline):
  414. for event_name, event in events.items():
  415. actual_event = event.wait()
  416. if actual_event.status == 'completed':
  417. continue
  418. decision = error_callback(event_name, instance)
  419. if decision is False:
  420. break
  421. class ComputeManager(manager.Manager):
  422. """Manages the running instances from creation to destruction."""
  423. target = messaging.Target(version='5.0')
  424. # How long to wait in seconds before re-issuing a shutdown
  425. # signal to an instance during power off. The overall
  426. # time to wait is set by CONF.shutdown_timeout.
  427. SHUTDOWN_RETRY_INTERVAL = 10
  428. def __init__(self, compute_driver=None, *args, **kwargs):
  429. """Load configuration options and connect to the hypervisor."""
  430. self.virtapi = ComputeVirtAPI(self)
  431. self.network_api = network.API()
  432. self.volume_api = cinder.API()
  433. self.image_api = image.API()
  434. self._last_host_check = 0
  435. self._last_bw_usage_poll = 0
  436. self._bw_usage_supported = True
  437. self._last_bw_usage_cell_update = 0
  438. self.compute_api = compute.API()
  439. self.compute_rpcapi = compute_rpcapi.ComputeAPI()
  440. self.conductor_api = conductor.API()
  441. self.compute_task_api = conductor.ComputeTaskAPI()
  442. self.is_neutron_security_groups = (
  443. openstack_driver.is_neutron_security_groups())
  444. self.cells_rpcapi = cells_rpcapi.CellsAPI()
  445. self.scheduler_client = scheduler_client.SchedulerClient()
  446. self.reportclient = self.scheduler_client.reportclient
  447. self._resource_tracker = None
  448. self.instance_events = InstanceEvents()
  449. self._sync_power_pool = eventlet.GreenPool(
  450. size=CONF.sync_power_state_pool_size)
  451. self._syncs_in_progress = {}
  452. self.send_instance_updates = (
  453. CONF.filter_scheduler.track_instance_changes)
  454. if CONF.max_concurrent_builds != 0:
  455. self._build_semaphore = eventlet.semaphore.Semaphore(
  456. CONF.max_concurrent_builds)
  457. else:
  458. self._build_semaphore = compute_utils.UnlimitedSemaphore()
  459. if max(CONF.max_concurrent_live_migrations, 0) != 0:
  460. self._live_migration_semaphore = eventlet.semaphore.Semaphore(
  461. CONF.max_concurrent_live_migrations)
  462. else:
  463. self._live_migration_semaphore = compute_utils.UnlimitedSemaphore()
  464. super(ComputeManager, self).__init__(service_name="compute",
  465. *args, **kwargs)
  466. self.additional_endpoints.append(_ComputeV4Proxy(self))
  467. # NOTE(russellb) Load the driver last. It may call back into the
  468. # compute manager via the virtapi, so we want it to be fully
  469. # initialized before that happens.
  470. self.driver = driver.load_compute_driver(self.virtapi, compute_driver)
  471. self.use_legacy_block_device_info = \
  472. self.driver.need_legacy_block_device_info
  473. def reset(self):
  474. LOG.info('Reloading compute RPC API')
  475. compute_rpcapi.LAST_VERSION = None
  476. self.compute_rpcapi = compute_rpcapi.ComputeAPI()
  477. def _get_resource_tracker(self):
  478. if not self._resource_tracker:
  479. rt = resource_tracker.ResourceTracker(self.host, self.driver)
  480. self._resource_tracker = rt
  481. return self._resource_tracker
  482. def _update_resource_tracker(self, context, instance):
  483. """Let the resource tracker know that an instance has changed state."""
  484. if instance.host == self.host:
  485. rt = self._get_resource_tracker()
  486. rt.update_usage(context, instance, instance.node)
  487. def _instance_update(self, context, instance, **kwargs):
  488. """Update an instance in the database using kwargs as value."""
  489. for k, v in kwargs.items():
  490. setattr(instance, k, v)
  491. instance.save()
  492. self._update_resource_tracker(context, instance)
  493. def _nil_out_instance_obj_host_and_node(self, instance):
  494. # NOTE(jwcroppe): We don't do instance.save() here for performance
  495. # reasons; a call to this is expected to be immediately followed by
  496. # another call that does instance.save(), thus avoiding two writes
  497. # to the database layer.
  498. instance.host = None
  499. instance.node = None
  500. # If the instance is not on a host, it's not in an aggregate and
  501. # therefore is not in an availability zone.
  502. instance.availability_zone = None
  503. def _set_instance_obj_error_state(self, context, instance,
  504. clean_task_state=False):
  505. try:
  506. instance.vm_state = vm_states.ERROR
  507. if clean_task_state:
  508. instance.task_state = None
  509. instance.save()
  510. except exception.InstanceNotFound:
  511. LOG.debug('Instance has been destroyed from under us while '
  512. 'trying to set it to ERROR', instance=instance)
  513. def _get_instances_on_driver(self, context, filters=None):
  514. """Return a list of instance records for the instances found
  515. on the hypervisor which satisfy the specified filters. If filters=None
  516. return a list of instance records for all the instances found on the
  517. hypervisor.
  518. """
  519. if not filters:
  520. filters = {}
  521. try:
  522. driver_uuids = self.driver.list_instance_uuids()
  523. if len(driver_uuids) == 0:
  524. # Short circuit, don't waste a DB call
  525. return objects.InstanceList()
  526. filters['uuid'] = driver_uuids
  527. local_instances = objects.InstanceList.get_by_filters(
  528. context, filters, use_slave=True)
  529. return local_instances
  530. except NotImplementedError:
  531. pass
  532. # The driver doesn't support uuids listing, so we'll have
  533. # to brute force.
  534. driver_instances = self.driver.list_instances()
  535. # NOTE(mjozefcz): In this case we need to apply host filter.
  536. # Without this all instance data would be fetched from db.
  537. filters['host'] = self.host
  538. instances = objects.InstanceList.get_by_filters(context, filters,
  539. use_slave=True)
  540. name_map = {instance.name: instance for instance in instances}
  541. local_instances = []
  542. for driver_instance in driver_instances:
  543. instance = name_map.get(driver_instance)
  544. if not instance:
  545. continue
  546. local_instances.append(instance)
  547. return local_instances
  548. def _destroy_evacuated_instances(self, context):
  549. """Destroys evacuated instances.
  550. While nova-compute was down, the instances running on it could be
  551. evacuated to another host. This method looks for evacuation migration
  552. records where this is the source host and which were either started
  553. (accepted), in-progress (pre-migrating) or migrated (done). From those
  554. migration records, local instances reported by the hypervisor are
  555. compared to the instances for the migration records and those local
  556. guests are destroyed, along with instance allocation records in
  557. Placement for this node.
  558. Then allocations are removed from Placement for every instance that is
  559. evacuated from this host regardless if the instance is reported by the
  560. hypervisor or not.
  561. :param context: The request context
  562. :return: A dict keyed by instance uuid mapped to Migration objects
  563. for instances that were migrated away from this host
  564. """
  565. filters = {
  566. 'source_compute': self.host,
  567. # NOTE(mriedem): Migration records that have been accepted are
  568. # included in case the source node comes back up while instances
  569. # are being evacuated to another host. We don't want the same
  570. # instance being reported from multiple hosts.
  571. # NOTE(lyarwood): pre-migrating is also included here as the
  572. # source compute can come back online shortly after the RT
  573. # claims on the destination that in-turn moves the migration to
  574. # pre-migrating. If the evacuate fails on the destination host,
  575. # the user can rebuild the instance (in ERROR state) on the source
  576. # host.
  577. 'status': ['accepted', 'pre-migrating', 'done'],
  578. 'migration_type': 'evacuation',
  579. }
  580. with utils.temporary_mutation(context, read_deleted='yes'):
  581. evacuations = objects.MigrationList.get_by_filters(context,
  582. filters)
  583. if not evacuations:
  584. return {}
  585. evacuations = {mig.instance_uuid: mig for mig in evacuations}
  586. # The instances might be deleted in which case we need to avoid
  587. # InstanceNotFound being raised from lazy-loading fields on the
  588. # instances while cleaning up this host.
  589. read_deleted_context = context.elevated(read_deleted='yes')
  590. # TODO(mriedem): We could optimize by pre-loading the joined fields
  591. # we know we'll use, like info_cache and flavor. We can also replace
  592. # this with a generic solution: https://review.openstack.org/575190/
  593. local_instances = self._get_instances_on_driver(read_deleted_context)
  594. evacuated_local_instances = {inst.uuid: inst
  595. for inst in local_instances
  596. if inst.uuid in evacuations}
  597. for instance in evacuated_local_instances.values():
  598. LOG.info('Destroying instance as it has been evacuated from '
  599. 'this host but still exists in the hypervisor',
  600. instance=instance)
  601. try:
  602. network_info = self.network_api.get_instance_nw_info(
  603. context, instance)
  604. bdi = self._get_instance_block_device_info(context,
  605. instance)
  606. destroy_disks = not (self._is_instance_storage_shared(
  607. context, instance))
  608. except exception.InstanceNotFound:
  609. network_info = network_model.NetworkInfo()
  610. bdi = {}
  611. LOG.info('Instance has been marked deleted already, '
  612. 'removing it from the hypervisor.',
  613. instance=instance)
  614. # always destroy disks if the instance was deleted
  615. destroy_disks = True
  616. self.driver.destroy(context, instance,
  617. network_info,
  618. bdi, destroy_disks)
  619. # NOTE(gibi): We are called from init_host and at this point the
  620. # compute_nodes of the resource tracker has not been populated yet so
  621. # we cannot rely on the resource tracker here.
  622. compute_nodes = {}
  623. for instance_uuid, migration in evacuations.items():
  624. try:
  625. if instance_uuid in evacuated_local_instances:
  626. # Avoid the db call if we already have the instance loaded
  627. # above
  628. instance = evacuated_local_instances[instance_uuid]
  629. else:
  630. instance = objects.Instance.get_by_uuid(
  631. context, instance_uuid)
  632. except exception.InstanceNotFound:
  633. # The instance already deleted so we expect that every
  634. # allocation of that instance has already been cleaned up
  635. continue
  636. LOG.info('Cleaning up allocations of the instance as it has been '
  637. 'evacuated from this host',
  638. instance=instance)
  639. if migration.source_node not in compute_nodes:
  640. try:
  641. cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
  642. context, self.host, migration.source_node).uuid
  643. compute_nodes[migration.source_node] = cn_uuid
  644. except exception.ComputeHostNotFound:
  645. LOG.error("Failed to clean allocation of evacuated "
  646. "instance as the source node %s is not found",
  647. migration.source_node, instance=instance)
  648. continue
  649. cn_uuid = compute_nodes[migration.source_node]
  650. # If the instance was deleted in the interim, assume its
  651. # allocations were properly cleaned up (either by its hosting
  652. # compute service or the API).
  653. if (not instance.deleted and
  654. not scheduler_utils.remove_allocation_from_compute(
  655. context, instance, cn_uuid, self.reportclient)):
  656. LOG.error("Failed to clean allocation of evacuated instance "
  657. "on the source node %s",
  658. cn_uuid, instance=instance)
  659. migration.status = 'completed'
  660. migration.save()
  661. return evacuations
  662. def _is_instance_storage_shared(self, context, instance, host=None):
  663. shared_storage = True
  664. data = None
  665. try:
  666. data = self.driver.check_instance_shared_storage_local(context,
  667. instance)
  668. if data:
  669. shared_storage = (self.compute_rpcapi.
  670. check_instance_shared_storage(context,
  671. instance, data, host=host))
  672. except NotImplementedError:
  673. LOG.debug('Hypervisor driver does not support '
  674. 'instance shared storage check, '
  675. 'assuming it\'s not on shared storage',
  676. instance=instance)
  677. shared_storage = False
  678. except Exception:
  679. LOG.exception('Failed to check if instance shared',
  680. instance=instance)
  681. finally:
  682. if data:
  683. self.driver.check_instance_shared_storage_cleanup(context,
  684. data)
  685. return shared_storage
  686. def _complete_partial_deletion(self, context, instance):
  687. """Complete deletion for instances in DELETED status but not marked as
  688. deleted in the DB
  689. """
  690. system_meta = instance.system_metadata
  691. instance.destroy()
  692. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  693. context, instance.uuid)
  694. self._complete_deletion(context,
  695. instance,
  696. bdms,
  697. system_meta)
  698. def _complete_deletion(self, context, instance, bdms,
  699. system_meta):
  700. self._update_resource_tracker(context, instance)
  701. rt = self._get_resource_tracker()
  702. rt.reportclient.delete_allocation_for_instance(context, instance.uuid)
  703. self._notify_about_instance_usage(context, instance, "delete.end",
  704. system_metadata=system_meta)
  705. compute_utils.notify_about_instance_action(context, instance,
  706. self.host, action=fields.NotificationAction.DELETE,
  707. phase=fields.NotificationPhase.END, bdms=bdms)
  708. self._delete_scheduler_instance_info(context, instance.uuid)
  709. def _init_instance(self, context, instance):
  710. """Initialize this instance during service init."""
  711. # NOTE(danms): If the instance appears to not be owned by this
  712. # host, it may have been evacuated away, but skipped by the
  713. # evacuation cleanup code due to configuration. Thus, if that
  714. # is a possibility, don't touch the instance in any way, but
  715. # log the concern. This will help avoid potential issues on
  716. # startup due to misconfiguration.
  717. if instance.host != self.host:
  718. LOG.warning('Instance %(uuid)s appears to not be owned '
  719. 'by this host, but by %(host)s. Startup '
  720. 'processing is being skipped.',
  721. {'uuid': instance.uuid,
  722. 'host': instance.host})
  723. return
  724. # Instances that are shut down, or in an error state can not be
  725. # initialized and are not attempted to be recovered. The exception
  726. # to this are instances that are in RESIZE_MIGRATING or DELETING,
  727. # which are dealt with further down.
  728. if (instance.vm_state == vm_states.SOFT_DELETED or
  729. (instance.vm_state == vm_states.ERROR and
  730. instance.task_state not in
  731. (task_states.RESIZE_MIGRATING, task_states.DELETING))):
  732. LOG.debug("Instance is in %s state.",
  733. instance.vm_state, instance=instance)
  734. return
  735. if instance.vm_state == vm_states.DELETED:
  736. try:
  737. self._complete_partial_deletion(context, instance)
  738. except Exception:
  739. # we don't want that an exception blocks the init_host
  740. LOG.exception('Failed to complete a deletion',
  741. instance=instance)
  742. return
  743. if (instance.vm_state == vm_states.BUILDING or
  744. instance.task_state in [task_states.SCHEDULING,
  745. task_states.BLOCK_DEVICE_MAPPING,
  746. task_states.NETWORKING,
  747. task_states.SPAWNING]):
  748. # NOTE(dave-mcnally) compute stopped before instance was fully
  749. # spawned so set to ERROR state. This is safe to do as the state
  750. # may be set by the api but the host is not so if we get here the
  751. # instance has already been scheduled to this particular host.
  752. LOG.debug("Instance failed to spawn correctly, "
  753. "setting to ERROR state", instance=instance)
  754. self._set_instance_obj_error_state(
  755. context, instance, clean_task_state=True)
  756. return
  757. if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
  758. instance.task_state in [task_states.REBUILDING,
  759. task_states.REBUILD_BLOCK_DEVICE_MAPPING,
  760. task_states.REBUILD_SPAWNING]):
  761. # NOTE(jichenjc) compute stopped before instance was fully
  762. # spawned so set to ERROR state. This is consistent to BUILD
  763. LOG.debug("Instance failed to rebuild correctly, "
  764. "setting to ERROR state", instance=instance)
  765. self._set_instance_obj_error_state(
  766. context, instance, clean_task_state=True)
  767. return
  768. if (instance.vm_state != vm_states.ERROR and
  769. instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING,
  770. task_states.IMAGE_PENDING_UPLOAD,
  771. task_states.IMAGE_UPLOADING,
  772. task_states.IMAGE_SNAPSHOT]):
  773. LOG.debug("Instance in transitional state %s at start-up "
  774. "clearing task state",
  775. instance.task_state, instance=instance)
  776. try:
  777. self._post_interrupted_snapshot_cleanup(context, instance)
  778. except Exception:
  779. # we don't want that an exception blocks the init_host
  780. LOG.exception('Failed to cleanup snapshot.', instance=instance)
  781. instance.task_state = None
  782. instance.save()
  783. if (instance.vm_state != vm_states.ERROR and
  784. instance.task_state in [task_states.RESIZE_PREP]):
  785. LOG.debug("Instance in transitional state %s at start-up "
  786. "clearing task state",
  787. instance['task_state'], instance=instance)
  788. instance.task_state = None
  789. instance.save()
  790. if instance.task_state == task_states.DELETING:
  791. try:
  792. LOG.info('Service started deleting the instance during '
  793. 'the previous run, but did not finish. Restarting'
  794. ' the deletion now.', instance=instance)
  795. instance.obj_load_attr('metadata')
  796. instance.obj_load_attr('system_metadata')
  797. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  798. context, instance.uuid)
  799. self._delete_instance(context, instance, bdms)
  800. except Exception:
  801. # we don't want that an exception blocks the init_host
  802. LOG.exception('Failed to complete a deletion',
  803. instance=instance)
  804. self._set_instance_obj_error_state(context, instance)
  805. return
  806. current_power_state = self._get_power_state(context, instance)
  807. try_reboot, reboot_type = self._retry_reboot(context, instance,
  808. current_power_state)
  809. if try_reboot:
  810. LOG.debug("Instance in transitional state (%(task_state)s) at "
  811. "start-up and power state is (%(power_state)s), "
  812. "triggering reboot",
  813. {'task_state': instance.task_state,
  814. 'power_state': current_power_state},
  815. instance=instance)
  816. # NOTE(mikal): if the instance was doing a soft reboot that got as
  817. # far as shutting down the instance but not as far as starting it
  818. # again, then we've just become a hard reboot. That means the
  819. # task state for the instance needs to change so that we're in one
  820. # of the expected task states for a hard reboot.
  821. if (instance.task_state in task_states.soft_reboot_states and
  822. reboot_type == 'HARD'):
  823. instance.task_state = task_states.REBOOT_PENDING_HARD
  824. instance.save()
  825. self.reboot_instance(context, instance, block_device_info=None,
  826. reboot_type=reboot_type)
  827. return
  828. elif (current_power_state == power_state.RUNNING and
  829. instance.task_state in [task_states.REBOOT_STARTED,
  830. task_states.REBOOT_STARTED_HARD,
  831. task_states.PAUSING,
  832. task_states.UNPAUSING]):
  833. LOG.warning("Instance in transitional state "
  834. "(%(task_state)s) at start-up and power state "
  835. "is (%(power_state)s), clearing task state",
  836. {'task_state': instance.task_state,
  837. 'power_state': current_power_state},
  838. instance=instance)
  839. instance.task_state = None
  840. instance.vm_state = vm_states.ACTIVE
  841. instance.save()
  842. elif (current_power_state == power_state.PAUSED and
  843. instance.task_state == task_states.UNPAUSING):
  844. LOG.warning("Instance in transitional state "
  845. "(%(task_state)s) at start-up and power state "
  846. "is (%(power_state)s), clearing task state "
  847. "and unpausing the instance",
  848. {'task_state': instance.task_state,
  849. 'power_state': current_power_state},
  850. instance=instance)
  851. try:
  852. self.unpause_instance(context, instance)
  853. except NotImplementedError:
  854. # Some virt driver didn't support pause and unpause
  855. pass
  856. except Exception:
  857. LOG.exception('Failed to unpause instance', instance=instance)
  858. return
  859. if instance.task_state == task_states.POWERING_OFF:
  860. try:
  861. LOG.debug("Instance in transitional state %s at start-up "
  862. "retrying stop request",
  863. instance.task_state, instance=instance)
  864. self.stop_instance(context, instance, True)
  865. except Exception:
  866. # we don't want that an exception blocks the init_host
  867. LOG.exception('Failed to stop instance', instance=instance)
  868. return
  869. if instance.task_state == task_states.POWERING_ON:
  870. try:
  871. LOG.debug("Instance in transitional state %s at start-up "
  872. "retrying start request",
  873. instance.task_state, instance=instance)
  874. self.start_instance(context, instance)
  875. except Exception:
  876. # we don't want that an exception blocks the init_host
  877. LOG.exception('Failed to start instance', instance=instance)
  878. return
  879. net_info = instance.get_network_info()
  880. try:
  881. self.driver.plug_vifs(instance, net_info)
  882. except NotImplementedError as e:
  883. LOG.debug(e, instance=instance)
  884. except exception.VirtualInterfacePlugException:
  885. # NOTE(mriedem): If we get here, it could be because the vif_type
  886. # in the cache is "binding_failed" or "unbound". The only way to
  887. # fix this is to try and bind the ports again, which would be
  888. # expensive here on host startup. We could add a check to
  889. # _heal_instance_info_cache to handle this, but probably only if
  890. # the instance task_state is None.
  891. LOG.exception('Virtual interface plugging failed for instance. '
  892. 'The port binding:host_id may need to be manually '
  893. 'updated.', instance=instance)
  894. self._set_instance_obj_error_state(context, instance)
  895. return
  896. if instance.task_state == task_states.RESIZE_MIGRATING:
  897. # We crashed during resize/migration, so roll back for safety
  898. try:
  899. # NOTE(mriedem): check old_vm_state for STOPPED here, if it's
  900. # not in system_metadata we default to True for backwards
  901. # compatibility
  902. power_on = (instance.system_metadata.get('old_vm_state') !=
  903. vm_states.STOPPED)
  904. block_dev_info = self._get_instance_block_device_info(context,
  905. instance)
  906. self.driver.finish_revert_migration(context,
  907. instance, net_info, block_dev_info, power_on)
  908. except Exception:
  909. LOG.exception('Failed to revert crashed migration',
  910. instance=instance)
  911. finally:
  912. LOG.info('Instance found in migrating state during '
  913. 'startup. Resetting task_state',
  914. instance=instance)
  915. instance.task_state = None
  916. instance.save()
  917. if instance.task_state == task_states.MIGRATING:
  918. # Live migration did not complete, but instance is on this
  919. # host, so reset the state.
  920. instance.task_state = None
  921. instance.save(expected_task_state=[task_states.MIGRATING])
  922. db_state = instance.power_state
  923. drv_state = self._get_power_state(context, instance)
  924. expect_running = (db_state == power_state.RUNNING and
  925. drv_state != db_state)
  926. LOG.debug('Current state is %(drv_state)s, state in DB is '
  927. '%(db_state)s.',
  928. {'drv_state': drv_state, 'db_state': db_state},
  929. instance=instance)
  930. if expect_running and CONF.resume_guests_state_on_host_boot:
  931. self._resume_guests_state(context, instance, net_info)
  932. elif drv_state == power_state.RUNNING:
  933. # VMwareAPI drivers will raise an exception
  934. try:
  935. self.driver.ensure_filtering_rules_for_instance(
  936. instance, net_info)
  937. except NotImplementedError:
  938. LOG.debug('Hypervisor driver does not support '
  939. 'firewall rules', instance=instance)
  940. def _resume_guests_state(self, context, instance, net_info):
  941. LOG.info('Rebooting instance after nova-compute restart.',
  942. instance=instance)
  943. block_device_info = \
  944. self._get_instance_block_device_info(context, instance)
  945. try:
  946. self.driver.resume_state_on_host_boot(
  947. context, instance, net_info, block_device_info)
  948. except NotImplementedError:
  949. LOG.warning('Hypervisor driver does not support '
  950. 'resume guests', instance=instance)
  951. except Exception:
  952. # NOTE(vish): The instance failed to resume, so we set the
  953. # instance to error and attempt to continue.
  954. LOG.warning('Failed to resume instance',
  955. instance=instance)
  956. self._set_instance_obj_error_state(context, instance)
  957. def _retry_reboot(self, context, instance, current_power_state):
  958. current_task_state = instance.task_state
  959. retry_reboot = False
  960. reboot_type = compute_utils.get_reboot_type(current_task_state,
  961. current_power_state)
  962. pending_soft = (current_task_state == task_states.REBOOT_PENDING and
  963. instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)
  964. pending_hard = (current_task_state == task_states.REBOOT_PENDING_HARD
  965. and instance.vm_state in vm_states.ALLOW_HARD_REBOOT)
  966. started_not_running = (current_task_state in
  967. [task_states.REBOOT_STARTED,
  968. task_states.REBOOT_STARTED_HARD] and
  969. current_power_state != power_state.RUNNING)
  970. if pending_soft or pending_hard or started_not_running:
  971. retry_reboot = True
  972. return retry_reboot, reboot_type
  973. def handle_lifecycle_event(self, event):
  974. LOG.info("VM %(state)s (Lifecycle Event)",
  975. {'state': event.get_name()},
  976. instance_uuid=event.get_instance_uuid())
  977. context = nova.context.get_admin_context(read_deleted='yes')
  978. instance = objects.Instance.get_by_uuid(context,
  979. event.get_instance_uuid(),
  980. expected_attrs=[])
  981. vm_power_state = None
  982. if event.get_transition() == virtevent.EVENT_LIFECYCLE_STOPPED:
  983. vm_power_state = power_state.SHUTDOWN
  984. elif event.get_transition() == virtevent.EVENT_LIFECYCLE_STARTED:
  985. vm_power_state = power_state.RUNNING
  986. elif event.get_transition() == virtevent.EVENT_LIFECYCLE_PAUSED:
  987. vm_power_state = power_state.PAUSED
  988. elif event.get_transition() == virtevent.EVENT_LIFECYCLE_RESUMED:
  989. vm_power_state = power_state.RUNNING
  990. elif event.get_transition() == virtevent.EVENT_LIFECYCLE_SUSPENDED:
  991. vm_power_state = power_state.SUSPENDED
  992. else:
  993. LOG.warning("Unexpected power state %d", event.get_transition())
  994. # Note(lpetrut): The event may be delayed, thus not reflecting
  995. # the current instance power state. In that case, ignore the event.
  996. current_power_state = self._get_power_state(context, instance)
  997. if current_power_state == vm_power_state:
  998. LOG.debug('Synchronizing instance power state after lifecycle '
  999. 'event "%(event)s"; current vm_state: %(vm_state)s, '
  1000. 'current task_state: %(task_state)s, current DB '
  1001. 'power_state: %(db_power_state)s, VM power_state: '
  1002. '%(vm_power_state)s',
  1003. {'event': event.get_name(),
  1004. 'vm_state': instance.vm_state,
  1005. 'task_state': instance.task_state,
  1006. 'db_power_state': instance.power_state,
  1007. 'vm_power_state': vm_power_state},
  1008. instance_uuid=instance.uuid)
  1009. self._sync_instance_power_state(context,
  1010. instance,
  1011. vm_power_state)
  1012. def handle_events(self, event):
  1013. if isinstance(event, virtevent.LifecycleEvent):
  1014. try:
  1015. self.handle_lifecycle_event(event)
  1016. except exception.InstanceNotFound:
  1017. LOG.debug("Event %s arrived for non-existent instance. The "
  1018. "instance was probably deleted.", event)
  1019. else:
  1020. LOG.debug("Ignoring event %s", event)
  1021. def init_virt_events(self):
  1022. if CONF.workarounds.handle_virt_lifecycle_events:
  1023. self.driver.register_event_listener(self.handle_events)
  1024. else:
  1025. # NOTE(mriedem): If the _sync_power_states periodic task is
  1026. # disabled we should emit a warning in the logs.
  1027. if CONF.sync_power_state_interval < 0:
  1028. LOG.warning('Instance lifecycle events from the compute '
  1029. 'driver have been disabled. Note that lifecycle '
  1030. 'changes to an instance outside of the compute '
  1031. 'service will not be synchronized '
  1032. 'automatically since the _sync_power_states '
  1033. 'periodic task is also disabled.')
  1034. else:
  1035. LOG.info('Instance lifecycle events from the compute '
  1036. 'driver have been disabled. Note that lifecycle '
  1037. 'changes to an instance outside of the compute '
  1038. 'service will only be synchronized by the '
  1039. '_sync_power_states periodic task.')
  1040. def init_host(self):
  1041. """Initialization for a standalone compute service."""
  1042. if CONF.pci.passthrough_whitelist:
  1043. # Simply loading the PCI passthrough whitelist will do a bunch of
  1044. # validation that would otherwise wait until the PciDevTracker is
  1045. # constructed when updating available resources for the compute
  1046. # node(s) in the resource tracker, effectively killing that task.
  1047. # So load up the whitelist when starting the compute service to
  1048. # flush any invalid configuration early so we can kill the service
  1049. # if the configuration is wrong.
  1050. whitelist.Whitelist(CONF.pci.passthrough_whitelist)
  1051. # NOTE(sbauza): We want the compute node to hard fail if it won't be
  1052. # able to provide its resources to the placement API, or it will not
  1053. # be able to be eligible as a destination.
  1054. if CONF.placement.region_name is None:
  1055. raise exception.PlacementNotConfigured()
  1056. self.driver.init_host(host=self.host)
  1057. context = nova.context.get_admin_context()
  1058. instances = objects.InstanceList.get_by_host(
  1059. context, self.host, expected_attrs=['info_cache', 'metadata'])
  1060. if CONF.defer_iptables_apply:
  1061. self.driver.filter_defer_apply_on()
  1062. self.init_virt_events()
  1063. try:
  1064. # checking that instance was not already evacuated to other host
  1065. evacuated_instances = self._destroy_evacuated_instances(context)
  1066. # Initialise instances on the host that are not evacuating
  1067. for instance in instances:
  1068. if instance.uuid not in evacuated_instances:
  1069. self._init_instance(context, instance)
  1070. # NOTE(gibi): collect all the instance uuids that is in some way
  1071. # was already handled above. Either by init_instance or by
  1072. # _destroy_evacuated_instances. This way we can limit the scope of
  1073. # the _error_out_instances_whose_build_was_interrupted call to look
  1074. # only for instances that have allocations on this node and not
  1075. # handled by the above calls.
  1076. already_handled = {instance.uuid for instance in instances}.union(
  1077. evacuated_instances)
  1078. # NOTE(gibi): If ironic and vcenter virt driver slow start time
  1079. # becomes problematic here then we should consider adding a config
  1080. # option or a driver flag to tell us if we should thread this out
  1081. # in the background on startup
  1082. self._error_out_instances_whose_build_was_interrupted(
  1083. context, already_handled)
  1084. finally:
  1085. if CONF.defer_iptables_apply:
  1086. self.driver.filter_defer_apply_off()
  1087. if instances:
  1088. # We only send the instance info to the scheduler on startup
  1089. # if there is anything to send, otherwise this host might
  1090. # not be mapped yet in a cell and the scheduler may have
  1091. # issues dealing with the information. Later changes to
  1092. # instances on this host will update the scheduler, or the
  1093. # _sync_scheduler_instance_info periodic task will.
  1094. self._update_scheduler_instance_info(context, instances)
  1095. def _error_out_instances_whose_build_was_interrupted(
  1096. self, context, already_handled_instances):
  1097. """If there are instances in BUILDING state that are not
  1098. assigned to this host but have allocations in placement towards
  1099. this compute that means the nova-compute service was
  1100. restarted while those instances waited for the resource claim
  1101. to finish and the _set_instance_host_and_node() to update the
  1102. instance.host field. We need to push them to ERROR state here to
  1103. prevent keeping them in BUILDING state forever.
  1104. :param context: The request context
  1105. :param already_handled_instances: The set of instance UUIDs that the
  1106. host initialization process already handled in some way.
  1107. """
  1108. # Strategy:
  1109. # 1) Get the allocations from placement for our compute node(s)
  1110. # 2) Remove the already handled instances from the consumer list;
  1111. # they are either already initialized or need to be skipped.
  1112. # 3) Check which remaining consumer is an instance in BUILDING state
  1113. # and push it to ERROR state.
  1114. LOG.info(
  1115. "Looking for unclaimed instances stuck in BUILDING status for "
  1116. "nodes managed by this host")
  1117. try:
  1118. node_names = self.driver.get_available_nodes()
  1119. except exception.VirtDriverNotReady:
  1120. LOG.warning(
  1121. "Virt driver is not ready. Therefore unable to error out any "
  1122. "instances stuck in BUILDING state on this node. If this is "
  1123. "the first time this service is starting on this host, then "
  1124. "you can ignore this warning.")
  1125. return
  1126. for node_name in node_names:
  1127. try:
  1128. cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
  1129. context, self.host, node_name).uuid
  1130. except exception.ComputeHostNotFound:
  1131. LOG.warning(
  1132. "Compute node %s not found in the database and therefore "
  1133. "unable to error out any instances stuck in BUILDING "
  1134. "state on this node. If this is the first time this "
  1135. "service is starting on this host, then you can ignore "
  1136. "this warning.", node_name)
  1137. continue
  1138. f = self.reportclient.get_allocations_for_resource_provider
  1139. allocations = f(context, cn_uuid)
  1140. if allocations is None:
  1141. LOG.error(
  1142. "Could not retrieve compute node resource provider %s and "
  1143. "therefore unable to error out any instances stuck in "
  1144. "BUILDING state.", cn_uuid)
  1145. continue
  1146. not_handled_consumers = (set(allocations) -
  1147. already_handled_instances)
  1148. if not not_handled_consumers:
  1149. continue
  1150. filters = {
  1151. 'vm_state': vm_states.BUILDING,
  1152. 'uuid': not_handled_consumers
  1153. }
  1154. instances = objects.InstanceList.get_by_filters(
  1155. context, filters, expected_attrs=[])
  1156. for instance in instances:
  1157. LOG.debug(
  1158. "Instance spawn was interrupted before instance_claim, "
  1159. "setting instance to ERROR state", instance=instance)
  1160. self._set_instance_obj_error_state(
  1161. context, instance, clean_task_state=True)
  1162. def cleanup_host(self):
  1163. self.driver.register_event_listener(None)
  1164. self.instance_events.cancel_all_events()
  1165. self.driver.cleanup_host(host=self.host)
  1166. def pre_start_hook(self):
  1167. """After the service is initialized, but before we fully bring
  1168. the service up by listening on RPC queues, make sure to update
  1169. our available resources (and indirectly our available nodes).
  1170. """
  1171. self.update_available_resource(nova.context.get_admin_context(),
  1172. startup=True)
  1173. def _get_power_state(self, context, instance):
  1174. """Retrieve the power state for the given instance."""
  1175. LOG.debug('Checking state', instance=instance)
  1176. try:
  1177. return self.driver.get_info(instance).state
  1178. except exception.InstanceNotFound:
  1179. return power_state.NOSTATE
  1180. def get_console_topic(self, context):
  1181. """Retrieves the console host for a project on this host.
  1182. Currently this is just set in the flags for each compute host.
  1183. """
  1184. # TODO(mdragon): perhaps make this variable by console_type?
  1185. return '%s.%s' % (console_rpcapi.RPC_TOPIC, CONF.console_host)
  1186. @wrap_exception()
  1187. def get_console_pool_info(self, context, console_type):
  1188. return self.driver.get_console_pool_info(console_type)
  1189. @wrap_exception()
  1190. def refresh_instance_security_rules(self, context, instance):
  1191. """Tell the virtualization driver to refresh security rules for
  1192. an instance.
  1193. Passes straight through to the virtualization driver.
  1194. Synchronize the call because we may still be in the middle of
  1195. creating the instance.
  1196. """
  1197. @utils.synchronized(instance.uuid)
  1198. def _sync_refresh():
  1199. try:
  1200. return self.driver.refresh_instance_security_rules(instance)
  1201. except NotImplementedError:
  1202. LOG.debug('Hypervisor driver does not support '
  1203. 'security groups.', instance=instance)
  1204. return _sync_refresh()
  1205. def _await_block_device_map_created(self, context, vol_id):
  1206. # TODO(yamahata): creating volume simultaneously
  1207. # reduces creation time?
  1208. # TODO(yamahata): eliminate dumb polling
  1209. start = time.time()
  1210. retries = CONF.block_device_allocate_retries
  1211. if retries < 0:
  1212. LOG.warning("Treating negative config value (%(retries)s) for "
  1213. "'block_device_retries' as 0.",
  1214. {'retries': retries})
  1215. # (1) treat negative config value as 0
  1216. # (2) the configured value is 0, one attempt should be made
  1217. # (3) the configured value is > 0, then the total number attempts
  1218. # is (retries + 1)
  1219. attempts = 1
  1220. if retries >= 1:
  1221. attempts = retries + 1
  1222. for attempt in range(1, attempts + 1):
  1223. volume = self.volume_api.get(context, vol_id)
  1224. volume_status = volume['status']
  1225. if volume_status not in ['creating', 'downloading']:
  1226. if volume_status == 'available':
  1227. return attempt
  1228. LOG.warning("Volume id: %(vol_id)s finished being "
  1229. "created but its status is %(vol_status)s.",
  1230. {'vol_id': vol_id,
  1231. 'vol_status': volume_status})
  1232. break
  1233. greenthread.sleep(CONF.block_device_allocate_retries_interval)
  1234. raise exception.VolumeNotCreated(volume_id=vol_id,
  1235. seconds=int(time.time() - start),
  1236. attempts=attempt,
  1237. volume_status=volume_status)
  1238. def _decode_files(self, injected_files):
  1239. """Base64 decode the list of files to inject."""
  1240. if not injected_files:
  1241. return []
  1242. def _decode(f):
  1243. path, contents = f
  1244. # Py3 raises binascii.Error instead of TypeError as in Py27
  1245. try:
  1246. decoded = base64.b64decode(contents)
  1247. return path, decoded
  1248. except (TypeError, binascii.Error):
  1249. raise exception.Base64Exception(path=path)
  1250. return [_decode(f) for f in injected_files]
  1251. def _validate_instance_group_policy(self, context, instance,
  1252. scheduler_hints):
  1253. # NOTE(russellb) Instance group policy is enforced by the scheduler.
  1254. # However, there is a race condition with the enforcement of
  1255. # the policy. Since more than one instance may be scheduled at the
  1256. # same time, it's possible that more than one instance with an
  1257. # anti-affinity policy may end up here. It's also possible that
  1258. # multiple instances with an affinity policy could end up on different
  1259. # hosts. This is a validation step to make sure that starting the
  1260. # instance here doesn't violate the policy.
  1261. group_hint = scheduler_hints.get('group')
  1262. if not group_hint:
  1263. return
  1264. # The RequestSpec stores scheduler_hints as key=list pairs so we need
  1265. # to check the type on the value and pull the single entry out. The
  1266. # API request schema validates that the 'group' hint is a single value.
  1267. if isinstance(group_hint, list):
  1268. group_hint = group_hint[0]
  1269. @utils.synchronized(group_hint)
  1270. def _do_validation(context, instance, group_hint):
  1271. group = objects.InstanceGroup.get_by_hint(context, group_hint)
  1272. if 'anti-affinity' in group.policies:
  1273. group_hosts = group.get_hosts(exclude=[instance.uuid])
  1274. if self.host in group_hosts:
  1275. msg = _("Anti-affinity instance group policy "
  1276. "was violated.")
  1277. raise exception.RescheduledException(
  1278. instance_uuid=instance.uuid,
  1279. reason=msg)
  1280. elif 'affinity' in group.policies:
  1281. group_hosts = group.get_hosts(exclude=[instance.uuid])
  1282. if group_hosts and self.host not in group_hosts:
  1283. msg = _("Affinity instance group policy was violated.")
  1284. raise exception.RescheduledException(
  1285. instance_uuid=instance.uuid,
  1286. reason=msg)
  1287. if not CONF.workarounds.disable_group_policy_check_upcall:
  1288. _do_validation(context, instance, group_hint)
  1289. def _log_original_error(self, exc_info, instance_uuid):
  1290. LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
  1291. exc_info=exc_info)
  1292. def _reschedule(self, context, request_spec, filter_properties,
  1293. instance, reschedule_method, method_args, task_state,
  1294. exc_info=None, host_list=None):
  1295. """Attempt to re-schedule a compute operation."""
  1296. instance_uuid = instance.uuid
  1297. retry = filter_properties.get('retry')
  1298. if not retry:
  1299. # no retry information, do not reschedule.
  1300. LOG.debug("Retry info not present, will not reschedule",
  1301. instance_uuid=instance_uuid)
  1302. return
  1303. if not request_spec:
  1304. LOG.debug("No request spec, will not reschedule",
  1305. instance_uuid=instance_uuid)
  1306. return
  1307. LOG.debug("Re-scheduling %(method)s: attempt %(num)d",
  1308. {'method': reschedule_method.__name__,
  1309. 'num': retry['num_attempts']}, instance_uuid=instance_uuid)
  1310. # reset the task state:
  1311. self._instance_update(context, instance, task_state=task_state)
  1312. if exc_info:
  1313. # stringify to avoid circular ref problem in json serialization:
  1314. retry['exc'] = traceback.format_exception_only(exc_info[0],
  1315. exc_info[1])
  1316. reschedule_method(context, *method_args, host_list=host_list)
  1317. return True
  1318. @periodic_task.periodic_task
  1319. def _check_instance_build_time(self, context):
  1320. """Ensure that instances are not stuck in build."""
  1321. timeout = CONF.instance_build_timeout
  1322. if timeout == 0:
  1323. return
  1324. filters = {'vm_state': vm_states.BUILDING,
  1325. 'host': self.host}
  1326. building_insts = objects.InstanceList.get_by_filters(context,
  1327. filters, expected_attrs=[], use_slave=True)
  1328. for instance in building_insts:
  1329. if timeutils.is_older_than(instance.created_at, timeout):
  1330. self._set_instance_obj_error_state(context, instance)
  1331. LOG.warning("Instance build timed out. Set to error "
  1332. "state.", instance=instance)
  1333. def _check_instance_exists(self, context, instance):
  1334. """Ensure an instance with the same name is not already present."""
  1335. if self.driver.instance_exists(instance):
  1336. raise exception.InstanceExists(name=instance.name)
  1337. def _allocate_network_async(self, context, instance, requested_networks,
  1338. macs, security_groups, is_vpn):
  1339. """Method used to allocate networks in the background.
  1340. Broken out for testing.
  1341. """
  1342. # First check to see if we're specifically not supposed to allocate
  1343. # networks because if so, we can exit early.
  1344. if requested_networks and requested_networks.no_allocate:
  1345. LOG.debug("Not allocating networking since 'none' was specified.",
  1346. instance=instance)
  1347. return network_model.NetworkInfo([])
  1348. LOG.debug("Allocating IP information in the background.",
  1349. instance=instance)
  1350. retries = CONF.network_allocate_retries
  1351. attempts = retries + 1
  1352. retry_time = 1
  1353. bind_host_id = self.driver.network_binding_host_id(context, instance)
  1354. for attempt in range(1, attempts + 1):
  1355. try:
  1356. nwinfo = self.network_api.allocate_for_instance(
  1357. context, instance, vpn=is_vpn,
  1358. requested_networks=requested_networks,
  1359. macs=macs,
  1360. security_groups=security_groups,
  1361. bind_host_id=bind_host_id)
  1362. LOG.debug('Instance network_info: |%s|', nwinfo,
  1363. instance=instance)
  1364. instance.system_metadata['network_allocated'] = 'True'
  1365. # NOTE(JoshNang) do not save the instance here, as it can cause
  1366. # races. The caller shares a reference to instance and waits
  1367. # for this async greenthread to finish before calling
  1368. # instance.save().
  1369. return nwinfo
  1370. except Exception:
  1371. exc_info = sys.exc_info()
  1372. log_info = {'attempt': attempt,
  1373. 'attempts': attempts}
  1374. if attempt == attempts:
  1375. LOG.exception('Instance failed network setup '
  1376. 'after %(attempts)d attempt(s)',
  1377. log_info)
  1378. six.reraise(*exc_info)
  1379. LOG.warning('Instance failed network setup '
  1380. '(attempt %(attempt)d of %(attempts)d)',
  1381. log_info, instance=instance)
  1382. time.sleep(retry_time)
  1383. retry_time *= 2
  1384. if retry_time > 30:
  1385. retry_time = 30
  1386. # Not reached.
  1387. def _build_networks_for_instance(self, context, instance,
  1388. requested_networks, security_groups):
  1389. # If we're here from a reschedule the network may already be allocated.
  1390. if strutils.bool_from_string(
  1391. instance.system_metadata.get('network_allocated', 'False')):
  1392. # NOTE(alex_xu): The network_allocated is True means the network
  1393. # resource already allocated at previous scheduling, and the
  1394. # network setup is cleanup at previous. After rescheduling, the
  1395. # network resource need setup on the new host.
  1396. self.network_api.setup_instance_network_on_host(
  1397. context, instance, instance.host)
  1398. return self.network_api.get_instance_nw_info(context, instance)
  1399. if not self.is_neutron_security_groups:
  1400. security_groups = []
  1401. macs = self.driver.macs_for_instance(instance)
  1402. network_info = self._allocate_network(context, instance,
  1403. requested_networks, macs, security_groups)
  1404. return network_info
  1405. def _allocate_network(self, context, instance, requested_networks, macs,
  1406. security_groups):
  1407. """Start network allocation asynchronously. Return an instance
  1408. of NetworkInfoAsyncWrapper that can be used to retrieve the
  1409. allocated networks when the operation has finished.
  1410. """
  1411. # NOTE(comstud): Since we're allocating networks asynchronously,
  1412. # this task state has little meaning, as we won't be in this
  1413. # state for very long.
  1414. instance.vm_state = vm_states.BUILDING
  1415. instance.task_state = task_states.NETWORKING
  1416. instance.save(expected_task_state=[None])
  1417. is_vpn = False
  1418. return network_model.NetworkInfoAsyncWrapper(
  1419. self._allocate_network_async, context, instance,
  1420. requested_networks, macs, security_groups, is_vpn)
  1421. def _default_root_device_name(self, instance, image_meta, root_bdm):
  1422. try:
  1423. return self.driver.default_root_device_name(instance,
  1424. image_meta,
  1425. root_bdm)
  1426. except NotImplementedError:
  1427. return compute_utils.get_next_device_name(instance, [])
  1428. def _default_device_names_for_instance(self, instance,
  1429. root_device_name,
  1430. *block_device_lists):
  1431. try:
  1432. self.driver.default_device_names_for_instance(instance,
  1433. root_device_name,
  1434. *block_device_lists)
  1435. except NotImplementedError:
  1436. compute_utils.default_device_names_for_instance(
  1437. instance, root_device_name, *block_device_lists)
  1438. def _get_device_name_for_instance(self, instance, bdms, block_device_obj):
  1439. # NOTE(ndipanov): Copy obj to avoid changing the original
  1440. block_device_obj = block_device_obj.obj_clone()
  1441. try:
  1442. return self.driver.get_device_name_for_instance(
  1443. instance, bdms, block_device_obj)
  1444. except NotImplementedError:
  1445. return compute_utils.get_device_name_for_instance(
  1446. instance, bdms, block_device_obj.get("device_name"))
  1447. def _default_block_device_names(self, instance, image_meta, block_devices):
  1448. """Verify that all the devices have the device_name set. If not,
  1449. provide a default name.
  1450. It also ensures that there is a root_device_name and is set to the
  1451. first block device in the boot sequence (boot_index=0).
  1452. """
  1453. root_bdm = block_device.get_root_bdm(block_devices)
  1454. if not root_bdm:
  1455. return
  1456. # Get the root_device_name from the root BDM or the instance
  1457. root_device_name = None
  1458. update_root_bdm = False
  1459. if root_bdm.device_name:
  1460. root_device_name = root_bdm.device_name
  1461. instance.root_device_name = root_device_name
  1462. elif instance.root_device_name:
  1463. root_device_name = instance.root_device_name
  1464. root_bdm.device_name = root_device_name
  1465. update_root_bdm = True
  1466. else:
  1467. root_device_name = self._default_root_device_name(instance,
  1468. image_meta,
  1469. root_bdm)
  1470. instance.root_device_name = root_device_name
  1471. root_bdm.device_name = root_device_name
  1472. update_root_bdm = True
  1473. if update_root_bdm:
  1474. root_bdm.save()
  1475. ephemerals = list(filter(block_device.new_format_is_ephemeral,
  1476. block_devices))
  1477. swap = list(filter(block_device.new_format_is_swap,
  1478. block_devices))
  1479. block_device_mapping = list(filter(
  1480. driver_block_device.is_block_device_mapping, block_devices))
  1481. self._default_device_names_for_instance(instance,
  1482. root_device_name,
  1483. ephemerals,
  1484. swap,
  1485. block_device_mapping)
  1486. def _block_device_info_to_legacy(self, block_device_info):
  1487. """Convert BDI to the old format for drivers that need it."""
  1488. if self.use_legacy_block_device_info:
  1489. ephemerals = driver_block_device.legacy_block_devices(
  1490. driver.block_device_info_get_ephemerals(block_device_info))
  1491. mapping = driver_block_device.legacy_block_devices(
  1492. driver.block_device_info_get_mapping(block_device_info))
  1493. swap = block_device_info['swap']
  1494. if swap:
  1495. swap = swap.legacy()
  1496. block_device_info.update({
  1497. 'ephemerals': ephemerals,
  1498. 'swap': swap,
  1499. 'block_device_mapping': mapping})
  1500. def _add_missing_dev_names(self, bdms, instance):
  1501. for bdm in bdms:
  1502. if bdm.device_name is not None:
  1503. continue
  1504. device_name = self._get_device_name_for_instance(instance,
  1505. bdms, bdm)
  1506. values = {'device_name': device_name}
  1507. bdm.update(values)
  1508. bdm.save()
  1509. def _prep_block_device(self, context, instance, bdms):
  1510. """Set up the block device for an instance with error logging."""
  1511. try:
  1512. self._add_missing_dev_names(bdms, instance)
  1513. block_device_info = driver.get_block_device_info(instance, bdms)
  1514. mapping = driver.block_device_info_get_mapping(block_device_info)
  1515. driver_block_device.attach_block_devices(
  1516. mapping, context, instance, self.volume_api, self.driver,
  1517. wait_func=self._await_block_device_map_created)
  1518. self._block_device_info_to_legacy(block_device_info)
  1519. return block_device_info
  1520. except exception.OverQuota as e:
  1521. LOG.warning('Failed to create block device for instance due'
  1522. ' to exceeding volume related resource quota.'
  1523. ' Error: %s', e.message, instance=instance)
  1524. raise
  1525. except Exception as ex:
  1526. LOG.exception('Instance failed block device setup',
  1527. instance=instance)
  1528. # InvalidBDM will eventually result in a BuildAbortException when
  1529. # booting from volume, and will be recorded as an instance fault.
  1530. # Maintain the original exception message which most likely has
  1531. # useful details which the standard InvalidBDM error message lacks.
  1532. raise exception.InvalidBDM(six.text_type(ex))
  1533. def _update_instance_after_spawn(self, context, instance):
  1534. instance.power_state = self._get_power_state(context, instance)
  1535. instance.vm_state = vm_states.ACTIVE
  1536. instance.task_state = None
  1537. instance.launched_at = timeutils.utcnow()
  1538. configdrive.update_instance(instance)
  1539. def _update_scheduler_instance_info(self, context, instance):
  1540. """Sends an InstanceList with created or updated Instance objects to
  1541. the Scheduler client.
  1542. In the case of init_host, the value passed will already be an
  1543. InstanceList. Other calls will send individual Instance objects that
  1544. have been created or resized. In this case, we create an InstanceList
  1545. object containing that Instance.
  1546. """
  1547. if not self.send_instance_updates:
  1548. return
  1549. if isinstance(instance, obj_instance.Instance):
  1550. instance = objects.InstanceList(objects=[instance])
  1551. context = context.elevated()
  1552. self.scheduler_client.update_instance_info(context, self.host,
  1553. instance)
  1554. def _delete_scheduler_instance_info(self, context, instance_uuid):
  1555. """Sends the uuid of the deleted Instance to the Scheduler client."""
  1556. if not self.send_instance_updates:
  1557. return
  1558. context = context.elevated()
  1559. self.scheduler_client.delete_instance_info(context, self.host,
  1560. instance_uuid)
  1561. @periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval)
  1562. def _sync_scheduler_instance_info(self, context):
  1563. if not self.send_instance_updates:
  1564. return
  1565. context = context.elevated()
  1566. instances = objects.InstanceList.get_by_host(context, self.host,
  1567. expected_attrs=[],
  1568. use_slave=True)
  1569. uuids = [instance.uuid for instance in instances]
  1570. self.scheduler_client.sync_instance_info(context, self.host, uuids)
  1571. def _notify_about_instance_usage(self, context, instance, event_suffix,
  1572. network_info=None, system_metadata=None,
  1573. extra_usage_info=None, fault=None):
  1574. compute_utils.notify_about_instance_usage(
  1575. self.notifier, context, instance, event_suffix,
  1576. network_info=network_info,
  1577. system_metadata=system_metadata,
  1578. extra_usage_info=extra_usage_info, fault=fault)
  1579. def _deallocate_network(self, context, instance,
  1580. requested_networks=None):
  1581. # If we were told not to allocate networks let's save ourselves
  1582. # the trouble of calling the network API.
  1583. if requested_networks and requested_networks.no_allocate:
  1584. LOG.debug("Skipping network deallocation for instance since "
  1585. "networking was not requested.", instance=instance)
  1586. return
  1587. LOG.debug('Deallocating network for instance', instance=instance)
  1588. with timeutils.StopWatch() as timer:
  1589. self.network_api.deallocate_for_instance(
  1590. context, instance, requested_networks=requested_networks)
  1591. # nova-network does an rpc call so we're OK tracking time spent here
  1592. LOG.info('Took %0.2f seconds to deallocate network for instance.',
  1593. timer.elapsed(), instance=instance)
  1594. def _get_instance_block_device_info(self, context, instance,
  1595. refresh_conn_info=False,
  1596. bdms=None):
  1597. """Transform block devices to the driver block_device format."""
  1598. if bdms is None:
  1599. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  1600. context, instance.uuid)
  1601. block_device_info = driver.get_block_device_info(instance, bdms)
  1602. if not refresh_conn_info:
  1603. # if the block_device_mapping has no value in connection_info
  1604. # (returned as None), don't include in the mapping
  1605. block_device_info['block_device_mapping'] = [
  1606. bdm for bdm in driver.block_device_info_get_mapping(
  1607. block_device_info)
  1608. if bdm.get('connection_info')]
  1609. else:
  1610. driver_block_device.refresh_conn_infos(
  1611. driver.block_device_info_get_mapping(block_device_info),
  1612. context, instance, self.volume_api, self.driver)
  1613. self._block_device_info_to_legacy(block_device_info)
  1614. return block_device_info
  1615. def _build_failed(self, node):
  1616. if CONF.compute.consecutive_build_service_disable_threshold:
  1617. rt = self._get_resource_tracker()
  1618. # NOTE(danms): Update our counter, but wait for the next
  1619. # update_available_resource() periodic to flush it to the DB
  1620. rt.build_failed(node)
  1621. def _build_succeeded(self, node):
  1622. rt = self._get_resource_tracker()
  1623. rt.build_succeeded(node)
  1624. @wrap_exception()
  1625. @reverts_task_state
  1626. @wrap_instance_fault
  1627. def build_and_run_instance(self, context, instance, image, request_spec,
  1628. filter_properties, admin_password=None,
  1629. injected_files=None, requested_networks=None,
  1630. security_groups=None, block_device_mapping=None,
  1631. node=None, limits=None, host_list=None):
  1632. @utils.synchronized(instance.uuid)
  1633. def _locked_do_build_and_run_instance(*args, **kwargs):
  1634. # NOTE(danms): We grab the semaphore with the instance uuid
  1635. # locked because we could wait in line to build this instance
  1636. # for a while and we want to make sure that nothing else tries
  1637. # to do anything with this instance while we wait.
  1638. with self._build_semaphore:
  1639. try:
  1640. result = self._do_build_and_run_instance(*args, **kwargs)
  1641. except Exception:
  1642. # NOTE(mriedem): This should really only happen if
  1643. # _decode_files in _do_build_and_run_instance fails, and
  1644. # that's before a guest is spawned so it's OK to remove
  1645. # allocations for the instance for this node from Placement
  1646. # below as there is no guest consuming resources anyway.
  1647. # The _decode_files case could be handled more specifically
  1648. # but that's left for another day.
  1649. result = build_results.FAILED
  1650. raise
  1651. finally:
  1652. if result == build_results.FAILED:
  1653. # Remove the allocation records from Placement for the
  1654. # instance if the build failed. The instance.host is
  1655. # likely set to None in _do_build_and_run_instance
  1656. # which means if the user deletes the instance, it
  1657. # will be deleted in the API, not the compute service.
  1658. # Setting the instance.host to None in
  1659. # _do_build_and_run_instance means that the
  1660. # ResourceTracker will no longer consider this instance
  1661. # to be claiming resources against it, so we want to
  1662. # reflect that same thing in Placement. No need to
  1663. # call this for a reschedule, as the allocations will
  1664. # have already been removed in
  1665. # self._do_build_and_run_instance().
  1666. self._delete_allocation_for_instance(context,
  1667. instance.uuid)
  1668. if result in (build_results.FAILED,
  1669. build_results.RESCHEDULED):
  1670. self._build_failed(node)
  1671. else:
  1672. self._build_succeeded(node)
  1673. # NOTE(danms): We spawn here to return the RPC worker thread back to
  1674. # the pool. Since what follows could take a really long time, we don't
  1675. # want to tie up RPC workers.
  1676. utils.spawn_n(_locked_do_build_and_run_instance,
  1677. context, instance, image, request_spec,
  1678. filter_properties, admin_password, injected_files,
  1679. requested_networks, security_groups,
  1680. block_device_mapping, node, limits, host_list)
  1681. def _delete_allocation_for_instance(self, context, instance_uuid):
  1682. rt = self._get_resource_tracker()
  1683. rt.reportclient.delete_allocation_for_instance(context, instance_uuid)
  1684. def _check_device_tagging(self, requested_networks, block_device_mapping):
  1685. tagging_requested = False
  1686. if requested_networks:
  1687. for net in requested_networks:
  1688. if 'tag' in net and net.tag is not None:
  1689. tagging_requested = True
  1690. break
  1691. if block_device_mapping and not tagging_requested:
  1692. for bdm in block_device_mapping:
  1693. if 'tag' in bdm and bdm.tag is not None:
  1694. tagging_requested = True
  1695. break
  1696. if (tagging_requested and
  1697. not self.driver.capabilities.get('supports_device_tagging')):
  1698. raise exception.BuildAbortException('Attempt to boot guest with '
  1699. 'tagged devices on host that '
  1700. 'does not support tagging.')
  1701. @hooks.add_hook('build_instance')
  1702. @wrap_exception()
  1703. @reverts_task_state
  1704. @wrap_instance_event(prefix='compute')
  1705. @wrap_instance_fault
  1706. def _do_build_and_run_instance(self, context, instance, image,
  1707. request_spec, filter_properties, admin_password, injected_files,
  1708. requested_networks, security_groups, block_device_mapping,
  1709. node=None, limits=None, host_list=None):
  1710. try:
  1711. LOG.debug('Starting instance...', instance=instance)
  1712. instance.vm_state = vm_states.BUILDING
  1713. instance.task_state = None
  1714. instance.save(expected_task_state=
  1715. (task_states.SCHEDULING, None))
  1716. except exception.InstanceNotFound:
  1717. msg = 'Instance disappeared before build.'
  1718. LOG.debug(msg, instance=instance)
  1719. return build_results.FAILED
  1720. except exception.UnexpectedTaskStateError as e:
  1721. LOG.debug(e.format_message(), instance=instance)
  1722. return build_results.FAILED
  1723. # b64 decode the files to inject:
  1724. decoded_files = self._decode_files(injected_files)
  1725. if limits is None:
  1726. limits = {}
  1727. if node is None:
  1728. node = self._get_nodename(instance, refresh=True)
  1729. try:
  1730. with timeutils.StopWatch() as timer:
  1731. self._build_and_run_instance(context, instance, image,
  1732. decoded_files, admin_password, requested_networks,
  1733. security_groups, block_device_mapping, node, limits,
  1734. filter_properties, request_spec)
  1735. LOG.info('Took %0.2f seconds to build instance.',
  1736. timer.elapsed(), instance=instance)
  1737. return build_results.ACTIVE
  1738. except exception.RescheduledException as e:
  1739. retry = filter_properties.get('retry')
  1740. if not retry:
  1741. # no retry information, do not reschedule.
  1742. LOG.debug("Retry info not present, will not reschedule",
  1743. instance=instance)
  1744. self._cleanup_allocated_networks(context, instance,
  1745. requested_networks)
  1746. self._cleanup_volumes(context, instance,
  1747. block_device_mapping, raise_exc=False)
  1748. compute_utils.add_instance_fault_from_exc(context,
  1749. instance, e, sys.exc_info(),
  1750. fault_message=e.kwargs['reason'])
  1751. self._nil_out_instance_obj_host_and_node(instance)
  1752. self._set_instance_obj_error_state(context, instance,
  1753. clean_task_state=True)
  1754. return build_results.FAILED
  1755. LOG.debug(e.format_message(), instance=instance)
  1756. # This will be used for logging the exception
  1757. retry['exc'] = traceback.format_exception(*sys.exc_info())
  1758. # This will be used for setting the instance fault message
  1759. retry['exc_reason'] = e.kwargs['reason']
  1760. # NOTE(comstud): Deallocate networks if the driver wants
  1761. # us to do so.
  1762. # NOTE(mriedem): Always deallocate networking when using Neutron.
  1763. # This is to unbind any ports that the user supplied in the server
  1764. # create request, or delete any ports that nova created which were
  1765. # meant to be bound to this host. This check intentionally bypasses
  1766. # the result of deallocate_networks_on_reschedule because the
  1767. # default value in the driver is False, but that method was really
  1768. # only meant for Ironic and should be removed when nova-network is
  1769. # removed (since is_neutron() will then always be True).
  1770. # NOTE(vladikr): SR-IOV ports should be deallocated to
  1771. # allow new sriov pci devices to be allocated on a new host.
  1772. # Otherwise, if devices with pci addresses are already allocated
  1773. # on the destination host, the instance will fail to spawn.
  1774. # info_cache.network_info should be present at this stage.
  1775. if (self.driver.deallocate_networks_on_reschedule(instance) or
  1776. utils.is_neutron() or
  1777. self.deallocate_sriov_ports_on_reschedule(instance)):
  1778. self._cleanup_allocated_networks(context, instance,
  1779. requested_networks)
  1780. else:
  1781. # NOTE(alex_xu): Network already allocated and we don't
  1782. # want to deallocate them before rescheduling. But we need
  1783. # to cleanup those network resources setup on this host before
  1784. # rescheduling.
  1785. self.network_api.cleanup_instance_network_on_host(
  1786. context, instance, self.host)
  1787. self._nil_out_instance_obj_host_and_node(instance)
  1788. instance.task_state = task_states.SCHEDULING
  1789. instance.save()
  1790. # The instance will have already claimed resources from this host
  1791. # before this build was attempted. Now that it has failed, we need
  1792. # to unclaim those resources before casting to the conductor, so
  1793. # that if there are alternate hosts available for a retry, it can
  1794. # claim resources on that new host for the instance.
  1795. self._delete_allocation_for_instance(context, instance.uuid)
  1796. self.compute_task_api.build_instances(context, [instance],
  1797. image, filter_properties, admin_password,
  1798. injected_files, requested_networks, security_groups,
  1799. block_device_mapping, request_spec=request_spec,
  1800. host_lists=[host_list])
  1801. return build_results.RESCHEDULED
  1802. except (exception.InstanceNotFound,
  1803. exception.UnexpectedDeletingTaskStateError):
  1804. msg = 'Instance disappeared during build.'
  1805. LOG.debug(msg, instance=instance)
  1806. self._cleanup_allocated_networks(context, instance,
  1807. requested_networks)
  1808. return build_results.FAILED
  1809. except exception.BuildAbortException as e:
  1810. LOG.exception(e.format_message(), instance=instance)
  1811. self._cleanup_allocated_networks(context, instance,
  1812. requested_networks)
  1813. self._cleanup_volumes(context, instance,
  1814. block_device_mapping, raise_exc=False)
  1815. compute_utils.add_instance_fault_from_exc(context, instance,
  1816. e, sys.exc_info())
  1817. self._nil_out_instance_obj_host_and_node(instance)
  1818. self._set_instance_obj_error_state(context, instance,
  1819. clean_task_state=True)
  1820. return build_results.FAILED
  1821. except Exception as e:
  1822. # Should not reach here.
  1823. LOG.exception('Unexpected build failure, not rescheduling build.',
  1824. instance=instance)
  1825. self._cleanup_allocated_networks(context, instance,
  1826. requested_networks)
  1827. self._cleanup_volumes(context, instance,
  1828. block_device_mapping, raise_exc=False)
  1829. compute_utils.add_instance_fault_from_exc(context, instance,
  1830. e, sys.exc_info())
  1831. self._nil_out_instance_obj_host_and_node(instance)
  1832. self._set_instance_obj_error_state(context, instance,
  1833. clean_task_state=True)
  1834. return build_results.FAILED
  1835. def deallocate_sriov_ports_on_reschedule(self, instance):
  1836. """Determine if networks are needed to be deallocated before reschedule
  1837. Check the cached network info for any assigned SR-IOV ports.
  1838. SR-IOV ports should be deallocated prior to rescheduling
  1839. in order to allow new sriov pci devices to be allocated on a new host.
  1840. """
  1841. info_cache = instance.info_cache
  1842. def _has_sriov_port(vif):
  1843. return vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV
  1844. if (info_cache and info_cache.network_info):
  1845. for vif in info_cache.network_info:
  1846. if _has_sriov_port(vif):
  1847. return True
  1848. return False
  1849. @staticmethod
  1850. def _get_scheduler_hints(filter_properties, request_spec=None):
  1851. """Helper method to get scheduler hints.
  1852. This method prefers to get the hints out of the request spec, but that
  1853. might not be provided. Conductor will pass request_spec down to the
  1854. first compute chosen for a build but older computes will not pass
  1855. the request_spec to conductor's build_instances method for a
  1856. a reschedule, so if we're on a host via a retry, request_spec may not
  1857. be provided so we need to fallback to use the filter_properties
  1858. to get scheduler hints.
  1859. """
  1860. hints = {}
  1861. if request_spec is not None and 'scheduler_hints' in request_spec:
  1862. hints = request_spec.scheduler_hints
  1863. if not hints:
  1864. hints = filter_properties.get('scheduler_hints') or {}
  1865. return hints
  1866. def _build_and_run_instance(self, context, instance, image, injected_files,
  1867. admin_password, requested_networks, security_groups,
  1868. block_device_mapping, node, limits, filter_properties,
  1869. request_spec=None):
  1870. image_name = image.get('name')
  1871. self._notify_about_instance_usage(context, instance, 'create.start',
  1872. extra_usage_info={'image_name': image_name})
  1873. compute_utils.notify_about_instance_create(
  1874. context, instance, self.host,
  1875. phase=fields.NotificationPhase.START,
  1876. bdms=block_device_mapping)
  1877. # NOTE(mikal): cache the keystone roles associated with the instance
  1878. # at boot time for later reference
  1879. instance.system_metadata.update(
  1880. {'boot_roles': ','.join(context.roles)})
  1881. self._check_device_tagging(requested_networks, block_device_mapping)
  1882. try:
  1883. scheduler_hints = self._get_scheduler_hints(filter_properties,
  1884. request_spec)
  1885. rt = self._get_resource_tracker()
  1886. with rt.instance_claim(context, instance, node, limits):
  1887. # NOTE(russellb) It's important that this validation be done
  1888. # *after* the resource tracker instance claim, as that is where
  1889. # the host is set on the instance.
  1890. self._validate_instance_group_policy(context, instance,
  1891. scheduler_hints)
  1892. image_meta = objects.ImageMeta.from_dict(image)
  1893. with self._build_resources(context, instance,
  1894. requested_networks, security_groups, image_meta,
  1895. block_device_mapping) as resources:
  1896. instance.vm_state = vm_states.BUILDING
  1897. instance.task_state = task_states.SPAWNING
  1898. # NOTE(JoshNang) This also saves the changes to the
  1899. # instance from _allocate_network_async, as they aren't
  1900. # saved in that function to prevent races.
  1901. instance.save(expected_task_state=
  1902. task_states.BLOCK_DEVICE_MAPPING)
  1903. block_device_info = resources['block_device_info']
  1904. network_info = resources['network_info']
  1905. allocs = resources['allocations']
  1906. LOG.debug('Start spawning the instance on the hypervisor.',
  1907. instance=instance)
  1908. with timeutils.StopWatch() as timer:
  1909. self.driver.spawn(context, instance, image_meta,
  1910. injected_files, admin_password,
  1911. allocs, network_info=network_info,
  1912. block_device_info=block_device_info)
  1913. LOG.info('Took %0.2f seconds to spawn the instance on '
  1914. 'the hypervisor.', timer.elapsed(),
  1915. instance=instance)
  1916. except (exception.InstanceNotFound,
  1917. exception.UnexpectedDeletingTaskStateError) as e:
  1918. with excutils.save_and_reraise_exception():
  1919. self._notify_about_instance_usage(context, instance,
  1920. 'create.error', fault=e)
  1921. compute_utils.notify_about_instance_create(
  1922. context, instance, self.host,
  1923. phase=fields.NotificationPhase.ERROR, exception=e,
  1924. bdms=block_device_mapping)
  1925. except exception.ComputeResourcesUnavailable as e:
  1926. LOG.debug(e.format_message(), instance=instance)
  1927. self._notify_about_instance_usage(context, instance,
  1928. 'create.error', fault=e)
  1929. compute_utils.notify_about_instance_create(
  1930. context, instance, self.host,
  1931. phase=fields.NotificationPhase.ERROR, exception=e,
  1932. bdms=block_device_mapping)
  1933. raise exception.RescheduledException(
  1934. instance_uuid=instance.uuid, reason=e.format_message())
  1935. except exception.BuildAbortException as e:
  1936. with excutils.save_and_reraise_exception():
  1937. LOG.debug(e.format_message(), instance=instance)
  1938. self._notify_about_instance_usage(context, instance,
  1939. 'create.error', fault=e)
  1940. compute_utils.notify_about_instance_create(
  1941. context, instance, self.host,
  1942. phase=fields.NotificationPhase.ERROR, exception=e,
  1943. bdms=block_device_mapping)
  1944. except (exception.FixedIpLimitExceeded,
  1945. exception.NoMoreNetworks, exception.NoMoreFixedIps) as e:
  1946. LOG.warning('No more network or fixed IP to be allocated',
  1947. instance=instance)
  1948. self._notify_about_instance_usage(context, instance,
  1949. 'create.error', fault=e)
  1950. compute_utils.notify_about_instance_create(
  1951. context, instance, self.host,
  1952. phase=fields.NotificationPhase.ERROR, exception=e,
  1953. bdms=block_device_mapping)
  1954. msg = _('Failed to allocate the network(s) with error %s, '
  1955. 'not rescheduling.') % e.format_message()
  1956. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1957. reason=msg)
  1958. except (exception.VirtualInterfaceCreateException,
  1959. exception.VirtualInterfaceMacAddressException,
  1960. exception.FixedIpInvalidOnHost,
  1961. exception.UnableToAutoAllocateNetwork) as e:
  1962. LOG.exception('Failed to allocate network(s)',
  1963. instance=instance)
  1964. self._notify_about_instance_usage(context, instance,
  1965. 'create.error', fault=e)
  1966. compute_utils.notify_about_instance_create(
  1967. context, instance, self.host,
  1968. phase=fields.NotificationPhase.ERROR, exception=e,
  1969. bdms=block_device_mapping)
  1970. msg = _('Failed to allocate the network(s), not rescheduling.')
  1971. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1972. reason=msg)
  1973. except (exception.FlavorDiskTooSmall,
  1974. exception.FlavorMemoryTooSmall,
  1975. exception.ImageNotActive,
  1976. exception.ImageUnacceptable,
  1977. exception.InvalidDiskInfo,
  1978. exception.InvalidDiskFormat,
  1979. cursive_exception.SignatureVerificationError,
  1980. exception.VolumeEncryptionNotSupported,
  1981. exception.InvalidInput,
  1982. # TODO(mriedem): We should be validating RequestedVRamTooHigh
  1983. # in the API during server create and rebuild.
  1984. exception.RequestedVRamTooHigh) as e:
  1985. self._notify_about_instance_usage(context, instance,
  1986. 'create.error', fault=e)
  1987. compute_utils.notify_about_instance_create(
  1988. context, instance, self.host,
  1989. phase=fields.NotificationPhase.ERROR, exception=e,
  1990. bdms=block_device_mapping)
  1991. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  1992. reason=e.format_message())
  1993. except Exception as e:
  1994. self._notify_about_instance_usage(context, instance,
  1995. 'create.error', fault=e)
  1996. compute_utils.notify_about_instance_create(
  1997. context, instance, self.host,
  1998. phase=fields.NotificationPhase.ERROR, exception=e,
  1999. bdms=block_device_mapping)
  2000. raise exception.RescheduledException(
  2001. instance_uuid=instance.uuid, reason=six.text_type(e))
  2002. # NOTE(alaski): This is only useful during reschedules, remove it now.
  2003. instance.system_metadata.pop('network_allocated', None)
  2004. # If CONF.default_access_ip_network_name is set, grab the
  2005. # corresponding network and set the access ip values accordingly.
  2006. network_name = CONF.default_access_ip_network_name
  2007. if (network_name and not instance.access_ip_v4 and
  2008. not instance.access_ip_v6):
  2009. # Note that when there are multiple ips to choose from, an
  2010. # arbitrary one will be chosen.
  2011. for vif in network_info:
  2012. if vif['network']['label'] == network_name:
  2013. for ip in vif.fixed_ips():
  2014. if not instance.access_ip_v4 and ip['version'] == 4:
  2015. instance.access_ip_v4 = ip['address']
  2016. if not instance.access_ip_v6 and ip['version'] == 6:
  2017. instance.access_ip_v6 = ip['address']
  2018. break
  2019. self._update_instance_after_spawn(context, instance)
  2020. try:
  2021. instance.save(expected_task_state=task_states.SPAWNING)
  2022. except (exception.InstanceNotFound,
  2023. exception.UnexpectedDeletingTaskStateError) as e:
  2024. with excutils.save_and_reraise_exception():
  2025. self._notify_about_instance_usage(context, instance,
  2026. 'create.error', fault=e)
  2027. compute_utils.notify_about_instance_create(
  2028. context, instance, self.host,
  2029. phase=fields.NotificationPhase.ERROR, exception=e,
  2030. bdms=block_device_mapping)
  2031. self._update_scheduler_instance_info(context, instance)
  2032. self._notify_about_instance_usage(context, instance, 'create.end',
  2033. extra_usage_info={'message': _('Success')},
  2034. network_info=network_info)
  2035. compute_utils.notify_about_instance_create(context, instance,
  2036. self.host, phase=fields.NotificationPhase.END,
  2037. bdms=block_device_mapping)
  2038. @contextlib.contextmanager
  2039. def _build_resources(self, context, instance, requested_networks,
  2040. security_groups, image_meta, block_device_mapping):
  2041. resources = {}
  2042. network_info = None
  2043. try:
  2044. LOG.debug('Start building networks asynchronously for instance.',
  2045. instance=instance)
  2046. network_info = self._build_networks_for_instance(context, instance,
  2047. requested_networks, security_groups)
  2048. resources['network_info'] = network_info
  2049. except (exception.InstanceNotFound,
  2050. exception.UnexpectedDeletingTaskStateError):
  2051. raise
  2052. except exception.UnexpectedTaskStateError as e:
  2053. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2054. reason=e.format_message())
  2055. except Exception:
  2056. # Because this allocation is async any failures are likely to occur
  2057. # when the driver accesses network_info during spawn().
  2058. LOG.exception('Failed to allocate network(s)',
  2059. instance=instance)
  2060. msg = _('Failed to allocate the network(s), not rescheduling.')
  2061. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2062. reason=msg)
  2063. try:
  2064. # Depending on a virt driver, some network configuration is
  2065. # necessary before preparing block devices.
  2066. self.driver.prepare_networks_before_block_device_mapping(
  2067. instance, network_info)
  2068. # Verify that all the BDMs have a device_name set and assign a
  2069. # default to the ones missing it with the help of the driver.
  2070. self._default_block_device_names(instance, image_meta,
  2071. block_device_mapping)
  2072. LOG.debug('Start building block device mappings for instance.',
  2073. instance=instance)
  2074. instance.vm_state = vm_states.BUILDING
  2075. instance.task_state = task_states.BLOCK_DEVICE_MAPPING
  2076. instance.save()
  2077. block_device_info = self._prep_block_device(context, instance,
  2078. block_device_mapping)
  2079. resources['block_device_info'] = block_device_info
  2080. except (exception.InstanceNotFound,
  2081. exception.UnexpectedDeletingTaskStateError):
  2082. with excutils.save_and_reraise_exception():
  2083. # Make sure the async call finishes
  2084. if network_info is not None:
  2085. network_info.wait(do_raise=False)
  2086. self.driver.clean_networks_preparation(instance,
  2087. network_info)
  2088. except (exception.UnexpectedTaskStateError,
  2089. exception.OverQuota, exception.InvalidBDM) as e:
  2090. # Make sure the async call finishes
  2091. if network_info is not None:
  2092. network_info.wait(do_raise=False)
  2093. self.driver.clean_networks_preparation(instance, network_info)
  2094. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2095. reason=e.format_message())
  2096. except Exception:
  2097. LOG.exception('Failure prepping block device',
  2098. instance=instance)
  2099. # Make sure the async call finishes
  2100. if network_info is not None:
  2101. network_info.wait(do_raise=False)
  2102. self.driver.clean_networks_preparation(instance, network_info)
  2103. msg = _('Failure prepping block device.')
  2104. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2105. reason=msg)
  2106. try:
  2107. resources['allocations'] = (
  2108. self.reportclient.get_allocations_for_consumer(context,
  2109. instance.uuid))
  2110. except Exception:
  2111. LOG.exception('Failure retrieving placement allocations',
  2112. instance=instance)
  2113. # Make sure the async call finishes
  2114. if network_info is not None:
  2115. network_info.wait(do_raise=False)
  2116. msg = _('Failure retrieving placement allocations')
  2117. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2118. reason=msg)
  2119. try:
  2120. yield resources
  2121. except Exception as exc:
  2122. with excutils.save_and_reraise_exception() as ctxt:
  2123. if not isinstance(exc, (
  2124. exception.InstanceNotFound,
  2125. exception.UnexpectedDeletingTaskStateError)):
  2126. LOG.exception('Instance failed to spawn',
  2127. instance=instance)
  2128. # Make sure the async call finishes
  2129. if network_info is not None:
  2130. network_info.wait(do_raise=False)
  2131. # if network_info is empty we're likely here because of
  2132. # network allocation failure. Since nothing can be reused on
  2133. # rescheduling it's better to deallocate network to eliminate
  2134. # the chance of orphaned ports in neutron
  2135. deallocate_networks = False if network_info else True
  2136. try:
  2137. self._shutdown_instance(context, instance,
  2138. block_device_mapping, requested_networks,
  2139. try_deallocate_networks=deallocate_networks)
  2140. except Exception as exc2:
  2141. ctxt.reraise = False
  2142. LOG.warning('Could not clean up failed build,'
  2143. ' not rescheduling. Error: %s',
  2144. six.text_type(exc2))
  2145. raise exception.BuildAbortException(
  2146. instance_uuid=instance.uuid,
  2147. reason=six.text_type(exc))
  2148. def _cleanup_allocated_networks(self, context, instance,
  2149. requested_networks):
  2150. try:
  2151. self._deallocate_network(context, instance, requested_networks)
  2152. except Exception:
  2153. LOG.exception('Failed to deallocate networks', instance=instance)
  2154. return
  2155. instance.system_metadata['network_allocated'] = 'False'
  2156. try:
  2157. instance.save()
  2158. except exception.InstanceNotFound:
  2159. # NOTE(alaski): It's possible that we're cleaning up the networks
  2160. # because the instance was deleted. If that's the case then this
  2161. # exception will be raised by instance.save()
  2162. pass
  2163. def _try_deallocate_network(self, context, instance,
  2164. requested_networks=None):
  2165. try:
  2166. # tear down allocated network structure
  2167. self._deallocate_network(context, instance, requested_networks)
  2168. except Exception as ex:
  2169. with excutils.save_and_reraise_exception():
  2170. LOG.error('Failed to deallocate network for instance. '
  2171. 'Error: %s', ex, instance=instance)
  2172. self._set_instance_obj_error_state(context, instance)
  2173. def _get_power_off_values(self, context, instance, clean_shutdown):
  2174. """Get the timing configuration for powering down this instance."""
  2175. if clean_shutdown:
  2176. timeout = compute_utils.get_value_from_system_metadata(instance,
  2177. key='image_os_shutdown_timeout', type=int,
  2178. default=CONF.shutdown_timeout)
  2179. retry_interval = self.SHUTDOWN_RETRY_INTERVAL
  2180. else:
  2181. timeout = 0
  2182. retry_interval = 0
  2183. return timeout, retry_interval
  2184. def _power_off_instance(self, context, instance, clean_shutdown=True):
  2185. """Power off an instance on this host."""
  2186. timeout, retry_interval = self._get_power_off_values(context,
  2187. instance, clean_shutdown)
  2188. self.driver.power_off(instance, timeout, retry_interval)
  2189. def _shutdown_instance(self, context, instance,
  2190. bdms, requested_networks=None, notify=True,
  2191. try_deallocate_networks=True):
  2192. """Shutdown an instance on this host.
  2193. :param:context: security context
  2194. :param:instance: a nova.objects.Instance object
  2195. :param:bdms: the block devices for the instance to be torn
  2196. down
  2197. :param:requested_networks: the networks on which the instance
  2198. has ports
  2199. :param:notify: true if a final usage notification should be
  2200. emitted
  2201. :param:try_deallocate_networks: false if we should avoid
  2202. trying to teardown networking
  2203. """
  2204. context = context.elevated()
  2205. LOG.info('Terminating instance', instance=instance)
  2206. if notify:
  2207. self._notify_about_instance_usage(context, instance,
  2208. "shutdown.start")
  2209. compute_utils.notify_about_instance_action(context, instance,
  2210. self.host, action=fields.NotificationAction.SHUTDOWN,
  2211. phase=fields.NotificationPhase.START, bdms=bdms)
  2212. network_info = instance.get_network_info()
  2213. # NOTE(vish) get bdms before destroying the instance
  2214. vol_bdms = [bdm for bdm in bdms if bdm.is_volume]
  2215. block_device_info = self._get_instance_block_device_info(
  2216. context, instance, bdms=bdms)
  2217. # NOTE(melwitt): attempt driver destroy before releasing ip, may
  2218. # want to keep ip allocated for certain failures
  2219. timer = timeutils.StopWatch()
  2220. try:
  2221. LOG.debug('Start destroying the instance on the hypervisor.',
  2222. instance=instance)
  2223. timer.start()
  2224. self.driver.destroy(context, instance, network_info,
  2225. block_device_info)
  2226. LOG.info('Took %0.2f seconds to destroy the instance on the '
  2227. 'hypervisor.', timer.elapsed(), instance=instance)
  2228. except exception.InstancePowerOffFailure:
  2229. # if the instance can't power off, don't release the ip
  2230. with excutils.save_and_reraise_exception():
  2231. pass
  2232. except Exception:
  2233. with excutils.save_and_reraise_exception():
  2234. # deallocate ip and fail without proceeding to
  2235. # volume api calls, preserving current behavior
  2236. if try_deallocate_networks:
  2237. self._try_deallocate_network(context, instance,
  2238. requested_networks)
  2239. if try_deallocate_networks:
  2240. self._try_deallocate_network(context, instance, requested_networks)
  2241. timer.restart()
  2242. for bdm in vol_bdms:
  2243. try:
  2244. if bdm.attachment_id:
  2245. self.volume_api.attachment_delete(context,
  2246. bdm.attachment_id)
  2247. else:
  2248. # NOTE(vish): actual driver detach done in driver.destroy,
  2249. # so just tell cinder that we are done with it.
  2250. connector = self.driver.get_volume_connector(instance)
  2251. self.volume_api.terminate_connection(context,
  2252. bdm.volume_id,
  2253. connector)
  2254. self.volume_api.detach(context, bdm.volume_id,
  2255. instance.uuid)
  2256. except exception.VolumeAttachmentNotFound as exc:
  2257. LOG.debug('Ignoring VolumeAttachmentNotFound: %s', exc,
  2258. instance=instance)
  2259. except exception.DiskNotFound as exc:
  2260. LOG.debug('Ignoring DiskNotFound: %s', exc,
  2261. instance=instance)
  2262. except exception.VolumeNotFound as exc:
  2263. LOG.debug('Ignoring VolumeNotFound: %s', exc,
  2264. instance=instance)
  2265. except (cinder_exception.EndpointNotFound,
  2266. keystone_exception.EndpointNotFound) as exc:
  2267. LOG.warning('Ignoring EndpointNotFound for '
  2268. 'volume %(volume_id)s: %(exc)s',
  2269. {'exc': exc, 'volume_id': bdm.volume_id},
  2270. instance=instance)
  2271. except cinder_exception.ClientException as exc:
  2272. LOG.warning('Ignoring unknown cinder exception for '
  2273. 'volume %(volume_id)s: %(exc)s',
  2274. {'exc': exc, 'volume_id': bdm.volume_id},
  2275. instance=instance)
  2276. except Exception as exc:
  2277. LOG.warning('Ignoring unknown exception for '
  2278. 'volume %(volume_id)s: %(exc)s',
  2279. {'exc': exc, 'volume_id': bdm.volume_id},
  2280. instance=instance)
  2281. if vol_bdms:
  2282. LOG.info('Took %(time).2f seconds to detach %(num)s volumes '
  2283. 'for instance.',
  2284. {'time': timer.elapsed(), 'num': len(vol_bdms)},
  2285. instance=instance)
  2286. if notify:
  2287. self._notify_about_instance_usage(context, instance,
  2288. "shutdown.end")
  2289. compute_utils.notify_about_instance_action(context, instance,
  2290. self.host, action=fields.NotificationAction.SHUTDOWN,
  2291. phase=fields.NotificationPhase.END, bdms=bdms)
  2292. def _cleanup_volumes(self, context, instance, bdms, raise_exc=True,
  2293. detach=True):
  2294. exc_info = None
  2295. for bdm in bdms:
  2296. if detach and bdm.volume_id:
  2297. try:
  2298. LOG.debug("Detaching volume: %s", bdm.volume_id,
  2299. instance_uuid=instance.uuid)
  2300. destroy = bdm.delete_on_termination
  2301. self._detach_volume(context, bdm, instance,
  2302. destroy_bdm=destroy)
  2303. except Exception as exc:
  2304. exc_info = sys.exc_info()
  2305. LOG.warning('Failed to detach volume: %(volume_id)s '
  2306. 'due to %(exc)s',
  2307. {'volume_id': bdm.volume_id, 'exc': exc})
  2308. if bdm.volume_id and bdm.delete_on_termination:
  2309. try:
  2310. LOG.debug("Deleting volume: %s", bdm.volume_id,
  2311. instance_uuid=instance.uuid)
  2312. self.volume_api.delete(context, bdm.volume_id)
  2313. except Exception as exc:
  2314. exc_info = sys.exc_info()
  2315. LOG.warning('Failed to delete volume: %(volume_id)s '
  2316. 'due to %(exc)s',
  2317. {'volume_id': bdm.volume_id, 'exc': exc})
  2318. if exc_info is not None and raise_exc:
  2319. six.reraise(exc_info[0], exc_info[1], exc_info[2])
  2320. @hooks.add_hook("delete_instance")
  2321. def _delete_instance(self, context, instance, bdms):
  2322. """Delete an instance on this host.
  2323. :param context: nova request context
  2324. :param instance: nova.objects.instance.Instance object
  2325. :param bdms: nova.objects.block_device.BlockDeviceMappingList object
  2326. """
  2327. events = self.instance_events.clear_events_for_instance(instance)
  2328. if events:
  2329. LOG.debug('Events pending at deletion: %(events)s',
  2330. {'events': ','.join(events.keys())},
  2331. instance=instance)
  2332. self._notify_about_instance_usage(context, instance,
  2333. "delete.start")
  2334. compute_utils.notify_about_instance_action(context, instance,
  2335. self.host, action=fields.NotificationAction.DELETE,
  2336. phase=fields.NotificationPhase.START, bdms=bdms)
  2337. self._shutdown_instance(context, instance, bdms)
  2338. # NOTE(dims): instance.info_cache.delete() should be called after
  2339. # _shutdown_instance in the compute manager as shutdown calls
  2340. # deallocate_for_instance so the info_cache is still needed
  2341. # at this point.
  2342. if instance.info_cache is not None:
  2343. instance.info_cache.delete()
  2344. else:
  2345. # NOTE(yoshimatsu): Avoid AttributeError if instance.info_cache
  2346. # is None. When the root cause that instance.info_cache becomes
  2347. # None is fixed, the log level should be reconsidered.
  2348. LOG.warning("Info cache for instance could not be found. "
  2349. "Ignore.", instance=instance)
  2350. # NOTE(vish): We have already deleted the instance, so we have
  2351. # to ignore problems cleaning up the volumes. It
  2352. # would be nice to let the user know somehow that
  2353. # the volume deletion failed, but it is not
  2354. # acceptable to have an instance that can not be
  2355. # deleted. Perhaps this could be reworked in the
  2356. # future to set an instance fault the first time
  2357. # and to only ignore the failure if the instance
  2358. # is already in ERROR.
  2359. # NOTE(ameeda): The volumes already detached during the above
  2360. # _shutdown_instance() call and this is why
  2361. # detach is not requested from _cleanup_volumes()
  2362. # in this case
  2363. self._cleanup_volumes(context, instance, bdms,
  2364. raise_exc=False, detach=False)
  2365. # if a delete task succeeded, always update vm state and task
  2366. # state without expecting task state to be DELETING
  2367. instance.vm_state = vm_states.DELETED
  2368. instance.task_state = None
  2369. instance.power_state = power_state.NOSTATE
  2370. instance.terminated_at = timeutils.utcnow()
  2371. instance.save()
  2372. system_meta = instance.system_metadata
  2373. instance.destroy()
  2374. self._complete_deletion(context,
  2375. instance,
  2376. bdms,
  2377. system_meta)
  2378. @wrap_exception()
  2379. @reverts_task_state
  2380. @wrap_instance_event(prefix='compute')
  2381. @wrap_instance_fault
  2382. def terminate_instance(self, context, instance, bdms):
  2383. """Terminate an instance on this host."""
  2384. @utils.synchronized(instance.uuid)
  2385. def do_terminate_instance(instance, bdms):
  2386. # NOTE(mriedem): If we are deleting the instance while it was
  2387. # booting from volume, we could be racing with a database update of
  2388. # the BDM volume_id. Since the compute API passes the BDMs over RPC
  2389. # to compute here, the BDMs may be stale at this point. So check
  2390. # for any volume BDMs that don't have volume_id set and if we
  2391. # detect that, we need to refresh the BDM list before proceeding.
  2392. # TODO(mriedem): Move this into _delete_instance and make the bdms
  2393. # parameter optional.
  2394. for bdm in list(bdms):
  2395. if bdm.is_volume and not bdm.volume_id:
  2396. LOG.debug('There are potentially stale BDMs during '
  2397. 'delete, refreshing the BlockDeviceMappingList.',
  2398. instance=instance)
  2399. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  2400. context, instance.uuid)
  2401. break
  2402. try:
  2403. self._delete_instance(context, instance, bdms)
  2404. except exception.InstanceNotFound:
  2405. LOG.info("Instance disappeared during terminate",
  2406. instance=instance)
  2407. except Exception:
  2408. # As we're trying to delete always go to Error if something
  2409. # goes wrong that _delete_instance can't handle.
  2410. with excutils.save_and_reraise_exception():
  2411. LOG.exception('Setting instance vm_state to ERROR',
  2412. instance=instance)
  2413. self._set_instance_obj_error_state(context, instance)
  2414. do_terminate_instance(instance, bdms)
  2415. # NOTE(johannes): This is probably better named power_off_instance
  2416. # so it matches the driver method, but because of other issues, we
  2417. # can't use that name in grizzly.
  2418. @wrap_exception()
  2419. @reverts_task_state
  2420. @wrap_instance_event(prefix='compute')
  2421. @wrap_instance_fault
  2422. def stop_instance(self, context, instance, clean_shutdown):
  2423. """Stopping an instance on this host."""
  2424. @utils.synchronized(instance.uuid)
  2425. def do_stop_instance():
  2426. current_power_state = self._get_power_state(context, instance)
  2427. LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
  2428. 'current task_state: %(task_state)s, current DB '
  2429. 'power_state: %(db_power_state)s, current VM '
  2430. 'power_state: %(current_power_state)s',
  2431. {'vm_state': instance.vm_state,
  2432. 'task_state': instance.task_state,
  2433. 'db_power_state': instance.power_state,
  2434. 'current_power_state': current_power_state},
  2435. instance_uuid=instance.uuid)
  2436. # NOTE(mriedem): If the instance is already powered off, we are
  2437. # possibly tearing down and racing with other operations, so we can
  2438. # expect the task_state to be None if something else updates the
  2439. # instance and we're not locking it.
  2440. expected_task_state = [task_states.POWERING_OFF]
  2441. # The list of power states is from _sync_instance_power_state.
  2442. if current_power_state in (power_state.NOSTATE,
  2443. power_state.SHUTDOWN,
  2444. power_state.CRASHED):
  2445. LOG.info('Instance is already powered off in the '
  2446. 'hypervisor when stop is called.',
  2447. instance=instance)
  2448. expected_task_state.append(None)
  2449. self._notify_about_instance_usage(context, instance,
  2450. "power_off.start")
  2451. compute_utils.notify_about_instance_action(context, instance,
  2452. self.host, action=fields.NotificationAction.POWER_OFF,
  2453. phase=fields.NotificationPhase.START)
  2454. self._power_off_instance(context, instance, clean_shutdown)
  2455. instance.power_state = self._get_power_state(context, instance)
  2456. instance.vm_state = vm_states.STOPPED
  2457. instance.task_state = None
  2458. instance.save(expected_task_state=expected_task_state)
  2459. self._notify_about_instance_usage(context, instance,
  2460. "power_off.end")
  2461. compute_utils.notify_about_instance_action(context, instance,
  2462. self.host, action=fields.NotificationAction.POWER_OFF,
  2463. phase=fields.NotificationPhase.END)
  2464. do_stop_instance()
  2465. def _power_on(self, context, instance):
  2466. network_info = self.network_api.get_instance_nw_info(context, instance)
  2467. block_device_info = self._get_instance_block_device_info(context,
  2468. instance)
  2469. self.driver.power_on(context, instance,
  2470. network_info,
  2471. block_device_info)
  2472. def _delete_snapshot_of_shelved_instance(self, context, instance,
  2473. snapshot_id):
  2474. """Delete snapshot of shelved instance."""
  2475. try:
  2476. self.image_api.delete(context, snapshot_id)
  2477. except (exception.ImageNotFound,
  2478. exception.ImageNotAuthorized) as exc:
  2479. LOG.warning("Failed to delete snapshot "
  2480. "from shelved instance (%s).",
  2481. exc.format_message(), instance=instance)
  2482. except Exception:
  2483. LOG.exception("Something wrong happened when trying to "
  2484. "delete snapshot from shelved instance.",
  2485. instance=instance)
  2486. # NOTE(johannes): This is probably better named power_on_instance
  2487. # so it matches the driver method, but because of other issues, we
  2488. # can't use that name in grizzly.
  2489. @wrap_exception()
  2490. @reverts_task_state
  2491. @wrap_instance_event(prefix='compute')
  2492. @wrap_instance_fault
  2493. def start_instance(self, context, instance):
  2494. """Starting an instance on this host."""
  2495. self._notify_about_instance_usage(context, instance, "power_on.start")
  2496. compute_utils.notify_about_instance_action(context, instance,
  2497. self.host, action=fields.NotificationAction.POWER_ON,
  2498. phase=fields.NotificationPhase.START)
  2499. self._power_on(context, instance)
  2500. instance.power_state = self._get_power_state(context, instance)
  2501. instance.vm_state = vm_states.ACTIVE
  2502. instance.task_state = None
  2503. # Delete an image(VM snapshot) for a shelved instance
  2504. snapshot_id = instance.system_metadata.get('shelved_image_id')
  2505. if snapshot_id:
  2506. self._delete_snapshot_of_shelved_instance(context, instance,
  2507. snapshot_id)
  2508. # Delete system_metadata for a shelved instance
  2509. compute_utils.remove_shelved_keys_from_system_metadata(instance)
  2510. instance.save(expected_task_state=task_states.POWERING_ON)
  2511. self._notify_about_instance_usage(context, instance, "power_on.end")
  2512. compute_utils.notify_about_instance_action(context, instance,
  2513. self.host, action=fields.NotificationAction.POWER_ON,
  2514. phase=fields.NotificationPhase.END)
  2515. @messaging.expected_exceptions(NotImplementedError,
  2516. exception.TriggerCrashDumpNotSupported,
  2517. exception.InstanceNotRunning)
  2518. @wrap_exception()
  2519. @wrap_instance_event(prefix='compute')
  2520. @wrap_instance_fault
  2521. def trigger_crash_dump(self, context, instance):
  2522. """Trigger crash dump in an instance."""
  2523. self._notify_about_instance_usage(context, instance,
  2524. "trigger_crash_dump.start")
  2525. compute_utils.notify_about_instance_action(context, instance,
  2526. self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
  2527. phase=fields.NotificationPhase.START)
  2528. # This method does not change task_state and power_state because the
  2529. # effect of a trigger depends on user's configuration.
  2530. self.driver.trigger_crash_dump(instance)
  2531. self._notify_about_instance_usage(context, instance,
  2532. "trigger_crash_dump.end")
  2533. compute_utils.notify_about_instance_action(context, instance,
  2534. self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
  2535. phase=fields.NotificationPhase.END)
  2536. @wrap_exception()
  2537. @reverts_task_state
  2538. @wrap_instance_event(prefix='compute')
  2539. @wrap_instance_fault
  2540. def soft_delete_instance(self, context, instance):
  2541. """Soft delete an instance on this host."""
  2542. with compute_utils.notify_about_instance_delete(
  2543. self.notifier, context, instance, 'soft_delete'):
  2544. compute_utils.notify_about_instance_action(context, instance,
  2545. self.host, action=fields.NotificationAction.SOFT_DELETE,
  2546. phase=fields.NotificationPhase.START)
  2547. try:
  2548. self.driver.soft_delete(instance)
  2549. except NotImplementedError:
  2550. # Fallback to just powering off the instance if the
  2551. # hypervisor doesn't implement the soft_delete method
  2552. self.driver.power_off(instance)
  2553. instance.power_state = self._get_power_state(context, instance)
  2554. instance.vm_state = vm_states.SOFT_DELETED
  2555. instance.task_state = None
  2556. instance.save(expected_task_state=[task_states.SOFT_DELETING])
  2557. compute_utils.notify_about_instance_action(
  2558. context, instance, self.host,
  2559. action=fields.NotificationAction.SOFT_DELETE,
  2560. phase=fields.NotificationPhase.END)
  2561. @wrap_exception()
  2562. @reverts_task_state
  2563. @wrap_instance_event(prefix='compute')
  2564. @wrap_instance_fault
  2565. def restore_instance(self, context, instance):
  2566. """Restore a soft-deleted instance on this host."""
  2567. self._notify_about_instance_usage(context, instance, "restore.start")
  2568. compute_utils.notify_about_instance_action(context, instance,
  2569. self.host, action=fields.NotificationAction.RESTORE,
  2570. phase=fields.NotificationPhase.START)
  2571. try:
  2572. self.driver.restore(instance)
  2573. except NotImplementedError:
  2574. # Fallback to just powering on the instance if the hypervisor
  2575. # doesn't implement the restore method
  2576. self._power_on(context, instance)
  2577. instance.power_state = self._get_power_state(context, instance)
  2578. instance.vm_state = vm_states.ACTIVE
  2579. instance.task_state = None
  2580. instance.save(expected_task_state=task_states.RESTORING)
  2581. self._notify_about_instance_usage(context, instance, "restore.end")
  2582. compute_utils.notify_about_instance_action(context, instance,
  2583. self.host, action=fields.NotificationAction.RESTORE,
  2584. phase=fields.NotificationPhase.END)
  2585. @staticmethod
  2586. def _set_migration_status(migration, status):
  2587. """Set the status, and guard against a None being passed in.
  2588. This is useful as some of the compute RPC calls will not pass
  2589. a migration object in older versions. The check can be removed when
  2590. we move past 4.x major version of the RPC API.
  2591. """
  2592. if migration:
  2593. migration.status = status
  2594. migration.save()
  2595. def _rebuild_default_impl(self, context, instance, image_meta,
  2596. injected_files, admin_password, allocations,
  2597. bdms, detach_block_devices, attach_block_devices,
  2598. network_info=None,
  2599. recreate=False, block_device_info=None,
  2600. preserve_ephemeral=False):
  2601. if preserve_ephemeral:
  2602. # The default code path does not support preserving ephemeral
  2603. # partitions.
  2604. raise exception.PreserveEphemeralNotSupported()
  2605. if recreate:
  2606. detach_block_devices(context, bdms)
  2607. else:
  2608. self._power_off_instance(context, instance, clean_shutdown=True)
  2609. detach_block_devices(context, bdms)
  2610. self.driver.destroy(context, instance,
  2611. network_info=network_info,
  2612. block_device_info=block_device_info)
  2613. instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
  2614. instance.save(expected_task_state=[task_states.REBUILDING])
  2615. new_block_device_info = attach_block_devices(context, instance, bdms)
  2616. instance.task_state = task_states.REBUILD_SPAWNING
  2617. instance.save(
  2618. expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])
  2619. with instance.mutated_migration_context():
  2620. self.driver.spawn(context, instance, image_meta, injected_files,
  2621. admin_password, allocations,
  2622. network_info=network_info,
  2623. block_device_info=new_block_device_info)
  2624. def _notify_instance_rebuild_error(self, context, instance, error, bdms):
  2625. self._notify_about_instance_usage(context, instance,
  2626. 'rebuild.error', fault=error)
  2627. compute_utils.notify_about_instance_action(
  2628. context, instance, self.host,
  2629. action=fields.NotificationAction.REBUILD,
  2630. phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms)
  2631. @messaging.expected_exceptions(exception.PreserveEphemeralNotSupported)
  2632. @wrap_exception()
  2633. @reverts_task_state
  2634. @wrap_instance_event(prefix='compute')
  2635. @wrap_instance_fault
  2636. def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
  2637. injected_files, new_pass, orig_sys_metadata,
  2638. bdms, recreate, on_shared_storage,
  2639. preserve_ephemeral, migration,
  2640. scheduled_node, limits, request_spec):
  2641. """Destroy and re-make this instance.
  2642. A 'rebuild' effectively purges all existing data from the system and
  2643. remakes the VM with given 'metadata' and 'personalities'.
  2644. :param context: `nova.RequestContext` object
  2645. :param instance: Instance object
  2646. :param orig_image_ref: Original image_ref before rebuild
  2647. :param image_ref: New image_ref for rebuild
  2648. :param injected_files: Files to inject
  2649. :param new_pass: password to set on rebuilt instance
  2650. :param orig_sys_metadata: instance system metadata from pre-rebuild
  2651. :param bdms: block-device-mappings to use for rebuild
  2652. :param recreate: True if the instance is being recreated (e.g. the
  2653. hypervisor it was on failed) - cleanup of old state will be
  2654. skipped.
  2655. :param on_shared_storage: True if instance files on shared storage.
  2656. If not provided then information from the
  2657. driver will be used to decide if the instance
  2658. files are available or not on the target host
  2659. :param preserve_ephemeral: True if the default ephemeral storage
  2660. partition must be preserved on rebuild
  2661. :param migration: a Migration object if one was created for this
  2662. rebuild operation (if it's a part of evacuate)
  2663. :param scheduled_node: A node of the host chosen by the scheduler. If a
  2664. host was specified by the user, this will be
  2665. None
  2666. :param limits: Overcommit limits set by the scheduler. If a host was
  2667. specified by the user, this will be None
  2668. :param request_spec: a RequestSpec object used to schedule the instance
  2669. """
  2670. context = context.elevated()
  2671. LOG.info("Rebuilding instance", instance=instance)
  2672. rt = self._get_resource_tracker()
  2673. if recreate:
  2674. # This is an evacuation to a new host, so we need to perform a
  2675. # resource claim.
  2676. rebuild_claim = rt.rebuild_claim
  2677. else:
  2678. # This is a rebuild to the same host, so we don't need to make
  2679. # a claim since the instance is already on this host.
  2680. rebuild_claim = claims.NopClaim
  2681. image_meta = {}
  2682. if image_ref:
  2683. image_meta = self.image_api.get(context, image_ref)
  2684. # NOTE(mriedem): On a recreate (evacuate), we need to update
  2685. # the instance's host and node properties to reflect it's
  2686. # destination node for the recreate.
  2687. if not scheduled_node:
  2688. if recreate:
  2689. try:
  2690. compute_node = self._get_compute_info(context, self.host)
  2691. scheduled_node = compute_node.hypervisor_hostname
  2692. except exception.ComputeHostNotFound:
  2693. LOG.exception('Failed to get compute_info for %s',
  2694. self.host)
  2695. else:
  2696. scheduled_node = instance.node
  2697. with self._error_out_instance_on_exception(context, instance):
  2698. try:
  2699. claim_ctxt = rebuild_claim(
  2700. context, instance, scheduled_node,
  2701. limits=limits, image_meta=image_meta,
  2702. migration=migration)
  2703. self._do_rebuild_instance_with_claim(
  2704. claim_ctxt, context, instance, orig_image_ref,
  2705. image_ref, injected_files, new_pass, orig_sys_metadata,
  2706. bdms, recreate, on_shared_storage, preserve_ephemeral,
  2707. migration, request_spec)
  2708. except (exception.ComputeResourcesUnavailable,
  2709. exception.RescheduledException) as e:
  2710. if isinstance(e, exception.ComputeResourcesUnavailable):
  2711. LOG.debug("Could not rebuild instance on this host, not "
  2712. "enough resources available.", instance=instance)
  2713. else:
  2714. # RescheduledException is raised by the late server group
  2715. # policy check during evacuation if a parallel scheduling
  2716. # violated the policy.
  2717. # We catch the RescheduledException here but we don't have
  2718. # the plumbing to do an actual reschedule so we abort the
  2719. # operation.
  2720. LOG.debug("Could not rebuild instance on this host, "
  2721. "late server group check failed.",
  2722. instance=instance)
  2723. # NOTE(ndipanov): We just abort the build for now and leave a
  2724. # migration record for potential cleanup later
  2725. self._set_migration_status(migration, 'failed')
  2726. # Since the claim failed, we need to remove the allocation
  2727. # created against the destination node. Note that we can only
  2728. # get here when evacuating to a destination node. Rebuilding
  2729. # on the same host (not evacuate) uses the NopClaim which will
  2730. # not raise ComputeResourcesUnavailable.
  2731. rt.delete_allocation_for_evacuated_instance(
  2732. context, instance, scheduled_node, node_type='destination')
  2733. self._notify_instance_rebuild_error(context, instance, e, bdms)
  2734. raise exception.BuildAbortException(
  2735. instance_uuid=instance.uuid, reason=e.format_message())
  2736. except (exception.InstanceNotFound,
  2737. exception.UnexpectedDeletingTaskStateError) as e:
  2738. LOG.debug('Instance was deleted while rebuilding',
  2739. instance=instance)
  2740. self._set_migration_status(migration, 'failed')
  2741. self._notify_instance_rebuild_error(context, instance, e, bdms)
  2742. except Exception as e:
  2743. self._set_migration_status(migration, 'failed')
  2744. if recreate or scheduled_node is not None:
  2745. rt.delete_allocation_for_evacuated_instance(
  2746. context, instance, scheduled_node,
  2747. node_type='destination')
  2748. self._notify_instance_rebuild_error(context, instance, e, bdms)
  2749. raise
  2750. else:
  2751. instance.apply_migration_context()
  2752. # NOTE (ndipanov): This save will now update the host and node
  2753. # attributes making sure that next RT pass is consistent since
  2754. # it will be based on the instance and not the migration DB
  2755. # entry.
  2756. instance.host = self.host
  2757. instance.node = scheduled_node
  2758. instance.save()
  2759. instance.drop_migration_context()
  2760. # NOTE (ndipanov): Mark the migration as done only after we
  2761. # mark the instance as belonging to this host.
  2762. self._set_migration_status(migration, 'done')
  2763. def _do_rebuild_instance_with_claim(self, claim_context, *args, **kwargs):
  2764. """Helper to avoid deep nesting in the top-level method."""
  2765. with claim_context:
  2766. self._do_rebuild_instance(*args, **kwargs)
  2767. @staticmethod
  2768. def _get_image_name(image_meta):
  2769. if image_meta.obj_attr_is_set("name"):
  2770. return image_meta.name
  2771. else:
  2772. return ''
  2773. def _do_rebuild_instance(self, context, instance, orig_image_ref,
  2774. image_ref, injected_files, new_pass,
  2775. orig_sys_metadata, bdms, recreate,
  2776. on_shared_storage, preserve_ephemeral,
  2777. migration, request_spec):
  2778. orig_vm_state = instance.vm_state
  2779. if recreate:
  2780. if request_spec:
  2781. # NOTE(gibi): Do a late check of server group policy as
  2782. # parallel scheduling could violate such policy. This will
  2783. # cause the evacuate to fail as rebuild does not implement
  2784. # reschedule.
  2785. hints = self._get_scheduler_hints({}, request_spec)
  2786. self._validate_instance_group_policy(context, instance, hints)
  2787. if not self.driver.capabilities["supports_recreate"]:
  2788. raise exception.InstanceRecreateNotSupported
  2789. self._check_instance_exists(context, instance)
  2790. if on_shared_storage is None:
  2791. LOG.debug('on_shared_storage is not provided, using driver'
  2792. 'information to decide if the instance needs to'
  2793. 'be recreated')
  2794. on_shared_storage = self.driver.instance_on_disk(instance)
  2795. elif (on_shared_storage !=
  2796. self.driver.instance_on_disk(instance)):
  2797. # To cover case when admin expects that instance files are
  2798. # on shared storage, but not accessible and vice versa
  2799. raise exception.InvalidSharedStorage(
  2800. _("Invalid state of instance files on shared"
  2801. " storage"))
  2802. if on_shared_storage:
  2803. LOG.info('disk on shared storage, recreating using'
  2804. ' existing disk')
  2805. else:
  2806. image_ref = orig_image_ref = instance.image_ref
  2807. LOG.info("disk not on shared storage, rebuilding from:"
  2808. " '%s'", str(image_ref))
  2809. if image_ref:
  2810. image_meta = objects.ImageMeta.from_image_ref(
  2811. context, self.image_api, image_ref)
  2812. else:
  2813. image_meta = instance.image_meta
  2814. # This instance.exists message should contain the original
  2815. # image_ref, not the new one. Since the DB has been updated
  2816. # to point to the new one... we have to override it.
  2817. orig_image_ref_url = self.image_api.generate_image_url(orig_image_ref,
  2818. context)
  2819. extra_usage_info = {'image_ref_url': orig_image_ref_url}
  2820. compute_utils.notify_usage_exists(
  2821. self.notifier, context, instance,
  2822. current_period=True, system_metadata=orig_sys_metadata,
  2823. extra_usage_info=extra_usage_info)
  2824. # This message should contain the new image_ref
  2825. extra_usage_info = {'image_name': self._get_image_name(image_meta)}
  2826. self._notify_about_instance_usage(context, instance,
  2827. "rebuild.start", extra_usage_info=extra_usage_info)
  2828. # NOTE: image_name is not included in the versioned notification
  2829. # because we already provide the image_uuid in the notification
  2830. # payload and the image details can be looked up via the uuid.
  2831. compute_utils.notify_about_instance_action(
  2832. context, instance, self.host,
  2833. action=fields.NotificationAction.REBUILD,
  2834. phase=fields.NotificationPhase.START,
  2835. bdms=bdms)
  2836. instance.power_state = self._get_power_state(context, instance)
  2837. instance.task_state = task_states.REBUILDING
  2838. instance.save(expected_task_state=[task_states.REBUILDING])
  2839. if recreate:
  2840. self.network_api.setup_networks_on_host(
  2841. context, instance, self.host)
  2842. # For nova-network this is needed to move floating IPs
  2843. # For neutron this updates the host in the port binding
  2844. # TODO(cfriesen): this network_api call and the one above
  2845. # are so similar, we should really try to unify them.
  2846. self.network_api.setup_instance_network_on_host(
  2847. context, instance, self.host, migration)
  2848. # TODO(mriedem): Consider decorating setup_instance_network_on_host
  2849. # with @base_api.refresh_cache and then we wouldn't need this
  2850. # explicit call to get_instance_nw_info.
  2851. network_info = self.network_api.get_instance_nw_info(context,
  2852. instance)
  2853. else:
  2854. network_info = instance.get_network_info()
  2855. allocations = self.reportclient.get_allocations_for_consumer(
  2856. context, instance.uuid)
  2857. if bdms is None:
  2858. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  2859. context, instance.uuid)
  2860. block_device_info = \
  2861. self._get_instance_block_device_info(
  2862. context, instance, bdms=bdms)
  2863. def detach_block_devices(context, bdms):
  2864. for bdm in bdms:
  2865. if bdm.is_volume:
  2866. # NOTE (ildikov): Having the attachment_id set in the BDM
  2867. # means that it's the new Cinder attach/detach flow
  2868. # (available from v3.44). In that case we explicitly
  2869. # attach and detach the volumes through attachment level
  2870. # operations. In this scenario _detach_volume will delete
  2871. # the existing attachment which would make the volume
  2872. # status change to 'available' if we don't pre-create
  2873. # another empty attachment before deleting the old one.
  2874. attachment_id = None
  2875. if bdm.attachment_id:
  2876. attachment_id = self.volume_api.attachment_create(
  2877. context, bdm['volume_id'], instance.uuid)['id']
  2878. self._detach_volume(context, bdm, instance,
  2879. destroy_bdm=False)
  2880. if attachment_id:
  2881. bdm.attachment_id = attachment_id
  2882. bdm.save()
  2883. files = self._decode_files(injected_files)
  2884. kwargs = dict(
  2885. context=context,
  2886. instance=instance,
  2887. image_meta=image_meta,
  2888. injected_files=files,
  2889. admin_password=new_pass,
  2890. allocations=allocations,
  2891. bdms=bdms,
  2892. detach_block_devices=detach_block_devices,
  2893. attach_block_devices=self._prep_block_device,
  2894. block_device_info=block_device_info,
  2895. network_info=network_info,
  2896. preserve_ephemeral=preserve_ephemeral,
  2897. recreate=recreate)
  2898. try:
  2899. with instance.mutated_migration_context():
  2900. self.driver.rebuild(**kwargs)
  2901. except NotImplementedError:
  2902. # NOTE(rpodolyaka): driver doesn't provide specialized version
  2903. # of rebuild, fall back to the default implementation
  2904. self._rebuild_default_impl(**kwargs)
  2905. self._update_instance_after_spawn(context, instance)
  2906. instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])
  2907. if orig_vm_state == vm_states.STOPPED:
  2908. LOG.info("bringing vm to original state: '%s'",
  2909. orig_vm_state, instance=instance)
  2910. instance.vm_state = vm_states.ACTIVE
  2911. instance.task_state = task_states.POWERING_OFF
  2912. instance.progress = 0
  2913. instance.save()
  2914. self.stop_instance(context, instance, False)
  2915. self._update_scheduler_instance_info(context, instance)
  2916. self._notify_about_instance_usage(
  2917. context, instance, "rebuild.end",
  2918. network_info=network_info,
  2919. extra_usage_info=extra_usage_info)
  2920. compute_utils.notify_about_instance_action(
  2921. context, instance, self.host,
  2922. action=fields.NotificationAction.REBUILD,
  2923. phase=fields.NotificationPhase.END,
  2924. bdms=bdms)
  2925. def _handle_bad_volumes_detached(self, context, instance, bad_devices,
  2926. block_device_info):
  2927. """Handle cases where the virt-layer had to detach non-working volumes
  2928. in order to complete an operation.
  2929. """
  2930. for bdm in block_device_info['block_device_mapping']:
  2931. if bdm.get('mount_device') in bad_devices:
  2932. try:
  2933. volume_id = bdm['connection_info']['data']['volume_id']
  2934. except KeyError:
  2935. continue
  2936. # NOTE(sirp): ideally we'd just call
  2937. # `compute_api.detach_volume` here but since that hits the
  2938. # DB directly, that's off limits from within the
  2939. # compute-manager.
  2940. #
  2941. # API-detach
  2942. LOG.info("Detaching from volume api: %s", volume_id)
  2943. self.volume_api.begin_detaching(context, volume_id)
  2944. # Manager-detach
  2945. self.detach_volume(context, volume_id, instance)
  2946. @wrap_exception()
  2947. @reverts_task_state
  2948. @wrap_instance_event(prefix='compute')
  2949. @wrap_instance_fault
  2950. def reboot_instance(self, context, instance, block_device_info,
  2951. reboot_type):
  2952. @utils.synchronized(instance.uuid)
  2953. def do_reboot_instance(context, instance, block_device_info,
  2954. reboot_type):
  2955. self._reboot_instance(context, instance, block_device_info,
  2956. reboot_type)
  2957. do_reboot_instance(context, instance, block_device_info, reboot_type)
  2958. def _reboot_instance(self, context, instance, block_device_info,
  2959. reboot_type):
  2960. """Reboot an instance on this host."""
  2961. # acknowledge the request made it to the manager
  2962. if reboot_type == "SOFT":
  2963. instance.task_state = task_states.REBOOT_PENDING
  2964. expected_states = task_states.soft_reboot_states
  2965. else:
  2966. instance.task_state = task_states.REBOOT_PENDING_HARD
  2967. expected_states = task_states.hard_reboot_states
  2968. context = context.elevated()
  2969. LOG.info("Rebooting instance", instance=instance)
  2970. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  2971. context, instance.uuid)
  2972. block_device_info = self._get_instance_block_device_info(
  2973. context, instance, bdms=bdms)
  2974. network_info = self.network_api.get_instance_nw_info(context, instance)
  2975. self._notify_about_instance_usage(context, instance, "reboot.start")
  2976. compute_utils.notify_about_instance_action(
  2977. context, instance, self.host,
  2978. action=fields.NotificationAction.REBOOT,
  2979. phase=fields.NotificationPhase.START,
  2980. bdms=bdms
  2981. )
  2982. instance.power_state = self._get_power_state(context, instance)
  2983. instance.save(expected_task_state=expected_states)
  2984. if instance.power_state != power_state.RUNNING:
  2985. state = instance.power_state
  2986. running = power_state.RUNNING
  2987. LOG.warning('trying to reboot a non-running instance:'
  2988. ' (state: %(state)s expected: %(running)s)',
  2989. {'state': state, 'running': running},
  2990. instance=instance)
  2991. def bad_volumes_callback(bad_devices):
  2992. self._handle_bad_volumes_detached(
  2993. context, instance, bad_devices, block_device_info)
  2994. try:
  2995. # Don't change it out of rescue mode
  2996. if instance.vm_state == vm_states.RESCUED:
  2997. new_vm_state = vm_states.RESCUED
  2998. else:
  2999. new_vm_state = vm_states.ACTIVE
  3000. new_power_state = None
  3001. if reboot_type == "SOFT":
  3002. instance.task_state = task_states.REBOOT_STARTED
  3003. expected_state = task_states.REBOOT_PENDING
  3004. else:
  3005. instance.task_state = task_states.REBOOT_STARTED_HARD
  3006. expected_state = task_states.REBOOT_PENDING_HARD
  3007. instance.save(expected_task_state=expected_state)
  3008. self.driver.reboot(context, instance,
  3009. network_info,
  3010. reboot_type,
  3011. block_device_info=block_device_info,
  3012. bad_volumes_callback=bad_volumes_callback)
  3013. except Exception as error:
  3014. with excutils.save_and_reraise_exception() as ctxt:
  3015. exc_info = sys.exc_info()
  3016. # if the reboot failed but the VM is running don't
  3017. # put it into an error state
  3018. new_power_state = self._get_power_state(context, instance)
  3019. if new_power_state == power_state.RUNNING:
  3020. LOG.warning('Reboot failed but instance is running',
  3021. instance=instance)
  3022. compute_utils.add_instance_fault_from_exc(context,
  3023. instance, error, exc_info)
  3024. self._notify_about_instance_usage(context, instance,
  3025. 'reboot.error', fault=error)
  3026. compute_utils.notify_about_instance_action(
  3027. context, instance, self.host,
  3028. action=fields.NotificationAction.REBOOT,
  3029. phase=fields.NotificationPhase.ERROR,
  3030. exception=error, bdms=bdms
  3031. )
  3032. ctxt.reraise = False
  3033. else:
  3034. LOG.error('Cannot reboot instance: %s', error,
  3035. instance=instance)
  3036. self._set_instance_obj_error_state(context, instance)
  3037. if not new_power_state:
  3038. new_power_state = self._get_power_state(context, instance)
  3039. try:
  3040. instance.power_state = new_power_state
  3041. instance.vm_state = new_vm_state
  3042. instance.task_state = None
  3043. instance.save()
  3044. except exception.InstanceNotFound:
  3045. LOG.warning("Instance disappeared during reboot",
  3046. instance=instance)
  3047. self._notify_about_instance_usage(context, instance, "reboot.end")
  3048. compute_utils.notify_about_instance_action(
  3049. context, instance, self.host,
  3050. action=fields.NotificationAction.REBOOT,
  3051. phase=fields.NotificationPhase.END,
  3052. bdms=bdms
  3053. )
  3054. @delete_image_on_error
  3055. def _do_snapshot_instance(self, context, image_id, instance):
  3056. self._snapshot_instance(context, image_id, instance,
  3057. task_states.IMAGE_BACKUP)
  3058. @wrap_exception()
  3059. @reverts_task_state
  3060. @wrap_instance_event(prefix='compute')
  3061. @wrap_instance_fault
  3062. def backup_instance(self, context, image_id, instance, backup_type,
  3063. rotation):
  3064. """Backup an instance on this host.
  3065. :param backup_type: daily | weekly
  3066. :param rotation: int representing how many backups to keep around
  3067. """
  3068. self._do_snapshot_instance(context, image_id, instance)
  3069. self._rotate_backups(context, instance, backup_type, rotation)
  3070. @wrap_exception()
  3071. @reverts_task_state
  3072. @wrap_instance_event(prefix='compute')
  3073. @wrap_instance_fault
  3074. @delete_image_on_error
  3075. def snapshot_instance(self, context, image_id, instance):
  3076. """Snapshot an instance on this host.
  3077. :param context: security context
  3078. :param image_id: glance.db.sqlalchemy.models.Image.Id
  3079. :param instance: a nova.objects.instance.Instance object
  3080. """
  3081. # NOTE(dave-mcnally) the task state will already be set by the api
  3082. # but if the compute manager has crashed/been restarted prior to the
  3083. # request getting here the task state may have been cleared so we set
  3084. # it again and things continue normally
  3085. try:
  3086. instance.task_state = task_states.IMAGE_SNAPSHOT
  3087. instance.save(
  3088. expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
  3089. except exception.InstanceNotFound:
  3090. # possibility instance no longer exists, no point in continuing
  3091. LOG.debug("Instance not found, could not set state %s "
  3092. "for instance.",
  3093. task_states.IMAGE_SNAPSHOT, instance=instance)
  3094. return
  3095. except exception.UnexpectedDeletingTaskStateError:
  3096. LOG.debug("Instance being deleted, snapshot cannot continue",
  3097. instance=instance)
  3098. return
  3099. self._snapshot_instance(context, image_id, instance,
  3100. task_states.IMAGE_SNAPSHOT)
  3101. def _snapshot_instance(self, context, image_id, instance,
  3102. expected_task_state):
  3103. context = context.elevated()
  3104. instance.power_state = self._get_power_state(context, instance)
  3105. try:
  3106. instance.save()
  3107. LOG.info('instance snapshotting', instance=instance)
  3108. if instance.power_state != power_state.RUNNING:
  3109. state = instance.power_state
  3110. running = power_state.RUNNING
  3111. LOG.warning('trying to snapshot a non-running instance: '
  3112. '(state: %(state)s expected: %(running)s)',
  3113. {'state': state, 'running': running},
  3114. instance=instance)
  3115. self._notify_about_instance_usage(
  3116. context, instance, "snapshot.start")
  3117. compute_utils.notify_about_instance_snapshot(context, instance,
  3118. self.host, phase=fields.NotificationPhase.START,
  3119. snapshot_image_id=image_id)
  3120. def update_task_state(task_state,
  3121. expected_state=expected_task_state):
  3122. instance.task_state = task_state
  3123. instance.save(expected_task_state=expected_state)
  3124. self.driver.snapshot(context, instance, image_id,
  3125. update_task_state)
  3126. instance.task_state = None
  3127. instance.save(expected_task_state=task_states.IMAGE_UPLOADING)
  3128. self._notify_about_instance_usage(context, instance,
  3129. "snapshot.end")
  3130. compute_utils.notify_about_instance_snapshot(context, instance,
  3131. self.host, phase=fields.NotificationPhase.END,
  3132. snapshot_image_id=image_id)
  3133. except (exception.InstanceNotFound,
  3134. exception.UnexpectedDeletingTaskStateError):
  3135. # the instance got deleted during the snapshot
  3136. # Quickly bail out of here
  3137. msg = 'Instance disappeared during snapshot'
  3138. LOG.debug(msg, instance=instance)
  3139. try:
  3140. image = self.image_api.get(context, image_id)
  3141. if image['status'] != 'active':
  3142. self.image_api.delete(context, image_id)
  3143. except Exception:
  3144. LOG.warning("Error while trying to clean up image %s",
  3145. image_id, instance=instance)
  3146. except exception.ImageNotFound:
  3147. instance.task_state = None
  3148. instance.save()
  3149. LOG.warning("Image not found during snapshot", instance=instance)
  3150. def _post_interrupted_snapshot_cleanup(self, context, instance):
  3151. self.driver.post_interrupted_snapshot_cleanup(context, instance)
  3152. @messaging.expected_exceptions(NotImplementedError)
  3153. @wrap_exception()
  3154. def volume_snapshot_create(self, context, instance, volume_id,
  3155. create_info):
  3156. self.driver.volume_snapshot_create(context, instance, volume_id,
  3157. create_info)
  3158. @messaging.expected_exceptions(NotImplementedError)
  3159. @wrap_exception()
  3160. def volume_snapshot_delete(self, context, instance, volume_id,
  3161. snapshot_id, delete_info):
  3162. self.driver.volume_snapshot_delete(context, instance, volume_id,
  3163. snapshot_id, delete_info)
  3164. @wrap_instance_fault
  3165. def _rotate_backups(self, context, instance, backup_type, rotation):
  3166. """Delete excess backups associated to an instance.
  3167. Instances are allowed a fixed number of backups (the rotation number);
  3168. this method deletes the oldest backups that exceed the rotation
  3169. threshold.
  3170. :param context: security context
  3171. :param instance: Instance dict
  3172. :param backup_type: a user-defined type, like "daily" or "weekly" etc.
  3173. :param rotation: int representing how many backups to keep around;
  3174. None if rotation shouldn't be used (as in the case of snapshots)
  3175. """
  3176. filters = {'property-image_type': 'backup',
  3177. 'property-backup_type': backup_type,
  3178. 'property-instance_uuid': instance.uuid}
  3179. images = self.image_api.get_all(context, filters=filters,
  3180. sort_key='created_at', sort_dir='desc')
  3181. num_images = len(images)
  3182. LOG.debug("Found %(num_images)d images (rotation: %(rotation)d)",
  3183. {'num_images': num_images, 'rotation': rotation},
  3184. instance=instance)
  3185. if num_images > rotation:
  3186. # NOTE(sirp): this deletes all backups that exceed the rotation
  3187. # limit
  3188. excess = len(images) - rotation
  3189. LOG.debug("Rotating out %d backups", excess,
  3190. instance=instance)
  3191. for i in range(excess):
  3192. image = images.pop()
  3193. image_id = image['id']
  3194. LOG.debug("Deleting image %s", image_id,
  3195. instance=instance)
  3196. try:
  3197. self.image_api.delete(context, image_id)
  3198. except exception.ImageNotFound:
  3199. LOG.info("Failed to find image %(image_id)s to "
  3200. "delete", {'image_id': image_id},
  3201. instance=instance)
  3202. except (exception.ImageDeleteConflict, Exception) as exc:
  3203. LOG.info("Failed to delete image %(image_id)s during "
  3204. "deleting excess backups. "
  3205. "Continuing for next image.. %(exc)s",
  3206. {'image_id': image_id, 'exc': exc},
  3207. instance=instance)
  3208. @wrap_exception()
  3209. @reverts_task_state
  3210. @wrap_instance_event(prefix='compute')
  3211. @wrap_instance_fault
  3212. def set_admin_password(self, context, instance, new_pass):
  3213. """Set the root/admin password for an instance on this host.
  3214. This is generally only called by API password resets after an
  3215. image has been built.
  3216. @param context: Nova auth context.
  3217. @param instance: Nova instance object.
  3218. @param new_pass: The admin password for the instance.
  3219. """
  3220. context = context.elevated()
  3221. if new_pass is None:
  3222. # Generate a random password
  3223. new_pass = utils.generate_password()
  3224. current_power_state = self._get_power_state(context, instance)
  3225. expected_state = power_state.RUNNING
  3226. if current_power_state != expected_state:
  3227. instance.task_state = None
  3228. instance.save(expected_task_state=task_states.UPDATING_PASSWORD)
  3229. _msg = _('instance %s is not running') % instance.uuid
  3230. raise exception.InstancePasswordSetFailed(
  3231. instance=instance.uuid, reason=_msg)
  3232. try:
  3233. self.driver.set_admin_password(instance, new_pass)
  3234. LOG.info("Admin password set", instance=instance)
  3235. instance.task_state = None
  3236. instance.save(
  3237. expected_task_state=task_states.UPDATING_PASSWORD)
  3238. except exception.InstanceAgentNotEnabled:
  3239. with excutils.save_and_reraise_exception():
  3240. LOG.debug('Guest agent is not enabled for the instance.',
  3241. instance=instance)
  3242. instance.task_state = None
  3243. instance.save(
  3244. expected_task_state=task_states.UPDATING_PASSWORD)
  3245. except exception.SetAdminPasswdNotSupported:
  3246. with excutils.save_and_reraise_exception():
  3247. LOG.info('set_admin_password is not supported '
  3248. 'by this driver or guest instance.',
  3249. instance=instance)
  3250. instance.task_state = None
  3251. instance.save(
  3252. expected_task_state=task_states.UPDATING_PASSWORD)
  3253. except NotImplementedError:
  3254. LOG.warning('set_admin_password is not implemented '
  3255. 'by this driver or guest instance.',
  3256. instance=instance)
  3257. instance.task_state = None
  3258. instance.save(
  3259. expected_task_state=task_states.UPDATING_PASSWORD)
  3260. raise NotImplementedError(_('set_admin_password is not '
  3261. 'implemented by this driver or guest '
  3262. 'instance.'))
  3263. except exception.UnexpectedTaskStateError:
  3264. # interrupted by another (most likely delete) task
  3265. # do not retry
  3266. raise
  3267. except Exception:
  3268. # Catch all here because this could be anything.
  3269. LOG.exception('set_admin_password failed', instance=instance)
  3270. # We create a new exception here so that we won't
  3271. # potentially reveal password information to the
  3272. # API caller. The real exception is logged above
  3273. _msg = _('error setting admin password')
  3274. raise exception.InstancePasswordSetFailed(
  3275. instance=instance.uuid, reason=_msg)
  3276. @wrap_exception()
  3277. @reverts_task_state
  3278. @wrap_instance_fault
  3279. def inject_file(self, context, path, file_contents, instance):
  3280. """Write a file to the specified path in an instance on this host."""
  3281. # NOTE(russellb) Remove this method, as well as the underlying virt
  3282. # driver methods, when the compute rpc interface is bumped to 4.x
  3283. # as it is no longer used.
  3284. context = context.elevated()
  3285. current_power_state = self._get_power_state(context, instance)
  3286. expected_state = power_state.RUNNING
  3287. if current_power_state != expected_state:
  3288. LOG.warning('trying to inject a file into a non-running '
  3289. '(state: %(current_state)s expected: '
  3290. '%(expected_state)s)',
  3291. {'current_state': current_power_state,
  3292. 'expected_state': expected_state},
  3293. instance=instance)
  3294. LOG.info('injecting file to %s', path, instance=instance)
  3295. self.driver.inject_file(instance, path, file_contents)
  3296. def _get_rescue_image(self, context, instance, rescue_image_ref=None):
  3297. """Determine what image should be used to boot the rescue VM."""
  3298. # 1. If rescue_image_ref is passed in, use that for rescue.
  3299. # 2. Else, use the base image associated with instance's current image.
  3300. # The idea here is to provide the customer with a rescue
  3301. # environment which they are familiar with.
  3302. # So, if they built their instance off of a Debian image,
  3303. # their rescue VM will also be Debian.
  3304. # 3. As a last resort, use instance's current image.
  3305. if not rescue_image_ref:
  3306. system_meta = utils.instance_sys_meta(instance)
  3307. rescue_image_ref = system_meta.get('image_base_image_ref')
  3308. if not rescue_image_ref:
  3309. LOG.warning('Unable to find a different image to use for '
  3310. 'rescue VM, using instance\'s current image',
  3311. instance=instance)
  3312. rescue_image_ref = instance.image_ref
  3313. return objects.ImageMeta.from_image_ref(
  3314. context, self.image_api, rescue_image_ref)
  3315. @wrap_exception()
  3316. @reverts_task_state
  3317. @wrap_instance_event(prefix='compute')
  3318. @wrap_instance_fault
  3319. def rescue_instance(self, context, instance, rescue_password,
  3320. rescue_image_ref, clean_shutdown):
  3321. context = context.elevated()
  3322. LOG.info('Rescuing', instance=instance)
  3323. admin_password = (rescue_password if rescue_password else
  3324. utils.generate_password())
  3325. network_info = self.network_api.get_instance_nw_info(context, instance)
  3326. rescue_image_meta = self._get_rescue_image(context, instance,
  3327. rescue_image_ref)
  3328. extra_usage_info = {'rescue_image_name':
  3329. self._get_image_name(rescue_image_meta)}
  3330. self._notify_about_instance_usage(context, instance,
  3331. "rescue.start", extra_usage_info=extra_usage_info,
  3332. network_info=network_info)
  3333. compute_utils.notify_about_instance_rescue_action(
  3334. context, instance, self.host, rescue_image_ref,
  3335. action=fields.NotificationAction.RESCUE,
  3336. phase=fields.NotificationPhase.START)
  3337. try:
  3338. self._power_off_instance(context, instance, clean_shutdown)
  3339. self.driver.rescue(context, instance,
  3340. network_info,
  3341. rescue_image_meta, admin_password)
  3342. except Exception as e:
  3343. LOG.exception("Error trying to Rescue Instance",
  3344. instance=instance)
  3345. self._set_instance_obj_error_state(context, instance)
  3346. raise exception.InstanceNotRescuable(
  3347. instance_id=instance.uuid,
  3348. reason=_("Driver Error: %s") % e)
  3349. compute_utils.notify_usage_exists(self.notifier, context, instance,
  3350. current_period=True)
  3351. instance.vm_state = vm_states.RESCUED
  3352. instance.task_state = None
  3353. instance.power_state = self._get_power_state(context, instance)
  3354. instance.launched_at = timeutils.utcnow()
  3355. instance.save(expected_task_state=task_states.RESCUING)
  3356. self._notify_about_instance_usage(context, instance,
  3357. "rescue.end", extra_usage_info=extra_usage_info,
  3358. network_info=network_info)
  3359. compute_utils.notify_about_instance_rescue_action(
  3360. context, instance, self.host, rescue_image_ref,
  3361. action=fields.NotificationAction.RESCUE,
  3362. phase=fields.NotificationPhase.END)
  3363. @wrap_exception()
  3364. @reverts_task_state
  3365. @wrap_instance_event(prefix='compute')
  3366. @wrap_instance_fault
  3367. def unrescue_instance(self, context, instance):
  3368. context = context.elevated()
  3369. LOG.info('Unrescuing', instance=instance)
  3370. network_info = self.network_api.get_instance_nw_info(context, instance)
  3371. self._notify_about_instance_usage(context, instance,
  3372. "unrescue.start", network_info=network_info)
  3373. compute_utils.notify_about_instance_action(context, instance,
  3374. self.host, action=fields.NotificationAction.UNRESCUE,
  3375. phase=fields.NotificationPhase.START)
  3376. with self._error_out_instance_on_exception(context, instance):
  3377. self.driver.unrescue(instance,
  3378. network_info)
  3379. instance.vm_state = vm_states.ACTIVE
  3380. instance.task_state = None
  3381. instance.power_state = self._get_power_state(context, instance)
  3382. instance.save(expected_task_state=task_states.UNRESCUING)
  3383. self._notify_about_instance_usage(context,
  3384. instance,
  3385. "unrescue.end",
  3386. network_info=network_info)
  3387. compute_utils.notify_about_instance_action(context, instance,
  3388. self.host, action=fields.NotificationAction.UNRESCUE,
  3389. phase=fields.NotificationPhase.END)
  3390. @wrap_exception()
  3391. @wrap_instance_fault
  3392. def change_instance_metadata(self, context, diff, instance):
  3393. """Update the metadata published to the instance."""
  3394. LOG.debug("Changing instance metadata according to %r",
  3395. diff, instance=instance)
  3396. self.driver.change_instance_metadata(context, instance, diff)
  3397. @wrap_exception()
  3398. @wrap_instance_event(prefix='compute')
  3399. @errors_out_migration
  3400. @wrap_instance_fault
  3401. def confirm_resize(self, context, instance, migration):
  3402. """Confirms a migration/resize and deletes the 'old' instance.
  3403. This is called from the API and runs on the source host.
  3404. Nothing needs to happen on the destination host at this point since
  3405. the instance is already running there. This routine just cleans up the
  3406. source host.
  3407. """
  3408. @utils.synchronized(instance.uuid)
  3409. def do_confirm_resize(context, instance, migration_id):
  3410. # NOTE(wangpan): Get the migration status from db, if it has been
  3411. # confirmed, we do nothing and return here
  3412. LOG.debug("Going to confirm migration %s", migration_id,
  3413. instance=instance)
  3414. try:
  3415. # TODO(russellb) Why are we sending the migration object just
  3416. # to turn around and look it up from the db again?
  3417. migration = objects.Migration.get_by_id(
  3418. context.elevated(), migration_id)
  3419. except exception.MigrationNotFound:
  3420. LOG.error("Migration %s is not found during confirmation",
  3421. migration_id, instance=instance)
  3422. return
  3423. if migration.status == 'confirmed':
  3424. LOG.info("Migration %s is already confirmed",
  3425. migration_id, instance=instance)
  3426. return
  3427. elif migration.status not in ('finished', 'confirming'):
  3428. LOG.warning("Unexpected confirmation status '%(status)s' "
  3429. "of migration %(id)s, exit confirmation process",
  3430. {"status": migration.status, "id": migration_id},
  3431. instance=instance)
  3432. return
  3433. # NOTE(wangpan): Get the instance from db, if it has been
  3434. # deleted, we do nothing and return here
  3435. expected_attrs = ['metadata', 'system_metadata', 'flavor']
  3436. try:
  3437. instance = objects.Instance.get_by_uuid(
  3438. context, instance.uuid,
  3439. expected_attrs=expected_attrs)
  3440. except exception.InstanceNotFound:
  3441. LOG.info("Instance is not found during confirmation",
  3442. instance=instance)
  3443. return
  3444. with self._error_out_instance_on_exception(context, instance):
  3445. old_instance_type = instance.old_flavor
  3446. try:
  3447. self._confirm_resize(
  3448. context, instance, migration=migration)
  3449. except Exception:
  3450. # Something failed when cleaning up the source host so
  3451. # log a traceback and leave a hint about hard rebooting
  3452. # the server to correct its state in the DB.
  3453. with excutils.save_and_reraise_exception(logger=LOG):
  3454. LOG.exception(
  3455. 'Confirm resize failed on source host %s. '
  3456. 'Resource allocations in the placement service '
  3457. 'will be removed regardless because the instance '
  3458. 'is now on the destination host %s. You can try '
  3459. 'hard rebooting the instance to correct its '
  3460. 'state.', self.host, migration.dest_compute,
  3461. instance=instance)
  3462. finally:
  3463. # Whether an error occurred or not, at this point the
  3464. # instance is on the dest host so to avoid leaking
  3465. # allocations in placement, delete them here.
  3466. # NOTE(mriedem): _delete_allocation_after_move is tightly
  3467. # coupled to the migration status on the confirm step so
  3468. # we unfortunately have to mutate the migration status to
  3469. # have _delete_allocation_after_move cleanup the allocation
  3470. # held by the migration consumer.
  3471. with utils.temporary_mutation(
  3472. migration, status='confirmed'):
  3473. self._delete_allocation_after_move(
  3474. context, instance, migration, old_instance_type,
  3475. migration.source_node)
  3476. do_confirm_resize(context, instance, migration.id)
  3477. def _get_updated_nw_info_with_pci_mapping(self, nw_info, pci_mapping):
  3478. # NOTE(adrianc): This method returns a copy of nw_info if modifications
  3479. # are made else it returns the original nw_info.
  3480. updated_nw_info = nw_info
  3481. if nw_info and pci_mapping:
  3482. updated_nw_info = copy.deepcopy(nw_info)
  3483. for vif in updated_nw_info:
  3484. if vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV:
  3485. try:
  3486. vif_pci_addr = vif['profile']['pci_slot']
  3487. new_addr = pci_mapping[vif_pci_addr].address
  3488. vif['profile']['pci_slot'] = new_addr
  3489. LOG.debug("Updating VIF's PCI address for VIF %(id)s. "
  3490. "Original value %(orig_val)s, "
  3491. "new value %(new_val)s",
  3492. {'id': vif['id'],
  3493. 'orig_val': vif_pci_addr,
  3494. 'new_val': new_addr})
  3495. except (KeyError, AttributeError):
  3496. with excutils.save_and_reraise_exception():
  3497. # NOTE(adrianc): This should never happen. If we
  3498. # get here it means there is some inconsistency
  3499. # with either 'nw_info' or 'pci_mapping'.
  3500. LOG.error("Unexpected error when updating network "
  3501. "information with PCI mapping.")
  3502. return updated_nw_info
  3503. def _confirm_resize(self, context, instance, migration=None):
  3504. """Destroys the source instance."""
  3505. self._notify_about_instance_usage(context, instance,
  3506. "resize.confirm.start")
  3507. compute_utils.notify_about_instance_action(context, instance,
  3508. self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
  3509. phase=fields.NotificationPhase.START)
  3510. # NOTE(danms): delete stashed migration information
  3511. old_instance_type = instance.old_flavor
  3512. instance.old_flavor = None
  3513. instance.new_flavor = None
  3514. instance.system_metadata.pop('old_vm_state', None)
  3515. instance.save()
  3516. # NOTE(tr3buchet): tear down networks on source host
  3517. self.network_api.setup_networks_on_host(context, instance,
  3518. migration.source_compute, teardown=True)
  3519. network_info = self.network_api.get_instance_nw_info(context,
  3520. instance)
  3521. # NOTE(adrianc): Populate old PCI device in VIF profile
  3522. # to allow virt driver to properly unplug it from Hypervisor.
  3523. pci_mapping = (instance.migration_context.
  3524. get_pci_mapping_for_migration(True))
  3525. network_info = self._get_updated_nw_info_with_pci_mapping(
  3526. network_info, pci_mapping)
  3527. # TODO(mriedem): Get BDMs here and pass them to the driver.
  3528. self.driver.confirm_migration(context, migration, instance,
  3529. network_info)
  3530. migration.status = 'confirmed'
  3531. with migration.obj_as_admin():
  3532. migration.save()
  3533. rt = self._get_resource_tracker()
  3534. rt.drop_move_claim(context, instance, migration.source_node,
  3535. old_instance_type, prefix='old_')
  3536. instance.drop_migration_context()
  3537. # NOTE(mriedem): The old_vm_state could be STOPPED but the user
  3538. # might have manually powered up the instance to confirm the
  3539. # resize/migrate, so we need to check the current power state
  3540. # on the instance and set the vm_state appropriately. We default
  3541. # to ACTIVE because if the power state is not SHUTDOWN, we
  3542. # assume _sync_instance_power_state will clean it up.
  3543. p_state = instance.power_state
  3544. vm_state = None
  3545. if p_state == power_state.SHUTDOWN:
  3546. vm_state = vm_states.STOPPED
  3547. LOG.debug("Resized/migrated instance is powered off. "
  3548. "Setting vm_state to '%s'.", vm_state,
  3549. instance=instance)
  3550. else:
  3551. vm_state = vm_states.ACTIVE
  3552. instance.vm_state = vm_state
  3553. instance.task_state = None
  3554. instance.save(expected_task_state=[None, task_states.DELETING,
  3555. task_states.SOFT_DELETING])
  3556. self._notify_about_instance_usage(
  3557. context, instance, "resize.confirm.end",
  3558. network_info=network_info)
  3559. compute_utils.notify_about_instance_action(context, instance,
  3560. self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
  3561. phase=fields.NotificationPhase.END)
  3562. def _delete_allocation_after_move(self, context, instance, migration,
  3563. flavor, nodename):
  3564. rt = self._get_resource_tracker()
  3565. cn_uuid = rt.get_node_uuid(nodename)
  3566. if migration.source_node == nodename:
  3567. if migration.status in ('confirmed', 'completed'):
  3568. # NOTE(danms): We're finishing on the source node, so try to
  3569. # delete the allocation based on the migration uuid
  3570. deleted = self.reportclient.delete_allocation_for_instance(
  3571. context, migration.uuid)
  3572. if deleted:
  3573. LOG.info(_('Source node %(node)s confirmed migration '
  3574. '%(mig)s; deleted migration-based '
  3575. 'allocation'),
  3576. {'node': nodename, 'mig': migration.uuid})
  3577. # NOTE(danms): We succeeded, which means we do not
  3578. # need to do the complex double allocation dance
  3579. return
  3580. else:
  3581. # We're reverting (or failed) on the source, so we
  3582. # need to check if our migration holds a claim and if
  3583. # so, avoid doing the legacy behavior below.
  3584. mig_allocs = (
  3585. self.reportclient.get_allocations_for_consumer_by_provider(
  3586. context, cn_uuid, migration.uuid))
  3587. if mig_allocs:
  3588. LOG.info(_('Source node %(node)s reverted migration '
  3589. '%(mig)s; not deleting migration-based '
  3590. 'allocation'),
  3591. {'node': nodename, 'mig': migration.uuid})
  3592. return
  3593. elif migration.dest_node == nodename:
  3594. # NOTE(danms): We're reverting on the destination node
  3595. # (and we must not be doing a same-host migration if we
  3596. # made it past the check above), so we need to check to
  3597. # see if the source did migration-based allocation
  3598. # accounting
  3599. allocs = (
  3600. self.reportclient.get_allocations_for_consumer_by_provider(
  3601. context, cn_uuid, migration.uuid))
  3602. if allocs:
  3603. # NOTE(danms): The source did migration-based allocation
  3604. # accounting, so we should let the source node rejigger
  3605. # the allocations in finish_resize_revert()
  3606. LOG.info(_('Destination node %(node)s reverted migration '
  3607. '%(mig)s; not deleting migration-based '
  3608. 'allocation'),
  3609. {'node': nodename, 'mig': migration.uuid})
  3610. return
  3611. # TODO(danms): Remove below this line when we remove compatibility
  3612. # for double-accounting migrations (likely rocky)
  3613. LOG.info(_('Doing legacy allocation math for migration %(mig)s after '
  3614. 'instance move'),
  3615. {'mig': migration.uuid},
  3616. instance=instance)
  3617. # NOTE(jaypipes): This sucks, but due to the fact that confirm_resize()
  3618. # only runs on the source host and revert_resize() runs on the
  3619. # destination host, we need to do this here. Basically, what we're
  3620. # doing here is grabbing the existing allocations for this instance
  3621. # from the placement API, dropping the resources in the doubled-up
  3622. # allocation set that refer to the source host UUID and calling PUT
  3623. # /allocations back to the placement API. The allocation that gets
  3624. # PUT'd back to placement will only include the destination host and
  3625. # any shared providers in the case of a confirm_resize operation and
  3626. # the source host and shared providers for a revert_resize operation..
  3627. if not scheduler_utils.remove_allocation_from_compute(
  3628. context, instance, cn_uuid, self.reportclient, flavor):
  3629. LOG.error("Failed to save manipulated allocation",
  3630. instance=instance)
  3631. @wrap_exception()
  3632. @reverts_task_state
  3633. @wrap_instance_event(prefix='compute')
  3634. @errors_out_migration
  3635. @wrap_instance_fault
  3636. def revert_resize(self, context, instance, migration):
  3637. """Destroys the new instance on the destination machine.
  3638. Reverts the model changes, and powers on the old instance on the
  3639. source machine.
  3640. """
  3641. # NOTE(comstud): A revert_resize is essentially a resize back to
  3642. # the old size, so we need to send a usage event here.
  3643. compute_utils.notify_usage_exists(self.notifier, context, instance,
  3644. current_period=True)
  3645. with self._error_out_instance_on_exception(context, instance):
  3646. # NOTE(tr3buchet): tear down networks on destination host
  3647. self.network_api.setup_networks_on_host(context, instance,
  3648. teardown=True)
  3649. migration_p = obj_base.obj_to_primitive(migration)
  3650. self.network_api.migrate_instance_start(context,
  3651. instance,
  3652. migration_p)
  3653. network_info = self.network_api.get_instance_nw_info(context,
  3654. instance)
  3655. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3656. context, instance.uuid)
  3657. block_device_info = self._get_instance_block_device_info(
  3658. context, instance, bdms=bdms)
  3659. destroy_disks = not self._is_instance_storage_shared(
  3660. context, instance, host=migration.source_compute)
  3661. self.driver.destroy(context, instance, network_info,
  3662. block_device_info, destroy_disks)
  3663. self._terminate_volume_connections(context, instance, bdms)
  3664. migration.status = 'reverted'
  3665. with migration.obj_as_admin():
  3666. migration.save()
  3667. # NOTE(ndipanov): We need to do this here because dropping the
  3668. # claim means we lose the migration_context data. We really should
  3669. # fix this by moving the drop_move_claim call to the
  3670. # finish_revert_resize method as this is racy (revert is dropped,
  3671. # but instance resources will be tracked with the new flavor until
  3672. # it gets rolled back in finish_revert_resize, which is
  3673. # potentially wrong for a period of time).
  3674. instance.revert_migration_context()
  3675. instance.save()
  3676. rt = self._get_resource_tracker()
  3677. rt.drop_move_claim(context, instance, instance.node)
  3678. self._delete_allocation_after_move(context, instance, migration,
  3679. instance.flavor,
  3680. instance.node)
  3681. # RPC cast back to the source host to finish the revert there.
  3682. self.compute_rpcapi.finish_revert_resize(context, instance,
  3683. migration, migration.source_compute)
  3684. @wrap_exception()
  3685. @reverts_task_state
  3686. @wrap_instance_event(prefix='compute')
  3687. @errors_out_migration
  3688. @wrap_instance_fault
  3689. def finish_revert_resize(self, context, instance, migration):
  3690. """Finishes the second half of reverting a resize on the source host.
  3691. Bring the original source instance state back (active/shutoff) and
  3692. revert the resized attributes in the database.
  3693. """
  3694. with self._error_out_instance_on_exception(context, instance):
  3695. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3696. context, instance.uuid)
  3697. self._notify_about_instance_usage(
  3698. context, instance, "resize.revert.start")
  3699. compute_utils.notify_about_instance_action(context, instance,
  3700. self.host, action=fields.NotificationAction.RESIZE_REVERT,
  3701. phase=fields.NotificationPhase.START, bdms=bdms)
  3702. # NOTE(mriedem): delete stashed old_vm_state information; we
  3703. # default to ACTIVE for backwards compatibility if old_vm_state
  3704. # is not set
  3705. old_vm_state = instance.system_metadata.pop('old_vm_state',
  3706. vm_states.ACTIVE)
  3707. self._set_instance_info(instance, instance.old_flavor)
  3708. instance.old_flavor = None
  3709. instance.new_flavor = None
  3710. instance.host = migration.source_compute
  3711. instance.node = migration.source_node
  3712. instance.save()
  3713. self._revert_allocation(context, instance, migration)
  3714. self.network_api.setup_networks_on_host(context, instance,
  3715. migration.source_compute)
  3716. migration_p = obj_base.obj_to_primitive(migration)
  3717. # NOTE(hanrong): we need to change migration_p['dest_compute'] to
  3718. # source host temporarily. "network_api.migrate_instance_finish"
  3719. # will setup the network for the instance on the destination host.
  3720. # For revert resize, the instance will back to the source host, the
  3721. # setup of the network for instance should be on the source host.
  3722. # So set the migration_p['dest_compute'] to source host at here.
  3723. migration_p['dest_compute'] = migration.source_compute
  3724. self.network_api.migrate_instance_finish(context,
  3725. instance,
  3726. migration_p)
  3727. network_info = self.network_api.get_instance_nw_info(context,
  3728. instance)
  3729. # revert_resize deleted any volume attachments for the instance
  3730. # and created new ones to be used on this host, but we
  3731. # have to update those attachments with the host connector so the
  3732. # BDM.connection_info will get set in the call to
  3733. # _get_instance_block_device_info below with refresh_conn_info=True
  3734. # and then the volumes can be re-connected via the driver on this
  3735. # host.
  3736. self._update_volume_attachments(context, instance, bdms)
  3737. block_device_info = self._get_instance_block_device_info(
  3738. context, instance, refresh_conn_info=True, bdms=bdms)
  3739. power_on = old_vm_state != vm_states.STOPPED
  3740. self.driver.finish_revert_migration(context, instance,
  3741. network_info,
  3742. block_device_info, power_on)
  3743. instance.drop_migration_context()
  3744. instance.launched_at = timeutils.utcnow()
  3745. instance.save(expected_task_state=task_states.RESIZE_REVERTING)
  3746. # Complete any volume attachments so the volumes are in-use.
  3747. self._complete_volume_attachments(context, bdms)
  3748. # if the original vm state was STOPPED, set it back to STOPPED
  3749. LOG.info("Updating instance to original state: '%s'",
  3750. old_vm_state, instance=instance)
  3751. if power_on:
  3752. instance.vm_state = vm_states.ACTIVE
  3753. instance.task_state = None
  3754. instance.save()
  3755. else:
  3756. instance.task_state = task_states.POWERING_OFF
  3757. instance.save()
  3758. self.stop_instance(context, instance=instance,
  3759. clean_shutdown=True)
  3760. self._notify_about_instance_usage(
  3761. context, instance, "resize.revert.end")
  3762. compute_utils.notify_about_instance_action(context, instance,
  3763. self.host, action=fields.NotificationAction.RESIZE_REVERT,
  3764. phase=fields.NotificationPhase.END, bdms=bdms)
  3765. def _revert_allocation(self, context, instance, migration):
  3766. """Revert an allocation that is held by migration to our instance."""
  3767. # Fetch the original allocation that the instance had on the source
  3768. # node, which are now held by the migration
  3769. orig_alloc = self.reportclient.get_allocations_for_consumer(
  3770. context, migration.uuid)
  3771. if not orig_alloc:
  3772. # NOTE(danms): This migration did not do per-migration allocation
  3773. # accounting, so nothing to do here.
  3774. LOG.info('Old-style migration %(mig)s is being reverted; '
  3775. 'no migration claims found on original node '
  3776. 'to swap.',
  3777. {'mig': migration.uuid},
  3778. instance=instance)
  3779. return False
  3780. if len(orig_alloc) > 1:
  3781. # NOTE(danms): This may change later if we have other allocations
  3782. # against other providers that need to be held by the migration
  3783. # as well. Perhaps something like shared storage resources that
  3784. # will actually be duplicated during a resize type operation.
  3785. LOG.error('New-style migration %(mig)s has allocations against '
  3786. 'more than one provider %(rps)s. This should not be '
  3787. 'possible, but reverting it anyway.',
  3788. {'mig': migration.uuid,
  3789. 'rps': ','.join(orig_alloc.keys())},
  3790. instance=instance)
  3791. # We only have a claim against one provider, it is the source node
  3792. cn_uuid = list(orig_alloc.keys())[0]
  3793. # Get just the resources part of the one allocation we need below
  3794. orig_alloc = orig_alloc[cn_uuid].get('resources', {})
  3795. # FIXME(danms): This method is flawed in that it asssumes allocations
  3796. # against only one provider. So, this may overwite allocations against
  3797. # a shared provider, if we had one.
  3798. LOG.info('Swapping old allocation on %(node)s held by migration '
  3799. '%(mig)s for instance',
  3800. {'node': cn_uuid, 'mig': migration.uuid},
  3801. instance=instance)
  3802. # TODO(cdent): Should we be doing anything with return values here?
  3803. self.reportclient.set_and_clear_allocations(
  3804. context, cn_uuid, instance.uuid, orig_alloc, instance.project_id,
  3805. instance.user_id, consumer_to_clear=migration.uuid)
  3806. return True
  3807. def _prep_resize(self, context, image, instance, instance_type,
  3808. filter_properties, node, migration, clean_shutdown=True):
  3809. if not filter_properties:
  3810. filter_properties = {}
  3811. if not instance.host:
  3812. self._set_instance_obj_error_state(context, instance)
  3813. msg = _('Instance has no source host')
  3814. raise exception.MigrationError(reason=msg)
  3815. same_host = instance.host == self.host
  3816. # if the flavor IDs match, it's migrate; otherwise resize
  3817. if same_host and instance_type.id == instance['instance_type_id']:
  3818. # check driver whether support migrate to same host
  3819. if not self.driver.capabilities['supports_migrate_to_same_host']:
  3820. raise exception.UnableToMigrateToSelf(
  3821. instance_id=instance.uuid, host=self.host)
  3822. # NOTE(danms): Stash the new instance_type to avoid having to
  3823. # look it up in the database later
  3824. instance.new_flavor = instance_type
  3825. # NOTE(mriedem): Stash the old vm_state so we can set the
  3826. # resized/reverted instance back to the same state later.
  3827. vm_state = instance.vm_state
  3828. LOG.debug('Stashing vm_state: %s', vm_state, instance=instance)
  3829. instance.system_metadata['old_vm_state'] = vm_state
  3830. instance.save()
  3831. limits = filter_properties.get('limits', {})
  3832. rt = self._get_resource_tracker()
  3833. with rt.resize_claim(context, instance, instance_type, node,
  3834. migration, image_meta=image,
  3835. limits=limits) as claim:
  3836. LOG.info('Migrating', instance=instance)
  3837. # RPC cast to the source host to start the actual resize/migration.
  3838. self.compute_rpcapi.resize_instance(
  3839. context, instance, claim.migration, image,
  3840. instance_type, clean_shutdown)
  3841. @wrap_exception()
  3842. @reverts_task_state
  3843. @wrap_instance_event(prefix='compute')
  3844. @wrap_instance_fault
  3845. def prep_resize(self, context, image, instance, instance_type,
  3846. request_spec, filter_properties, node,
  3847. clean_shutdown, migration, host_list):
  3848. """Initiates the process of moving a running instance to another host.
  3849. Possibly changes the VCPU, RAM and disk size in the process.
  3850. This is initiated from conductor and runs on the destination host.
  3851. The main purpose of this method is performing some checks on the
  3852. destination host and making a claim for resources. If the claim fails
  3853. then a reschedule to another host may be attempted which involves
  3854. calling back to conductor to start the process over again.
  3855. """
  3856. if node is None:
  3857. node = self._get_nodename(instance, refresh=True)
  3858. with self._error_out_instance_on_exception(context, instance), \
  3859. errors_out_migration_ctxt(migration):
  3860. compute_utils.notify_usage_exists(self.notifier, context, instance,
  3861. current_period=True)
  3862. self._notify_about_instance_usage(
  3863. context, instance, "resize.prep.start")
  3864. compute_utils.notify_about_resize_prep_instance(
  3865. context, instance, self.host,
  3866. fields.NotificationPhase.START, instance_type)
  3867. try:
  3868. self._prep_resize(context, image, instance,
  3869. instance_type, filter_properties,
  3870. node, migration, clean_shutdown)
  3871. except Exception:
  3872. # Since we hit a failure, we're either rescheduling or dead
  3873. # and either way we need to cleanup any allocations created
  3874. # by the scheduler for the destination node.
  3875. if migration and not self._revert_allocation(
  3876. context, instance, migration):
  3877. # We did not do a migration-based
  3878. # allocation. Note that for a resize to the
  3879. # same host, the scheduler will merge the
  3880. # flavors, so here we'd be subtracting the new
  3881. # flavor from the allocated resources on this
  3882. # node.
  3883. # FIXME(danms): Remove this in Rocky
  3884. rt = self._get_resource_tracker()
  3885. rt.delete_allocation_for_failed_resize(
  3886. context, instance, node, instance_type)
  3887. # try to re-schedule the resize elsewhere:
  3888. exc_info = sys.exc_info()
  3889. self._reschedule_resize_or_reraise(context, image, instance,
  3890. exc_info, instance_type, request_spec,
  3891. filter_properties, host_list)
  3892. finally:
  3893. extra_usage_info = dict(
  3894. new_instance_type=instance_type.name,
  3895. new_instance_type_id=instance_type.id)
  3896. self._notify_about_instance_usage(
  3897. context, instance, "resize.prep.end",
  3898. extra_usage_info=extra_usage_info)
  3899. compute_utils.notify_about_resize_prep_instance(
  3900. context, instance, self.host,
  3901. fields.NotificationPhase.END, instance_type)
  3902. def _reschedule_resize_or_reraise(self, context, image, instance, exc_info,
  3903. instance_type, request_spec, filter_properties, host_list):
  3904. """Try to re-schedule the resize or re-raise the original error to
  3905. error out the instance.
  3906. """
  3907. if not request_spec:
  3908. request_spec = {}
  3909. if not filter_properties:
  3910. filter_properties = {}
  3911. rescheduled = False
  3912. instance_uuid = instance.uuid
  3913. try:
  3914. reschedule_method = self.compute_task_api.resize_instance
  3915. scheduler_hint = dict(filter_properties=filter_properties)
  3916. method_args = (instance, None, scheduler_hint, instance_type)
  3917. task_state = task_states.RESIZE_PREP
  3918. rescheduled = self._reschedule(context, request_spec,
  3919. filter_properties, instance, reschedule_method,
  3920. method_args, task_state, exc_info, host_list=host_list)
  3921. except Exception as error:
  3922. rescheduled = False
  3923. LOG.exception("Error trying to reschedule",
  3924. instance_uuid=instance_uuid)
  3925. compute_utils.add_instance_fault_from_exc(context,
  3926. instance, error,
  3927. exc_info=sys.exc_info())
  3928. self._notify_about_instance_usage(context, instance,
  3929. 'resize.error', fault=error)
  3930. compute_utils.notify_about_instance_action(
  3931. context, instance, self.host,
  3932. action=fields.NotificationAction.RESIZE,
  3933. phase=fields.NotificationPhase.ERROR,
  3934. exception=error)
  3935. if rescheduled:
  3936. self._log_original_error(exc_info, instance_uuid)
  3937. compute_utils.add_instance_fault_from_exc(context,
  3938. instance, exc_info[1], exc_info=exc_info)
  3939. self._notify_about_instance_usage(context, instance,
  3940. 'resize.error', fault=exc_info[1])
  3941. compute_utils.notify_about_instance_action(
  3942. context, instance, self.host,
  3943. action=fields.NotificationAction.RESIZE,
  3944. phase=fields.NotificationPhase.ERROR,
  3945. exception=exc_info[1])
  3946. else:
  3947. # not re-scheduling
  3948. six.reraise(*exc_info)
  3949. @wrap_exception()
  3950. @reverts_task_state
  3951. @wrap_instance_event(prefix='compute')
  3952. @wrap_instance_fault
  3953. def resize_instance(self, context, instance, image,
  3954. migration, instance_type, clean_shutdown):
  3955. """Starts the migration of a running instance to another host.
  3956. This is initiated from the destination host's ``prep_resize`` routine
  3957. and runs on the source host.
  3958. """
  3959. try:
  3960. self._resize_instance(context, instance, image, migration,
  3961. instance_type, clean_shutdown)
  3962. except Exception:
  3963. with excutils.save_and_reraise_exception():
  3964. self._revert_allocation(context, instance, migration)
  3965. def _resize_instance(self, context, instance, image,
  3966. migration, instance_type, clean_shutdown):
  3967. with self._error_out_instance_on_exception(context, instance), \
  3968. errors_out_migration_ctxt(migration):
  3969. network_info = self.network_api.get_instance_nw_info(context,
  3970. instance)
  3971. migration.status = 'migrating'
  3972. with migration.obj_as_admin():
  3973. migration.save()
  3974. instance.task_state = task_states.RESIZE_MIGRATING
  3975. instance.save(expected_task_state=task_states.RESIZE_PREP)
  3976. self._notify_about_instance_usage(
  3977. context, instance, "resize.start", network_info=network_info)
  3978. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3979. context, instance.uuid)
  3980. compute_utils.notify_about_instance_action(context, instance,
  3981. self.host, action=fields.NotificationAction.RESIZE,
  3982. phase=fields.NotificationPhase.START, bdms=bdms)
  3983. block_device_info = self._get_instance_block_device_info(
  3984. context, instance, bdms=bdms)
  3985. timeout, retry_interval = self._get_power_off_values(context,
  3986. instance, clean_shutdown)
  3987. disk_info = self.driver.migrate_disk_and_power_off(
  3988. context, instance, migration.dest_host,
  3989. instance_type, network_info,
  3990. block_device_info,
  3991. timeout, retry_interval)
  3992. self._terminate_volume_connections(context, instance, bdms)
  3993. migration_p = obj_base.obj_to_primitive(migration)
  3994. self.network_api.migrate_instance_start(context,
  3995. instance,
  3996. migration_p)
  3997. migration.status = 'post-migrating'
  3998. with migration.obj_as_admin():
  3999. migration.save()
  4000. instance.host = migration.dest_compute
  4001. instance.node = migration.dest_node
  4002. instance.task_state = task_states.RESIZE_MIGRATED
  4003. instance.save(expected_task_state=task_states.RESIZE_MIGRATING)
  4004. # RPC cast to the destination host to finish the resize/migration.
  4005. self.compute_rpcapi.finish_resize(context, instance,
  4006. migration, image, disk_info, migration.dest_compute)
  4007. self._notify_about_instance_usage(context, instance, "resize.end",
  4008. network_info=network_info)
  4009. compute_utils.notify_about_instance_action(context, instance,
  4010. self.host, action=fields.NotificationAction.RESIZE,
  4011. phase=fields.NotificationPhase.END, bdms=bdms)
  4012. self.instance_events.clear_events_for_instance(instance)
  4013. def _terminate_volume_connections(self, context, instance, bdms):
  4014. connector = None
  4015. for bdm in bdms:
  4016. if bdm.is_volume:
  4017. if bdm.attachment_id:
  4018. # NOTE(jdg): So here's the thing, the idea behind the new
  4019. # attach API's was to have a new code fork/path that we
  4020. # followed, we're not going to do that so we have to do
  4021. # some extra work in here to make it *behave* just like the
  4022. # old code. Cinder doesn't allow disconnect/reconnect (you
  4023. # just delete the attachment and get a new one)
  4024. # attachments in the new attach code so we have to do
  4025. # a delete and create without a connector (reserve),
  4026. # in other words, beware
  4027. attachment_id = self.volume_api.attachment_create(
  4028. context, bdm.volume_id, instance.uuid)['id']
  4029. self.volume_api.attachment_delete(context,
  4030. bdm.attachment_id)
  4031. bdm.attachment_id = attachment_id
  4032. bdm.save()
  4033. else:
  4034. if connector is None:
  4035. connector = self.driver.get_volume_connector(instance)
  4036. self.volume_api.terminate_connection(context,
  4037. bdm.volume_id,
  4038. connector)
  4039. @staticmethod
  4040. def _set_instance_info(instance, instance_type):
  4041. instance.instance_type_id = instance_type.id
  4042. instance.memory_mb = instance_type.memory_mb
  4043. instance.vcpus = instance_type.vcpus
  4044. instance.root_gb = instance_type.root_gb
  4045. instance.ephemeral_gb = instance_type.ephemeral_gb
  4046. instance.flavor = instance_type
  4047. def _update_volume_attachments(self, context, instance, bdms):
  4048. """Updates volume attachments using the virt driver host connector.
  4049. :param context: nova.context.RequestContext - user request context
  4050. :param instance: nova.objects.Instance
  4051. :param bdms: nova.objects.BlockDeviceMappingList - the list of block
  4052. device mappings for the given instance
  4053. """
  4054. if bdms:
  4055. connector = None
  4056. for bdm in bdms:
  4057. if bdm.is_volume and bdm.attachment_id:
  4058. if connector is None:
  4059. connector = self.driver.get_volume_connector(instance)
  4060. self.volume_api.attachment_update(
  4061. context, bdm.attachment_id, connector, bdm.device_name)
  4062. def _complete_volume_attachments(self, context, bdms):
  4063. """Completes volume attachments for the instance
  4064. :param context: nova.context.RequestContext - user request context
  4065. :param bdms: nova.objects.BlockDeviceMappingList - the list of block
  4066. device mappings for the given instance
  4067. """
  4068. if bdms:
  4069. for bdm in bdms:
  4070. if bdm.is_volume and bdm.attachment_id:
  4071. self.volume_api.attachment_complete(
  4072. context, bdm.attachment_id)
  4073. def _finish_resize(self, context, instance, migration, disk_info,
  4074. image_meta, bdms):
  4075. resize_instance = False
  4076. old_instance_type_id = migration['old_instance_type_id']
  4077. new_instance_type_id = migration['new_instance_type_id']
  4078. old_instance_type = instance.get_flavor()
  4079. # NOTE(mriedem): Get the old_vm_state so we know if we should
  4080. # power on the instance. If old_vm_state is not set we need to default
  4081. # to ACTIVE for backwards compatibility
  4082. old_vm_state = instance.system_metadata.get('old_vm_state',
  4083. vm_states.ACTIVE)
  4084. instance.old_flavor = old_instance_type
  4085. if old_instance_type_id != new_instance_type_id:
  4086. instance_type = instance.get_flavor('new')
  4087. self._set_instance_info(instance, instance_type)
  4088. for key in ('root_gb', 'swap', 'ephemeral_gb'):
  4089. if old_instance_type[key] != instance_type[key]:
  4090. resize_instance = True
  4091. break
  4092. instance.apply_migration_context()
  4093. # NOTE(tr3buchet): setup networks on destination host
  4094. self.network_api.setup_networks_on_host(context, instance,
  4095. migration['dest_compute'])
  4096. migration_p = obj_base.obj_to_primitive(migration)
  4097. self.network_api.migrate_instance_finish(context,
  4098. instance,
  4099. migration_p)
  4100. network_info = self.network_api.get_instance_nw_info(context, instance)
  4101. instance.task_state = task_states.RESIZE_FINISH
  4102. instance.save(expected_task_state=task_states.RESIZE_MIGRATED)
  4103. self._notify_about_instance_usage(
  4104. context, instance, "finish_resize.start",
  4105. network_info=network_info)
  4106. compute_utils.notify_about_instance_action(context, instance,
  4107. self.host, action=fields.NotificationAction.RESIZE_FINISH,
  4108. phase=fields.NotificationPhase.START, bdms=bdms)
  4109. # We need to update any volume attachments using the destination
  4110. # host connector so that we can update the BDM.connection_info
  4111. # before calling driver.finish_migration otherwise the driver
  4112. # won't know how to connect the volumes to this host.
  4113. # Note that _get_instance_block_device_info with
  4114. # refresh_conn_info=True will update the BDM.connection_info value
  4115. # in the database so we must do this before calling that method.
  4116. self._update_volume_attachments(context, instance, bdms)
  4117. block_device_info = self._get_instance_block_device_info(
  4118. context, instance, refresh_conn_info=True, bdms=bdms)
  4119. # NOTE(mriedem): If the original vm_state was STOPPED, we don't
  4120. # automatically power on the instance after it's migrated
  4121. power_on = old_vm_state != vm_states.STOPPED
  4122. try:
  4123. self.driver.finish_migration(context, migration, instance,
  4124. disk_info,
  4125. network_info,
  4126. image_meta, resize_instance,
  4127. block_device_info, power_on)
  4128. except Exception:
  4129. with excutils.save_and_reraise_exception():
  4130. if old_instance_type_id != new_instance_type_id:
  4131. self._set_instance_info(instance,
  4132. old_instance_type)
  4133. # Now complete any volume attachments that were previously updated.
  4134. self._complete_volume_attachments(context, bdms)
  4135. migration.status = 'finished'
  4136. with migration.obj_as_admin():
  4137. migration.save()
  4138. instance.vm_state = vm_states.RESIZED
  4139. instance.task_state = None
  4140. instance.launched_at = timeutils.utcnow()
  4141. instance.save(expected_task_state=task_states.RESIZE_FINISH)
  4142. return network_info
  4143. @wrap_exception()
  4144. @reverts_task_state
  4145. @wrap_instance_event(prefix='compute')
  4146. @wrap_instance_fault
  4147. def finish_resize(self, context, disk_info, image, instance,
  4148. migration):
  4149. """Completes the migration process.
  4150. Sets up the newly transferred disk and turns on the instance at its
  4151. new host machine.
  4152. """
  4153. # _finish_resize sets instance.old_flavor to instance.flavor and
  4154. # changes instance.flavor to instance.new_flavor (if doing a resize
  4155. # rather than a cold migration). We save off the old_flavor here in
  4156. # case we need it for error handling below.
  4157. old_flavor = instance.flavor
  4158. try:
  4159. self._finish_resize_helper(context, disk_info, image, instance,
  4160. migration)
  4161. except Exception:
  4162. with excutils.save_and_reraise_exception():
  4163. # At this point, resize_instance (which runs on the source) has
  4164. # already updated the instance host/node values to point to
  4165. # this (the dest) compute, so we need to leave the allocations
  4166. # against the dest node resource provider intact and drop the
  4167. # allocations against the source node resource provider. If the
  4168. # user tries to recover the server by hard rebooting it, it
  4169. # will happen on this host so that's where the allocations
  4170. # should go.
  4171. LOG.info('Deleting allocations for old flavor on source node '
  4172. '%s after finish_resize failure. You may be able to '
  4173. 'recover the instance by hard rebooting it.',
  4174. migration.source_compute, instance=instance)
  4175. # NOTE(mriedem): We can't use _delete_allocation_after_move
  4176. # because it relies on the resource tracker to look up the
  4177. # node uuid and since we are on the dest host, passing the
  4178. # source nodename won't work since the RT isn't tracking that
  4179. # node here. So we just try to remove the migration-based
  4180. # allocations directly and handle the case they don't exist.
  4181. if not self.reportclient.delete_allocation_for_instance(
  4182. context, migration.uuid):
  4183. # No migration-based allocation. Try to cleanup directly.
  4184. cn = objects.ComputeNode.get_by_host_and_nodename(
  4185. context, migration.source_compute,
  4186. migration.source_node)
  4187. if not scheduler_utils.remove_allocation_from_compute(
  4188. context, instance, cn.uuid, self.reportclient,
  4189. flavor=old_flavor):
  4190. LOG.error('Failed to delete allocations for old '
  4191. 'flavor %s against source node %s. The '
  4192. 'instance is now on the dest node %s. The '
  4193. 'allocations against the source node need '
  4194. 'to be manually cleaned up in Placement.',
  4195. old_flavor.flavorid, migration.source_node,
  4196. migration.dest_node, instance=instance)
  4197. def _finish_resize_helper(self, context, disk_info, image, instance,
  4198. migration):
  4199. """Completes the migration process.
  4200. The caller must revert the instance's allocations if the migration
  4201. process failed.
  4202. """
  4203. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4204. context, instance.uuid)
  4205. with self._error_out_instance_on_exception(context, instance), \
  4206. errors_out_migration_ctxt(migration):
  4207. image_meta = objects.ImageMeta.from_dict(image)
  4208. network_info = self._finish_resize(context, instance, migration,
  4209. disk_info, image_meta, bdms)
  4210. self._update_scheduler_instance_info(context, instance)
  4211. self._notify_about_instance_usage(
  4212. context, instance, "finish_resize.end",
  4213. network_info=network_info)
  4214. compute_utils.notify_about_instance_action(context, instance,
  4215. self.host, action=fields.NotificationAction.RESIZE_FINISH,
  4216. phase=fields.NotificationPhase.END, bdms=bdms)
  4217. @wrap_exception()
  4218. @wrap_instance_fault
  4219. def add_fixed_ip_to_instance(self, context, network_id, instance):
  4220. """Calls network_api to add new fixed_ip to instance
  4221. then injects the new network info and resets instance networking.
  4222. """
  4223. self._notify_about_instance_usage(
  4224. context, instance, "create_ip.start")
  4225. network_info = self.network_api.add_fixed_ip_to_instance(context,
  4226. instance,
  4227. network_id)
  4228. self._inject_network_info(context, instance, network_info)
  4229. self.reset_network(context, instance)
  4230. # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
  4231. instance.updated_at = timeutils.utcnow()
  4232. instance.save()
  4233. self._notify_about_instance_usage(
  4234. context, instance, "create_ip.end", network_info=network_info)
  4235. @wrap_exception()
  4236. @wrap_instance_fault
  4237. def remove_fixed_ip_from_instance(self, context, address, instance):
  4238. """Calls network_api to remove existing fixed_ip from instance
  4239. by injecting the altered network info and resetting
  4240. instance networking.
  4241. """
  4242. self._notify_about_instance_usage(
  4243. context, instance, "delete_ip.start")
  4244. network_info = self.network_api.remove_fixed_ip_from_instance(context,
  4245. instance,
  4246. address)
  4247. self._inject_network_info(context, instance, network_info)
  4248. self.reset_network(context, instance)
  4249. # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
  4250. instance.updated_at = timeutils.utcnow()
  4251. instance.save()
  4252. self._notify_about_instance_usage(
  4253. context, instance, "delete_ip.end", network_info=network_info)
  4254. @wrap_exception()
  4255. @reverts_task_state
  4256. @wrap_instance_event(prefix='compute')
  4257. @wrap_instance_fault
  4258. def pause_instance(self, context, instance):
  4259. """Pause an instance on this host."""
  4260. context = context.elevated()
  4261. LOG.info('Pausing', instance=instance)
  4262. self._notify_about_instance_usage(context, instance, 'pause.start')
  4263. compute_utils.notify_about_instance_action(context, instance,
  4264. self.host, action=fields.NotificationAction.PAUSE,
  4265. phase=fields.NotificationPhase.START)
  4266. self.driver.pause(instance)
  4267. instance.power_state = self._get_power_state(context, instance)
  4268. instance.vm_state = vm_states.PAUSED
  4269. instance.task_state = None
  4270. instance.save(expected_task_state=task_states.PAUSING)
  4271. self._notify_about_instance_usage(context, instance, 'pause.end')
  4272. compute_utils.notify_about_instance_action(context, instance,
  4273. self.host, action=fields.NotificationAction.PAUSE,
  4274. phase=fields.NotificationPhase.END)
  4275. @wrap_exception()
  4276. @reverts_task_state
  4277. @wrap_instance_event(prefix='compute')
  4278. @wrap_instance_fault
  4279. def unpause_instance(self, context, instance):
  4280. """Unpause a paused instance on this host."""
  4281. context = context.elevated()
  4282. LOG.info('Unpausing', instance=instance)
  4283. self._notify_about_instance_usage(context, instance, 'unpause.start')
  4284. compute_utils.notify_about_instance_action(context, instance,
  4285. self.host, action=fields.NotificationAction.UNPAUSE,
  4286. phase=fields.NotificationPhase.START)
  4287. self.driver.unpause(instance)
  4288. instance.power_state = self._get_power_state(context, instance)
  4289. instance.vm_state = vm_states.ACTIVE
  4290. instance.task_state = None
  4291. instance.save(expected_task_state=task_states.UNPAUSING)
  4292. self._notify_about_instance_usage(context, instance, 'unpause.end')
  4293. compute_utils.notify_about_instance_action(context, instance,
  4294. self.host, action=fields.NotificationAction.UNPAUSE,
  4295. phase=fields.NotificationPhase.END)
  4296. @wrap_exception()
  4297. def host_power_action(self, context, action):
  4298. """Reboots, shuts down or powers up the host."""
  4299. return self.driver.host_power_action(action)
  4300. @wrap_exception()
  4301. def host_maintenance_mode(self, context, host, mode):
  4302. """Start/Stop host maintenance window. On start, it triggers
  4303. guest VMs evacuation.
  4304. """
  4305. return self.driver.host_maintenance_mode(host, mode)
  4306. @wrap_exception()
  4307. def set_host_enabled(self, context, enabled):
  4308. """Sets the specified host's ability to accept new instances."""
  4309. return self.driver.set_host_enabled(enabled)
  4310. @wrap_exception()
  4311. def get_host_uptime(self, context):
  4312. """Returns the result of calling "uptime" on the target host."""
  4313. return self.driver.get_host_uptime()
  4314. @wrap_exception()
  4315. @wrap_instance_fault
  4316. def get_diagnostics(self, context, instance):
  4317. """Retrieve diagnostics for an instance on this host."""
  4318. current_power_state = self._get_power_state(context, instance)
  4319. if current_power_state == power_state.RUNNING:
  4320. LOG.info("Retrieving diagnostics", instance=instance)
  4321. return self.driver.get_diagnostics(instance)
  4322. else:
  4323. raise exception.InstanceInvalidState(
  4324. attr='power state',
  4325. instance_uuid=instance.uuid,
  4326. state=power_state.STATE_MAP[instance.power_state],
  4327. method='get_diagnostics')
  4328. @wrap_exception()
  4329. @wrap_instance_fault
  4330. def get_instance_diagnostics(self, context, instance):
  4331. """Retrieve diagnostics for an instance on this host."""
  4332. current_power_state = self._get_power_state(context, instance)
  4333. if current_power_state == power_state.RUNNING:
  4334. LOG.info("Retrieving diagnostics", instance=instance)
  4335. return self.driver.get_instance_diagnostics(instance)
  4336. else:
  4337. raise exception.InstanceInvalidState(
  4338. attr='power state',
  4339. instance_uuid=instance.uuid,
  4340. state=power_state.STATE_MAP[instance.power_state],
  4341. method='get_diagnostics')
  4342. @wrap_exception()
  4343. @reverts_task_state
  4344. @wrap_instance_event(prefix='compute')
  4345. @wrap_instance_fault
  4346. def suspend_instance(self, context, instance):
  4347. """Suspend the given instance."""
  4348. context = context.elevated()
  4349. # Store the old state
  4350. instance.system_metadata['old_vm_state'] = instance.vm_state
  4351. self._notify_about_instance_usage(context, instance, 'suspend.start')
  4352. compute_utils.notify_about_instance_action(context, instance,
  4353. self.host, action=fields.NotificationAction.SUSPEND,
  4354. phase=fields.NotificationPhase.START)
  4355. with self._error_out_instance_on_exception(context, instance,
  4356. instance_state=instance.vm_state):
  4357. self.driver.suspend(context, instance)
  4358. instance.power_state = self._get_power_state(context, instance)
  4359. instance.vm_state = vm_states.SUSPENDED
  4360. instance.task_state = None
  4361. instance.save(expected_task_state=task_states.SUSPENDING)
  4362. self._notify_about_instance_usage(context, instance, 'suspend.end')
  4363. compute_utils.notify_about_instance_action(context, instance,
  4364. self.host, action=fields.NotificationAction.SUSPEND,
  4365. phase=fields.NotificationPhase.END)
  4366. @wrap_exception()
  4367. @reverts_task_state
  4368. @wrap_instance_event(prefix='compute')
  4369. @wrap_instance_fault
  4370. def resume_instance(self, context, instance):
  4371. """Resume the given suspended instance."""
  4372. context = context.elevated()
  4373. LOG.info('Resuming', instance=instance)
  4374. self._notify_about_instance_usage(context, instance, 'resume.start')
  4375. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4376. context, instance.uuid)
  4377. block_device_info = self._get_instance_block_device_info(
  4378. context, instance, bdms=bdms)
  4379. compute_utils.notify_about_instance_action(context, instance,
  4380. self.host, action=fields.NotificationAction.RESUME,
  4381. phase=fields.NotificationPhase.START, bdms=bdms)
  4382. network_info = self.network_api.get_instance_nw_info(context, instance)
  4383. with self._error_out_instance_on_exception(context, instance,
  4384. instance_state=instance.vm_state):
  4385. self.driver.resume(context, instance, network_info,
  4386. block_device_info)
  4387. instance.power_state = self._get_power_state(context, instance)
  4388. # We default to the ACTIVE state for backwards compatibility
  4389. instance.vm_state = instance.system_metadata.pop('old_vm_state',
  4390. vm_states.ACTIVE)
  4391. instance.task_state = None
  4392. instance.save(expected_task_state=task_states.RESUMING)
  4393. self._notify_about_instance_usage(context, instance, 'resume.end')
  4394. compute_utils.notify_about_instance_action(context, instance,
  4395. self.host, action=fields.NotificationAction.RESUME,
  4396. phase=fields.NotificationPhase.END, bdms=bdms)
  4397. @wrap_exception()
  4398. @reverts_task_state
  4399. @wrap_instance_event(prefix='compute')
  4400. @wrap_instance_fault
  4401. def shelve_instance(self, context, instance, image_id,
  4402. clean_shutdown):
  4403. """Shelve an instance.
  4404. This should be used when you want to take a snapshot of the instance.
  4405. It also adds system_metadata that can be used by a periodic task to
  4406. offload the shelved instance after a period of time.
  4407. :param context: request context
  4408. :param instance: an Instance object
  4409. :param image_id: an image id to snapshot to.
  4410. :param clean_shutdown: give the GuestOS a chance to stop
  4411. """
  4412. @utils.synchronized(instance.uuid)
  4413. def do_shelve_instance():
  4414. self._shelve_instance(context, instance, image_id, clean_shutdown)
  4415. do_shelve_instance()
  4416. def _shelve_instance(self, context, instance, image_id,
  4417. clean_shutdown):
  4418. LOG.info('Shelving', instance=instance)
  4419. offload = CONF.shelved_offload_time == 0
  4420. if offload:
  4421. # Get the BDMs early so we can pass them into versioned
  4422. # notifications since _shelve_offload_instance needs the
  4423. # BDMs anyway.
  4424. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4425. context, instance.uuid)
  4426. else:
  4427. bdms = None
  4428. compute_utils.notify_usage_exists(self.notifier, context, instance,
  4429. current_period=True)
  4430. self._notify_about_instance_usage(context, instance, 'shelve.start')
  4431. compute_utils.notify_about_instance_action(context, instance,
  4432. self.host, action=fields.NotificationAction.SHELVE,
  4433. phase=fields.NotificationPhase.START, bdms=bdms)
  4434. def update_task_state(task_state, expected_state=task_states.SHELVING):
  4435. shelving_state_map = {
  4436. task_states.IMAGE_PENDING_UPLOAD:
  4437. task_states.SHELVING_IMAGE_PENDING_UPLOAD,
  4438. task_states.IMAGE_UPLOADING:
  4439. task_states.SHELVING_IMAGE_UPLOADING,
  4440. task_states.SHELVING: task_states.SHELVING}
  4441. task_state = shelving_state_map[task_state]
  4442. expected_state = shelving_state_map[expected_state]
  4443. instance.task_state = task_state
  4444. instance.save(expected_task_state=expected_state)
  4445. # Do not attempt a clean shutdown of a paused guest since some
  4446. # hypervisors will fail the clean shutdown if the guest is not
  4447. # running.
  4448. if instance.power_state == power_state.PAUSED:
  4449. clean_shutdown = False
  4450. self._power_off_instance(context, instance, clean_shutdown)
  4451. self.driver.snapshot(context, instance, image_id, update_task_state)
  4452. instance.system_metadata['shelved_at'] = timeutils.utcnow().isoformat()
  4453. instance.system_metadata['shelved_image_id'] = image_id
  4454. instance.system_metadata['shelved_host'] = self.host
  4455. instance.vm_state = vm_states.SHELVED
  4456. instance.task_state = None
  4457. if CONF.shelved_offload_time == 0:
  4458. instance.task_state = task_states.SHELVING_OFFLOADING
  4459. instance.power_state = self._get_power_state(context, instance)
  4460. instance.save(expected_task_state=[
  4461. task_states.SHELVING,
  4462. task_states.SHELVING_IMAGE_UPLOADING])
  4463. self._notify_about_instance_usage(context, instance, 'shelve.end')
  4464. compute_utils.notify_about_instance_action(context, instance,
  4465. self.host, action=fields.NotificationAction.SHELVE,
  4466. phase=fields.NotificationPhase.END, bdms=bdms)
  4467. if offload:
  4468. self._shelve_offload_instance(context, instance,
  4469. clean_shutdown=False, bdms=bdms)
  4470. @wrap_exception()
  4471. @reverts_task_state
  4472. @wrap_instance_event(prefix='compute')
  4473. @wrap_instance_fault
  4474. def shelve_offload_instance(self, context, instance, clean_shutdown):
  4475. """Remove a shelved instance from the hypervisor.
  4476. This frees up those resources for use by other instances, but may lead
  4477. to slower unshelve times for this instance. This method is used by
  4478. volume backed instances since restoring them doesn't involve the
  4479. potentially large download of an image.
  4480. :param context: request context
  4481. :param instance: nova.objects.instance.Instance
  4482. :param clean_shutdown: give the GuestOS a chance to stop
  4483. """
  4484. @utils.synchronized(instance.uuid)
  4485. def do_shelve_offload_instance():
  4486. self._shelve_offload_instance(context, instance, clean_shutdown)
  4487. do_shelve_offload_instance()
  4488. def _shelve_offload_instance(self, context, instance, clean_shutdown,
  4489. bdms=None):
  4490. LOG.info('Shelve offloading', instance=instance)
  4491. if bdms is None:
  4492. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4493. context, instance.uuid)
  4494. self._notify_about_instance_usage(context, instance,
  4495. 'shelve_offload.start')
  4496. compute_utils.notify_about_instance_action(context, instance,
  4497. self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,
  4498. phase=fields.NotificationPhase.START, bdms=bdms)
  4499. self._power_off_instance(context, instance, clean_shutdown)
  4500. current_power_state = self._get_power_state(context, instance)
  4501. self.network_api.cleanup_instance_network_on_host(context, instance,
  4502. instance.host)
  4503. network_info = self.network_api.get_instance_nw_info(context, instance)
  4504. block_device_info = self._get_instance_block_device_info(context,
  4505. instance,
  4506. bdms=bdms)
  4507. self.driver.destroy(context, instance, network_info,
  4508. block_device_info)
  4509. # the instance is going to be removed from the host so we want to
  4510. # terminate all the connections with the volume server and the host
  4511. self._terminate_volume_connections(context, instance, bdms)
  4512. instance.power_state = current_power_state
  4513. # NOTE(mriedem): The vm_state has to be set before updating the
  4514. # resource tracker, see vm_states.ALLOW_RESOURCE_REMOVAL. The host/node
  4515. # values cannot be nulled out until after updating the resource tracker
  4516. # though.
  4517. instance.vm_state = vm_states.SHELVED_OFFLOADED
  4518. instance.task_state = None
  4519. instance.save(expected_task_state=[task_states.SHELVING,
  4520. task_states.SHELVING_OFFLOADING])
  4521. # NOTE(ndipanov): Free resources from the resource tracker
  4522. self._update_resource_tracker(context, instance)
  4523. rt = self._get_resource_tracker()
  4524. rt.delete_allocation_for_shelve_offloaded_instance(context, instance)
  4525. # NOTE(sfinucan): RPC calls should no longer be attempted against this
  4526. # instance, so ensure any calls result in errors
  4527. self._nil_out_instance_obj_host_and_node(instance)
  4528. instance.save(expected_task_state=None)
  4529. self._delete_scheduler_instance_info(context, instance.uuid)
  4530. self._notify_about_instance_usage(context, instance,
  4531. 'shelve_offload.end')
  4532. compute_utils.notify_about_instance_action(context, instance,
  4533. self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,
  4534. phase=fields.NotificationPhase.END, bdms=bdms)
  4535. @wrap_exception()
  4536. @reverts_task_state
  4537. @wrap_instance_event(prefix='compute')
  4538. @wrap_instance_fault
  4539. def unshelve_instance(self, context, instance, image,
  4540. filter_properties, node):
  4541. """Unshelve the instance.
  4542. :param context: request context
  4543. :param instance: a nova.objects.instance.Instance object
  4544. :param image: an image to build from. If None we assume a
  4545. volume backed instance.
  4546. :param filter_properties: dict containing limits, retry info etc.
  4547. :param node: target compute node
  4548. """
  4549. if filter_properties is None:
  4550. filter_properties = {}
  4551. @utils.synchronized(instance.uuid)
  4552. def do_unshelve_instance():
  4553. self._unshelve_instance(context, instance, image,
  4554. filter_properties, node)
  4555. do_unshelve_instance()
  4556. def _unshelve_instance_key_scrub(self, instance):
  4557. """Remove data from the instance that may cause side effects."""
  4558. cleaned_keys = dict(
  4559. key_data=instance.key_data,
  4560. auto_disk_config=instance.auto_disk_config)
  4561. instance.key_data = None
  4562. instance.auto_disk_config = False
  4563. return cleaned_keys
  4564. def _unshelve_instance_key_restore(self, instance, keys):
  4565. """Restore previously scrubbed keys before saving the instance."""
  4566. instance.update(keys)
  4567. def _unshelve_instance(self, context, instance, image, filter_properties,
  4568. node):
  4569. LOG.info('Unshelving', instance=instance)
  4570. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4571. context, instance.uuid)
  4572. self._notify_about_instance_usage(context, instance, 'unshelve.start')
  4573. compute_utils.notify_about_instance_action(context, instance,
  4574. self.host, action=fields.NotificationAction.UNSHELVE,
  4575. phase=fields.NotificationPhase.START, bdms=bdms)
  4576. instance.task_state = task_states.SPAWNING
  4577. instance.save()
  4578. block_device_info = self._prep_block_device(context, instance, bdms)
  4579. scrubbed_keys = self._unshelve_instance_key_scrub(instance)
  4580. if node is None:
  4581. node = self._get_nodename(instance)
  4582. rt = self._get_resource_tracker()
  4583. limits = filter_properties.get('limits', {})
  4584. allocations = self.reportclient.get_allocations_for_consumer(
  4585. context, instance.uuid)
  4586. shelved_image_ref = instance.image_ref
  4587. if image:
  4588. instance.image_ref = image['id']
  4589. image_meta = objects.ImageMeta.from_dict(image)
  4590. else:
  4591. image_meta = objects.ImageMeta.from_dict(
  4592. utils.get_image_from_system_metadata(
  4593. instance.system_metadata))
  4594. self.network_api.setup_instance_network_on_host(context, instance,
  4595. self.host)
  4596. network_info = self.network_api.get_instance_nw_info(context, instance)
  4597. try:
  4598. with rt.instance_claim(context, instance, node, limits):
  4599. self.driver.spawn(context, instance, image_meta,
  4600. injected_files=[],
  4601. admin_password=None,
  4602. allocations=allocations,
  4603. network_info=network_info,
  4604. block_device_info=block_device_info)
  4605. except Exception:
  4606. with excutils.save_and_reraise_exception(logger=LOG):
  4607. LOG.exception('Instance failed to spawn',
  4608. instance=instance)
  4609. # Cleanup allocations created by the scheduler on this host
  4610. # since we failed to spawn the instance. We do this both if
  4611. # the instance claim failed with ComputeResourcesUnavailable
  4612. # or if we did claim but the spawn failed, because aborting the
  4613. # instance claim will not remove the allocations.
  4614. rt.reportclient.delete_allocation_for_instance(context,
  4615. instance.uuid)
  4616. # FIXME: Umm, shouldn't we be rolling back port bindings too?
  4617. self._terminate_volume_connections(context, instance, bdms)
  4618. # The reverts_task_state decorator on unshelve_instance will
  4619. # eventually save these updates.
  4620. self._nil_out_instance_obj_host_and_node(instance)
  4621. if image:
  4622. instance.image_ref = shelved_image_ref
  4623. self._delete_snapshot_of_shelved_instance(context, instance,
  4624. image['id'])
  4625. self._unshelve_instance_key_restore(instance, scrubbed_keys)
  4626. self._update_instance_after_spawn(context, instance)
  4627. # Delete system_metadata for a shelved instance
  4628. compute_utils.remove_shelved_keys_from_system_metadata(instance)
  4629. instance.save(expected_task_state=task_states.SPAWNING)
  4630. self._update_scheduler_instance_info(context, instance)
  4631. self._notify_about_instance_usage(context, instance, 'unshelve.end')
  4632. compute_utils.notify_about_instance_action(context, instance,
  4633. self.host, action=fields.NotificationAction.UNSHELVE,
  4634. phase=fields.NotificationPhase.END, bdms=bdms)
  4635. @messaging.expected_exceptions(NotImplementedError)
  4636. @wrap_instance_fault
  4637. def reset_network(self, context, instance):
  4638. """Reset networking on the given instance."""
  4639. LOG.debug('Reset network', instance=instance)
  4640. self.driver.reset_network(instance)
  4641. def _inject_network_info(self, context, instance, network_info):
  4642. """Inject network info for the given instance."""
  4643. LOG.debug('Inject network info', instance=instance)
  4644. LOG.debug('network_info to inject: |%s|', network_info,
  4645. instance=instance)
  4646. self.driver.inject_network_info(instance,
  4647. network_info)
  4648. @wrap_instance_fault
  4649. def inject_network_info(self, context, instance):
  4650. """Inject network info, but don't return the info."""
  4651. network_info = self.network_api.get_instance_nw_info(context, instance)
  4652. self._inject_network_info(context, instance, network_info)
  4653. @messaging.expected_exceptions(NotImplementedError,
  4654. exception.ConsoleNotAvailable,
  4655. exception.InstanceNotFound)
  4656. @wrap_exception()
  4657. @wrap_instance_fault
  4658. def get_console_output(self, context, instance, tail_length):
  4659. """Send the console output for the given instance."""
  4660. context = context.elevated()
  4661. LOG.info("Get console output", instance=instance)
  4662. output = self.driver.get_console_output(context, instance)
  4663. if type(output) is six.text_type:
  4664. output = six.b(output)
  4665. if tail_length is not None:
  4666. output = self._tail_log(output, tail_length)
  4667. return output.decode('ascii', 'replace')
  4668. def _tail_log(self, log, length):
  4669. try:
  4670. length = int(length)
  4671. except ValueError:
  4672. length = 0
  4673. if length == 0:
  4674. return b''
  4675. else:
  4676. return b'\n'.join(log.split(b'\n')[-int(length):])
  4677. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4678. exception.InstanceNotReady,
  4679. exception.InstanceNotFound,
  4680. exception.ConsoleTypeUnavailable,
  4681. NotImplementedError)
  4682. @wrap_exception()
  4683. @wrap_instance_fault
  4684. def get_vnc_console(self, context, console_type, instance):
  4685. """Return connection information for a vnc console."""
  4686. context = context.elevated()
  4687. LOG.debug("Getting vnc console", instance=instance)
  4688. token = uuidutils.generate_uuid()
  4689. if not CONF.vnc.enabled:
  4690. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4691. if console_type == 'novnc':
  4692. # For essex, novncproxy_base_url must include the full path
  4693. # including the html file (like http://myhost/vnc_auto.html)
  4694. access_url = '%s?token=%s' % (CONF.vnc.novncproxy_base_url, token)
  4695. elif console_type == 'xvpvnc':
  4696. access_url = '%s?token=%s' % (CONF.vnc.xvpvncproxy_base_url, token)
  4697. else:
  4698. raise exception.ConsoleTypeInvalid(console_type=console_type)
  4699. try:
  4700. # Retrieve connect info from driver, and then decorate with our
  4701. # access info token
  4702. console = self.driver.get_vnc_console(context, instance)
  4703. connect_info = console.get_connection_info(token, access_url)
  4704. except exception.InstanceNotFound:
  4705. if instance.vm_state != vm_states.BUILDING:
  4706. raise
  4707. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4708. return connect_info
  4709. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4710. exception.InstanceNotReady,
  4711. exception.InstanceNotFound,
  4712. exception.ConsoleTypeUnavailable,
  4713. NotImplementedError)
  4714. @wrap_exception()
  4715. @wrap_instance_fault
  4716. def get_spice_console(self, context, console_type, instance):
  4717. """Return connection information for a spice console."""
  4718. context = context.elevated()
  4719. LOG.debug("Getting spice console", instance=instance)
  4720. token = uuidutils.generate_uuid()
  4721. if not CONF.spice.enabled:
  4722. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4723. if console_type == 'spice-html5':
  4724. # For essex, spicehtml5proxy_base_url must include the full path
  4725. # including the html file (like http://myhost/spice_auto.html)
  4726. access_url = '%s?token=%s' % (CONF.spice.html5proxy_base_url,
  4727. token)
  4728. else:
  4729. raise exception.ConsoleTypeInvalid(console_type=console_type)
  4730. try:
  4731. # Retrieve connect info from driver, and then decorate with our
  4732. # access info token
  4733. console = self.driver.get_spice_console(context, instance)
  4734. connect_info = console.get_connection_info(token, access_url)
  4735. except exception.InstanceNotFound:
  4736. if instance.vm_state != vm_states.BUILDING:
  4737. raise
  4738. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4739. return connect_info
  4740. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4741. exception.InstanceNotReady,
  4742. exception.InstanceNotFound,
  4743. exception.ConsoleTypeUnavailable,
  4744. NotImplementedError)
  4745. @wrap_exception()
  4746. @wrap_instance_fault
  4747. def get_rdp_console(self, context, console_type, instance):
  4748. """Return connection information for a RDP console."""
  4749. context = context.elevated()
  4750. LOG.debug("Getting RDP console", instance=instance)
  4751. token = uuidutils.generate_uuid()
  4752. if not CONF.rdp.enabled:
  4753. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4754. if console_type == 'rdp-html5':
  4755. access_url = '%s?token=%s' % (CONF.rdp.html5_proxy_base_url,
  4756. token)
  4757. else:
  4758. raise exception.ConsoleTypeInvalid(console_type=console_type)
  4759. try:
  4760. # Retrieve connect info from driver, and then decorate with our
  4761. # access info token
  4762. console = self.driver.get_rdp_console(context, instance)
  4763. connect_info = console.get_connection_info(token, access_url)
  4764. except exception.InstanceNotFound:
  4765. if instance.vm_state != vm_states.BUILDING:
  4766. raise
  4767. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4768. return connect_info
  4769. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4770. exception.InstanceNotReady,
  4771. exception.InstanceNotFound,
  4772. exception.ConsoleTypeUnavailable,
  4773. NotImplementedError)
  4774. @wrap_exception()
  4775. @wrap_instance_fault
  4776. def get_mks_console(self, context, console_type, instance):
  4777. """Return connection information for a MKS console."""
  4778. context = context.elevated()
  4779. LOG.debug("Getting MKS console", instance=instance)
  4780. token = uuidutils.generate_uuid()
  4781. if not CONF.mks.enabled:
  4782. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4783. if console_type == 'webmks':
  4784. access_url = '%s?token=%s' % (CONF.mks.mksproxy_base_url,
  4785. token)
  4786. else:
  4787. raise exception.ConsoleTypeInvalid(console_type=console_type)
  4788. try:
  4789. # Retrieve connect info from driver, and then decorate with our
  4790. # access info token
  4791. console = self.driver.get_mks_console(context, instance)
  4792. connect_info = console.get_connection_info(token, access_url)
  4793. except exception.InstanceNotFound:
  4794. if instance.vm_state != vm_states.BUILDING:
  4795. raise
  4796. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4797. return connect_info
  4798. @messaging.expected_exceptions(
  4799. exception.ConsoleTypeInvalid,
  4800. exception.InstanceNotReady,
  4801. exception.InstanceNotFound,
  4802. exception.ConsoleTypeUnavailable,
  4803. exception.SocketPortRangeExhaustedException,
  4804. exception.ImageSerialPortNumberInvalid,
  4805. exception.ImageSerialPortNumberExceedFlavorValue,
  4806. NotImplementedError)
  4807. @wrap_exception()
  4808. @wrap_instance_fault
  4809. def get_serial_console(self, context, console_type, instance):
  4810. """Returns connection information for a serial console."""
  4811. LOG.debug("Getting serial console", instance=instance)
  4812. if not CONF.serial_console.enabled:
  4813. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4814. context = context.elevated()
  4815. token = uuidutils.generate_uuid()
  4816. access_url = '%s?token=%s' % (CONF.serial_console.base_url, token)
  4817. try:
  4818. # Retrieve connect info from driver, and then decorate with our
  4819. # access info token
  4820. console = self.driver.get_serial_console(context, instance)
  4821. connect_info = console.get_connection_info(token, access_url)
  4822. except exception.InstanceNotFound:
  4823. if instance.vm_state != vm_states.BUILDING:
  4824. raise
  4825. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4826. return connect_info
  4827. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4828. exception.InstanceNotReady,
  4829. exception.InstanceNotFound)
  4830. @wrap_exception()
  4831. @wrap_instance_fault
  4832. def validate_console_port(self, ctxt, instance, port, console_type):
  4833. if console_type == "spice-html5":
  4834. console_info = self.driver.get_spice_console(ctxt, instance)
  4835. elif console_type == "rdp-html5":
  4836. console_info = self.driver.get_rdp_console(ctxt, instance)
  4837. elif console_type == "serial":
  4838. console_info = self.driver.get_serial_console(ctxt, instance)
  4839. elif console_type == "webmks":
  4840. console_info = self.driver.get_mks_console(ctxt, instance)
  4841. else:
  4842. console_info = self.driver.get_vnc_console(ctxt, instance)
  4843. return console_info.port == port
  4844. @wrap_exception()
  4845. @reverts_task_state
  4846. @wrap_instance_fault
  4847. def reserve_block_device_name(self, context, instance, device,
  4848. volume_id, disk_bus, device_type, tag,
  4849. multiattach):
  4850. if (tag and not
  4851. self.driver.capabilities.get('supports_tagged_attach_volume',
  4852. False)):
  4853. raise exception.VolumeTaggedAttachNotSupported()
  4854. if (multiattach and not
  4855. self.driver.capabilities.get('supports_multiattach', False)):
  4856. raise exception.MultiattachNotSupportedByVirtDriver(
  4857. volume_id=volume_id)
  4858. @utils.synchronized(instance.uuid)
  4859. def do_reserve():
  4860. bdms = (
  4861. objects.BlockDeviceMappingList.get_by_instance_uuid(
  4862. context, instance.uuid))
  4863. # NOTE(ndipanov): We need to explicitly set all the fields on the
  4864. # object so that obj_load_attr does not fail
  4865. new_bdm = objects.BlockDeviceMapping(
  4866. context=context,
  4867. source_type='volume', destination_type='volume',
  4868. instance_uuid=instance.uuid, boot_index=None,
  4869. volume_id=volume_id,
  4870. device_name=device, guest_format=None,
  4871. disk_bus=disk_bus, device_type=device_type, tag=tag)
  4872. new_bdm.device_name = self._get_device_name_for_instance(
  4873. instance, bdms, new_bdm)
  4874. # NOTE(vish): create bdm here to avoid race condition
  4875. new_bdm.create()
  4876. return new_bdm
  4877. return do_reserve()
  4878. @wrap_exception()
  4879. @wrap_instance_event(prefix='compute')
  4880. @wrap_instance_fault
  4881. def attach_volume(self, context, instance, bdm):
  4882. """Attach a volume to an instance."""
  4883. driver_bdm = driver_block_device.convert_volume(bdm)
  4884. @utils.synchronized(instance.uuid)
  4885. def do_attach_volume(context, instance, driver_bdm):
  4886. try:
  4887. return self._attach_volume(context, instance, driver_bdm)
  4888. except Exception:
  4889. with excutils.save_and_reraise_exception():
  4890. bdm.destroy()
  4891. do_attach_volume(context, instance, driver_bdm)
  4892. def _attach_volume(self, context, instance, bdm):
  4893. context = context.elevated()
  4894. LOG.info('Attaching volume %(volume_id)s to %(mountpoint)s',
  4895. {'volume_id': bdm.volume_id,
  4896. 'mountpoint': bdm['mount_device']},
  4897. instance=instance)
  4898. compute_utils.notify_about_volume_attach_detach(
  4899. context, instance, self.host,
  4900. action=fields.NotificationAction.VOLUME_ATTACH,
  4901. phase=fields.NotificationPhase.START,
  4902. volume_id=bdm.volume_id)
  4903. try:
  4904. bdm.attach(context, instance, self.volume_api, self.driver,
  4905. do_driver_attach=True)
  4906. except Exception as e:
  4907. with excutils.save_and_reraise_exception():
  4908. LOG.exception("Failed to attach %(volume_id)s "
  4909. "at %(mountpoint)s",
  4910. {'volume_id': bdm.volume_id,
  4911. 'mountpoint': bdm['mount_device']},
  4912. instance=instance)
  4913. if bdm['attachment_id']:
  4914. # Try to delete the attachment to make the volume
  4915. # available again. Note that DriverVolumeBlockDevice
  4916. # may have already deleted the attachment so ignore
  4917. # VolumeAttachmentNotFound.
  4918. try:
  4919. self.volume_api.attachment_delete(
  4920. context, bdm['attachment_id'])
  4921. except exception.VolumeAttachmentNotFound as exc:
  4922. LOG.debug('Ignoring VolumeAttachmentNotFound: %s',
  4923. exc, instance=instance)
  4924. else:
  4925. self.volume_api.unreserve_volume(context, bdm.volume_id)
  4926. compute_utils.notify_about_volume_attach_detach(
  4927. context, instance, self.host,
  4928. action=fields.NotificationAction.VOLUME_ATTACH,
  4929. phase=fields.NotificationPhase.ERROR,
  4930. exception=e,
  4931. volume_id=bdm.volume_id)
  4932. info = {'volume_id': bdm.volume_id}
  4933. self._notify_about_instance_usage(
  4934. context, instance, "volume.attach", extra_usage_info=info)
  4935. compute_utils.notify_about_volume_attach_detach(
  4936. context, instance, self.host,
  4937. action=fields.NotificationAction.VOLUME_ATTACH,
  4938. phase=fields.NotificationPhase.END,
  4939. volume_id=bdm.volume_id)
  4940. def _notify_volume_usage_detach(self, context, instance, bdm):
  4941. if CONF.volume_usage_poll_interval <= 0:
  4942. return
  4943. mp = bdm.device_name
  4944. # Handle bootable volumes which will not contain /dev/
  4945. if '/dev/' in mp:
  4946. mp = mp[5:]
  4947. try:
  4948. vol_stats = self.driver.block_stats(instance, mp)
  4949. if vol_stats is None:
  4950. return
  4951. except NotImplementedError:
  4952. return
  4953. LOG.debug("Updating volume usage cache with totals", instance=instance)
  4954. rd_req, rd_bytes, wr_req, wr_bytes, flush_ops = vol_stats
  4955. vol_usage = objects.VolumeUsage(context)
  4956. vol_usage.volume_id = bdm.volume_id
  4957. vol_usage.instance_uuid = instance.uuid
  4958. vol_usage.project_id = instance.project_id
  4959. vol_usage.user_id = instance.user_id
  4960. vol_usage.availability_zone = instance.availability_zone
  4961. vol_usage.curr_reads = rd_req
  4962. vol_usage.curr_read_bytes = rd_bytes
  4963. vol_usage.curr_writes = wr_req
  4964. vol_usage.curr_write_bytes = wr_bytes
  4965. vol_usage.save(update_totals=True)
  4966. self.notifier.info(context, 'volume.usage',
  4967. compute_utils.usage_volume_info(vol_usage))
  4968. def _detach_volume(self, context, bdm, instance, destroy_bdm=True,
  4969. attachment_id=None):
  4970. """Detach a volume from an instance.
  4971. :param context: security context
  4972. :param bdm: nova.objects.BlockDeviceMapping volume bdm to detach
  4973. :param instance: the Instance object to detach the volume from
  4974. :param destroy_bdm: if True, the corresponding BDM entry will be marked
  4975. as deleted. Disabling this is useful for operations
  4976. like rebuild, when we don't want to destroy BDM
  4977. :param attachment_id: The volume attachment_id for the given instance
  4978. and volume.
  4979. """
  4980. volume_id = bdm.volume_id
  4981. compute_utils.notify_about_volume_attach_detach(
  4982. context, instance, self.host,
  4983. action=fields.NotificationAction.VOLUME_DETACH,
  4984. phase=fields.NotificationPhase.START,
  4985. volume_id=volume_id)
  4986. self._notify_volume_usage_detach(context, instance, bdm)
  4987. LOG.info('Detaching volume %(volume_id)s',
  4988. {'volume_id': volume_id}, instance=instance)
  4989. driver_bdm = driver_block_device.convert_volume(bdm)
  4990. driver_bdm.detach(context, instance, self.volume_api, self.driver,
  4991. attachment_id=attachment_id, destroy_bdm=destroy_bdm)
  4992. info = dict(volume_id=volume_id)
  4993. self._notify_about_instance_usage(
  4994. context, instance, "volume.detach", extra_usage_info=info)
  4995. compute_utils.notify_about_volume_attach_detach(
  4996. context, instance, self.host,
  4997. action=fields.NotificationAction.VOLUME_DETACH,
  4998. phase=fields.NotificationPhase.END,
  4999. volume_id=volume_id)
  5000. if 'tag' in bdm and bdm.tag:
  5001. self._delete_disk_metadata(instance, bdm)
  5002. if destroy_bdm:
  5003. bdm.destroy()
  5004. def _delete_disk_metadata(self, instance, bdm):
  5005. for device in instance.device_metadata.devices:
  5006. if isinstance(device, objects.DiskMetadata):
  5007. if 'serial' in device:
  5008. if device.serial == bdm.volume_id:
  5009. instance.device_metadata.devices.remove(device)
  5010. instance.save()
  5011. break
  5012. else:
  5013. # NOTE(artom) We log the entire device object because all
  5014. # fields are nullable and may not be set
  5015. LOG.warning('Unable to determine whether to clean up '
  5016. 'device metadata for disk %s', device,
  5017. instance=instance)
  5018. @wrap_exception()
  5019. @wrap_instance_event(prefix='compute')
  5020. @wrap_instance_fault
  5021. def detach_volume(self, context, volume_id, instance, attachment_id):
  5022. """Detach a volume from an instance.
  5023. :param context: security context
  5024. :param volume_id: the volume id
  5025. :param instance: the Instance object to detach the volume from
  5026. :param attachment_id: The volume attachment_id for the given instance
  5027. and volume.
  5028. """
  5029. @utils.synchronized(instance.uuid)
  5030. def do_detach_volume(context, volume_id, instance, attachment_id):
  5031. bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
  5032. context, volume_id, instance.uuid)
  5033. self._detach_volume(context, bdm, instance,
  5034. attachment_id=attachment_id)
  5035. do_detach_volume(context, volume_id, instance, attachment_id)
  5036. def _init_volume_connection(self, context, new_volume,
  5037. old_volume_id, connector, bdm,
  5038. new_attachment_id, mountpoint):
  5039. new_volume_id = new_volume['id']
  5040. if new_attachment_id is None:
  5041. # We're dealing with an old-style attachment so initialize the
  5042. # connection so we can get the connection_info.
  5043. new_cinfo = self.volume_api.initialize_connection(context,
  5044. new_volume_id,
  5045. connector)
  5046. else:
  5047. # Check for multiattach on the new volume and if True, check to
  5048. # see if the virt driver supports multiattach.
  5049. # TODO(mriedem): This is copied from DriverVolumeBlockDevice
  5050. # and should be consolidated into some common code at some point.
  5051. vol_multiattach = new_volume.get('multiattach', False)
  5052. virt_multiattach = self.driver.capabilities['supports_multiattach']
  5053. if vol_multiattach and not virt_multiattach:
  5054. raise exception.MultiattachNotSupportedByVirtDriver(
  5055. volume_id=new_volume_id)
  5056. # This is a new style attachment and the API created the new
  5057. # volume attachment and passed the id to the compute over RPC.
  5058. # At this point we need to update the new volume attachment with
  5059. # the host connector, which will give us back the new attachment
  5060. # connection_info.
  5061. new_cinfo = self.volume_api.attachment_update(
  5062. context, new_attachment_id, connector,
  5063. mountpoint)['connection_info']
  5064. if vol_multiattach:
  5065. # This will be used by the volume driver to determine the
  5066. # proper disk configuration.
  5067. new_cinfo['multiattach'] = True
  5068. old_cinfo = jsonutils.loads(bdm['connection_info'])
  5069. if old_cinfo and 'serial' not in old_cinfo:
  5070. old_cinfo['serial'] = old_volume_id
  5071. # NOTE(lyarwood): serial is not always present in the returned
  5072. # connection_info so set it if it is missing as we do in
  5073. # DriverVolumeBlockDevice.attach().
  5074. if 'serial' not in new_cinfo:
  5075. new_cinfo['serial'] = new_volume_id
  5076. return (old_cinfo, new_cinfo)
  5077. def _swap_volume(self, context, instance, bdm, connector,
  5078. old_volume_id, new_volume, resize_to,
  5079. new_attachment_id, is_cinder_migration):
  5080. new_volume_id = new_volume['id']
  5081. mountpoint = bdm['device_name']
  5082. failed = False
  5083. new_cinfo = None
  5084. try:
  5085. old_cinfo, new_cinfo = self._init_volume_connection(
  5086. context, new_volume, old_volume_id, connector,
  5087. bdm, new_attachment_id, mountpoint)
  5088. # NOTE(lyarwood): The Libvirt driver, the only virt driver
  5089. # currently implementing swap_volume, will modify the contents of
  5090. # new_cinfo when connect_volume is called. This is then saved to
  5091. # the BDM in swap_volume for future use outside of this flow.
  5092. msg = ("swap_volume: Calling driver volume swap with "
  5093. "connection infos: new: %(new_cinfo)s; "
  5094. "old: %(old_cinfo)s" %
  5095. {'new_cinfo': new_cinfo, 'old_cinfo': old_cinfo})
  5096. # Both new and old info might contain password
  5097. LOG.debug(strutils.mask_password(msg), instance=instance)
  5098. self.driver.swap_volume(context, old_cinfo, new_cinfo, instance,
  5099. mountpoint, resize_to)
  5100. if new_attachment_id:
  5101. self.volume_api.attachment_complete(context, new_attachment_id)
  5102. msg = ("swap_volume: Driver volume swap returned, new "
  5103. "connection_info is now : %(new_cinfo)s" %
  5104. {'new_cinfo': new_cinfo})
  5105. LOG.debug(strutils.mask_password(msg))
  5106. except Exception as ex:
  5107. failed = True
  5108. with excutils.save_and_reraise_exception():
  5109. compute_utils.notify_about_volume_swap(
  5110. context, instance, self.host,
  5111. fields.NotificationAction.VOLUME_SWAP,
  5112. fields.NotificationPhase.ERROR,
  5113. old_volume_id, new_volume_id, ex)
  5114. if new_cinfo:
  5115. msg = ("Failed to swap volume %(old_volume_id)s "
  5116. "for %(new_volume_id)s")
  5117. LOG.exception(msg, {'old_volume_id': old_volume_id,
  5118. 'new_volume_id': new_volume_id},
  5119. instance=instance)
  5120. else:
  5121. msg = ("Failed to connect to volume %(volume_id)s "
  5122. "with volume at %(mountpoint)s")
  5123. LOG.exception(msg, {'volume_id': new_volume_id,
  5124. 'mountpoint': bdm['device_name']},
  5125. instance=instance)
  5126. # The API marked the volume as 'detaching' for the old volume
  5127. # so we need to roll that back so the volume goes back to
  5128. # 'in-use' state.
  5129. self.volume_api.roll_detaching(context, old_volume_id)
  5130. if new_attachment_id is None:
  5131. # The API reserved the new volume so it would be in
  5132. # 'attaching' status, so we need to unreserve it so it
  5133. # goes back to 'available' status.
  5134. self.volume_api.unreserve_volume(context, new_volume_id)
  5135. else:
  5136. # This is a new style attachment for the new volume, which
  5137. # was created in the API. We just need to delete it here
  5138. # to put the new volume back into 'available' status.
  5139. self.volume_api.attachment_delete(
  5140. context, new_attachment_id)
  5141. finally:
  5142. # TODO(mriedem): This finally block is terribly confusing and is
  5143. # trying to do too much. We should consider removing the finally
  5144. # block and move whatever needs to happen on success and failure
  5145. # into the blocks above for clarity, even if it means a bit of
  5146. # redundant code.
  5147. conn_volume = new_volume_id if failed else old_volume_id
  5148. if new_cinfo:
  5149. LOG.debug("swap_volume: removing Cinder connection "
  5150. "for volume %(volume)s", {'volume': conn_volume},
  5151. instance=instance)
  5152. if bdm.attachment_id is None:
  5153. # This is the pre-3.44 flow for new-style volume
  5154. # attachments so just terminate the connection.
  5155. self.volume_api.terminate_connection(context,
  5156. conn_volume,
  5157. connector)
  5158. else:
  5159. # This is a new style volume attachment. If we failed, then
  5160. # the new attachment was already deleted above in the
  5161. # exception block and we have nothing more to do here. If
  5162. # swap_volume was successful in the driver, then we need to
  5163. # "detach" the original attachment by deleting it.
  5164. if not failed:
  5165. self.volume_api.attachment_delete(
  5166. context, bdm.attachment_id)
  5167. # Need to make some decisions based on whether this was
  5168. # a Cinder initiated migration or not. The callback to
  5169. # migration completion isn't needed in the case of a
  5170. # nova initiated simple swap of two volume
  5171. # "volume-update" call so skip that. The new attachment
  5172. # scenarios will give us a new attachment record and
  5173. # that's what we want.
  5174. if bdm.attachment_id and not is_cinder_migration:
  5175. # we don't callback to cinder
  5176. comp_ret = {'save_volume_id': new_volume_id}
  5177. else:
  5178. # NOTE(lyarwood): The following call to
  5179. # os-migrate-volume-completion returns a dict containing
  5180. # save_volume_id, this volume id has two possible values :
  5181. # 1. old_volume_id if we are migrating (retyping) volumes
  5182. # 2. new_volume_id if we are swapping between two existing
  5183. # volumes
  5184. # This volume id is later used to update the volume_id and
  5185. # connection_info['serial'] of the BDM.
  5186. comp_ret = self.volume_api.migrate_volume_completion(
  5187. context,
  5188. old_volume_id,
  5189. new_volume_id,
  5190. error=failed)
  5191. LOG.debug("swap_volume: Cinder migrate_volume_completion "
  5192. "returned: %(comp_ret)s", {'comp_ret': comp_ret},
  5193. instance=instance)
  5194. return (comp_ret, new_cinfo)
  5195. @wrap_exception()
  5196. @wrap_instance_event(prefix='compute')
  5197. @wrap_instance_fault
  5198. def swap_volume(self, context, old_volume_id, new_volume_id, instance,
  5199. new_attachment_id):
  5200. """Swap volume for an instance."""
  5201. context = context.elevated()
  5202. compute_utils.notify_about_volume_swap(
  5203. context, instance, self.host,
  5204. fields.NotificationAction.VOLUME_SWAP,
  5205. fields.NotificationPhase.START,
  5206. old_volume_id, new_volume_id)
  5207. bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
  5208. context, old_volume_id, instance.uuid)
  5209. connector = self.driver.get_volume_connector(instance)
  5210. resize_to = 0
  5211. old_volume = self.volume_api.get(context, old_volume_id)
  5212. # Yes this is a tightly-coupled state check of what's going on inside
  5213. # cinder, but we need this while we still support old (v1/v2) and
  5214. # new style attachments (v3.44). Once we drop support for old style
  5215. # attachments we could think about cleaning up the cinder-initiated
  5216. # swap volume API flows.
  5217. is_cinder_migration = False
  5218. if 'migration_status' in old_volume:
  5219. is_cinder_migration = old_volume['migration_status'] == 'migrating'
  5220. old_vol_size = old_volume['size']
  5221. new_volume = self.volume_api.get(context, new_volume_id)
  5222. new_vol_size = new_volume['size']
  5223. if new_vol_size > old_vol_size:
  5224. resize_to = new_vol_size
  5225. LOG.info('Swapping volume %(old_volume)s for %(new_volume)s',
  5226. {'old_volume': old_volume_id, 'new_volume': new_volume_id},
  5227. instance=instance)
  5228. comp_ret, new_cinfo = self._swap_volume(context,
  5229. instance,
  5230. bdm,
  5231. connector,
  5232. old_volume_id,
  5233. new_volume,
  5234. resize_to,
  5235. new_attachment_id,
  5236. is_cinder_migration)
  5237. # NOTE(lyarwood): Update the BDM with the modified new_cinfo and
  5238. # correct volume_id returned by Cinder.
  5239. save_volume_id = comp_ret['save_volume_id']
  5240. new_cinfo['serial'] = save_volume_id
  5241. values = {
  5242. 'connection_info': jsonutils.dumps(new_cinfo),
  5243. 'source_type': 'volume',
  5244. 'destination_type': 'volume',
  5245. 'snapshot_id': None,
  5246. 'volume_id': save_volume_id,
  5247. 'no_device': None}
  5248. if resize_to:
  5249. values['volume_size'] = resize_to
  5250. if new_attachment_id is not None:
  5251. # This was a volume swap for a new-style attachment so we
  5252. # need to update the BDM attachment_id for the new attachment.
  5253. values['attachment_id'] = new_attachment_id
  5254. LOG.debug("swap_volume: Updating volume %(volume_id)s BDM record with "
  5255. "%(updates)s", {'volume_id': bdm.volume_id,
  5256. 'updates': values},
  5257. instance=instance)
  5258. bdm.update(values)
  5259. bdm.save()
  5260. compute_utils.notify_about_volume_swap(
  5261. context, instance, self.host,
  5262. fields.NotificationAction.VOLUME_SWAP,
  5263. fields.NotificationPhase.END,
  5264. old_volume_id, new_volume_id)
  5265. @wrap_exception()
  5266. def remove_volume_connection(self, context, volume_id, instance):
  5267. """Remove the volume connection on this host
  5268. Detach the volume from this instance on this host, and if this is
  5269. the cinder v2 flow, call cinder to terminate the connection.
  5270. """
  5271. try:
  5272. bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
  5273. context, volume_id, instance.uuid)
  5274. driver_bdm = driver_block_device.convert_volume(bdm)
  5275. driver_bdm.driver_detach(context, instance,
  5276. self.volume_api, self.driver)
  5277. if bdm.attachment_id is None:
  5278. # cinder v2 api flow
  5279. connector = self.driver.get_volume_connector(instance)
  5280. self.volume_api.terminate_connection(context, volume_id,
  5281. connector)
  5282. except exception.NotFound:
  5283. pass
  5284. @wrap_exception()
  5285. @wrap_instance_event(prefix='compute')
  5286. @wrap_instance_fault
  5287. def attach_interface(self, context, instance, network_id, port_id,
  5288. requested_ip, tag):
  5289. """Use hotplug to add an network adapter to an instance."""
  5290. if not self.driver.capabilities['supports_attach_interface']:
  5291. raise exception.AttachInterfaceNotSupported(
  5292. instance_uuid=instance.uuid)
  5293. if (tag and not
  5294. self.driver.capabilities.get('supports_tagged_attach_interface',
  5295. False)):
  5296. raise exception.NetworkInterfaceTaggedAttachNotSupported()
  5297. compute_utils.notify_about_instance_action(
  5298. context, instance, self.host,
  5299. action=fields.NotificationAction.INTERFACE_ATTACH,
  5300. phase=fields.NotificationPhase.START)
  5301. bind_host_id = self.driver.network_binding_host_id(context, instance)
  5302. network_info = self.network_api.allocate_port_for_instance(
  5303. context, instance, port_id, network_id, requested_ip,
  5304. bind_host_id=bind_host_id, tag=tag)
  5305. if len(network_info) != 1:
  5306. LOG.error('allocate_port_for_instance returned %(ports)s '
  5307. 'ports', {'ports': len(network_info)})
  5308. # TODO(elod.illes): an instance.interface_attach.error notification
  5309. # should be sent here
  5310. raise exception.InterfaceAttachFailed(
  5311. instance_uuid=instance.uuid)
  5312. image_meta = objects.ImageMeta.from_instance(instance)
  5313. try:
  5314. self.driver.attach_interface(context, instance, image_meta,
  5315. network_info[0])
  5316. except exception.NovaException as ex:
  5317. port_id = network_info[0].get('id')
  5318. LOG.warning("attach interface failed , try to deallocate "
  5319. "port %(port_id)s, reason: %(msg)s",
  5320. {'port_id': port_id, 'msg': ex},
  5321. instance=instance)
  5322. try:
  5323. self.network_api.deallocate_port_for_instance(
  5324. context, instance, port_id)
  5325. except Exception:
  5326. LOG.warning("deallocate port %(port_id)s failed",
  5327. {'port_id': port_id}, instance=instance)
  5328. compute_utils.notify_about_instance_action(
  5329. context, instance, self.host,
  5330. action=fields.NotificationAction.INTERFACE_ATTACH,
  5331. phase=fields.NotificationPhase.ERROR,
  5332. exception=ex)
  5333. raise exception.InterfaceAttachFailed(
  5334. instance_uuid=instance.uuid)
  5335. compute_utils.notify_about_instance_action(
  5336. context, instance, self.host,
  5337. action=fields.NotificationAction.INTERFACE_ATTACH,
  5338. phase=fields.NotificationPhase.END)
  5339. return network_info[0]
  5340. @wrap_exception()
  5341. @wrap_instance_event(prefix='compute')
  5342. @wrap_instance_fault
  5343. def detach_interface(self, context, instance, port_id):
  5344. """Detach a network adapter from an instance."""
  5345. network_info = instance.info_cache.network_info
  5346. condemned = None
  5347. for vif in network_info:
  5348. if vif['id'] == port_id:
  5349. condemned = vif
  5350. break
  5351. if condemned is None:
  5352. raise exception.PortNotFound(_("Port %s is not "
  5353. "attached") % port_id)
  5354. compute_utils.notify_about_instance_action(
  5355. context, instance, self.host,
  5356. action=fields.NotificationAction.INTERFACE_DETACH,
  5357. phase=fields.NotificationPhase.START)
  5358. try:
  5359. self.driver.detach_interface(context, instance, condemned)
  5360. except exception.NovaException as ex:
  5361. # If the instance was deleted before the interface was detached,
  5362. # just log it at debug.
  5363. log_level = (logging.DEBUG
  5364. if isinstance(ex, exception.InstanceNotFound)
  5365. else logging.WARNING)
  5366. LOG.log(log_level,
  5367. "Detach interface failed, port_id=%(port_id)s, reason: "
  5368. "%(msg)s", {'port_id': port_id, 'msg': ex},
  5369. instance=instance)
  5370. raise exception.InterfaceDetachFailed(instance_uuid=instance.uuid)
  5371. else:
  5372. try:
  5373. self.network_api.deallocate_port_for_instance(
  5374. context, instance, port_id)
  5375. except Exception as ex:
  5376. with excutils.save_and_reraise_exception():
  5377. # Since this is a cast operation, log the failure for
  5378. # triage.
  5379. LOG.warning('Failed to deallocate port %(port_id)s '
  5380. 'for instance. Error: %(error)s',
  5381. {'port_id': port_id, 'error': ex},
  5382. instance=instance)
  5383. compute_utils.notify_about_instance_action(
  5384. context, instance, self.host,
  5385. action=fields.NotificationAction.INTERFACE_DETACH,
  5386. phase=fields.NotificationPhase.END)
  5387. def _get_compute_info(self, context, host):
  5388. return objects.ComputeNode.get_first_node_by_host_for_old_compat(
  5389. context, host)
  5390. @wrap_exception()
  5391. def check_instance_shared_storage(self, ctxt, instance, data):
  5392. """Check if the instance files are shared
  5393. :param ctxt: security context
  5394. :param instance: dict of instance data
  5395. :param data: result of driver.check_instance_shared_storage_local
  5396. Returns True if instance disks located on shared storage and
  5397. False otherwise.
  5398. """
  5399. return self.driver.check_instance_shared_storage_remote(ctxt, data)
  5400. @wrap_exception()
  5401. @wrap_instance_event(prefix='compute')
  5402. @wrap_instance_fault
  5403. def check_can_live_migrate_destination(self, ctxt, instance,
  5404. block_migration, disk_over_commit):
  5405. """Check if it is possible to execute live migration.
  5406. This runs checks on the destination host, and then calls
  5407. back to the source host to check the results.
  5408. :param context: security context
  5409. :param instance: dict of instance data
  5410. :param block_migration: if true, prepare for block migration
  5411. if None, calculate it in driver
  5412. :param disk_over_commit: if true, allow disk over commit
  5413. if None, ignore disk usage checking
  5414. :returns: a dict containing migration info
  5415. """
  5416. return self._do_check_can_live_migrate_destination(ctxt, instance,
  5417. block_migration,
  5418. disk_over_commit)
  5419. def _do_check_can_live_migrate_destination(self, ctxt, instance,
  5420. block_migration,
  5421. disk_over_commit):
  5422. src_compute_info = obj_base.obj_to_primitive(
  5423. self._get_compute_info(ctxt, instance.host))
  5424. dst_compute_info = obj_base.obj_to_primitive(
  5425. self._get_compute_info(ctxt, CONF.host))
  5426. dest_check_data = self.driver.check_can_live_migrate_destination(ctxt,
  5427. instance, src_compute_info, dst_compute_info,
  5428. block_migration, disk_over_commit)
  5429. LOG.debug('destination check data is %s', dest_check_data)
  5430. try:
  5431. migrate_data = self.compute_rpcapi.\
  5432. check_can_live_migrate_source(ctxt, instance,
  5433. dest_check_data)
  5434. finally:
  5435. self.driver.cleanup_live_migration_destination_check(ctxt,
  5436. dest_check_data)
  5437. return migrate_data
  5438. @wrap_exception()
  5439. @wrap_instance_event(prefix='compute')
  5440. @wrap_instance_fault
  5441. def check_can_live_migrate_source(self, ctxt, instance, dest_check_data):
  5442. """Check if it is possible to execute live migration.
  5443. This checks if the live migration can succeed, based on the
  5444. results from check_can_live_migrate_destination.
  5445. :param ctxt: security context
  5446. :param instance: dict of instance data
  5447. :param dest_check_data: result of check_can_live_migrate_destination
  5448. :returns: a dict containing migration info
  5449. """
  5450. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5451. ctxt, instance.uuid)
  5452. is_volume_backed = compute_utils.is_volume_backed_instance(
  5453. ctxt, instance, bdms)
  5454. dest_check_data.is_volume_backed = is_volume_backed
  5455. block_device_info = self._get_instance_block_device_info(
  5456. ctxt, instance, refresh_conn_info=False, bdms=bdms)
  5457. result = self.driver.check_can_live_migrate_source(ctxt, instance,
  5458. dest_check_data,
  5459. block_device_info)
  5460. LOG.debug('source check data is %s', result)
  5461. return result
  5462. @wrap_exception()
  5463. @wrap_instance_event(prefix='compute')
  5464. @wrap_instance_fault
  5465. def pre_live_migration(self, context, instance, block_migration, disk,
  5466. migrate_data):
  5467. """Preparations for live migration at dest host.
  5468. :param context: security context
  5469. :param instance: dict of instance data
  5470. :param block_migration: if true, prepare for block migration
  5471. :param disk: disk info of instance
  5472. :param migrate_data: A dict or LiveMigrateData object holding data
  5473. required for live migration without shared
  5474. storage.
  5475. :returns: migrate_data containing additional migration info
  5476. """
  5477. LOG.debug('pre_live_migration data is %s', migrate_data)
  5478. migrate_data.old_vol_attachment_ids = {}
  5479. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5480. context, instance.uuid)
  5481. network_info = self.network_api.get_instance_nw_info(context, instance)
  5482. self._notify_about_instance_usage(
  5483. context, instance, "live_migration.pre.start",
  5484. network_info=network_info)
  5485. compute_utils.notify_about_instance_action(
  5486. context, instance, self.host,
  5487. action=fields.NotificationAction.LIVE_MIGRATION_PRE,
  5488. phase=fields.NotificationPhase.START, bdms=bdms)
  5489. connector = self.driver.get_volume_connector(instance)
  5490. try:
  5491. for bdm in bdms:
  5492. if bdm.is_volume and bdm.attachment_id is not None:
  5493. # This bdm uses the new cinder v3.44 API.
  5494. # We will create a new attachment for this
  5495. # volume on this migration destination host. The old
  5496. # attachment will be deleted on the source host
  5497. # when the migration succeeds. The old attachment_id
  5498. # is stored in dict with the key being the bdm.volume_id
  5499. # so it can be restored on rollback.
  5500. #
  5501. # Also note that attachment_update is not needed as we
  5502. # are providing the connector in the create call.
  5503. attach_ref = self.volume_api.attachment_create(
  5504. context, bdm.volume_id, bdm.instance_uuid,
  5505. connector=connector, mountpoint=bdm.device_name)
  5506. # save current attachment so we can detach it on success,
  5507. # or restore it on a rollback.
  5508. # NOTE(mdbooth): This data is no longer used by the source
  5509. # host since change I0390c9ff. We can't remove it until we
  5510. # are sure the source host has been upgraded.
  5511. migrate_data.old_vol_attachment_ids[bdm.volume_id] = \
  5512. bdm.attachment_id
  5513. # update the bdm with the new attachment_id.
  5514. bdm.attachment_id = attach_ref['id']
  5515. bdm.save()
  5516. block_device_info = self._get_instance_block_device_info(
  5517. context, instance, refresh_conn_info=True,
  5518. bdms=bdms)
  5519. migrate_data = self.driver.pre_live_migration(context,
  5520. instance,
  5521. block_device_info,
  5522. network_info,
  5523. disk,
  5524. migrate_data)
  5525. LOG.debug('driver pre_live_migration data is %s', migrate_data)
  5526. # NOTE(tr3buchet): setup networks on destination host
  5527. self.network_api.setup_networks_on_host(context, instance,
  5528. self.host)
  5529. # Creating filters to hypervisors and firewalls.
  5530. # An example is that nova-instance-instance-xxx,
  5531. # which is written to libvirt.xml(Check "virsh nwfilter-list")
  5532. # This nwfilter is necessary on the destination host.
  5533. # In addition, this method is creating filtering rule
  5534. # onto destination host.
  5535. self.driver.ensure_filtering_rules_for_instance(instance,
  5536. network_info)
  5537. except Exception:
  5538. # If we raise, migrate_data with the updated attachment ids
  5539. # will not be returned to the source host for rollback.
  5540. # So we need to rollback new attachments here.
  5541. with excutils.save_and_reraise_exception():
  5542. old_attachments = migrate_data.old_vol_attachment_ids
  5543. for bdm in bdms:
  5544. if (bdm.is_volume and bdm.attachment_id is not None and
  5545. bdm.volume_id in old_attachments):
  5546. self.volume_api.attachment_delete(context,
  5547. bdm.attachment_id)
  5548. bdm.attachment_id = old_attachments[bdm.volume_id]
  5549. bdm.save()
  5550. # Volume connections are complete, tell cinder that all the
  5551. # attachments have completed.
  5552. for bdm in bdms:
  5553. if bdm.is_volume and bdm.attachment_id is not None:
  5554. self.volume_api.attachment_complete(context,
  5555. bdm.attachment_id)
  5556. self._notify_about_instance_usage(
  5557. context, instance, "live_migration.pre.end",
  5558. network_info=network_info)
  5559. compute_utils.notify_about_instance_action(
  5560. context, instance, self.host,
  5561. action=fields.NotificationAction.LIVE_MIGRATION_PRE,
  5562. phase=fields.NotificationPhase.END, bdms=bdms)
  5563. LOG.debug('pre_live_migration result data is %s', migrate_data)
  5564. return migrate_data
  5565. @staticmethod
  5566. def _neutron_failed_live_migration_callback(event_name, instance):
  5567. msg = ('Neutron reported failure during live migration '
  5568. 'with %(event)s for instance %(uuid)s')
  5569. msg_args = {'event': event_name, 'uuid': instance.uuid}
  5570. if CONF.vif_plugging_is_fatal:
  5571. raise exception.VirtualInterfacePlugException(msg % msg_args)
  5572. LOG.error(msg, msg_args)
  5573. @staticmethod
  5574. def _get_neutron_events_for_live_migration(instance):
  5575. # We don't generate events if CONF.vif_plugging_timeout=0
  5576. # or if waiting during live migration is disabled,
  5577. # meaning that the operator disabled using them.
  5578. if (CONF.vif_plugging_timeout and utils.is_neutron() and
  5579. CONF.compute.live_migration_wait_for_vif_plug):
  5580. return [('network-vif-plugged', vif['id'])
  5581. for vif in instance.get_network_info()]
  5582. else:
  5583. return []
  5584. def _cleanup_pre_live_migration(self, context, dest, instance,
  5585. migration, migrate_data, source_bdms):
  5586. """Helper method for when pre_live_migration fails
  5587. Sets the migration status to "error" and rolls back the live migration
  5588. setup on the destination host.
  5589. :param context: The user request context.
  5590. :type context: nova.context.RequestContext
  5591. :param dest: The live migration destination hostname.
  5592. :type dest: str
  5593. :param instance: The instance being live migrated.
  5594. :type instance: nova.objects.Instance
  5595. :param migration: The migration record tracking this live migration.
  5596. :type migration: nova.objects.Migration
  5597. :param migrate_data: Data about the live migration, populated from
  5598. the destination host.
  5599. :type migrate_data: Subclass of nova.objects.LiveMigrateData
  5600. :param source_bdms: BDMs prior to modification by the destination
  5601. compute host. Set by _do_live_migration and not
  5602. part of the callback interface, so this is never
  5603. None
  5604. """
  5605. self._set_migration_status(migration, 'error')
  5606. # Make sure we set this for _rollback_live_migration()
  5607. # so it can find it, as expected if it was called later
  5608. migrate_data.migration = migration
  5609. self._rollback_live_migration(context, instance, dest,
  5610. migrate_data=migrate_data,
  5611. source_bdms=source_bdms)
  5612. def _do_live_migration(self, context, dest, instance, block_migration,
  5613. migration, migrate_data):
  5614. # NOTE(danms): We should enhance the RT to account for migrations
  5615. # and use the status field to denote when the accounting has been
  5616. # done on source/destination. For now, this is just here for status
  5617. # reporting
  5618. self._set_migration_status(migration, 'preparing')
  5619. source_bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5620. context, instance.uuid)
  5621. events = self._get_neutron_events_for_live_migration(instance)
  5622. try:
  5623. if ('block_migration' in migrate_data and
  5624. migrate_data.block_migration):
  5625. block_device_info = self._get_instance_block_device_info(
  5626. context, instance, bdms=source_bdms)
  5627. disk = self.driver.get_instance_disk_info(
  5628. instance, block_device_info=block_device_info)
  5629. else:
  5630. disk = None
  5631. deadline = CONF.vif_plugging_timeout
  5632. error_cb = self._neutron_failed_live_migration_callback
  5633. # In order to avoid a race with the vif plugging that the virt
  5634. # driver does on the destination host, we register our events
  5635. # to wait for before calling pre_live_migration.
  5636. with self.virtapi.wait_for_instance_event(
  5637. instance, events, deadline=deadline,
  5638. error_callback=error_cb):
  5639. migrate_data = self.compute_rpcapi.pre_live_migration(
  5640. context, instance,
  5641. block_migration, disk, dest, migrate_data)
  5642. except exception.VirtualInterfacePlugException:
  5643. with excutils.save_and_reraise_exception():
  5644. LOG.exception('Failed waiting for network virtual interfaces '
  5645. 'to be plugged on the destination host %s.',
  5646. dest, instance=instance)
  5647. self._cleanup_pre_live_migration(
  5648. context, dest, instance, migration, migrate_data,
  5649. source_bdms)
  5650. except eventlet.timeout.Timeout:
  5651. msg = 'Timed out waiting for events: %s'
  5652. LOG.warning(msg, events, instance=instance)
  5653. if CONF.vif_plugging_is_fatal:
  5654. self._cleanup_pre_live_migration(
  5655. context, dest, instance, migration, migrate_data,
  5656. source_bdms)
  5657. raise exception.MigrationError(reason=msg % events)
  5658. except Exception:
  5659. with excutils.save_and_reraise_exception():
  5660. LOG.exception('Pre live migration failed at %s',
  5661. dest, instance=instance)
  5662. self._cleanup_pre_live_migration(
  5663. context, dest, instance, migration, migrate_data,
  5664. source_bdms)
  5665. self._set_migration_status(migration, 'running')
  5666. if migrate_data:
  5667. migrate_data.migration = migration
  5668. # NOTE(mdbooth): pre_live_migration will update connection_info and
  5669. # attachment_id on all volume BDMS to reflect the new destination
  5670. # host attachment. We fetch BDMs before that to retain connection_info
  5671. # and attachment_id relating to the source host for post migration
  5672. # cleanup.
  5673. post_live_migration = functools.partial(self._post_live_migration,
  5674. source_bdms=source_bdms)
  5675. rollback_live_migration = functools.partial(
  5676. self._rollback_live_migration, source_bdms=source_bdms)
  5677. LOG.debug('live_migration data is %s', migrate_data)
  5678. try:
  5679. self.driver.live_migration(context, instance, dest,
  5680. post_live_migration,
  5681. rollback_live_migration,
  5682. block_migration, migrate_data)
  5683. except Exception:
  5684. LOG.exception('Live migration failed.', instance=instance)
  5685. with excutils.save_and_reraise_exception():
  5686. # Put instance and migration into error state,
  5687. # as its almost certainly too late to rollback
  5688. self._set_migration_status(migration, 'error')
  5689. # first refresh instance as it may have got updated by
  5690. # post_live_migration_at_destination
  5691. instance.refresh()
  5692. self._set_instance_obj_error_state(context, instance,
  5693. clean_task_state=True)
  5694. @wrap_exception()
  5695. @wrap_instance_event(prefix='compute')
  5696. @wrap_instance_fault
  5697. def live_migration(self, context, dest, instance, block_migration,
  5698. migration, migrate_data):
  5699. """Executing live migration.
  5700. :param context: security context
  5701. :param dest: destination host
  5702. :param instance: a nova.objects.instance.Instance object
  5703. :param block_migration: if true, prepare for block migration
  5704. :param migration: an nova.objects.Migration object
  5705. :param migrate_data: implementation specific params
  5706. """
  5707. self._set_migration_status(migration, 'queued')
  5708. def dispatch_live_migration(*args, **kwargs):
  5709. with self._live_migration_semaphore:
  5710. self._do_live_migration(*args, **kwargs)
  5711. # NOTE(danms): We spawn here to return the RPC worker thread back to
  5712. # the pool. Since what follows could take a really long time, we don't
  5713. # want to tie up RPC workers.
  5714. utils.spawn_n(dispatch_live_migration,
  5715. context, dest, instance,
  5716. block_migration, migration,
  5717. migrate_data)
  5718. @wrap_exception()
  5719. @wrap_instance_event(prefix='compute')
  5720. @wrap_instance_fault
  5721. def live_migration_force_complete(self, context, instance):
  5722. """Force live migration to complete.
  5723. :param context: Security context
  5724. :param instance: The instance that is being migrated
  5725. """
  5726. self._notify_about_instance_usage(
  5727. context, instance, 'live.migration.force.complete.start')
  5728. self.driver.live_migration_force_complete(instance)
  5729. self._notify_about_instance_usage(
  5730. context, instance, 'live.migration.force.complete.end')
  5731. @wrap_exception()
  5732. @wrap_instance_event(prefix='compute')
  5733. @wrap_instance_fault
  5734. def live_migration_abort(self, context, instance, migration_id):
  5735. """Abort an in-progress live migration.
  5736. :param context: Security context
  5737. :param instance: The instance that is being migrated
  5738. :param migration_id: ID of in-progress live migration
  5739. """
  5740. migration = objects.Migration.get_by_id(context, migration_id)
  5741. if migration.status != 'running':
  5742. raise exception.InvalidMigrationState(migration_id=migration_id,
  5743. instance_uuid=instance.uuid,
  5744. state=migration.status,
  5745. method='abort live migration')
  5746. self._notify_about_instance_usage(
  5747. context, instance, 'live.migration.abort.start')
  5748. compute_utils.notify_about_instance_action(
  5749. context, instance, self.host,
  5750. action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
  5751. phase=fields.NotificationPhase.START)
  5752. self.driver.live_migration_abort(instance)
  5753. self._notify_about_instance_usage(
  5754. context, instance, 'live.migration.abort.end')
  5755. compute_utils.notify_about_instance_action(
  5756. context, instance, self.host,
  5757. action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
  5758. phase=fields.NotificationPhase.END)
  5759. def _live_migration_cleanup_flags(self, migrate_data):
  5760. """Determine whether disks or instance path need to be cleaned up after
  5761. live migration (at source on success, at destination on rollback)
  5762. Block migration needs empty image at destination host before migration
  5763. starts, so if any failure occurs, any empty images has to be deleted.
  5764. Also Volume backed live migration w/o shared storage needs to delete
  5765. newly created instance-xxx dir on the destination as a part of its
  5766. rollback process
  5767. :param migrate_data: implementation specific data
  5768. :returns: (bool, bool) -- do_cleanup, destroy_disks
  5769. """
  5770. # NOTE(pkoniszewski): block migration specific params are set inside
  5771. # migrate_data objects for drivers that expose block live migration
  5772. # information (i.e. Libvirt, Xenapi and HyperV). For other drivers
  5773. # cleanup is not needed.
  5774. is_shared_block_storage = True
  5775. is_shared_instance_path = True
  5776. if isinstance(migrate_data, migrate_data_obj.LibvirtLiveMigrateData):
  5777. is_shared_block_storage = migrate_data.is_shared_block_storage
  5778. is_shared_instance_path = migrate_data.is_shared_instance_path
  5779. elif isinstance(migrate_data, migrate_data_obj.XenapiLiveMigrateData):
  5780. is_shared_block_storage = not migrate_data.block_migration
  5781. is_shared_instance_path = not migrate_data.block_migration
  5782. elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData):
  5783. is_shared_instance_path = migrate_data.is_shared_instance_path
  5784. is_shared_block_storage = migrate_data.is_shared_instance_path
  5785. # No instance booting at source host, but instance dir
  5786. # must be deleted for preparing next block migration
  5787. # must be deleted for preparing next live migration w/o shared storage
  5788. do_cleanup = not is_shared_instance_path
  5789. destroy_disks = not is_shared_block_storage
  5790. return (do_cleanup, destroy_disks)
  5791. @wrap_exception()
  5792. @wrap_instance_fault
  5793. def _post_live_migration(self, ctxt, instance, dest,
  5794. block_migration=False, migrate_data=None,
  5795. source_bdms=None):
  5796. """Post operations for live migration.
  5797. This method is called from live_migration
  5798. and mainly updating database record.
  5799. :param ctxt: security context
  5800. :param instance: instance dict
  5801. :param dest: destination host
  5802. :param block_migration: if true, prepare for block migration
  5803. :param migrate_data: if not None, it is a dict which has data
  5804. :param source_bdms: BDMs prior to modification by the destination
  5805. compute host. Set by _do_live_migration and not
  5806. part of the callback interface, so this is never
  5807. None
  5808. required for live migration without shared storage
  5809. """
  5810. LOG.info('_post_live_migration() is started..',
  5811. instance=instance)
  5812. # Cleanup source host post live-migration
  5813. block_device_info = self._get_instance_block_device_info(
  5814. ctxt, instance, bdms=source_bdms)
  5815. self.driver.post_live_migration(ctxt, instance, block_device_info,
  5816. migrate_data)
  5817. # Detaching volumes.
  5818. connector = self.driver.get_volume_connector(instance)
  5819. for bdm in source_bdms:
  5820. if bdm.is_volume:
  5821. # Detaching volumes is a call to an external API that can fail.
  5822. # If it does, we need to handle it gracefully so that the call
  5823. # to post_live_migration_at_destination - where we set instance
  5824. # host and task state - still happens. We need to rethink the
  5825. # current approach of setting instance host and task state
  5826. # AFTER a whole bunch of things that could fail in unhandled
  5827. # ways, but that is left as a TODO(artom).
  5828. try:
  5829. if bdm.attachment_id is None:
  5830. # Prior to cinder v3.44:
  5831. # We don't want to actually mark the volume detached,
  5832. # or delete the bdm, just remove the connection from
  5833. # this host.
  5834. #
  5835. # remove the volume connection without detaching from
  5836. # hypervisor because the instance is not running
  5837. # anymore on the current host
  5838. self.volume_api.terminate_connection(ctxt,
  5839. bdm.volume_id,
  5840. connector)
  5841. else:
  5842. # cinder v3.44 api flow - delete the old attachment
  5843. # for the source host
  5844. self.volume_api.attachment_delete(ctxt,
  5845. bdm.attachment_id)
  5846. except Exception as e:
  5847. if bdm.attachment_id is None:
  5848. LOG.error('Connection for volume %s not terminated on '
  5849. 'source host %s during post_live_migration: '
  5850. '%s', bdm.volume_id, self.host,
  5851. six.text_type(e), instance=instance)
  5852. else:
  5853. LOG.error('Volume attachment %s not deleted on source '
  5854. 'host %s during post_live_migration: %s',
  5855. bdm.attachment_id, self.host,
  5856. six.text_type(e), instance=instance)
  5857. # Releasing vlan.
  5858. # (not necessary in current implementation?)
  5859. network_info = self.network_api.get_instance_nw_info(ctxt, instance)
  5860. self._notify_about_instance_usage(ctxt, instance,
  5861. "live_migration._post.start",
  5862. network_info=network_info)
  5863. # Releasing security group ingress rule.
  5864. LOG.debug('Calling driver.unfilter_instance from _post_live_migration',
  5865. instance=instance)
  5866. self.driver.unfilter_instance(instance,
  5867. network_info)
  5868. migration = {'source_compute': self.host,
  5869. 'dest_compute': dest, }
  5870. self.network_api.migrate_instance_start(ctxt,
  5871. instance,
  5872. migration)
  5873. destroy_vifs = False
  5874. try:
  5875. self.driver.post_live_migration_at_source(ctxt, instance,
  5876. network_info)
  5877. except NotImplementedError as ex:
  5878. LOG.debug(ex, instance=instance)
  5879. # For all hypervisors other than libvirt, there is a possibility
  5880. # they are unplugging networks from source node in the cleanup
  5881. # method
  5882. destroy_vifs = True
  5883. # NOTE(danms): Save source node before calling post method on
  5884. # destination, which will update it
  5885. source_node = instance.node
  5886. # Define domain at destination host, without doing it,
  5887. # pause/suspend/terminate do not work.
  5888. post_at_dest_success = True
  5889. try:
  5890. self.compute_rpcapi.post_live_migration_at_destination(ctxt,
  5891. instance, block_migration, dest)
  5892. except Exception as error:
  5893. post_at_dest_success = False
  5894. # We don't want to break _post_live_migration() if
  5895. # post_live_migration_at_destination() fails as it should never
  5896. # affect cleaning up source node.
  5897. LOG.exception("Post live migration at destination %s failed",
  5898. dest, instance=instance, error=error)
  5899. do_cleanup, destroy_disks = self._live_migration_cleanup_flags(
  5900. migrate_data)
  5901. if do_cleanup:
  5902. LOG.debug('Calling driver.cleanup from _post_live_migration',
  5903. instance=instance)
  5904. self.driver.cleanup(ctxt, instance, network_info,
  5905. destroy_disks=destroy_disks,
  5906. migrate_data=migrate_data,
  5907. destroy_vifs=destroy_vifs)
  5908. self.instance_events.clear_events_for_instance(instance)
  5909. # NOTE(timello): make sure we update available resources on source
  5910. # host even before next periodic task.
  5911. self.update_available_resource(ctxt)
  5912. self._update_scheduler_instance_info(ctxt, instance)
  5913. self._notify_about_instance_usage(ctxt, instance,
  5914. "live_migration._post.end",
  5915. network_info=network_info)
  5916. if post_at_dest_success:
  5917. LOG.info('Migrating instance to %s finished successfully.',
  5918. dest, instance=instance)
  5919. if migrate_data and migrate_data.obj_attr_is_set('migration'):
  5920. migrate_data.migration.status = 'completed'
  5921. migrate_data.migration.save()
  5922. migration = migrate_data.migration
  5923. rc = self.scheduler_client.reportclient
  5924. # Check to see if our migration has its own allocations
  5925. allocs = rc.get_allocations_for_consumer(ctxt, migration.uuid)
  5926. else:
  5927. # We didn't have data on a migration, which means we can't
  5928. # look up to see if we had new-style migration-based
  5929. # allocations. This should really only happen in cases of
  5930. # a buggy virt driver or some really old component in the
  5931. # system. Log a warning so we know it happened.
  5932. allocs = None
  5933. LOG.warning('Live migration ended with no migrate_data '
  5934. 'record. Unable to clean up migration-based '
  5935. 'allocations which is almost certainly not '
  5936. 'an expected situation.')
  5937. if allocs:
  5938. # We had a migration-based allocation that we need to handle
  5939. self._delete_allocation_after_move(ctxt,
  5940. instance,
  5941. migrate_data.migration,
  5942. instance.flavor,
  5943. source_node)
  5944. else:
  5945. # No migration-based allocations, so do the old thing and
  5946. # attempt to clean up any doubled per-instance allocation
  5947. rt = self._get_resource_tracker()
  5948. rt.delete_allocation_for_migrated_instance(
  5949. ctxt, instance, source_node)
  5950. def _consoles_enabled(self):
  5951. """Returns whether a console is enable."""
  5952. return (CONF.vnc.enabled or CONF.spice.enabled or
  5953. CONF.rdp.enabled or CONF.serial_console.enabled or
  5954. CONF.mks.enabled)
  5955. @wrap_exception()
  5956. @wrap_instance_event(prefix='compute')
  5957. @wrap_instance_fault
  5958. def post_live_migration_at_destination(self, context, instance,
  5959. block_migration):
  5960. """Post operations for live migration .
  5961. :param context: security context
  5962. :param instance: Instance dict
  5963. :param block_migration: if true, prepare for block migration
  5964. """
  5965. LOG.info('Post operation of migration started',
  5966. instance=instance)
  5967. # NOTE(tr3buchet): setup networks on destination host
  5968. # this is called a second time because
  5969. # multi_host does not create the bridge in
  5970. # plug_vifs
  5971. self.network_api.setup_networks_on_host(context, instance,
  5972. self.host)
  5973. migration = {'source_compute': instance.host,
  5974. 'dest_compute': self.host, }
  5975. self.network_api.migrate_instance_finish(context,
  5976. instance,
  5977. migration)
  5978. network_info = self.network_api.get_instance_nw_info(context, instance)
  5979. self._notify_about_instance_usage(
  5980. context, instance, "live_migration.post.dest.start",
  5981. network_info=network_info)
  5982. block_device_info = self._get_instance_block_device_info(context,
  5983. instance)
  5984. try:
  5985. self.driver.post_live_migration_at_destination(
  5986. context, instance, network_info, block_migration,
  5987. block_device_info)
  5988. except Exception:
  5989. with excutils.save_and_reraise_exception():
  5990. instance.vm_state = vm_states.ERROR
  5991. LOG.error('Unexpected error during post live migration at '
  5992. 'destination host.', instance=instance)
  5993. finally:
  5994. # Restore instance state and update host
  5995. current_power_state = self._get_power_state(context, instance)
  5996. node_name = None
  5997. prev_host = instance.host
  5998. try:
  5999. compute_node = self._get_compute_info(context, self.host)
  6000. node_name = compute_node.hypervisor_hostname
  6001. except exception.ComputeHostNotFound:
  6002. LOG.exception('Failed to get compute_info for %s', self.host)
  6003. finally:
  6004. instance.host = self.host
  6005. instance.power_state = current_power_state
  6006. instance.task_state = None
  6007. instance.node = node_name
  6008. instance.progress = 0
  6009. instance.save(expected_task_state=task_states.MIGRATING)
  6010. # NOTE(tr3buchet): tear down networks on source host
  6011. self.network_api.setup_networks_on_host(context, instance,
  6012. prev_host, teardown=True)
  6013. # NOTE(vish): this is necessary to update dhcp
  6014. self.network_api.setup_networks_on_host(context, instance, self.host)
  6015. self._notify_about_instance_usage(
  6016. context, instance, "live_migration.post.dest.end",
  6017. network_info=network_info)
  6018. @wrap_exception()
  6019. @wrap_instance_fault
  6020. def _rollback_live_migration(self, context, instance,
  6021. dest, migrate_data=None,
  6022. migration_status='error',
  6023. source_bdms=None):
  6024. """Recovers Instance/volume state from migrating -> running.
  6025. :param context: security context
  6026. :param instance: nova.objects.instance.Instance object
  6027. :param dest:
  6028. This method is called from live migration src host.
  6029. This param specifies destination host.
  6030. :param migrate_data:
  6031. if not none, contains implementation specific data.
  6032. :param migration_status:
  6033. Contains the status we want to set for the migration object
  6034. :param source_bdms: BDMs prior to modification by the destination
  6035. compute host. Set by _do_live_migration and not
  6036. part of the callback interface, so this is never
  6037. None
  6038. """
  6039. if (isinstance(migrate_data, migrate_data_obj.LiveMigrateData) and
  6040. migrate_data.obj_attr_is_set('migration')):
  6041. migration = migrate_data.migration
  6042. else:
  6043. migration = None
  6044. if migration:
  6045. # Remove allocations created in Placement for the dest node.
  6046. # If migration is None, we must be so old we don't have placement,
  6047. # so no need to do something else.
  6048. self._revert_allocation(context, instance, migration)
  6049. else:
  6050. LOG.error('Unable to revert allocations during live migration '
  6051. 'rollback; compute driver did not provide migrate_data',
  6052. instance=instance)
  6053. instance.task_state = None
  6054. instance.progress = 0
  6055. instance.save(expected_task_state=[task_states.MIGRATING])
  6056. # NOTE(tr3buchet): setup networks on source host (really it's re-setup)
  6057. self.network_api.setup_networks_on_host(context, instance, self.host)
  6058. source_bdms_by_volid = {bdm.volume_id: bdm for bdm in source_bdms
  6059. if bdm.is_volume}
  6060. # NOTE(lyarwood): Fetch the current list of BDMs and delete any volume
  6061. # attachments used by the destination host before rolling back to the
  6062. # original and still valid source host volume attachments.
  6063. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  6064. context, instance.uuid)
  6065. for bdm in bdms:
  6066. if bdm.is_volume:
  6067. # remove the connection on the destination host
  6068. # NOTE(lyarwood): This actually calls the cinderv2
  6069. # os-terminate_connection API if required.
  6070. self.compute_rpcapi.remove_volume_connection(
  6071. context, instance, bdm.volume_id, dest)
  6072. if bdm.attachment_id:
  6073. # 3.44 cinder api flow. Set the bdm's
  6074. # attachment_id to the old attachment of the source
  6075. # host. If old_attachments is not there, then
  6076. # there was an error before the new attachment was made.
  6077. # TODO(lyarwood): migrate_data.old_vol_attachment_ids can
  6078. # be removed now as we can lookup the original
  6079. # attachment_ids from the source_bdms list here.
  6080. old_attachments = migrate_data.old_vol_attachment_ids \
  6081. if 'old_vol_attachment_ids' in migrate_data else None
  6082. if old_attachments and bdm.volume_id in old_attachments:
  6083. self.volume_api.attachment_delete(context,
  6084. bdm.attachment_id)
  6085. bdm.attachment_id = old_attachments[bdm.volume_id]
  6086. # NOTE(lyarwood): Rollback the connection_info stored within
  6087. # the BDM to that used by the source and not the destination.
  6088. source_bdm = source_bdms_by_volid[bdm.volume_id]
  6089. bdm.connection_info = source_bdm.connection_info
  6090. bdm.save()
  6091. self._notify_about_instance_usage(context, instance,
  6092. "live_migration._rollback.start")
  6093. compute_utils.notify_about_instance_action(context, instance,
  6094. self.host,
  6095. action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK,
  6096. phase=fields.NotificationPhase.START,
  6097. bdms=bdms)
  6098. do_cleanup, destroy_disks = self._live_migration_cleanup_flags(
  6099. migrate_data)
  6100. if do_cleanup:
  6101. self.compute_rpcapi.rollback_live_migration_at_destination(
  6102. context, instance, dest, destroy_disks=destroy_disks,
  6103. migrate_data=migrate_data)
  6104. elif utils.is_neutron():
  6105. # The port binding profiles need to be cleaned up.
  6106. with errors_out_migration_ctxt(migration):
  6107. try:
  6108. self.network_api.setup_networks_on_host(
  6109. context, instance, teardown=True)
  6110. except Exception:
  6111. with excutils.save_and_reraise_exception():
  6112. LOG.exception(
  6113. 'An error occurred while cleaning up networking '
  6114. 'during live migration rollback.',
  6115. instance=instance)
  6116. self._notify_about_instance_usage(context, instance,
  6117. "live_migration._rollback.end")
  6118. compute_utils.notify_about_instance_action(context, instance,
  6119. self.host,
  6120. action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK,
  6121. phase=fields.NotificationPhase.END,
  6122. bdms=bdms)
  6123. self._set_migration_status(migration, migration_status)
  6124. @wrap_exception()
  6125. @wrap_instance_event(prefix='compute')
  6126. @wrap_instance_fault
  6127. def rollback_live_migration_at_destination(self, context, instance,
  6128. destroy_disks,
  6129. migrate_data):
  6130. """Cleaning up image directory that is created pre_live_migration.
  6131. :param context: security context
  6132. :param instance: a nova.objects.instance.Instance object sent over rpc
  6133. :param destroy_disks: whether to destroy volumes or not
  6134. :param migrate_data: contains migration info
  6135. """
  6136. network_info = self.network_api.get_instance_nw_info(context, instance)
  6137. self._notify_about_instance_usage(
  6138. context, instance, "live_migration.rollback.dest.start",
  6139. network_info=network_info)
  6140. try:
  6141. # NOTE(tr3buchet): tear down networks on destination host
  6142. self.network_api.setup_networks_on_host(context, instance,
  6143. self.host, teardown=True)
  6144. except Exception:
  6145. with excutils.save_and_reraise_exception():
  6146. # NOTE(tdurakov): even if teardown networks fails driver
  6147. # should try to rollback live migration on destination.
  6148. LOG.exception('An error occurred while deallocating network.',
  6149. instance=instance)
  6150. finally:
  6151. # always run this even if setup_networks_on_host fails
  6152. # NOTE(vish): The mapping is passed in so the driver can disconnect
  6153. # from remote volumes if necessary
  6154. block_device_info = self._get_instance_block_device_info(context,
  6155. instance)
  6156. self.driver.rollback_live_migration_at_destination(
  6157. context, instance, network_info, block_device_info,
  6158. destroy_disks=destroy_disks, migrate_data=migrate_data)
  6159. self._notify_about_instance_usage(
  6160. context, instance, "live_migration.rollback.dest.end",
  6161. network_info=network_info)
  6162. @periodic_task.periodic_task(
  6163. spacing=CONF.heal_instance_info_cache_interval)
  6164. def _heal_instance_info_cache(self, context):
  6165. """Called periodically. On every call, try to update the
  6166. info_cache's network information for another instance by
  6167. calling to the network manager.
  6168. This is implemented by keeping a cache of uuids of instances
  6169. that live on this host. On each call, we pop one off of a
  6170. list, pull the DB record, and try the call to the network API.
  6171. If anything errors don't fail, as it's possible the instance
  6172. has been deleted, etc.
  6173. """
  6174. heal_interval = CONF.heal_instance_info_cache_interval
  6175. if not heal_interval:
  6176. return
  6177. instance_uuids = getattr(self, '_instance_uuids_to_heal', [])
  6178. instance = None
  6179. LOG.debug('Starting heal instance info cache')
  6180. if not instance_uuids:
  6181. # The list of instances to heal is empty so rebuild it
  6182. LOG.debug('Rebuilding the list of instances to heal')
  6183. db_instances = objects.InstanceList.get_by_host(
  6184. context, self.host, expected_attrs=[], use_slave=True)
  6185. for inst in db_instances:
  6186. # We don't want to refresh the cache for instances
  6187. # which are building or deleting so don't put them
  6188. # in the list. If they are building they will get
  6189. # added to the list next time we build it.
  6190. if (inst.vm_state == vm_states.BUILDING):
  6191. LOG.debug('Skipping network cache update for instance '
  6192. 'because it is Building.', instance=inst)
  6193. continue
  6194. if (inst.task_state == task_states.DELETING):
  6195. LOG.debug('Skipping network cache update for instance '
  6196. 'because it is being deleted.', instance=inst)
  6197. continue
  6198. if not instance:
  6199. # Save the first one we find so we don't
  6200. # have to get it again
  6201. instance = inst
  6202. else:
  6203. instance_uuids.append(inst['uuid'])
  6204. self._instance_uuids_to_heal = instance_uuids
  6205. else:
  6206. # Find the next valid instance on the list
  6207. while instance_uuids:
  6208. try:
  6209. inst = objects.Instance.get_by_uuid(
  6210. context, instance_uuids.pop(0),
  6211. expected_attrs=['system_metadata', 'info_cache',
  6212. 'flavor'],
  6213. use_slave=True)
  6214. except exception.InstanceNotFound:
  6215. # Instance is gone. Try to grab another.
  6216. continue
  6217. # Check the instance hasn't been migrated
  6218. if inst.host != self.host:
  6219. LOG.debug('Skipping network cache update for instance '
  6220. 'because it has been migrated to another '
  6221. 'host.', instance=inst)
  6222. # Check the instance isn't being deleting
  6223. elif inst.task_state == task_states.DELETING:
  6224. LOG.debug('Skipping network cache update for instance '
  6225. 'because it is being deleted.', instance=inst)
  6226. else:
  6227. instance = inst
  6228. break
  6229. if instance:
  6230. # We have an instance now to refresh
  6231. try:
  6232. # Call to network API to get instance info.. this will
  6233. # force an update to the instance's info_cache
  6234. self.network_api.get_instance_nw_info(context, instance)
  6235. LOG.debug('Updated the network info_cache for instance',
  6236. instance=instance)
  6237. except exception.InstanceNotFound:
  6238. # Instance is gone.
  6239. LOG.debug('Instance no longer exists. Unable to refresh',
  6240. instance=instance)
  6241. return
  6242. except exception.InstanceInfoCacheNotFound:
  6243. # InstanceInfoCache is gone.
  6244. LOG.debug('InstanceInfoCache no longer exists. '
  6245. 'Unable to refresh', instance=instance)
  6246. except Exception:
  6247. LOG.error('An error occurred while refreshing the network '
  6248. 'cache.', instance=instance, exc_info=True)
  6249. else:
  6250. LOG.debug("Didn't find any instances for network info cache "
  6251. "update.")
  6252. @periodic_task.periodic_task
  6253. def _poll_rebooting_instances(self, context):
  6254. if CONF.reboot_timeout > 0:
  6255. filters = {'task_state':
  6256. [task_states.REBOOTING,
  6257. task_states.REBOOT_STARTED,
  6258. task_states.REBOOT_PENDING],
  6259. 'host': self.host}
  6260. rebooting = objects.InstanceList.get_by_filters(
  6261. context, filters, expected_attrs=[], use_slave=True)
  6262. to_poll = []
  6263. for instance in rebooting:
  6264. if timeutils.is_older_than(instance.updated_at,
  6265. CONF.reboot_timeout):
  6266. to_poll.append(instance)
  6267. self.driver.poll_rebooting_instances(CONF.reboot_timeout, to_poll)
  6268. @periodic_task.periodic_task
  6269. def _poll_rescued_instances(self, context):
  6270. if CONF.rescue_timeout > 0:
  6271. filters = {'vm_state': vm_states.RESCUED,
  6272. 'host': self.host}
  6273. rescued_instances = objects.InstanceList.get_by_filters(
  6274. context, filters, expected_attrs=["system_metadata"],
  6275. use_slave=True)
  6276. to_unrescue = []
  6277. for instance in rescued_instances:
  6278. if timeutils.is_older_than(instance.launched_at,
  6279. CONF.rescue_timeout):
  6280. to_unrescue.append(instance)
  6281. for instance in to_unrescue:
  6282. self.compute_api.unrescue(context, instance)
  6283. @periodic_task.periodic_task
  6284. def _poll_unconfirmed_resizes(self, context):
  6285. if CONF.resize_confirm_window == 0:
  6286. return
  6287. migrations = objects.MigrationList.get_unconfirmed_by_dest_compute(
  6288. context, CONF.resize_confirm_window, self.host,
  6289. use_slave=True)
  6290. migrations_info = dict(migration_count=len(migrations),
  6291. confirm_window=CONF.resize_confirm_window)
  6292. if migrations_info["migration_count"] > 0:
  6293. LOG.info("Found %(migration_count)d unconfirmed migrations "
  6294. "older than %(confirm_window)d seconds",
  6295. migrations_info)
  6296. def _set_migration_to_error(migration, reason, **kwargs):
  6297. LOG.warning("Setting migration %(migration_id)s to error: "
  6298. "%(reason)s",
  6299. {'migration_id': migration['id'], 'reason': reason},
  6300. **kwargs)
  6301. migration.status = 'error'
  6302. with migration.obj_as_admin():
  6303. migration.save()
  6304. for migration in migrations:
  6305. instance_uuid = migration.instance_uuid
  6306. LOG.info("Automatically confirming migration "
  6307. "%(migration_id)s for instance %(instance_uuid)s",
  6308. {'migration_id': migration.id,
  6309. 'instance_uuid': instance_uuid})
  6310. expected_attrs = ['metadata', 'system_metadata']
  6311. try:
  6312. instance = objects.Instance.get_by_uuid(context,
  6313. instance_uuid, expected_attrs=expected_attrs,
  6314. use_slave=True)
  6315. except exception.InstanceNotFound:
  6316. reason = (_("Instance %s not found") %
  6317. instance_uuid)
  6318. _set_migration_to_error(migration, reason)
  6319. continue
  6320. if instance.vm_state == vm_states.ERROR:
  6321. reason = _("In ERROR state")
  6322. _set_migration_to_error(migration, reason,
  6323. instance=instance)
  6324. continue
  6325. # race condition: The instance in DELETING state should not be
  6326. # set the migration state to error, otherwise the instance in
  6327. # to be deleted which is in RESIZED state
  6328. # will not be able to confirm resize
  6329. if instance.task_state in [task_states.DELETING,
  6330. task_states.SOFT_DELETING]:
  6331. msg = ("Instance being deleted or soft deleted during resize "
  6332. "confirmation. Skipping.")
  6333. LOG.debug(msg, instance=instance)
  6334. continue
  6335. # race condition: This condition is hit when this method is
  6336. # called between the save of the migration record with a status of
  6337. # finished and the save of the instance object with a state of
  6338. # RESIZED. The migration record should not be set to error.
  6339. if instance.task_state == task_states.RESIZE_FINISH:
  6340. msg = ("Instance still resizing during resize "
  6341. "confirmation. Skipping.")
  6342. LOG.debug(msg, instance=instance)
  6343. continue
  6344. vm_state = instance.vm_state
  6345. task_state = instance.task_state
  6346. if vm_state != vm_states.RESIZED or task_state is not None:
  6347. reaso